Index: sys/kern/subr_trap.c
===================================================================
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -140,6 +140,16 @@
 #ifdef HWPMC_HOOKS
 	if (PMC_THREAD_HAS_SAMPLES(td))
 		PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
+#endif
+#ifdef TCPHPTS
+	/*
+	 * @gallatin is adament that this needs to go here, I
+	 * am not so sure. Running hpts is a lot like
+	 * a lro_flush() that happens while a user process
+	 * is running. But he may know best so I will go
+	 * with his view of accounting. :-)
+	 */
+	tcp_run_hpts();
 #endif
 	/*
 	 * Let the scheduler adjust our priority etc.
Index: sys/netinet/in_pcb.h
===================================================================
--- sys/netinet/in_pcb.h
+++ sys/netinet/in_pcb.h
@@ -258,6 +258,7 @@
 	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
 #endif
 	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
+	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
@@ -266,7 +267,8 @@
 			 inp_input_cpu_set : 1,	/* on input hpts (i) */
 			 inp_hpts_calls :1,	/* (i) from output hpts */
 			 inp_input_calls :1,	/* (i) from input hpts */
-			 inp_spare_bits2 : 4;
+			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
+			 inp_spare_bits2 : 3;
 	uint8_t inp_numa_domain;	/* numa domain */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
Index: sys/netinet/tcp_hpts.h
===================================================================
--- sys/netinet/tcp_hpts.h
+++ sys/netinet/tcp_hpts.h
@@ -44,7 +44,7 @@
 TAILQ_HEAD(hptsh, inpcb);
 
 /* Number of useconds in a hpts tick */
-#define HPTS_TICKS_PER_USEC 10
+#define HPTS_TICKS_PER_SLOT 10
 #define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
 #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
 #define HPTS_USEC_IN_SEC 1000000
@@ -56,7 +56,7 @@
 	uint32_t p_nxt_slot;		/* bbr->flex1 x */
 	uint32_t p_cur_slot;		/* bbr->flex2 x */
 	uint32_t p_prev_slot;		/* bbr->delivered */
-	uint32_t p_runningtick;		/* bbr->inflight */
+	uint32_t p_runningslot;		/* bbr->inflight */
 	uint32_t slot_req;		/* bbr->flex3 x */
 	uint32_t inp_hptsslot;		/* bbr->flex4 x */
 	uint32_t slot_remaining;	/* bbr->flex5 x */
@@ -64,8 +64,8 @@
 	uint32_t hpts_sleep_time;	/* bbr->applimited x */
 	uint32_t yet_to_sleep;		/* bbr->lt_epoch x */
 	uint32_t need_new_to;		/* bbr->flex6 x  */
-	uint32_t wheel_tick;		/* bbr->bw_inuse x */
-	uint32_t maxticks;		/* bbr->delRate x */
+	uint32_t wheel_slot;		/* bbr->bw_inuse x */
+	uint32_t maxslots;		/* bbr->delRate x */
 	uint32_t wheel_cts;		/* bbr->rttProp x */
 	int32_t co_ret; 		/* bbr->pkts_out x */
 	uint32_t p_curtick;		/* upper bbr->cur_del_rate */
@@ -83,16 +83,20 @@
 #define PACE_PKT_OUTPUT 0x40	/* Output Packets being paced */
 #define PACE_TMR_MASK   (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
 
+#define DEFAULT_CONNECTION_THESHOLD 100
+
 #ifdef _KERNEL
 /* Each hpts has its own p_mtx which is used for locking */
 struct tcp_hpts_entry {
 	/* Cache line 0x00 */
 	struct mtx p_mtx;	/* Mutex for hpts */
+	struct timeval p_mysleep;	/* Our min sleep time */
+	uint64_t syscall_cnt;
+	uint64_t sleeping;	/* What the actual sleep was (if sleeping) */
 	uint16_t p_hpts_active; /* Flag that says hpts is awake  */
-	uint8_t p_hpts_wake_scheduled;	/* Have we scheduled a wakeup? */
 	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
 	uint32_t p_curtick;	/* Tick in 10 us the hpts is going to */
-	uint32_t p_runningtick; /* Current tick we are at if we are running */
+	uint32_t p_runningslot; /* Current tick we are at if we are running */
 	uint32_t p_prev_slot;	/* Previous slot we were on */
 	uint32_t p_cur_slot;	/* Current slot in wheel hpts is draining */
 	uint32_t p_nxt_slot;	/* The next slot outside the current range of
@@ -101,7 +105,8 @@
 	uint32_t p_lasttick;	/* Last tick before the current one */
 	uint8_t p_direct_wake :1, /* boolean */
 		p_on_min_sleep:1, /* boolean */
-		p_avail:6;
+		p_hpts_wake_scheduled:1, /* boolean */
+		p_avail:5;
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	void *p_inp;
@@ -109,8 +114,6 @@
 	/* Hptsi wheel */
 	struct hptsh *p_hptss;
 	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
-	uint32_t hit_no_enobuf;
-	uint32_t p_dyn_adjust;
 	uint32_t p_hpts_sleep_time;	/* Current sleep interval having a max
 					 * of 255ms */
 	uint32_t overidden_sleep;	/* what was overrided by min-sleep for logging */
@@ -134,6 +137,7 @@
 struct tcp_hptsi {
 	struct proc *rp_proc;	/* Process structure for hpts */
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
+	uint32_t *cts_last_ran;
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 };
 
@@ -155,10 +159,37 @@
  * be sent when a TCB is still around must be
  * sent from a routine like tcp_respond().
  */
+#define LOWEST_SLEEP_ALLOWED 50
 #define DEFAULT_MIN_SLEEP 250	/* How many usec's is default for hpts sleep
 				 * this determines min granularity of the
-				 * hpts. If 0, granularity is 10useconds at
-				 * the cost of more CPU (context switching). */
+				 * hpts. If 1, granularity is 10useconds at
+				 * the cost of more CPU (context switching).
+				 * Note do not set this to 0.
+				 */
+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
+#define DYNAMIC_MAX_SLEEP 100000	/* 100ms */
+/* No of connections when wee start aligning to the cpu from syscalls */
+#define OLDEST_THRESHOLD 1200
+/* Thresholds for raising/lowering sleep */
+#define TICKS_INDICATE_MORE_SLEEP 100		/* This would be 1ms */
+#define TICKS_INDICATE_LESS_SLEEP 1000		/* This would indicate 10ms */
+/**
+ *
+ * Dynamic adjustment of sleeping times is done in "new" mode
+ * where we are depending on syscall returns and lro returns
+ * to push hpts forward mainly and the timer is only a backstop.
+ *
+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
+ * then we do a dynamic adjustment on the time we sleep.
+ * Our threshold is if the lateness of the first client served (in ticks) is
+ * greater than or equal too ticks_indicate_more_sleep (10ms
+ * or 10000 ticks). If we were that late, the actual sleep time
+ * is adjusted down by 50%. If the ticks_ran is less than
+ * ticks_indicate_more_sleep (100 ticks or 1000usecs).
+ *
+ */
+
+
 #ifdef _KERNEL
 #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
 struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
@@ -215,43 +246,61 @@
 void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
 #define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
 
+void tcp_run_hpts(void);
+
+uint16_t hpts_random_cpu(struct inpcb *inp);
+
 extern int32_t tcp_min_hptsi_time;
 
-static __inline uint32_t
-tcp_tv_to_hptstick(struct timeval *sv)
-{
-	return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
-}
+#endif /* _KERNEL */
 
+/*
+ * The following functions should also be available
+ * to userspace as well.
+ */
 static __inline uint32_t
-tcp_gethptstick(struct timeval *sv)
+tcp_tv_to_hptstick(const struct timeval *sv)
 {
-	struct timeval tv;
-
-	if (sv == NULL)
-		sv = &tv;
-	microuptime(sv);
-	return (tcp_tv_to_hptstick(sv));
+	return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
 }
 
 static __inline uint32_t
-tcp_tv_to_usectick(struct timeval *sv)
+tcp_tv_to_usectick(const struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
 }
 
 static __inline uint32_t
-tcp_tv_to_mssectick(struct timeval *sv)
+tcp_tv_to_mssectick(const struct timeval *sv)
 {
 	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
 }
 
+static __inline uint64_t
+tcp_tv_to_lusectick(const struct timeval *sv)
+{
+	return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+#ifdef _KERNEL
+
 static __inline void
 tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
 {
 	mtx_unlock(&hpts->p_mtx);
 }
 
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+	struct timeval tv;
+
+	if (sv == NULL)
+		sv = &tv;
+	microuptime(sv);
+	return (tcp_tv_to_hptstick(sv));
+}
+
 static __inline uint32_t
 tcp_get_usecs(struct timeval *tv)
 {
Index: sys/netinet/tcp_hpts.c
===================================================================
--- sys/netinet/tcp_hpts.c
+++ sys/netinet/tcp_hpts.c
@@ -193,23 +193,29 @@
 #else
 static int tcp_bind_threads = 2;
 #endif
-TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-
+static int tcp_use_irq_cpu = 0;
 static struct tcp_hptsi tcp_pace;
+static uint32_t *cts_last_ran;
 static int hpts_does_tp_logging = 0;
+static int hpts_use_assigned_cpu = 1;
+static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
 
-static void tcp_wakehpts(struct tcp_hpts_entry *p);
-static void tcp_wakeinput(struct tcp_hpts_entry *p);
 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts);
+static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
 static void tcp_hpts_thread(void *ctx);
 static void tcp_init_hptsi(void *st);
 
 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
-static int32_t tcp_hpts_callout_skip_swi = 0;
+static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
+static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
+static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
+
+
 
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Hpts controls");
+SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "TCP Hpts statistics");
 
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
@@ -230,44 +236,92 @@
 
 struct hpts_domain_info hpts_domains[MAXMEMDOM];
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
-    &tcp_hpts_precision, 120,
-    "Value for PRE() precision of callout");
-
 counter_u64_t hpts_hopelessly_behind;
 
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
     &hpts_hopelessly_behind,
     "Number of times hpts could not catch up and was behind hopelessly");
 
 counter_u64_t hpts_loops;
 
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
     &hpts_loops, "Number of times hpts had to loop to catch up");
 
 counter_u64_t back_tosleep;
 
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
     &back_tosleep, "Number of times hpts found no tcbs");
 
 counter_u64_t combined_wheel_wrap;
 
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
     &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
 counter_u64_t wheel_wrap;
 
-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
     &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
 
-static int32_t out_ts_percision = 0;
+counter_u64_t hpts_direct_call;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
+    &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
+
+counter_u64_t hpts_wake_timeout;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
+    &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
+
+counter_u64_t hpts_direct_awakening;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
+    &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
-    &out_ts_percision, 0,
-    "Do we use a percise timestamp for every output cts");
+counter_u64_t hpts_back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
+    &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
+
+counter_u64_t cpu_uses_flowid;
+counter_u64_t cpu_uses_random;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
+    &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
+    &cpu_uses_random, "Number of times when setting cpuid we used the a random value");
+
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
+    &tcp_bind_threads, 2,
+    "Thread Binding tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
+    &tcp_use_irq_cpu, 0,
+    "Use of irq CPU  tunable");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+    &tcp_hpts_precision, 120,
+    "Value for PRE() precision of callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
+    &conn_cnt_thresh, 0,
+    "How many connections (below) make us use the callout based mechanism");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
     &hpts_does_tp_logging, 0,
     "Do we add to any tp that has logging on pacer logs");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
+    &hpts_use_assigned_cpu, 0,
+    "Do we start any hpts timer on the assigned cpu?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
+    &hpts_uses_oldest, OLDEST_THRESHOLD,
+    "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
+    &dynamic_min_sleep, 250,
+    "What is the dynamic minsleep value?");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
+    &dynamic_max_sleep, 5000,
+    "What is the dynamic maxsleep value?");
+
+
+
+
 
 static int32_t max_pacer_loops = 10;
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
@@ -287,7 +341,7 @@
 	new = hpts_sleep_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
-		if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+		if ((new < dynamic_min_sleep) ||
 		    (new > HPTS_MAX_SLEEP_ALLOWED))
 			error = EINVAL;
 		else
@@ -296,26 +350,60 @@
 	return (error);
 }
 
+static int
+sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	uint32_t new;
+
+	new = tcp_min_hptsi_time;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error == 0 && req->newptr) {
+		if (new < LOWEST_SLEEP_ALLOWED)
+			error = EINVAL;
+		else
+			tcp_min_hptsi_time = new;
+	}
+	return (error);
+}
+
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &hpts_sleep_max, 0,
     &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
     "Maximum time hpts will sleep");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
+    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &tcp_min_hptsi_time, 0,
+    &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
     "The minimum time the hpts must sleep before processing more slots");
 
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
-    &tcp_hpts_callout_skip_swi, 0,
-    "Do we have the callout call directly to the hpts?");
+static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
+static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
+static int tcp_hpts_no_wake_over_thresh = 1;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
+    &ticks_indicate_more_sleep, 0,
+    "If we only process this many or less on a timeout, we need longer sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
+    &ticks_indicate_less_sleep, 0,
+    "If we process this many or more on a timeout, we need less sleep on the next callout");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
+    &tcp_hpts_no_wake_over_thresh, 0,
+    "When we are over the threshold on the pacer do we prohibit wakeups?");
 
 static void
 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
-	     int ticks_to_run, int idx)
+	     int slots_to_run, int idx, int from_callout)
 {
 	union tcp_log_stackspecific log;
-
+	/*
+	 * Unused logs are
+	 * 64 bit - delRate, rttProp, bw_inuse
+	 * 16 bit - cwnd_gain
+	 *  8 bit - bbr_state, bbr_substate, inhpts, ininput;
+	 */
 	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 	log.u_bbr.flex1 = hpts->p_nxt_slot;
 	log.u_bbr.flex2 = hpts->p_cur_slot;
@@ -323,8 +411,9 @@
 	log.u_bbr.flex4 = idx;
 	log.u_bbr.flex5 = hpts->p_curtick;
 	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
-	log.u_bbr.use_lt_bw = 1;
-	log.u_bbr.inflight = ticks_to_run;
+	log.u_bbr.flex7 = hpts->p_cpu;
+	log.u_bbr.flex8 = (uint8_t)from_callout;
+	log.u_bbr.inflight = slots_to_run;
 	log.u_bbr.applimited = hpts->overidden_sleep;
 	log.u_bbr.delivered = hpts->saved_curtick;
 	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
@@ -332,7 +421,9 @@
 	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
 	log.u_bbr.pkts_out = hpts->p_delayed_by;
 	log.u_bbr.lost = hpts->p_hpts_sleep_time;
-	log.u_bbr.cur_del_rate = hpts->p_runningtick;
+	log.u_bbr.pacing_gain = hpts->p_cpu;
+	log.u_bbr.pkt_epoch = hpts->p_runningslot;
+	log.u_bbr.use_lt_bw = 1;
 	TCP_LOG_EVENTP(tp, NULL,
 		       &tp->t_inpcb->inp_socket->so_rcv,
 		       &tp->t_inpcb->inp_socket->so_snd,
@@ -341,47 +432,40 @@
 }
 
 static void
-hpts_timeout_swi(void *arg)
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
 {
-	struct tcp_hpts_entry *hpts;
+	HPTS_MTX_ASSERT(hpts);
 
-	hpts = (struct tcp_hpts_entry *)arg;
-	swi_sched(hpts->ie_cookie, 0);
+	if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
+		hpts->p_direct_wake = 0;
+		return;
+	}
+	if (hpts->p_hpts_wake_scheduled == 0) {
+		hpts->p_hpts_wake_scheduled = 1;
+		swi_sched(hpts->ie_cookie, 0);
+	}
 }
 
 static void
-hpts_timeout_dir(void *arg)
+hpts_timeout_swi(void *arg)
 {
-	tcp_hpts_thread(arg);
+	struct tcp_hpts_entry *hpts;
+
+	hpts = (struct tcp_hpts_entry *)arg;
+	swi_sched(hpts->ie_cookie, 0);
 }
 
 static inline void
 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
 {
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx) == 0) {
-		/* We don't own the mutex? */
-		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
-	}
-	if (hpts->p_cpu != inp->inp_hpts_cpu) {
-		/* It is not the right cpu/mutex? */
-		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
-	}
-	if (inp->inp_in_hpts == 0) {
-		/* We are not on the hpts? */
-		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
-	}
-#endif
+	HPTS_MTX_ASSERT(hpts);
+	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+	KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
 	TAILQ_REMOVE(head, inp, inp_hpts);
 	hpts->p_on_queue_cnt--;
-	if (hpts->p_on_queue_cnt < 0) {
-		/* Count should not go negative .. */
-#ifdef INVARIANTS
-		panic("Hpts goes negative inp:%p hpts:%p",
-		    inp, hpts);
-#endif
-		hpts->p_on_queue_cnt = 0;
-	}
+	KASSERT(hpts->p_on_queue_cnt >= 0,
+		("Hpts goes negative inp:%p hpts:%p",
+		 inp, hpts));
 	if (clear) {
 		inp->inp_hpts_request = 0;
 		inp->inp_in_hpts = 0;
@@ -391,20 +475,13 @@
 static inline void
 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
 {
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx) == 0) {
-		/* We don't own the mutex? */
-		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
-	}
-	if (hpts->p_cpu != inp->inp_hpts_cpu) {
-		/* It is not the right cpu/mutex? */
-		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
-	}
-	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
-		/* We are already on the hpts? */
-		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
-	}
-#endif
+	HPTS_MTX_ASSERT(hpts);
+	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+	KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) ||
+		((noref == 0) && (inp->inp_in_hpts == 0)),
+		("%s: hpts:%p inp:%p already on the hpts?",
+		 __FUNCTION__, hpts, inp));
 	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
 	inp->inp_in_hpts = 1;
 	hpts->p_on_queue_cnt++;
@@ -416,37 +493,20 @@
 static inline void
 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
 {
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx) == 0) {
-		/* We don't own the mutex? */
-		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
-	}
-	if (hpts->p_cpu != inp->inp_input_cpu) {
-		/* It is not the right cpu/mutex? */
-		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
-	}
-	if (inp->inp_in_input == 0) {
-		/* We are not on the input hpts? */
-		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
-	}
-#endif
+	HPTS_MTX_ASSERT(hpts);
+	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+	KASSERT(inp->inp_in_input != 0,
+		("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
 	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
 	hpts->p_on_inqueue_cnt--;
-	if (hpts->p_on_inqueue_cnt < 0) {
-#ifdef INVARIANTS
-		panic("Hpts in goes negative inp:%p hpts:%p",
-		    inp, hpts);
-#endif
-		hpts->p_on_inqueue_cnt = 0;
-	}
-#ifdef INVARIANTS
-	if (TAILQ_EMPTY(&hpts->p_input) &&
-	    (hpts->p_on_inqueue_cnt != 0)) {
-		/* We should not be empty with a queue count */
-		panic("%s hpts:%p in_hpts input empty but cnt:%d",
-		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
-	}
-#endif
+	KASSERT(hpts->p_on_inqueue_cnt >= 0,
+		("Hpts in goes negative inp:%p hpts:%p",
+		 inp, hpts));
+	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+		("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
+		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
 	if (clear)
 		inp->inp_in_input = 0;
 }
@@ -454,46 +514,17 @@
 static inline void
 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
 {
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx) == 0) {
-		/* We don't own the mutex? */
-		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
-	}
-	if (hpts->p_cpu != inp->inp_input_cpu) {
-		/* It is not the right cpu/mutex? */
-		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
-	}
-	if (inp->inp_in_input == 1) {
-		/* We are already on the input hpts? */
-		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
-	}
-#endif
+	HPTS_MTX_ASSERT(hpts);
+	KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
+		("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
+	KASSERT(inp->inp_in_input == 0,
+		("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
 	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
 	inp->inp_in_input = 1;
 	hpts->p_on_inqueue_cnt++;
 	in_pcbref(inp);
 }
 
-static void
-tcp_wakehpts(struct tcp_hpts_entry *hpts)
-{
-	HPTS_MTX_ASSERT(hpts);
-	if (hpts->p_hpts_wake_scheduled == 0) {
-		hpts->p_hpts_wake_scheduled = 1;
-		swi_sched(hpts->ie_cookie, 0);
-	}
-}
-
-static void
-tcp_wakeinput(struct tcp_hpts_entry *hpts)
-{
-	HPTS_MTX_ASSERT(hpts);
-	if (hpts->p_hpts_wake_scheduled == 0) {
-		hpts->p_hpts_wake_scheduled = 1;
-		swi_sched(hpts->ie_cookie, 0);
-	}
-}
-
 struct tcp_hpts_entry *
 tcp_cur_hpts(struct inpcb *inp)
 {
@@ -514,12 +545,9 @@
 again:
 	hpts_num = inp->inp_hpts_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx)) {
-		panic("Hpts:%p owns mtx prior-to lock line:%d",
-		    hpts, __LINE__);
-	}
-#endif
+	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+		("Hpts:%p owns mtx prior-to lock line:%d",
+		 hpts, __LINE__));
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_hpts_cpu) {
 		mtx_unlock(&hpts->p_mtx);
@@ -537,12 +565,9 @@
 again:
 	hpts_num = inp->inp_input_cpu;
 	hpts = tcp_pace.rp_ent[hpts_num];
-#ifdef INVARIANTS
-	if (mtx_owned(&hpts->p_mtx)) {
-		panic("Hpts:%p owns mtx prior-to lock line:%d",
-		    hpts, __LINE__);
-	}
-#endif
+	KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+		("Hpts:%p owns mtx prior-to lock line:%d",
+		hpts, __LINE__));
 	mtx_lock(&hpts->p_mtx);
 	if (hpts_num != inp->inp_input_cpu) {
 		mtx_unlock(&hpts->p_mtx);
@@ -555,6 +580,7 @@
 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
 {
 	int32_t add_freed;
+	int32_t ret;
 
 	if (inp->inp_flags2 & INP_FREED) {
 		/*
@@ -567,26 +593,11 @@
 		add_freed = 0;
 	}
 #ifndef INP_REF_DEBUG
-	if (in_pcbrele_wlocked(inp)) {
-		/*
-		 * This should not happen. We have the inpcb referred to by
-		 * the main socket (why we are called) and the hpts. It
-		 * should always return 0.
-		 */
-		panic("inpcb:%p release ret 1",
-		    inp);
-	}
+	ret = in_pcbrele_wlocked(inp);
 #else
-	if (__in_pcbrele_wlocked(inp, line)) {
-		/*
-		 * This should not happen. We have the inpcb referred to by
-		 * the main socket (why we are called) and the hpts. It
-		 * should always return 0.
-		 */
-		panic("inpcb:%p release ret 1",
-		    inp);
-	}
+	ret = __in_pcbrele_wlocked(inp, line);
 #endif
+	KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
 	if (add_freed) {
 		inp->inp_flags2 |= INP_FREED;
 	}
@@ -642,73 +653,76 @@
 }
 
 static inline int
-hpts_tick(uint32_t wheel_tick, uint32_t plus)
+hpts_slot(uint32_t wheel_slot, uint32_t plus)
 {
 	/*
 	 * Given a slot on the wheel, what slot
 	 * is that plus ticks out?
 	 */
-	KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
-	return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
+	KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
+	return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
 tick_to_wheel(uint32_t cts_in_wticks)
 {
 	/*
-	 * Given a timestamp in wheel ticks (10usec inc's)
-	 * map it to our limited space wheel.
+	 * Given a timestamp in ticks (so by
+	 * default to get it to a real time one
+	 * would multiply by 10.. i.e the number
+	 * of ticks in a slot) map it to our limited
+	 * space wheel.
 	 */
 	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
 }
 
 static inline int
-hpts_ticks_diff(int prev_tick, int tick_now)
+hpts_slots_diff(int prev_slot, int slot_now)
 {
 	/*
-	 * Given two ticks that are someplace
+	 * Given two slots that are someplace
 	 * on our wheel. How far are they apart?
 	 */
-	if (tick_now > prev_tick)
-		return (tick_now - prev_tick);
-	else if (tick_now == prev_tick)
+	if (slot_now > prev_slot)
+		return (slot_now - prev_slot);
+	else if (slot_now == prev_slot)
 		/*
 		 * Special case, same means we can go all of our
 		 * wheel less one slot.
 		 */
 		return (NUM_OF_HPTSI_SLOTS - 1);
 	else
-		return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+		return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
 }
 
 /*
- * Given a tick on the wheel that is the current time
- * mapped to the wheel (wheel_tick), what is the maximum
+ * Given a slot on the wheel that is the current time
+ * mapped to the wheel (wheel_slot), what is the maximum
  * distance forward that can be obtained without
- * wrapping past either prev_tick or running_tick
+ * wrapping past either prev_slot or running_slot
  * depending on the htps state? Also if passed
- * a uint32_t *, fill it with the tick location.
+ * a uint32_t *, fill it with the slot location.
  *
  * Note if you do not give this function the current
- * time (that you think it is) mapped to the wheel
+ * time (that you think it is) mapped to the wheel slot
  * then the results will not be what you expect and
  * could lead to invalid inserts.
  */
 static inline int32_t
-max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *target_slot)
 {
-	uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+	uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;
 
 	if ((hpts->p_hpts_active == 1) &&
 	    (hpts->p_wheel_complete == 0)) {
-		end_tick = hpts->p_runningtick;
+		end_slot = hpts->p_runningslot;
 		/* Back up one tick */
-		if (end_tick == 0)
-			end_tick = NUM_OF_HPTSI_SLOTS - 1;
+		if (end_slot == 0)
+			end_slot = NUM_OF_HPTSI_SLOTS - 1;
 		else
-			end_tick--;
-		if (target_tick)
-			*target_tick = end_tick;
+			end_slot--;
+		if (target_slot)
+			*target_slot = end_slot;
 	} else {
 		/*
 		 * For the case where we are
@@ -718,26 +732,26 @@
 		 * prev tick and subtract one from it. This puts us
 		 * as far out as possible on the wheel.
 		 */
-		end_tick = hpts->p_prev_slot;
-		if (end_tick == 0)
-			end_tick = NUM_OF_HPTSI_SLOTS - 1;
+		end_slot = hpts->p_prev_slot;
+		if (end_slot == 0)
+			end_slot = NUM_OF_HPTSI_SLOTS - 1;
 		else
-			end_tick--;
-		if (target_tick)
-			*target_tick = end_tick;
+			end_slot--;
+		if (target_slot)
+			*target_slot = end_slot;
 		/*
 		 * Now we have close to the full wheel left minus the
 		 * time it has been since the pacer went to sleep. Note
 		 * that wheel_tick, passed in, should be the current time
 		 * from the perspective of the caller, mapped to the wheel.
 		 */
-		if (hpts->p_prev_slot != wheel_tick)
-			dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+		if (hpts->p_prev_slot != wheel_slot)
+			dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
 		else
 			dis_to_travel = 1;
 		/*
 		 * dis_to_travel in this case is the space from when the
-		 * pacer stopped (p_prev_slot) and where our wheel_tick
+		 * pacer stopped (p_prev_slot) and where our wheel_slot
 		 * is now. To know how many slots we can put it in we
 		 * subtract from the wheel size. We would not want
 		 * to place something after p_prev_slot or it will
@@ -746,21 +760,21 @@
 		return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
 	}
 	/*
-	 * So how many slots are open between p_runningtick -> p_cur_slot
+	 * So how many slots are open between p_runningslot -> p_cur_slot
 	 * that is what is currently un-available for insertion. Special
 	 * case when we are at the last slot, this gets 1, so that
 	 * the answer to how many slots are available is all but 1.
 	 */
-	if (hpts->p_runningtick == hpts->p_cur_slot)
+	if (hpts->p_runningslot == hpts->p_cur_slot)
 		dis_to_travel = 1;
 	else
-		dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+		dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
 	/*
 	 * How long has the pacer been running?
 	 */
-	if (hpts->p_cur_slot != wheel_tick) {
+	if (hpts->p_cur_slot != wheel_slot) {
 		/* The pacer is a bit late */
-		pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+		pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
 	} else {
 		/* The pacer is right on time, now == pacers start time */
 		pacer_to_now = 0;
@@ -774,24 +788,24 @@
 	/*
 	 * Now how many of those we will eat due to the pacer's
 	 * time (p_cur_slot) of start being behind the
-	 * real time (wheel_tick)?
+	 * real time (wheel_slot)?
 	 */
 	if (avail_on_wheel <= pacer_to_now) {
 		/*
 		 * Wheel wrap, we can't fit on the wheel, that
 		 * is unusual the system must be way overloaded!
-		 * Insert into the assured tick, and return special
+		 * Insert into the assured slot, and return special
 		 * "0".
 		 */
 		counter_u64_add(combined_wheel_wrap, 1);
-		*target_tick = hpts->p_nxt_slot;
+		*target_slot = hpts->p_nxt_slot;
 		return (0);
 	} else {
 		/*
 		 * We know how many slots are open
 		 * on the wheel (the reverse of what
 		 * is left to run. Take away the time
-		 * the pacer started to now (wheel_tick)
+		 * the pacer started to now (wheel_slot)
 		 * and that tells you how many slots are
 		 * open that can be inserted into that won't
 		 * be touched by the pacer until later.
@@ -815,7 +829,7 @@
 			 * A sleeping hpts we want in next slot to run
 			 * note that in this state p_prev_slot == p_cur_slot
 			 */
-			inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
 			if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
 				need_wake = 1;
 		} else if ((void *)inp == hpts->p_inp) {
@@ -827,7 +841,7 @@
 			 */
 			inp->inp_hptsslot = hpts->p_nxt_slot;
 		} else
-			inp->inp_hptsslot = hpts->p_runningtick;
+			inp->inp_hptsslot = hpts->p_runningslot;
 		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
 		if (need_wake) {
 			/*
@@ -862,9 +876,9 @@
 	 * Sanity checks for the pacer with invariants
 	 * on insert.
 	 */
-	if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
-		panic("hpts:%p inp:%p slot:%d > max",
-		      hpts, inp, inp_hptsslot);
+	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
+		("hpts:%p inp:%p slot:%d > max",
+		 hpts, inp, inp_hptsslot));
 	if ((hpts->p_hpts_active) &&
 	    (hpts->p_wheel_complete == 0)) {
 		/*
@@ -875,17 +889,16 @@
 		 */
 		int distance, yet_to_run;
 
-		distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
-		if (hpts->p_runningtick != hpts->p_cur_slot)
-			yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
+		if (hpts->p_runningslot != hpts->p_cur_slot)
+			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
 		else
 			yet_to_run = 0;	/* processing last slot */
-		if (yet_to_run > distance) {
-			panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
-			      hpts, inp, inp_hptsslot,
-			      distance, yet_to_run,
-			      hpts->p_runningtick, hpts->p_cur_slot);
-		}
+		KASSERT(yet_to_run <= distance,
+			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+			 hpts, inp, inp_hptsslot,
+			 distance, yet_to_run,
+			 hpts->p_runningslot, hpts->p_cur_slot));
 	}
 }
 #endif
@@ -895,8 +908,9 @@
 		       struct hpts_diag *diag, struct timeval *tv)
 {
 	uint32_t need_new_to = 0;
-	uint32_t wheel_cts, last_tick;
-	int32_t wheel_tick, maxticks;
+	uint32_t wheel_cts; 
+	int32_t wheel_slot, maxslots, last_slot;
+	int cpu;
 	int8_t need_wakeup = 0;
 
 	HPTS_MTX_ASSERT(hpts);
@@ -904,7 +918,7 @@
 		memset(diag, 0, sizeof(struct hpts_diag));
 		diag->p_hpts_active = hpts->p_hpts_active;
 		diag->p_prev_slot = hpts->p_prev_slot;
-		diag->p_runningtick = hpts->p_runningtick;
+		diag->p_runningslot = hpts->p_runningslot;
 		diag->p_nxt_slot = hpts->p_nxt_slot;
 		diag->p_cur_slot = hpts->p_cur_slot;
 		diag->p_curtick = hpts->p_curtick;
@@ -913,131 +927,120 @@
 		diag->p_on_min_sleep = hpts->p_on_min_sleep;
 		diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
 	}
-	if (inp->inp_in_hpts == 0) {
-		if (slot == 0) {
-			/* Immediate */
-			tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
-			return;
-		}
-		/* Get the current time relative to the wheel */
-		wheel_cts = tcp_tv_to_hptstick(tv);
-		/* Map it onto the wheel */
-		wheel_tick = tick_to_wheel(wheel_cts);
-		/* Now what's the max we can place it at? */
-		maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
-		if (diag) {
-			diag->wheel_tick = wheel_tick;
-			diag->maxticks = maxticks;
-			diag->wheel_cts = wheel_cts;
+	KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
+	if (slot == 0) {
+		/* Immediate */
+		tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+		return;
+	}
+	/* Get the current time relative to the wheel */
+	wheel_cts = tcp_tv_to_hptstick(tv);
+	/* Map it onto the wheel */
+	wheel_slot = tick_to_wheel(wheel_cts);
+	/* Now what's the max we can place it at? */
+	maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
+	if (diag) {
+		diag->wheel_slot = wheel_slot;
+		diag->maxslots = maxslots;
+		diag->wheel_cts = wheel_cts;
+	}
+	if (maxslots == 0) {
+		/* The pacer is in a wheel wrap behind, yikes! */
+		if (slot > 1) {
+			/*
+			 * Reduce by 1 to prevent a forever loop in
+			 * case something else is wrong. Note this
+			 * probably does not hurt because the pacer
+			 * if its true is so far behind we will be
+			 * > 1second late calling anyway.
+			 */
+			slot--;
 		}
-		if (maxticks == 0) {
-			/* The pacer is in a wheel wrap behind, yikes! */
-			if (slot > 1) {
-				/*
-				 * Reduce by 1 to prevent a forever loop in
-				 * case something else is wrong. Note this
-				 * probably does not hurt because the pacer
-				 * if its true is so far behind we will be
-				 * > 1second late calling anyway.
-				 */
-				slot--;
-			}
-			inp->inp_hptsslot = last_tick;
-			inp->inp_hpts_request = slot;
-		} else 	if (maxticks >= slot) {
-			/* It all fits on the wheel */
-			inp->inp_hpts_request = 0;
-			inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
-		} else {
-			/* It does not fit */
-			inp->inp_hpts_request = slot - maxticks;
-			inp->inp_hptsslot = last_tick;
+		inp->inp_hptsslot = last_slot;
+		inp->inp_hpts_request = slot;
+	} else 	if (maxslots >= slot) {
+		/* It all fits on the wheel */
+		inp->inp_hpts_request = 0;
+		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
+	} else {
+		/* It does not fit */
+		inp->inp_hpts_request = slot - maxslots;
+		inp->inp_hptsslot = last_slot;
+	}
+	if (diag) {
+		diag->slot_remaining = inp->inp_hpts_request;
+		diag->inp_hptsslot = inp->inp_hptsslot;
+	}
+#ifdef INVARIANTS
+	check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
+#endif
+	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+	if ((hpts->p_hpts_active == 0) &&
+	    (inp->inp_hpts_request == 0) &&
+	    (hpts->p_on_min_sleep == 0)) {
+		/*
+		 * The hpts is sleeping and NOT on a minimum
+		 * sleep time, we need to figure out where
+		 * it will wake up at and if we need to reschedule
+		 * its time-out.
+		 */
+		uint32_t have_slept, yet_to_sleep;
+
+		/* Now do we need to restart the hpts's timer? */
+		have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
+		if (have_slept < hpts->p_hpts_sleep_time)
+			yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+		else {
+			/* We are over-due */
+			yet_to_sleep = 0;
+			need_wakeup = 1;
 		}
 		if (diag) {
-			diag->slot_remaining = inp->inp_hpts_request;
-			diag->inp_hptsslot = inp->inp_hptsslot;
+			diag->have_slept = have_slept;
+			diag->yet_to_sleep = yet_to_sleep;
 		}
-#ifdef INVARIANTS
-		check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
-#endif
-		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
-		if ((hpts->p_hpts_active == 0) &&
-		    (inp->inp_hpts_request == 0) &&
-		    (hpts->p_on_min_sleep == 0)) {
+		if (yet_to_sleep &&
+		    (yet_to_sleep > slot)) {
 			/*
-			 * The hpts is sleeping and not on a minimum
-			 * sleep time, we need to figure out where
-			 * it will wake up at and if we need to reschedule
-			 * its time-out.
+			 * We need to reschedule the hpts's time-out.
 			 */
-			uint32_t have_slept, yet_to_sleep;
-
-			/* Now do we need to restart the hpts's timer? */
-			have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
-			if (have_slept < hpts->p_hpts_sleep_time)
-				yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
-			else {
-				/* We are over-due */
-				yet_to_sleep = 0;
-				need_wakeup = 1;
-			}
-			if (diag) {
-				diag->have_slept = have_slept;
-				diag->yet_to_sleep = yet_to_sleep;
-			}
-			if (yet_to_sleep &&
-			    (yet_to_sleep > slot)) {
-				/*
-				 * We need to reschedule the hpts's time-out.
-				 */
-				hpts->p_hpts_sleep_time = slot;
-				need_new_to = slot * HPTS_TICKS_PER_USEC;
-			}
+			hpts->p_hpts_sleep_time = slot;
+			need_new_to = slot * HPTS_TICKS_PER_SLOT;
 		}
-		/*
-		 * Now how far is the hpts sleeping to? if active is 1, its
-		 * up and ticking we do nothing, otherwise we may need to
-		 * reschedule its callout if need_new_to is set from above.
-		 */
-		if (need_wakeup) {
-			hpts->p_direct_wake = 1;
-			tcp_wakehpts(hpts);
-			if (diag) {
-				diag->need_new_to = 0;
-				diag->co_ret = 0xffff0000;
-			}
-		} else if (need_new_to) {
-			int32_t co_ret;
-			struct timeval tv;
-			sbintime_t sb;
+	}
+	/*
+	 * Now how far is the hpts sleeping to? if active is 1, its
+	 * up and ticking we do nothing, otherwise we may need to
+	 * reschedule its callout if need_new_to is set from above.
+	 */
+	if (need_wakeup) {
+		hpts->p_direct_wake = 1;
+		tcp_wakehpts(hpts);
+		if (diag) {
+			diag->need_new_to = 0;
+			diag->co_ret = 0xffff0000;
+		}
+	} else if (need_new_to) {
+		int32_t co_ret;
+		struct timeval tv;
+		sbintime_t sb;
 
-			tv.tv_sec = 0;
-			tv.tv_usec = 0;
-			while (need_new_to > HPTS_USEC_IN_SEC) {
-				tv.tv_sec++;
-				need_new_to -= HPTS_USEC_IN_SEC;
-			}
-			tv.tv_usec = need_new_to;
-			sb = tvtosbt(tv);
-			if (tcp_hpts_callout_skip_swi == 0) {
-				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
-				    hpts_timeout_swi, hpts, hpts->p_cpu,
-				    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
-			} else {
-				co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
-				    hpts_timeout_dir, hpts,
-				    hpts->p_cpu,
-				    C_PREL(tcp_hpts_precision));
-			}
-			if (diag) {
-				diag->need_new_to = need_new_to;
-				diag->co_ret = co_ret;
-			}
+		tv.tv_sec = 0;
+		tv.tv_usec = 0;
+		while (need_new_to > HPTS_USEC_IN_SEC) {
+			tv.tv_sec++;
+			need_new_to -= HPTS_USEC_IN_SEC;
+		}
+		tv.tv_usec = need_new_to;
+		sb = tvtosbt(tv);
+		cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
+		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+					      hpts_timeout_swi, hpts, cpu,
+					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+		if (diag) {
+			diag->need_new_to = need_new_to;
+			diag->co_ret = co_ret;
 		}
-	} else {
-#ifdef INVARIANTS
-		panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
-#endif
 	}
 }
 
@@ -1066,6 +1069,7 @@
 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
 	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
 }
+
 int
 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
 {
@@ -1076,18 +1080,20 @@
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
 		retval = 1;
-		if (hpts->p_hpts_active == 0) {
+		if ((hpts->p_hpts_active == 0) &&
+		    (hpts->p_on_min_sleep == 0)){
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			retval = 2;
 			hpts->p_direct_wake = 1;
-			tcp_wakeinput(hpts);
+			tcp_wakehpts(hpts);
 		}
-	} else if (hpts->p_hpts_active == 0) {
+	} else if ((hpts->p_hpts_active == 0) &&
+		   (hpts->p_on_min_sleep == 0)){
 		retval = 4;
 		hpts->p_direct_wake = 1;
-		tcp_wakeinput(hpts);
+		tcp_wakehpts(hpts);
 	}
 	return (retval);
 }
@@ -1115,22 +1121,24 @@
 	if (inp->inp_in_input == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
 		hpts_sane_input_insert(hpts, inp, line);
-		if (hpts->p_hpts_active == 0) {
+		if ((hpts->p_hpts_active == 0) &&
+		    (hpts->p_on_min_sleep == 0)){
 			/*
 			 * Activate the hpts if it is sleeping.
 			 */
 			hpts->p_direct_wake = 1;
-			tcp_wakeinput(hpts);
+			tcp_wakehpts(hpts);
 		}
-	} else if (hpts->p_hpts_active == 0) {
+	} else if ((hpts->p_hpts_active == 0) &&
+		   (hpts->p_on_min_sleep == 0)){
 		hpts->p_direct_wake = 1;
-		tcp_wakeinput(hpts);
+		tcp_wakehpts(hpts);
 	}
 	inp->inp_hpts_drop_reas = reason;
 	mtx_unlock(&hpts->p_mtx);
 }
 
-static uint16_t
+uint16_t
 hpts_random_cpu(struct inpcb *inp){
 	/*
 	 * No flow type set distribute the load randomly.
@@ -1149,18 +1157,19 @@
 	}
 	/* Nothing set use a random number */
 	ran = arc4random();
-	cpuid = (ran & 0xffff) % mp_ncpus;
+	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
 	return (cpuid);
 }
 
 static uint16_t
-hpts_cpuid(struct inpcb *inp)
+hpts_cpuid(struct inpcb *inp, int *failed)
 {
 	u_int cpuid;
 #if !defined(RSS) && defined(NUMA)
 	struct hpts_domain_info *di;
 #endif
 
+	*failed = 0;
 	/*
 	 * If one has been set use it i.e. we want both in and out on the
 	 * same hpts.
@@ -1170,6 +1179,17 @@
 	} else if (inp->inp_hpts_cpu_set) {
 		return (inp->inp_hpts_cpu);
 	}
+	/*
+	 * If we are using the irq cpu set by LRO or
+	 * the driver then it overrides all other domains.
+	 */
+	if (tcp_use_irq_cpu) {
+		if (inp->inp_irq_cpu_set == 0) {
+			*failed = 1;
+			return(0);
+		}
+		return(inp->inp_irq_cpu);
+	}
 	/* If one is set the other must be the same */
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
@@ -1183,9 +1203,10 @@
 	 * unknown cpuids to curcpu.  Not the best, but apparently better
 	 * than defaulting to swi 0.
 	 */
-
-	if (inp->inp_flowtype == M_HASHTYPE_NONE)
+	if (inp->inp_flowtype == M_HASHTYPE_NONE) {
+		counter_u64_add(cpu_uses_random, 1);
 		return (hpts_random_cpu(inp));
+	}
 	/*
 	 * Hash to a thread based on the flowid.  If we are using numa,
 	 * then restrict the hash to the numa domain where the inp lives.
@@ -1197,7 +1218,7 @@
 	} else
 #endif
 		cpuid = inp->inp_flowid % mp_ncpus;
-
+	counter_u64_add(cpu_uses_flowid, 1);
 	return (cpuid);
 #endif
 }
@@ -1323,7 +1344,7 @@
 			kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 			did_prefetch = 1;
 		}
-		if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+		if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) {
 			if (inp->inp_in_input)
 				tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
 			dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
@@ -1357,23 +1378,51 @@
 }
 
 static void
-tcp_hptsi(struct tcp_hpts_entry *hpts)
+tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
+{
+	uint32_t t = 0, i, fnd = 0;
+
+	if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
+		/*
+		 * Find next slot that is occupied and use that to
+		 * be the sleep time.
+		 */
+		for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
+			if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+				fnd = 1;
+				break;
+			}
+			t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+		}
+		KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
+		hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
+	} else {
+		/* No one on the wheel sleep for all but 400 slots or sleep max  */
+		hpts->p_hpts_sleep_time = hpts_sleep_max;
+	}
+}
+
+static int32_t
+tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp = NULL, *ninp;
 	struct timeval tv;
-	int32_t ticks_to_run, i, error;
+	uint64_t total_slots_processed = 0;
+	int32_t slots_to_run, i, error;
 	int32_t paced_cnt = 0;
 	int32_t loop_cnt = 0;
 	int32_t did_prefetch = 0;
 	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
 	int32_t wrap_loop_cnt = 0;
+	int32_t slot_pos_of_endpoint = 0;
+	int32_t orig_exit_slot;
 	int16_t set_cpu;
+	int8_t completed_measure = 0, seen_endpoint = 0;
 
 	HPTS_MTX_ASSERT(hpts);
 	NET_EPOCH_ASSERT();
-
 	/* record previous info for any logging */
 	hpts->saved_lasttick = hpts->p_lasttick;
 	hpts->saved_curtick = hpts->p_curtick;
@@ -1382,7 +1431,8 @@
 
 	hpts->p_lasttick = hpts->p_curtick;
 	hpts->p_curtick = tcp_gethptstick(&tv);
-	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+	cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
+	orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 	if ((hpts->p_on_queue_cnt == 0) ||
 	    (hpts->p_lasttick == hpts->p_curtick)) {
 		/*
@@ -1396,8 +1446,9 @@
 again:
 	hpts->p_wheel_complete = 0;
 	HPTS_MTX_ASSERT(hpts);
-	ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
-	if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+	slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+	if (((hpts->p_curtick - hpts->p_lasttick) >
+	     ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
 	    (hpts->p_on_queue_cnt != 0)) {
 		/*
 		 * Wheel wrap is occuring, basically we
@@ -1416,8 +1467,8 @@
 		 * first slot at the head.
 		 */
 		wrap_loop_cnt++;
-		hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
-		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+		hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
+		hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
 		/*
 		 * Adjust p_cur_slot to be where we are starting from
 		 * hopefully we will catch up (fat chance if something
@@ -1438,58 +1489,61 @@
 		 * INP lock and the pacer mutex to change the inp_hptsslot.
 		 */
 		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
-			inp->inp_hptsslot = hpts->p_runningtick;
+			inp->inp_hptsslot = hpts->p_runningslot;
 		}
 #endif
-		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot],
 			     &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
-		ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+		slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
 		counter_u64_add(wheel_wrap, 1);
 	} else {
 		/*
-		 * Nxt slot is always one after p_runningtick though
+		 * Nxt slot is always one after p_runningslot though
 		 * its not used usually unless we are doing wheel wrap.
 		 */
 		hpts->p_nxt_slot = hpts->p_prev_slot;
-		hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
-	}
-#ifdef INVARIANTS
-	if (TAILQ_EMPTY(&hpts->p_input) &&
-	    (hpts->p_on_inqueue_cnt != 0)) {
-		panic("tp:%p in_hpts input empty but cnt:%d",
-		      hpts, hpts->p_on_inqueue_cnt);
+		hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
 	}
-#endif
+	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+		("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
+		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
 	HPTS_MTX_ASSERT(hpts);
 	if (hpts->p_on_queue_cnt == 0) {
 		goto no_one;
 	}
 	HPTS_MTX_ASSERT(hpts);
-	for (i = 0; i < ticks_to_run; i++) {
+	for (i = 0; i < slots_to_run; i++) {
 		/*
 		 * Calculate our delay, if there are no extra ticks there
-		 * was not any (i.e. if ticks_to_run == 1, no delay).
+		 * was not any (i.e. if slots_to_run == 1, no delay).
 		 */
-		hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+		hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT;
 		HPTS_MTX_ASSERT(hpts);
-		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+		while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
+			HPTS_MTX_ASSERT(hpts);
 			/* For debugging */
+			if (seen_endpoint == 0) {
+				seen_endpoint = 1;
+				orig_exit_slot = slot_pos_of_endpoint = hpts->p_runningslot;
+			} else if (completed_measure == 0) {
+				/* Record the new position */
+				orig_exit_slot = hpts->p_runningslot;
+			}
+			total_slots_processed++;
 			hpts->p_inp = inp;
 			paced_cnt++;
-#ifdef INVARIANTS
-			if (hpts->p_runningtick != inp->inp_hptsslot) {
-				panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
-				      hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
-			}
-#endif
+			KASSERT(hpts->p_runningslot == inp->inp_hptsslot,
+				("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+				 hpts, inp, hpts->p_runningslot, inp->inp_hptsslot));
 			/* Now pull it */
 			if (inp->inp_hpts_cpu_set == 0) {
 				set_cpu = 1;
 			} else {
 				set_cpu = 0;
 			}
-			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
-			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
+			hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningslot], 0);
+			if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
 				/* We prefetch the next inp if possible */
 				kern_prefetch(ninp, &prefetch_ninp);
 				prefetch_ninp = 1;
@@ -1501,22 +1555,22 @@
 				 * Push him back on the wheel or run it
 				 * depending.
 				 */
-				uint32_t maxticks, last_tick, remaining_slots;
+				uint32_t maxslots, last_slot, remaining_slots;
 
-				remaining_slots = ticks_to_run - (i + 1);
+				remaining_slots = slots_to_run - (i + 1);
 				if (inp->inp_hpts_request > remaining_slots) {
 					/*
 					 * How far out can we go?
 					 */
-					maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
-					if (maxticks >= inp->inp_hpts_request) {
+					maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot);
+					if (maxslots >= inp->inp_hpts_request) {
 						/* we can place it finally to be processed  */
-						inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+						inp->inp_hptsslot = hpts_slot(hpts->p_runningslot, inp->inp_hpts_request);
 						inp->inp_hpts_request = 0;
 					} else {
 						/* Work off some more time */
-						inp->inp_hptsslot = last_tick;
-						inp->inp_hpts_request-= maxticks;
+						inp->inp_hptsslot = last_slot;
+						inp->inp_hpts_request-= maxslots;
 					}
 					hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
 					hpts->p_inp = NULL;
@@ -1542,12 +1596,9 @@
 			if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
 			    (inp->inp_flags2 & INP_FREED)) {
 			out_now:
-#ifdef INVARIANTS
-				if (mtx_owned(&hpts->p_mtx)) {
-					panic("Hpts:%p owns mtx prior-to lock line:%d",
-					      hpts, __LINE__);
-				}
-#endif
+				KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+					("Hpts:%p owns mtx prior-to lock line:%d",
+					 hpts, __LINE__));
 				INP_WUNLOCK(inp);
 				mtx_lock(&hpts->p_mtx);
 				hpts->p_inp = NULL;
@@ -1582,7 +1633,7 @@
 #endif
 			/* Lets do any logging that we might want to */
 			if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
-				tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
+				tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
 			}
 			/*
 			 * There is a hole here, we get the refcnt on the
@@ -1592,12 +1643,10 @@
 			 * fini gets the lock first we are assured of having
 			 * a sane INP we can lock and test.
 			 */
-#ifdef INVARIANTS
-			if (mtx_owned(&hpts->p_mtx)) {
-				panic("Hpts:%p owns mtx before tcp-output:%d",
-				      hpts, __LINE__);
-			}
-#endif
+			KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+				("Hpts:%p owns mtx prior-to tcp_output call line:%d",
+				 hpts, __LINE__));
+
 			if (tp->t_fb_ptr != NULL) {
 				kern_prefetch(tp->t_fb_ptr, &did_prefetch);
 				did_prefetch = 1;
@@ -1653,20 +1702,27 @@
 			CURVNET_RESTORE();
 #endif
 			INP_UNLOCK_ASSERT(inp);
-#ifdef INVARIANTS
-			if (mtx_owned(&hpts->p_mtx)) {
-				panic("Hpts:%p owns mtx prior-to lock line:%d",
-				      hpts, __LINE__);
-			}
-#endif
+			KASSERT(mtx_owned(&hpts->p_mtx) == 0,
+				("Hpts:%p owns mtx prior-to lock line:%d",
+				 hpts, __LINE__));
 			mtx_lock(&hpts->p_mtx);
 			hpts->p_inp = NULL;
 		}
+		if (seen_endpoint) {
+			/*
+			 * We now have a accurate distance between
+			 * slot_pos_of_endpoint <-> orig_exit_slot
+			 * to tell us how late we were, orig_exit_slot
+			 * is where we calculated the end of our cycle to
+			 * be when we first entered.
+			 */
+			completed_measure = 1;
+		}
 		HPTS_MTX_ASSERT(hpts);
 		hpts->p_inp = NULL;
-		hpts->p_runningtick++;
-		if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
-			hpts->p_runningtick = 0;
+		hpts->p_runningslot++;
+		if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
+			hpts->p_runningslot = 0;
 		}
 	}
 no_one:
@@ -1676,16 +1732,13 @@
 	 * Check to see if we took an excess amount of time and need to run
 	 * more ticks (if we did not hit eno-bufs).
 	 */
-#ifdef INVARIANTS
-	if (TAILQ_EMPTY(&hpts->p_input) &&
-	    (hpts->p_on_inqueue_cnt != 0)) {
-		panic("tp:%p in_hpts input empty but cnt:%d",
-		      hpts, hpts->p_on_inqueue_cnt);
-	}
-#endif
+	KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) ||
+		 ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
+		("%s hpts:%p in_hpts cnt:%d queue state mismatch",
+		 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
 	hpts->p_prev_slot = hpts->p_cur_slot;
 	hpts->p_lasttick = hpts->p_curtick;
-	if (loop_cnt > max_pacer_loops) {
+	if ((from_callout == 0) || (loop_cnt > max_pacer_loops)) {
 		/*
 		 * Something is serious slow we have
 		 * looped through processing the wheel
@@ -1700,11 +1753,16 @@
 		 * correct. When it next awakens
 		 * it will find itself further behind.
 		 */
-		counter_u64_add(hpts_hopelessly_behind, 1);
+		if (from_callout)
+			counter_u64_add(hpts_hopelessly_behind, 1);
 		goto no_run;
 	}
 	hpts->p_curtick = tcp_gethptstick(&tv);
 	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+	if (seen_endpoint == 0) {
+		/* We saw no endpoint but we may be looping */
+		orig_exit_slot = hpts->p_cur_slot;
+	}
 	if ((wrap_loop_cnt < 2) &&
 	    (hpts->p_lasttick != hpts->p_curtick)) {
 		counter_u64_add(hpts_loops, 1);
@@ -1712,6 +1770,7 @@
 		goto again;
 	}
 no_run:
+	cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
 	/*
 	 * Set flag to tell that we are done for
 	 * any slot input that happens during
@@ -1725,76 +1784,58 @@
 	if (!TAILQ_EMPTY(&hpts->p_input)) {
 		tcp_input_data(hpts, &tv);
 		/*
-		 * Now did we spend too long running
-		 * input and need to run more ticks?
+		 * Now did we spend too long running input and need to run more ticks?
+		 * Note that if wrap_loop_cnt < 2 then we should have the conditions
+		 * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
+		 * is greater than 2, then the condtion most likely are *not* true. Also
+		 * if we are called not from the callout, we don't run the wheel multiple
+		 * times so the slots may not align either.
 		 */
-		KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+		KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) ||
+			 (wrap_loop_cnt >= 2) || (from_callout == 0)),
 			("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
 			 hpts->p_prev_slot, hpts->p_cur_slot));
-		KASSERT(hpts->p_lasttick == hpts->p_curtick,
+		KASSERT(((hpts->p_lasttick == hpts->p_curtick)
+			 || (wrap_loop_cnt >= 2) || (from_callout == 0)),
 			("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
 			 hpts->p_lasttick, hpts->p_curtick));
-		hpts->p_curtick = tcp_gethptstick(&tv);
-		if (hpts->p_lasttick != hpts->p_curtick) {
+		if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
+			hpts->p_curtick = tcp_gethptstick(&tv);
 			counter_u64_add(hpts_loops, 1);
 			hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 			goto again;
 		}
 	}
-	{
-		uint32_t t = 0, i, fnd = 0;
-
-		if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
-			/*
-			 * Find next slot that is occupied and use that to
-			 * be the sleep time.
-			 */
-			for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
-				if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
-					fnd = 1;
-					break;
-				}
-				t = (t + 1) % NUM_OF_HPTSI_SLOTS;
-			}
-			if (fnd) {
-				hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
-			} else {
-#ifdef INVARIANTS
-				panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
-#endif
-				counter_u64_add(back_tosleep, 1);
-				hpts->p_on_queue_cnt = 0;
-				goto non_found;
-			}
-		} else if (wrap_loop_cnt >= 2) {
-			/* Special case handling */
-			hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
-		} else {
-			/* No one on the wheel sleep for all but 400 slots or sleep max  */
-		non_found:
-			hpts->p_hpts_sleep_time = hpts_sleep_max;
-		}
+	if (from_callout){
+		tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
 	}
+	if (seen_endpoint)
+		return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
+	else
+		return (0);
 }
 
 void
 __tcp_set_hpts(struct inpcb *inp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
+	int failed;
 
 	INP_WLOCK_ASSERT(inp);
 	hpts = tcp_hpts_lock(inp);
 	if ((inp->inp_in_hpts == 0) &&
 	    (inp->inp_hpts_cpu_set == 0)) {
-		inp->inp_hpts_cpu = hpts_cpuid(inp);
-		inp->inp_hpts_cpu_set = 1;
+		inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
+		if (failed == 0)
+			inp->inp_hpts_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 	hpts = tcp_input_lock(inp);
 	if ((inp->inp_input_cpu_set == 0) &&
 	    (inp->inp_in_input == 0)) {
-		inp->inp_input_cpu = hpts_cpuid(inp);
-		inp->inp_input_cpu_set = 1;
+		inp->inp_input_cpu = hpts_cpuid(inp, &failed);
+		if (failed == 0)
+			inp->inp_input_cpu_set = 1;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
@@ -1804,6 +1845,127 @@
 	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
 }
 
+static void
+__tcp_run_hpts(struct tcp_hpts_entry *hpts)
+{
+	int ticks_ran;
+
+	if (hpts->p_hpts_active) {
+		/* Already active */
+		return;
+	}
+	if (mtx_trylock(&hpts->p_mtx) == 0) {
+		/* Someone else got the lock */
+		return;
+	}
+	if (hpts->p_hpts_active)
+		goto out_with_mtx;
+	hpts->syscall_cnt++;
+	counter_u64_add(hpts_direct_call, 1);
+	hpts->p_hpts_active = 1;
+	ticks_ran = tcp_hptsi(hpts, 0);
+	/* We may want to adjust the sleep values here */
+	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+		if (ticks_ran > ticks_indicate_less_sleep) {
+			struct timeval tv;
+			sbintime_t sb;
+			int cpu;
+
+			hpts->p_mysleep.tv_usec /= 2;
+			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+				hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+			/* Reschedule with new to value */
+			tcp_hpts_set_max_sleep(hpts, 0);
+			tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+			/* Validate its in the right ranges */
+			if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+				hpts->overidden_sleep = tv.tv_usec;
+				tv.tv_usec = hpts->p_mysleep.tv_usec;
+			} else if (tv.tv_usec > dynamic_max_sleep) {
+				/* Lets not let sleep get above this value */
+				hpts->overidden_sleep = tv.tv_usec;
+				tv.tv_usec = dynamic_max_sleep;
+			}
+			/*
+			 * In this mode the timer is a backstop to
+			 * all the userret/lro_flushes so we use
+			 * the dynamic value and set the on_min_sleep
+			 * flag so we will not be awoken.
+			 */
+			sb = tvtosbt(tv);
+			cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
+			/* Store off to make visible the actual sleep time */
+			hpts->sleeping = tv.tv_usec;
+			callout_reset_sbt_on(&hpts->co, sb, 0,
+					     hpts_timeout_swi, hpts, cpu,
+					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+		} else if (ticks_ran < ticks_indicate_more_sleep) {
+			/* For the further sleep, don't reschedule  hpts */
+			hpts->p_mysleep.tv_usec *= 2;
+			if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+				hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+		}
+		hpts->p_on_min_sleep = 1;
+	}
+	hpts->p_hpts_active = 0;
+out_with_mtx:
+	HPTS_MTX_ASSERT(hpts);
+	mtx_unlock(&hpts->p_mtx);
+}
+
+static struct tcp_hpts_entry *
+tcp_choose_hpts_to_run()
+{
+	int i, oldest_idx;
+	uint32_t cts, time_since_ran, calc;
+
+	if ((hpts_uses_oldest == 0) ||
+	    ((hpts_uses_oldest > 1) &&
+	     (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
+		/*
+		 * We have either disabled the feature (0), or
+		 * we have crossed over the oldest threshold on the
+		 * last hpts. We use the last one for simplification
+		 * since we don't want to use the first one (it may
+		 * have starting connections that have not settled
+		 * on the cpu yet).
+		 */
+		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+	}
+	/* Lets find the oldest hpts to attempt to run */
+	cts = tcp_get_usecs(NULL);
+	time_since_ran = 0;
+	oldest_idx = -1;
+	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+		if (TSTMP_GT(cts, cts_last_ran[i]))
+			calc = cts - cts_last_ran[i];
+		else
+			calc = 0;
+		if (calc > time_since_ran) {
+			oldest_idx = i;
+			time_since_ran = calc;
+		}
+	}
+	if (oldest_idx >= 0)
+		return(tcp_pace.rp_ent[oldest_idx]);
+	else
+		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
+}
+
+
+void
+tcp_run_hpts(void)
+{
+	static struct tcp_hpts_entry *hpts;
+	struct epoch_tracker et;
+
+	NET_EPOCH_ENTER(et);
+	hpts = tcp_choose_hpts_to_run();
+	__tcp_run_hpts(hpts);
+	NET_EPOCH_EXIT(et);
+}
+
+
 static void
 tcp_hpts_thread(void *ctx)
 {
@@ -1811,51 +1973,142 @@
 	struct epoch_tracker et;
 	struct timeval tv;
 	sbintime_t sb;
+	int cpu, ticks_ran;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
 	if (hpts->p_direct_wake) {
-		/* Signaled by input */
+		/* Signaled by input or output with low occupancy count. */
 		callout_stop(&hpts->co);
+		counter_u64_add(hpts_direct_awakening, 1);
 	} else {
-		/* Timed out */
+		/* Timed out, the normal case. */
+		counter_u64_add(hpts_wake_timeout, 1);
 		if (callout_pending(&hpts->co) ||
 		    !callout_active(&hpts->co)) {
 			mtx_unlock(&hpts->p_mtx);
 			return;
 		}
-		callout_deactivate(&hpts->co);
 	}
+	callout_deactivate(&hpts->co);
 	hpts->p_hpts_wake_scheduled = 0;
-	hpts->p_hpts_active = 1;
 	NET_EPOCH_ENTER(et);
-	tcp_hptsi(hpts);
-	NET_EPOCH_EXIT(et);
-	HPTS_MTX_ASSERT(hpts);
+	if (hpts->p_hpts_active) {
+		/*
+		 * We are active already. This means that a syscall
+		 * trap or LRO is running in behalf of hpts. In that case
+		 * we need to double our timeout since there seems to be
+		 * enough activity in the system that we don't need to
+		 * run as often (if we were not directly woken).
+		 */
+		if (hpts->p_direct_wake == 0) {
+			counter_u64_add(hpts_back_tosleep, 1);
+			if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+				hpts->p_mysleep.tv_usec *= 2;
+				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+					hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+				tv.tv_usec = hpts->p_mysleep.tv_usec;
+				hpts->p_on_min_sleep = 1;
+			} else {
+				/*
+				 * Here we have low count on the wheel, but
+				 * somehow we still collided with one of the
+				 * connections. Lets go back to sleep for a
+				 * min sleep time, but clear the flag so we
+				 * can be awoken by insert.
+				 */
+				hpts->p_on_min_sleep = 0;
+				tv.tv_usec = tcp_min_hptsi_time;
+			}
+		} else {
+			/*
+			 * Directly woken most likely to reset the
+			 * callout time.
+			 */
+			tv.tv_sec = 0;
+			tv.tv_usec = hpts->p_mysleep.tv_usec;
+		}
+		goto back_to_sleep;
+	}
+	hpts->sleeping = 0;
+	hpts->p_hpts_active = 1;
+	ticks_ran = tcp_hptsi(hpts, 1);
 	tv.tv_sec = 0;
-	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
-	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
-		hpts->overidden_sleep = tv.tv_usec;
-		tv.tv_usec = tcp_min_hptsi_time;
+	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+	if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
+		if(hpts->p_direct_wake == 0) {
+			/*
+			 * Only adjust sleep time if we were
+			 * called from the callout i.e. direct_wake == 0.
+			 */
+			if (ticks_ran < ticks_indicate_more_sleep) {
+				hpts->p_mysleep.tv_usec *= 2;
+				if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
+					hpts->p_mysleep.tv_usec = dynamic_max_sleep;
+			} else if (ticks_ran > ticks_indicate_less_sleep) {
+				hpts->p_mysleep.tv_usec /= 2;
+				if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
+					hpts->p_mysleep.tv_usec = dynamic_min_sleep;
+			}
+		}
+		if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
+			hpts->overidden_sleep = tv.tv_usec;
+			tv.tv_usec = hpts->p_mysleep.tv_usec;
+		} else if (tv.tv_usec > dynamic_max_sleep) {
+			/* Lets not let sleep get above this value */
+			hpts->overidden_sleep = tv.tv_usec;
+			tv.tv_usec = dynamic_max_sleep;
+		}
+		/*
+		 * In this mode the timer is a backstop to
+		 * all the userret/lro_flushes so we use
+		 * the dynamic value and set the on_min_sleep
+		 * flag so we will not be awoken.
+		 */
 		hpts->p_on_min_sleep = 1;
-	} else {
-		/* Clear the min sleep flag */
-		hpts->overidden_sleep = 0;
+	} else if (hpts->p_on_queue_cnt == 0)  {
+		/*
+		 * No one on the wheel, please wake us up
+		 * if you insert on the wheel.
+		 */
 		hpts->p_on_min_sleep = 0;
-	}
-	hpts->p_hpts_active = 0;
-	sb = tvtosbt(tv);
-	if (tcp_hpts_callout_skip_swi == 0) {
-		callout_reset_sbt_on(&hpts->co, sb, 0,
-		    hpts_timeout_swi, hpts, hpts->p_cpu,
-		    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+		hpts->overidden_sleep = 0;
 	} else {
-		callout_reset_sbt_on(&hpts->co, sb, 0,
-		    hpts_timeout_dir, hpts,
-		    hpts->p_cpu,
-		    C_PREL(tcp_hpts_precision));
+		/*
+		 * We hit here when we have a low number of
+		 * clients on the wheel (our else clause).
+		 * We may need to go on min sleep, if we set
+		 * the flag we will not be awoken if someone
+		 * is inserted ahead of us. Clearing the flag
+		 * means we can be awoken. This is "old mode"
+		 * where the timer is what runs hpts mainly.
+		 */
+		if (tv.tv_usec < tcp_min_hptsi_time) {
+			/*
+			 * Yes on min sleep, which means
+			 * we cannot be awoken.
+			 */
+			hpts->overidden_sleep = tv.tv_usec;
+			tv.tv_usec = tcp_min_hptsi_time;
+			hpts->p_on_min_sleep = 1;
+		} else {
+			/* Clear the min sleep flag */
+			hpts->overidden_sleep = 0;
+			hpts->p_on_min_sleep = 0;
+		}
 	}
+	HPTS_MTX_ASSERT(hpts);
+	hpts->p_hpts_active = 0;
+back_to_sleep:
 	hpts->p_direct_wake = 0;
+	sb = tvtosbt(tv);
+	cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
+	/* Store off to make visible the actual sleep time */
+	hpts->sleeping = tv.tv_usec;
+	callout_reset_sbt_on(&hpts->co, sb, 0,
+			     hpts_timeout_swi, hpts, cpu,
+			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+	NET_EPOCH_EXIT(et);
 	mtx_unlock(&hpts->p_mtx);
 }
 
@@ -1873,7 +2126,7 @@
 	cpuset_t cs;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
-	int count, domain;
+	int count, domain, cpu;
 
 	tcp_pace.rp_proc = NULL;
 	tcp_pace.rp_num_hptss = ncpus;
@@ -1882,8 +2135,18 @@
 	back_tosleep = counter_u64_alloc(M_WAITOK);
 	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
 	wheel_wrap = counter_u64_alloc(M_WAITOK);
+	hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
+	hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
+	hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
+	hpts_direct_call = counter_u64_alloc(M_WAITOK);
+	cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
+	cpu_uses_random = counter_u64_alloc(M_WAITOK);
+
+
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
+	cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
@@ -1933,19 +2196,41 @@
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "runtick", CTLFLAG_RD,
-		    &hpts->p_runningtick, 0,
+		    &hpts->p_runningslot, 0,
 		    "What the running pacers current slot is");
 		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
 		    SYSCTL_CHILDREN(hpts->hpts_root),
 		    OID_AUTO, "curtick", CTLFLAG_RD,
 		    &hpts->p_curtick, 0,
 		    "What the running pacers last tick mapped to the wheel was");
+		SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "lastran", CTLFLAG_RD,
+		    &cts_last_ran[i], 0,
+		    "The last usec tick that this hpts ran");
+		SYSCTL_ADD_U64(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
+		    &hpts->p_mysleep.tv_usec, 0,
+		    "What the running pacers is using for p_mysleep.tv_usec");
+		SYSCTL_ADD_U64(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "now_sleeping", CTLFLAG_RD,
+		    &hpts->sleeping, 0,
+		    "What the running pacers is actually sleeping for");
+		SYSCTL_ADD_U64(&hpts->hpts_ctx,
+		    SYSCTL_CHILDREN(hpts->hpts_root),
+		    OID_AUTO, "syscall_cnt", CTLFLAG_RD,
+		    &hpts->syscall_cnt, 0,
+		    "How many times we had syscalls on this hpts");
+
 		hpts->p_hpts_sleep_time = hpts_sleep_max;
 		hpts->p_num = i;
 		hpts->p_curtick = tcp_gethptstick(&tv);
+		cts_last_ran[i] = tcp_tv_to_usectick(&tv);
 		hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
 		hpts->p_cpu = 0xffff;
-		hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
+		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
 	}
 
@@ -1956,17 +2241,18 @@
 	/*
 	 * Now lets start ithreads to handle the hptss.
 	 */
-	CPU_FOREACH(i) {
+	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
-		if (error) {
-			panic("Can't add hpts:%p i:%d err:%d",
-			    hpts, i, error);
-		}
+		KASSERT(error == 0,
+			("Can't add hpts:%p i:%d err:%d",
+			 hpts, i, error));
 		created++;
+		hpts->p_mysleep.tv_sec = 0;
+		hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
 		if (tcp_bind_threads == 1) {
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
@@ -1983,18 +2269,13 @@
 			}
 		}
 		tv.tv_sec = 0;
-		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
+		hpts->sleeping = tv.tv_usec;
 		sb = tvtosbt(tv);
-		if (tcp_hpts_callout_skip_swi == 0) {
-			callout_reset_sbt_on(&hpts->co, sb, 0,
-			    hpts_timeout_swi, hpts, hpts->p_cpu,
-			    (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
-		} else {
-			callout_reset_sbt_on(&hpts->co, sb, 0,
-			    hpts_timeout_dir, hpts,
-			    hpts->p_cpu,
-			    C_PREL(tcp_hpts_precision));
-		}
+		cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
+		callout_reset_sbt_on(&hpts->co, sb, 0,
+				     hpts_timeout_swi, hpts, cpu,
+				     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	}
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
@@ -2006,11 +2287,13 @@
 			break;
 		}
 	}
-
 	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
 	    created, bound,
 	    tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
+#ifdef INVARIANTS
+	printf("HPTS is in INVARIANT mode!!\n");
+#endif
 }
 
-SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
+SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
 MODULE_VERSION(tcphpts, 1);
Index: sys/netinet/tcp_lro.h
===================================================================
--- sys/netinet/tcp_lro.h
+++ sys/netinet/tcp_lro.h
@@ -56,6 +56,11 @@
 #define TSTMP_LRO		0x0100
 #define TSTMP_HDWR		0x0200
 #define HAS_TSTMP		0x0400
+/*
+ * Default number of interrupts on the same cpu in a row
+ * that will cause us to declare a "affinity cpu".
+ */
+#define TCP_LRO_CPU_DECLARATION_THRESH 50
 
 struct inpcb;
 
@@ -162,12 +167,15 @@
 	unsigned	lro_mbuf_count;
 	unsigned	lro_mbuf_max;
 	unsigned short	lro_ackcnt_lim;		/* max # of aggregated ACKs */
+	unsigned short	lro_cpu;		/* Guess at the cpu we have affinity too */
 	unsigned 	lro_length_lim;		/* max len of aggregated data */
-
 	u_long		lro_hashsz;
+	uint32_t	lro_last_cpu;
+	uint32_t 	lro_cnt_of_same_cpu;
 	struct lro_head	*lro_hash;
 	struct lro_head	lro_active;
 	struct lro_head	lro_free;
+	uint8_t		lro_cpu_is_set;		/* Flag to say its ok to set the CPU on the inp */
 };
 
 struct tcp_ackent {
Index: sys/netinet/tcp_lro.c
===================================================================
--- sys/netinet/tcp_lro.c
+++ sys/netinet/tcp_lro.c
@@ -107,6 +107,11 @@
     CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
     "default number of LRO entries");
 
+static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
+    CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
+    "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
+
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
     &tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
 SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
@@ -631,12 +636,13 @@
 			log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
 		}
 		log.u_bbr.inflight = th_seq;
+		log.u_bbr.delivered = th_ack;
 		log.u_bbr.timeStamp = cts;
 		log.u_bbr.epoch = le->next_seq;
-		log.u_bbr.delivered = th_ack;
 		log.u_bbr.lt_epoch = le->ack_seq;
 		log.u_bbr.pacing_gain = th_win;
 		log.u_bbr.cwnd_gain = le->window;
+		log.u_bbr.lost = curcpu;
 		log.u_bbr.cur_del_rate = (uintptr_t)m;
 		log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
 		bintime2timeval(&lc->lro_last_queue_time, &btv);
@@ -1273,7 +1279,10 @@
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
-
+	if ((inp->inp_irq_cpu_set == 0)  && (lc->lro_cpu_is_set == 1)) {
+		inp->inp_irq_cpu = lc->lro_last_cpu;
+		inp->inp_irq_cpu_set = 1;
+	}
 	/* Check if the transport doesn't support the needed optimizations. */
 	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
 		INP_WUNLOCK(inp);
@@ -1445,7 +1454,17 @@
 	/* check if no mbufs to flush */
 	if (lc->lro_mbuf_count == 0)
 		goto done;
-
+	if (lc->lro_cpu_is_set == 0) {
+		if (lc->lro_last_cpu == curcpu) {
+			lc->lro_cnt_of_same_cpu++;
+			/* Have we reached the threshold to declare a cpu? */
+			if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
+				lc->lro_cpu_is_set = 1;
+		} else {
+			lc->lro_last_cpu = curcpu;
+			lc->lro_cnt_of_same_cpu = 0;
+		}
+	}
 	CURVNET_SET(lc->ifp->if_vnet);
 
 	/* get current time */
@@ -1486,6 +1505,9 @@
 	/* flush active streams */
 	tcp_lro_rx_done(lc);
 
+#ifdef TCPHPTS
+	tcp_run_hpts();
+#endif
 	lc->lro_mbuf_count = 0;
 }
 
Index: sys/netinet/tcp_stacks/bbr.c
===================================================================
--- sys/netinet/tcp_stacks/bbr.c
+++ sys/netinet/tcp_stacks/bbr.c
@@ -2429,10 +2429,10 @@
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
-		log.u_bbr.inflight = diag->p_runningtick;
-		log.u_bbr.bw_inuse = diag->wheel_tick;
+		log.u_bbr.inflight = diag->p_runningslot;
+		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
-		log.u_bbr.delRate = diag->maxticks;
+		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -5609,11 +5609,11 @@
 		log.u_bbr.pkts_out = diag->co_ret;
 		log.u_bbr.applimited = diag->hpts_sleep_time;
 		log.u_bbr.delivered = diag->p_prev_slot;
-		log.u_bbr.inflight = diag->p_runningtick;
-		log.u_bbr.bw_inuse = diag->wheel_tick;
+		log.u_bbr.inflight = diag->p_runningslot;
+		log.u_bbr.bw_inuse = diag->wheel_slot;
 		log.u_bbr.rttProp = diag->wheel_cts;
 		log.u_bbr.timeStamp = cts;
-		log.u_bbr.delRate = diag->maxticks;
+		log.u_bbr.delRate = diag->maxslots;
 		log.u_bbr.cur_del_rate = diag->p_curtick;
 		log.u_bbr.cur_del_rate <<= 32;
 		log.u_bbr.cur_del_rate |= diag->p_lasttick;
@@ -5707,22 +5707,22 @@
 			 * on the clock. We always have a min
 			 * 10 slots (10 x 10 i.e. 100 usecs).
 			 */
-			if (slot <= HPTS_TICKS_PER_USEC) {
+			if (slot <= HPTS_TICKS_PER_SLOT) {
 				/* We gain delay */
-				rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
-				slot = HPTS_TICKS_PER_USEC;
+				rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
+				slot = HPTS_TICKS_PER_SLOT;
 			} else {
 				/* We take off some */
-				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
-				slot = HPTS_TICKS_PER_USEC;
+				rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
+				slot = HPTS_TICKS_PER_SLOT;
 			}
 		} else {
 			slot -= rack->r_ctl.rc_agg_delayed;
 			rack->r_ctl.rc_agg_delayed = 0;
 			/* Make sure we have 100 useconds at minimum */
-			if (slot < HPTS_TICKS_PER_USEC) {
-				rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
-				slot = HPTS_TICKS_PER_USEC;
+			if (slot < HPTS_TICKS_PER_SLOT) {
+				rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
+				slot = HPTS_TICKS_PER_SLOT;
 			}
 			if (rack->r_ctl.rc_agg_delayed == 0)
 				rack->r_late = 0;