diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -90,9 +90,8 @@
 				 * Note do not set this to 0.
 				 */
 #define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
-#define DYNAMIC_MAX_SLEEP 100000	/* 100ms */
-/* No of connections when wee start aligning to the cpu from syscalls */
-#define OLDEST_THRESHOLD 1200
+#define DYNAMIC_MAX_SLEEP 5000	/* 5ms */
+
 /* Thresholds for raising/lowering sleep */
 #define TICKS_INDICATE_MORE_SLEEP 100		/* This would be 1ms */
 #define TICKS_INDICATE_LESS_SLEEP 1000		/* This would indicate 10ms */
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -229,8 +229,10 @@
 }               __aligned(CACHE_LINE_SIZE);
 
 static struct tcp_hptsi {
+	struct cpu_group **grps;
 	struct tcp_hpts_entry **rp_ent;	/* Array of hptss */
 	uint32_t *cts_last_ran;
+	uint32_t grp_cnt;
 	uint32_t rp_num_hptss;	/* Number of hpts threads */
 } tcp_pace;
 
@@ -243,8 +245,6 @@
 static int tcp_use_irq_cpu = 0;
 static uint32_t *cts_last_ran;
 static int hpts_does_tp_logging = 0;
-static int hpts_use_assigned_cpu = 1;
-static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;
 
 static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
 static void tcp_hpts_thread(void *ctx);
@@ -256,7 +256,6 @@
 static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
 
 
-
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP Hpts controls");
 SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -355,12 +354,6 @@
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
     &hpts_does_tp_logging, 0,
     "Do we add to any tp that has logging on pacer logs");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
-    &hpts_use_assigned_cpu, 0,
-    "Do we start any hpts timer on the assigned cpu?");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
-    &hpts_uses_oldest, OLDEST_THRESHOLD,
-    "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
     &dynamic_min_sleep, 250,
     "What is the dynamic minsleep value?");
@@ -368,10 +361,6 @@
     &dynamic_max_sleep, 5000,
     "What is the dynamic maxsleep value?");
 
-
-
-
-
 static int32_t max_pacer_loops = 10;
 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
     &max_pacer_loops, 10,
@@ -390,8 +379,8 @@
 	new = hpts_sleep_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
-		if ((new < dynamic_min_sleep) ||
-		    (new > HPTS_MAX_SLEEP_ALLOWED))
+		if ((new < (dynamic_min_sleep/HPTS_TICKS_PER_SLOT)) ||
+		     (new > HPTS_MAX_SLEEP_ALLOWED))
 			error = EINVAL;
 		else
 			hpts_sleep_max = new;
@@ -417,13 +406,13 @@
 }
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
-    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+    CTLTYPE_UINT | CTLFLAG_RW,
     &hpts_sleep_max, 0,
     &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
-    "Maximum time hpts will sleep");
+    "Maximum time hpts will sleep in slots");
 
 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
-    CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
+    CTLTYPE_UINT | CTLFLAG_RW,
     &tcp_min_hptsi_time, 0,
     &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
     "The minimum time the hpts must sleep before processing more slots");
@@ -818,7 +807,6 @@
 	struct timeval tv;
 	uint32_t slot_on, wheel_cts, last_slot, need_new_to = 0;
 	int32_t wheel_slot, maxslots;
-	int cpu;
 	bool need_wakeup = false;
 
 	INP_WLOCK_ASSERT(inp);
@@ -975,9 +963,8 @@
 		}
 		tv.tv_usec = need_new_to;
 		sb = tvtosbt(tv);
-		cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
 		co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
-					      hpts_timeout_swi, hpts, cpu,
+					      hpts_timeout_swi, hpts, hpts->p_cpu,
 					      (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		if (diag) {
 			diag->need_new_to = need_new_to;
@@ -1054,17 +1041,24 @@
 	 * Hash to a thread based on the flowid.  If we are using numa,
 	 * then restrict the hash to the numa domain where the inp lives.
 	 */
+
+#ifdef NUMA
+	if ((vm_ndomains == 1) ||
+	    (inp->inp_numa_domain == M_NODOM)) {
+#endif
+		cpuid = inp->inp_flowid % mp_ncpus;
 #ifdef NUMA
-	if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) {
+	} else {
+		/* Hash into the cpu's that use that domain */
 		di = &hpts_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
-	} else
+	}
 #endif
-		cpuid = inp->inp_flowid % mp_ncpus;
 	counter_u64_add(cpu_uses_flowid, 1);
 	return (cpuid);
 }
 
+#ifdef not_longer_used_gleb
 static void
 tcp_drop_in_pkts(struct tcpcb *tp)
 {
@@ -1083,11 +1077,12 @@
 			n = m->m_nextpkt;
 	}
 }
+#endif
 
 static void
 tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
 {
-	uint32_t t = 0, i, fnd = 0;
+	uint32_t t = 0, i;
 
 	if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
 		/*
@@ -1096,12 +1091,11 @@
 		 */
 		for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
 			if (TAILQ_EMPTY(&hpts->p_hptss[t].head) == 0) {
-				fnd = 1;
 				break;
 			}
 			t = (t + 1) % NUM_OF_HPTSI_SLOTS;
 		}
-		KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
+		KASSERT((i != NUM_OF_HPTSI_SLOTS), ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
 		hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
 	} else {
 		/* No one on the wheel sleep for all but 400 slots or sleep max  */
@@ -1554,7 +1548,6 @@
 		if (ticks_ran > ticks_indicate_less_sleep) {
 			struct timeval tv;
 			sbintime_t sb;
-			int cpu;
 
 			hpts->p_mysleep.tv_usec /= 2;
 			if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
@@ -1578,11 +1571,10 @@
 			 * flag so we will not be awoken.
 			 */
 			sb = tvtosbt(tv);
-			cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
 			/* Store off to make visible the actual sleep time */
 			hpts->sleeping = tv.tv_usec;
 			callout_reset_sbt_on(&hpts->co, sb, 0,
-					     hpts_timeout_swi, hpts, cpu,
+					     hpts_timeout_swi, hpts, hpts->p_cpu,
 					     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 		} else if (ticks_ran < ticks_indicate_more_sleep) {
 			/* For the further sleep, don't reschedule  hpts */
@@ -1601,27 +1593,29 @@
 static struct tcp_hpts_entry *
 tcp_choose_hpts_to_run()
 {
-	int i, oldest_idx;
+	int i, oldest_idx, start, end;
 	uint32_t cts, time_since_ran, calc;
 
-	if ((hpts_uses_oldest == 0) ||
-	    ((hpts_uses_oldest > 1) &&
-	     (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
-		/*
-		 * We have either disabled the feature (0), or
-		 * we have crossed over the oldest threshold on the
-		 * last hpts. We use the last one for simplification
-		 * since we don't want to use the first one (it may
-		 * have starting connections that have not settled
-		 * on the cpu yet).
-		 */
-		return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
-	}
-	/* Lets find the oldest hpts to attempt to run */
 	cts = tcp_get_usecs(NULL);
 	time_since_ran = 0;
+	/* Default is all one group */
+	start = 0;
+	end = tcp_pace.rp_num_hptss;
+	/*
+	 * If we have more than one L3 group figure out which one
+	 * this CPU is in.
+	 */
+	if (tcp_pace.grp_cnt > 1) {
+		for (i = 0; i < tcp_pace.grp_cnt; i++) {
+			if (CPU_ISSET(curcpu, &tcp_pace.grps[i]->cg_mask)) {
+				start = tcp_pace.grps[i]->cg_first;
+				end = (tcp_pace.grps[i]->cg_last + 1);
+				break;
+			}
+		}
+	}
 	oldest_idx = -1;
-	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+	for (i = start; i < end; i++) {
 		if (TSTMP_GT(cts, cts_last_ran[i]))
 			calc = cts - cts_last_ran[i];
 		else
@@ -1658,7 +1652,7 @@
 	struct epoch_tracker et;
 	struct timeval tv;
 	sbintime_t sb;
-	int cpu, ticks_ran;
+	int ticks_ran;
 
 	hpts = (struct tcp_hpts_entry *)ctx;
 	mtx_lock(&hpts->p_mtx);
@@ -1787,11 +1781,10 @@
 back_to_sleep:
 	hpts->p_direct_wake = 0;
 	sb = tvtosbt(tv);
-	cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
 	/* Store off to make visible the actual sleep time */
 	hpts->sleeping = tv.tv_usec;
 	callout_reset_sbt_on(&hpts->co, sb, 0,
-			     hpts_timeout_swi, hpts, cpu,
+			     hpts_timeout_swi, hpts, hpts->p_cpu,
 			     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	NET_EPOCH_EXIT(et);
 	mtx_unlock(&hpts->p_mtx);
@@ -1799,20 +1792,62 @@
 
 #undef	timersub
 
+static int32_t
+hpts_count_level(struct cpu_group *cg)
+{
+	int32_t count_l3, i;
+
+	count_l3 = 0;
+	if (cg->cg_level == CG_SHARE_L3)
+		count_l3++;
+	/* Walk all the children looking for L3 */
+	for (i = 0; i < cg->cg_children; i++) {
+		count_l3 += hpts_count_level(&cg->cg_child[i]);
+	}
+	return (count_l3);
+}
+
+static void
+hpts_gather_grps(struct cpu_group **grps, int32_t *at, int32_t max, struct cpu_group *cg)
+{
+	int32_t idx, i;
+
+	idx = *at;
+	if (cg->cg_level == CG_SHARE_L3) {
+		grps[idx] = cg;
+		idx++;
+		if (idx == max) {
+			*at = idx;
+			return;
+		}
+	}
+	*at = idx;
+	/* Walk all the children looking for L3 */
+	for (i = 0; i < cg->cg_children; i++) {
+		hpts_gather_grps(grps, at, max, &cg->cg_child[i]);
+	}
+}
+
 static void
 tcp_init_hptsi(void *st)
 {
-	int32_t i, j, error, bound = 0, created = 0;
+	struct cpu_group *cpu_top;
+	int32_t error __diagused;
+	int32_t i, j, bound = 0, created = 0;
 	size_t sz, asz;
 	struct timeval tv;
 	sbintime_t sb;
 	struct tcp_hpts_entry *hpts;
 	struct pcpu *pc;
-	cpuset_t cs;
 	char unit[16];
 	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
-	int count, domain, cpu;
+	int count, domain;
 
+#ifdef SMP
+	cpu_top = smp_topo();
+#else
+	cpu_top = NULL;
+#endif
 	tcp_pace.rp_num_hptss = ncpus;
 	hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
 	hpts_loops = counter_u64_alloc(M_WAITOK);
@@ -1826,17 +1861,46 @@
 	cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
 	cpu_uses_random = counter_u64_alloc(M_WAITOK);
 
-
 	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
 	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
 	sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
 	cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
+	tcp_pace.grp_cnt = 0;
+	if (cpu_top == NULL) {
+		tcp_pace.grp_cnt = 1;
+	} else {
+		/* Find out how many cache level 3 domains we have */
+		count = 0;
+		tcp_pace.grp_cnt = hpts_count_level(cpu_top);
+		if (tcp_pace.grp_cnt == 0) {
+			tcp_pace.grp_cnt = 1;
+		}
+		sz = (tcp_pace.grp_cnt * sizeof(struct cpu_group *));
+		tcp_pace.grps = malloc(sz, M_TCPHPTS, M_WAITOK);
+		/* Now populate the groups */
+		if (tcp_pace.grp_cnt == 1) {
+			/*
+			 * All we need is the top level all cpu's are in
+			 * the same cache so when we use grp[0]->cg_mask
+			 * with the cg_first <-> cg_last it will include
+			 * all cpu's in it. The level here is probably
+			 * zero which is ok.
+			 */
+			tcp_pace.grps[0] = cpu_top;
+		} else {
+			/*
+			 * Here we must find all the level three cache domains
+			 * and setup our pointers to them.
+			 */
+			count = 0;
+			hpts_gather_grps(tcp_pace.grps, &count, tcp_pace.grp_cnt, cpu_top);
+		}
+	}
 	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
 		    M_TCPHPTS, M_WAITOK | M_ZERO);
-		tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
-		    M_TCPHPTS, M_WAITOK);
+		tcp_pace.rp_ent[i]->p_hptss = malloc(asz, M_TCPHPTS, M_WAITOK);
 		hpts = tcp_pace.rp_ent[i];
 		/*
 		 * Init all the hpts structures that are not specifically
@@ -1913,7 +1977,6 @@
 		hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
 		callout_init(&hpts->co, 1);
 	}
-
 	/* Don't try to bind to NUMA domains if we don't have any */
 	if (vm_ndomains == 1 && tcp_bind_threads == 2)
 		tcp_bind_threads = 0;
@@ -1924,6 +1987,7 @@
 	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
 		hpts = tcp_pace.rp_ent[i];
 		hpts->p_cpu = i;
+
 		error = swi_add(&hpts->ie, "hpts",
 		    tcp_hpts_thread, (void *)hpts,
 		    SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
@@ -1937,24 +2001,28 @@
 			if (intr_event_bind(hpts->ie, i) == 0)
 				bound++;
 		} else if (tcp_bind_threads == 2) {
-			pc = pcpu_find(i);
-			domain = pc->pc_domain;
-			CPU_COPY(&cpuset_domain[domain], &cs);
-			if (intr_event_bind_ithread_cpuset(hpts->ie, &cs)
-			    == 0) {
-				bound++;
-				count = hpts_domains[domain].count;
-				hpts_domains[domain].cpu[count] = i;
-				hpts_domains[domain].count++;
+			/* Find the group for this CPU (i) and bind into it */
+			for (j = 0; j < tcp_pace.grp_cnt; j++) {
+				if (CPU_ISSET(i, &tcp_pace.grps[j]->cg_mask)) {
+					if (intr_event_bind_ithread_cpuset(hpts->ie,
+						&tcp_pace.grps[j]->cg_mask) == 0) {
+						bound++;
+						pc = pcpu_find(i);
+						domain = pc->pc_domain;
+						count = hpts_domains[domain].count;
+						hpts_domains[domain].cpu[count] = i;
+						hpts_domains[domain].count++;
+						break;
+					}
+				}
 			}
 		}
 		tv.tv_sec = 0;
 		tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
 		hpts->sleeping = tv.tv_usec;
 		sb = tvtosbt(tv);
-		cpu = (tcp_bind_threads || hpts_use_assigned_cpu) ?  hpts->p_cpu : curcpu;
 		callout_reset_sbt_on(&hpts->co, sb, 0,
-				     hpts_timeout_swi, hpts, cpu,
+				     hpts_timeout_swi, hpts, hpts->p_cpu,
 				     (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
 	}
 	/*