diff --git a/sys/compat/linuxkpi/common/src/linux_current.c b/sys/compat/linuxkpi/common/src/linux_current.c
--- a/sys/compat/linuxkpi/common/src/linux_current.c
+++ b/sys/compat/linuxkpi/common/src/linux_current.c
@@ -45,7 +45,6 @@
 
 static eventhandler_tag linuxkpi_thread_dtor_tag;
 
-static atomic_t linux_current_allocs;
 static uma_zone_t linux_current_zone;
 static uma_zone_t linux_mm_zone;
 
@@ -147,10 +146,6 @@
 	/* free mm_struct pointer, if any */
 	uma_zfree(linux_mm_zone, mm);
 
-	/* keep track of number of allocations */
-	if (atomic_add_return(1, &linux_current_allocs) == INT_MAX)
-		panic("linux_alloc_current: Refcount too high!");
-
 	return (0);
 }
 
@@ -178,10 +173,6 @@
 {
 	mmput(ts->mm);
 	uma_zfree(linux_current_zone, ts);
-
-	/* keep track of number of allocations */
-	if (atomic_sub_return(1, &linux_current_allocs) < 0)
-		panic("linux_free_current: Negative refcount!");
 }
 
 static void
@@ -306,9 +297,9 @@
 
 	atomic_thread_fence_seq_cst();
 
-	lkpi_alloc_current = linux_alloc_current;
 	linuxkpi_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
 	    linuxkpi_thread_dtor, NULL, EVENTHANDLER_PRI_ANY);
+	lkpi_alloc_current = linux_alloc_current;
 }
 SYSINIT(linux_current, SI_SUB_EVENTHANDLER, SI_ORDER_SECOND,
     linux_current_init, NULL);
@@ -337,17 +328,10 @@
 	}
 	sx_sunlock(&allproc_lock);
 
-	/*
-	 * There is a window where threads are removed from the
-	 * process list and where the thread destructor is invoked.
-	 * Catch that window by waiting for all task_struct
-	 * allocations to be returned before freeing the UMA zone.
-	 */
-	while (atomic_read(&linux_current_allocs) != 0)
-		pause("W", 1);
+	thread_reap_barrier();
 
 	EVENTHANDLER_DEREGISTER(thread_dtor, linuxkpi_thread_dtor_tag);
-	
+
 	uma_zdestroy(linux_current_zone);
 	uma_zdestroy(linux_mm_zone);
 }
diff --git a/sys/kern/kern_thread.c b/sys/kern/kern_thread.c
--- a/sys/kern/kern_thread.c
+++ b/sys/kern/kern_thread.c
@@ -541,7 +541,8 @@
 
 	TASK_INIT(&thread_reap_task, 0, thread_reap_task_cb, NULL);
 	callout_init(&thread_reap_callout, 1);
-	callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL);
+	callout_reset(&thread_reap_callout, 5 * hz,
+	    thread_reap_callout_cb, NULL);
 }
 
 /*
@@ -704,7 +705,40 @@
 
 	if (wantreap)
 		taskqueue_enqueue(taskqueue_thread, &thread_reap_task);
-	callout_reset(&thread_reap_callout, 5 * hz, thread_reap_callout_cb, NULL);
+	callout_reset(&thread_reap_callout, 5 * hz,
+	    thread_reap_callout_cb, NULL);
+}
+
+/*
+ * Calling this function guarantees that any thread that exited before
+ * the call is reaped when the function returns.  By 'exited' we mean
+ * a thread removed from the process linkage with thread_unlink().
+ * Practically this means that caller must lock/unlock corresponding
+ * process lock before the call, to synchronize with thread_exit().
+ */
+void
+thread_reap_barrier(void)
+{
+	struct thread *td;
+	struct task *t;
+
+	td = curthread;
+
+	/*
+	 * First do context switches to each CPU to ensure that all
+	 * PCPU pc_deadthreads are moved to zombie list.
+	 */
+	quiesce_all_cpus("", PDROP);
+
+	/*
+	 * Second, fire the task in the same thread as normal
+	 * thread_reap() is done, to serialize reaping.
+	 */
+	t = malloc(sizeof(*t), M_TEMP, M_WAITOK);
+	TASK_INIT(t, 0, thread_reap_task_cb, t);
+	taskqueue_enqueue(taskqueue_thread, t);
+	taskqueue_drain(taskqueue_thread, t);
+	free(t, M_TEMP);
 }
 
 /*
diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c
--- a/sys/kern/subr_smp.c
+++ b/sys/kern/subr_smp.c
@@ -943,25 +943,31 @@
 }
 
 /*
+ * If (prio & PDROP) == 0:
  * Wait for specified idle threads to switch once.  This ensures that even
  * preempted threads have cycled through the switch function once,
  * exiting their codepaths.  This allows us to change global pointers
  * with no other synchronization.
+ * If (prio & PDROP) != 0:
+ * Force the specified CPUs to switch context at least once.
  */
 int
 quiesce_cpus(cpuset_t map, const char *wmesg, int prio)
 {
 	struct pcpu *pcpu;
-	u_int gen[MAXCPU];
+	u_int *gen;
 	int error;
 	int cpu;
 
 	error = 0;
-	for (cpu = 0; cpu <= mp_maxid; cpu++) {
-		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
-			continue;
-		pcpu = pcpu_find(cpu);
-		gen[cpu] = pcpu->pc_idlethread->td_generation;
+	if ((prio & PDROP) == 0) {
+		gen = malloc(sizeof(u_int) * MAXCPU, M_TEMP, M_WAITOK);
+		for (cpu = 0; cpu <= mp_maxid; cpu++) {
+			if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
+				continue;
+			pcpu = pcpu_find(cpu);
+			gen[cpu] = pcpu->pc_idlethread->td_generation;
+		}
 	}
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu))
@@ -970,8 +976,10 @@
 		thread_lock(curthread);
 		sched_bind(curthread, cpu);
 		thread_unlock(curthread);
+		if ((prio & PDROP) != 0)
+			continue;
 		while (gen[cpu] == pcpu->pc_idlethread->td_generation) {
-			error = tsleep(quiesce_cpus, prio, wmesg, 1);
+			error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1);
 			if (error != EWOULDBLOCK)
 				goto out;
 			error = 0;
@@ -981,6 +989,8 @@
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
+	if ((prio & PDROP) == 0)
+		free(gen, M_TEMP);
 
 	return (error);
 }
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -1188,6 +1188,7 @@
 void	thread_exit(void) __dead2;
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct proc *p);
+void	thread_reap_barrier(void);
 int	thread_single(struct proc *p, int how);
 void	thread_single_end(struct proc *p, int how);
 void	thread_stash(struct thread *td);