Index: sys/kern/subr_epoch.c
===================================================================
--- sys/kern/subr_epoch.c
+++ sys/kern/subr_epoch.c
@@ -66,16 +66,18 @@
 #define EPOCH_ALIGN CACHE_LINE_SIZE
 #endif
 
-TAILQ_HEAD (epoch_tdlist, epoch_tracker);
+TAILQ_HEAD(epoch_tdlist, epoch_tracker);
 typedef struct epoch_record {
 	ck_epoch_record_t er_record;
-	struct epoch_context er_drain_ctx;
 	struct epoch *er_parent;
-	volatile struct epoch_tdlist er_tdlist;
-	volatile uint32_t er_gen;
+	struct epoch_tdlist er_tdlist;
+	struct thread *er_firsttd;
+	struct thread *er_blockedtd;
+	struct mtx er_lock;
+	struct lock_object er_lo;
 	uint32_t er_cpuid;
 	int er_drain_state;
-} __aligned(EPOCH_ALIGN)     *epoch_record_t;
+} __aligned(EPOCH_ALIGN) *epoch_record_t;
 
 #define	EPOCH_DRAIN_START	2
 #define	EPOCH_DRAIN_RUNNING	1
@@ -91,8 +93,6 @@
 	const char *e_name;
 };
 
-/* arbitrary --- needs benchmarking */
-#define MAX_ADAPTIVE_SPIN 100
 #define MAX_EPOCHS 64
 
 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
@@ -101,33 +101,22 @@
 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "epoch stats");
 
-/* Stats. */
-static counter_u64_t block_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
-    &block_count, "# of times a thread was in an epoch when epoch_wait was called");
-static counter_u64_t migrate_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
-    &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
-static counter_u64_t turnstile_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
-    &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
-static counter_u64_t switch_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
-    &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
-static counter_u64_t epoch_call_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW,
-    &epoch_call_count, "# of times a callback was deferred");
-static counter_u64_t epoch_call_task_count;
-
-SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW,
-    &epoch_call_task_count, "# of times a callback task was run");
-
-TAILQ_HEAD (threadlist, thread);
+static COUNTER_U64_DEFINE_EARLY(block_count);
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, blocked, CTLFLAG_RW,
+    &block_count,
+    "Number of times a thread was in an epoch when epoch_wait was called");
+static COUNTER_U64_DEFINE_EARLY(turnstile_count);
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, contended, CTLFLAG_RW,
+    &turnstile_count,
+    "Number of times a thread was blocked on a lock in an epoch during an epoch_wait");
+static COUNTER_U64_DEFINE_EARLY(call_count);
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, calls, CTLFLAG_RW,
+    &call_count,
+    "Number of times a callback was deferred");
+static COUNTER_U64_DEFINE_EARLY(call_task_count);
+SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, call_tasks, CTLFLAG_RW,
+    &call_task_count,
+    "Number of times a callback task was run");
 
 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
     ck_epoch_entry_container)
@@ -186,7 +175,6 @@
 	va_list ap;
 	struct stackentry se, *new;
 
-	stack_zero(&se.se_stack);	/* XXX: is it really needed? */
 	stack_save(&se.se_stack);
 
 	/* Tree is never reduced - go lockless. */
@@ -265,13 +253,6 @@
 {
 	int cpu;
 
-	block_count = counter_u64_alloc(M_WAITOK);
-	migrate_count = counter_u64_alloc(M_WAITOK);
-	turnstile_count = counter_u64_alloc(M_WAITOK);
-	switch_count = counter_u64_alloc(M_WAITOK);
-	epoch_call_count = counter_u64_alloc(M_WAITOK);
-	epoch_call_task_count = counter_u64_alloc(M_WAITOK);
-
 	pcpu_zone_record = uma_zcreate("epoch_record pcpu",
 	    sizeof(struct epoch_record), NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
@@ -306,24 +287,39 @@
 	epoch_record_t er;
 	int cpu;
 
-	epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK);
+	epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK |
+	    M_ZERO);
 	CPU_FOREACH(cpu) {
 		er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu);
-		bzero(er, sizeof(*er));
 		ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
-		TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
+		mtx_init(&er->er_lock, "epoch wait", NULL, MTX_DEF);
+		er->er_lo.lo_name = epoch->e_name;
+		TAILQ_INIT(&er->er_tdlist);
 		er->er_cpuid = cpu;
 		er->er_parent = epoch;
 	}
 }
 
+/*
+ * Slow path for epoch_exit_preempt(): wake up blocked threads that have
+ * propagated their scheduling priority to us.
+ */
 static void
-epoch_adjust_prio(struct thread *td, u_char prio)
+epoch_unblock(epoch_record_t er)
 {
+	struct turnstile *ts;
 
-	thread_lock(td);
-	sched_prio(td, prio);
-	thread_unlock(td);
+	KASSERT(er->er_blockedtd == curthread,
+	    ("%s: unblocking from wrong thread", __func__));
+
+	mtx_lock(&er->er_lock);
+	turnstile_chain_lock(&er->er_lo);
+	ts = turnstile_lookup(&er->er_lo);
+	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
+	turnstile_unpend(ts);
+	turnstile_chain_unlock(&er->er_lo);
+	er->er_blockedtd = NULL;
+	mtx_unlock(&er->er_lock);
 }
 
 epoch_t
@@ -391,9 +387,9 @@
 	THREAD_NO_SLEEPING();
 	critical_enter();
 	sched_pin();
-	td->td_pre_epoch_prio = td->td_priority;
 	er = epoch_currecord(epoch);
 	TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link);
+	er->er_firsttd = TAILQ_FIRST(&er->er_tdlist)->et_td;
 	ck_epoch_begin(&er->er_record, &et->et_section);
 	critical_exit();
 }
@@ -414,26 +410,27 @@
 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE)
 {
 	struct epoch_record *er;
+	struct epoch_tracker *fet;
 	struct thread *td;
 
 	INIT_CHECK(epoch);
 	td = curthread;
+	THREAD_SLEEPING_OK();
+
 	critical_enter();
 	sched_unpin();
-	THREAD_SLEEPING_OK();
 	er = epoch_currecord(epoch);
+
 	MPASS(epoch->e_flags & EPOCH_PREEMPT);
-	MPASS(et != NULL);
 	MPASS(et->et_td == td);
-#ifdef INVARIANTS
-	et->et_td = (void*)0xDEADBEEF;
-#endif
+
 	ck_epoch_end(&er->er_record, &et->et_section);
 	TAILQ_REMOVE(&er->er_tdlist, et, et_link);
-	er->er_gen++;
-	if (__predict_false(td->td_pre_epoch_prio != td->td_priority))
-		epoch_adjust_prio(td, td->td_pre_epoch_prio);
+	fet = TAILQ_FIRST(&er->er_tdlist);
+	er->er_firsttd = fet != NULL ? fet->et_td : NULL;
 	critical_exit();
+	if (__predict_false(er->er_blockedtd == td))
+		epoch_unblock(er);
 #ifdef EPOCH_TRACE
 	epoch_trace_exit(td, epoch, et, file, line);
 #endif
@@ -458,148 +455,61 @@
 epoch_block_handler_preempt(struct ck_epoch *global __unused,
     ck_epoch_record_t *cr, void *arg __unused)
 {
-	epoch_record_t record;
-	struct thread *td, *owner, *curwaittd;
-	struct epoch_tracker *tdwait;
+	struct epoch_record *er;
+	struct thread *td;
 	struct turnstile *ts;
-	struct lock_object *lock;
-	int spincount, gen;
-	int locksheld __unused;
 
-	record = __containerof(cr, struct epoch_record, er_record);
-	td = curthread;
-	locksheld = td->td_locks;
-	spincount = 0;
 	counter_u64_add(block_count, 1);
-	/*
-	 * We lost a race and there's no longer any threads
-	 * on the CPU in an epoch section.
-	 */
-	if (TAILQ_EMPTY(&record->er_tdlist))
-		return;
 
-	if (record->er_cpuid != curcpu) {
+	er = __containerof(cr, struct epoch_record, er_record);
+
+	td = er->er_firsttd;
+	if (td == NULL)
+		return;
+	if (TD_IS_RUNNING(td)) {
 		/*
-		 * If the head of the list is running, we can wait for it
-		 * to remove itself from the list and thus save us the
-		 * overhead of a migration
+		 * There is nothing useful we can do until this thread exits the
+		 * epoch.
 		 */
-		gen = record->er_gen;
-		thread_unlock(td);
+		cpu_spinwait();
+		return;
+	}
+
+	mtx_lock(&er->er_lock);
+	if (er->er_blockedtd == NULL) {
 		/*
-		 * We can't actually check if the waiting thread is running
-		 * so we simply poll for it to exit before giving up and
-		 * migrating.
+		 * A thread in the target epoch is off-CPU.  Prepare to make it
+		 * the owner of this CPU's turnstile so that we can lend
+		 * priority.  Ensure that it will wake us up upon exiting the
+		 * section, using the thread lock to ensure that it doesn't get
+		 * scheduled and exit the section before we're ready.
 		 */
-		do {
-			cpu_spinwait();
-		} while (!TAILQ_EMPTY(&record->er_tdlist) &&
-				 gen == record->er_gen &&
-				 spincount++ < MAX_ADAPTIVE_SPIN);
 		thread_lock(td);
-		/*
-		 * If the generation has changed we can poll again
-		 * otherwise we need to migrate.
-		 */
-		if (gen != record->er_gen)
+		if (TD_IS_RUNNING(td) || td != er->er_firsttd) {
+			thread_unlock(td);
+			mtx_unlock(&er->er_lock);
 			return;
+		}
+		er->er_blockedtd = td;
+		thread_unlock(td);
+	} else {
 		/*
-		 * Being on the same CPU as that of the record on which
-		 * we need to wait allows us access to the thread
-		 * list associated with that CPU. We can then examine the
-		 * oldest thread in the queue and wait on its turnstile
-		 * until it resumes and so on until a grace period
-		 * elapses.
-		 *
-		 */
-		counter_u64_add(migrate_count, 1);
-		sched_bind(td, record->er_cpuid);
-		/*
-		 * At this point we need to return to the ck code
-		 * to scan to see if a grace period has elapsed.
-		 * We can't move on to check the thread list, because
-		 * in the meantime new threads may have arrived that
-		 * in fact belong to a different epoch.
+		 * At least one other thread is blocked waiting for a thread to
+		 * exit the target epoch.  Join it.
 		 */
-		return;
+		td = er->er_blockedtd;
 	}
-	/*
-	 * Try to find a thread in an epoch section on this CPU
-	 * waiting on a turnstile. Otherwise find the lowest
-	 * priority thread (highest prio value) and drop our priority
-	 * to match to allow it to run.
-	 */
-	TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) {
-		/*
-		 * Propagate our priority to any other waiters to prevent us
-		 * from starving them. They will have their original priority
-		 * restore on exit from epoch_wait().
-		 */
-		curwaittd = tdwait->et_td;
-		if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) {
-			critical_enter();
-			thread_unlock(td);
-			thread_lock(curwaittd);
-			sched_prio(curwaittd, td->td_priority);
-			thread_unlock(curwaittd);
-			thread_lock(td);
-			critical_exit();
-		}
-		if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) &&
-		    ((ts = curwaittd->td_blocked) != NULL)) {
-			/*
-			 * We unlock td to allow turnstile_wait to reacquire
-			 * the thread lock. Before unlocking it we enter a
-			 * critical section to prevent preemption after we
-			 * reenable interrupts by dropping the thread lock in
-			 * order to prevent curwaittd from getting to run.
-			 */
-			critical_enter();
-			thread_unlock(td);
+	ts = turnstile_trywait(&er->er_lo);
+	mtx_unlock(&er->er_lock);
 
-			if (turnstile_lock(ts, &lock, &owner)) {
-				if (ts == curwaittd->td_blocked) {
-					MPASS(TD_IS_INHIBITED(curwaittd) &&
-					    TD_ON_LOCK(curwaittd));
-					critical_exit();
-					turnstile_wait(ts, owner,
-					    curwaittd->td_tsqueue);
-					counter_u64_add(turnstile_count, 1);
-					thread_lock(td);
-					return;
-				}
-				turnstile_unlock(ts, lock);
-			}
-			thread_lock(td);
-			critical_exit();
-			KASSERT(td->td_locks == locksheld,
-			    ("%d extra locks held", td->td_locks - locksheld));
-		}
-	}
-	/*
-	 * We didn't find any threads actually blocked on a lock
-	 * so we have nothing to do except context switch away.
-	 */
-	counter_u64_add(switch_count, 1);
-	mi_switch(SW_VOL | SWT_RELINQUISH);
-	/*
-	 * It is important the thread lock is dropped while yielding
-	 * to allow other threads to acquire the lock pointed to by
-	 * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the
-	 * thread lock before returning. Else a deadlock like
-	 * situation might happen.
-	 */
-	thread_lock(td);
+	counter_u64_add(turnstile_count, 1);
+	turnstile_wait(ts, td, TS_EXCLUSIVE_QUEUE);
 }
 
 void
 epoch_wait_preempt(epoch_t epoch)
 {
 	struct thread *td;
-	int was_bound;
-	int old_cpu;
-	int old_pinned;
-	u_char old_prio;
 	int locks __unused;
 
 	MPASS(cold || epoch != NULL);
@@ -615,34 +525,10 @@
 	    "of an epoch section of the same epoch"));
 #endif
 	DROP_GIANT();
-	thread_lock(td);
-
-	old_cpu = PCPU_GET(cpuid);
-	old_pinned = td->td_pinned;
-	old_prio = td->td_priority;
-	was_bound = sched_is_bound(td);
-	sched_unbind(td);
-	td->td_pinned = 0;
-	sched_bind(td, old_cpu);
 
 	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt,
 	    NULL);
 
-	/* restore CPU binding, if any */
-	if (was_bound != 0) {
-		sched_bind(td, old_cpu);
-	} else {
-		/* get thread back to initial CPU, if any */
-		if (old_pinned != 0)
-			sched_bind(td, old_cpu);
-		sched_unbind(td);
-	}
-	/* restore pinned after bind */
-	td->td_pinned = old_pinned;
-
-	/* restore thread priority */
-	sched_prio(td, old_prio);
-	thread_unlock(td);
 	PICKUP_GIANT();
 	KASSERT(td->td_locks == locks,
 	    ("%d residual locks held", td->td_locks - locks));
@@ -731,8 +617,8 @@
 	*DPCPU_PTR(epoch_cb_count) -= total;
 	critical_exit();
 
-	counter_u64_add(epoch_call_count, total);
-	counter_u64_add(epoch_call_task_count, 1);
+	counter_u64_add(call_count, total);
+	counter_u64_add(call_task_count, 1);
 
 	head = ck_stack_batch_pop_npsc(&cb_stack);
 	for (cursor = head; cursor != NULL; cursor = next) {
Index: sys/kern/subr_turnstile.c
===================================================================
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -590,41 +590,6 @@
 	return (ts);
 }
 
-bool
-turnstile_lock(struct turnstile *ts, struct lock_object **lockp,
-    struct thread **tdp)
-{
-	struct turnstile_chain *tc;
-	struct lock_object *lock;
-
-	if ((lock = ts->ts_lockobj) == NULL)
-		return (false);
-	tc = TC_LOOKUP(lock);
-	mtx_lock_spin(&tc->tc_lock);
-	mtx_lock_spin(&ts->ts_lock);
-	if (__predict_false(lock != ts->ts_lockobj)) {
-		mtx_unlock_spin(&tc->tc_lock);
-		mtx_unlock_spin(&ts->ts_lock);
-		return (false);
-	}
-	*lockp = lock;
-	*tdp = ts->ts_owner;
-	return (true);
-}
-
-void
-turnstile_unlock(struct turnstile *ts, struct lock_object *lock)
-{
-	struct turnstile_chain *tc;
-
-	mtx_assert(&ts->ts_lock, MA_OWNED);
-	mtx_unlock_spin(&ts->ts_lock);
-	if (ts == curthread->td_turnstile)
-		ts->ts_lockobj = NULL;
-	tc = TC_LOOKUP(lock);
-	mtx_unlock_spin(&tc->tc_lock);
-}
-
 void
 turnstile_assert(struct turnstile *ts)
 {
Index: sys/sys/epoch.h
===================================================================
--- sys/sys/epoch.h
+++ sys/sys/epoch.h
@@ -61,7 +61,7 @@
 	const char *et_file;
 	int et_line;
 #endif
-}  __aligned(sizeof(void *));
+};
 typedef struct epoch_tracker *epoch_tracker_t;
 
 epoch_t	epoch_alloc(const char *name, int flags);
Index: sys/sys/proc.h
===================================================================
--- sys/sys/proc.h
+++ sys/sys/proc.h
@@ -316,7 +316,6 @@
 	u_char		td_pri_class;	/* (t) Scheduling class. */
 	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
 	u_char		td_base_user_pri; /* (t) Base user pri */
-	u_char		td_pre_epoch_prio; /* (k) User pri on entry to epoch */
 	uintptr_t	td_rb_list;	/* (k) Robust list head. */
 	uintptr_t	td_rbp_list;	/* (k) Robust priv list head. */
 	uintptr_t	td_rb_inact;	/* (k) Current in-action mutex loc. */
Index: sys/sys/turnstile.h
===================================================================
--- sys/sys/turnstile.h
+++ sys/sys/turnstile.h
@@ -99,9 +99,7 @@
 struct turnstile *turnstile_trywait(struct lock_object *);
 void	turnstile_unpend(struct turnstile *);
 void	turnstile_wait(struct turnstile *, struct thread *, int);
-bool	turnstile_lock(struct turnstile *, struct lock_object **,
-	    struct thread **);
-void	turnstile_unlock(struct turnstile *, struct lock_object *);
 void	turnstile_assert(struct turnstile *);
+
 #endif	/* _KERNEL */
 #endif	/* _SYS_TURNSTILE_H_ */