Index: head/sys/kern/kern_clock.c
===================================================================
--- head/sys/kern/kern_clock.c	(revision 333760)
+++ head/sys/kern/kern_clock.c	(revision 333761)
@@ -1,895 +1,898 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ntp.h"
 #include "opt_watchdog.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
+#include <sys/epoch.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , clock, hard);
 PMC_SOFT_DEFINE( , , clock, stat);
 PMC_SOFT_DEFINE_EX( , , clock, prof, \
     cpu_startprofclock, cpu_stopprofclock);
 #endif
 
 #ifdef DEVICE_POLLING
 extern void hardclock_device_poll(void);
 #endif /* DEVICE_POLLING */
 
 static void initclocks(void *dummy);
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL);
 
 /* Spin-lock protecting profiling statistics. */
 static struct mtx time_lock;
 
 SDT_PROVIDER_DECLARE(sched);
 SDT_PROBE_DEFINE2(sched, , , tick, "struct thread *", "struct proc *");
 
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	long cp_time[CPUSTATES];
 #ifdef SCTL_MASK32
 	int i;
 	unsigned int cp_time32[CPUSTATES];
 #endif
 
 	read_cpu_time(cp_time);
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
 		for (i = 0; i < CPUSTATES; i++)
 			cp_time32[i] = (unsigned int)cp_time[i];
 		error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 	} else
 #endif
 	{
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time));
 		error = SYSCTL_OUT(req, cp_time, sizeof(cp_time));
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
 
 static long empty[CPUSTATES];
 
 static int
 sysctl_kern_cp_times(SYSCTL_HANDLER_ARGS)
 {
 	struct pcpu *pcpu;
 	int error;
 	int c;
 	long *cp_time;
 #ifdef SCTL_MASK32
 	unsigned int cp_time32[CPUSTATES];
 	int i;
 #endif
 
 	if (!req->oldptr) {
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32) * (mp_maxid + 1));
 		else
 #endif
 			return SYSCTL_OUT(req, 0, sizeof(long) * CPUSTATES * (mp_maxid + 1));
 	}
 	for (error = 0, c = 0; error == 0 && c <= mp_maxid; c++) {
 		if (!CPU_ABSENT(c)) {
 			pcpu = pcpu_find(c);
 			cp_time = pcpu->pc_cp_time;
 		} else {
 			cp_time = empty;
 		}
 #ifdef SCTL_MASK32
 		if (req->flags & SCTL_MASK32) {
 			for (i = 0; i < CPUSTATES; i++)
 				cp_time32[i] = (unsigned int)cp_time[i];
 			error = SYSCTL_OUT(req, cp_time32, sizeof(cp_time32));
 		} else
 #endif
 			error = SYSCTL_OUT(req, cp_time, sizeof(long) * CPUSTATES);
 	}
 	return error;
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, cp_times, CTLTYPE_LONG|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0,0, sysctl_kern_cp_times, "LU", "per-CPU time statistics");
 
 #ifdef DEADLKRES
 static const char *blessed[] = {
 	"getblk",
 	"so_snd_sx",
 	"so_rcv_sx",
 	NULL
 };
 static int slptime_threshold = 1800;
 static int blktime_threshold = 900;
 static int sleepfreq = 3;
 
 static void
 deadlkres(void)
 {
 	struct proc *p;
 	struct thread *td;
 	void *wchan;
 	int blkticks, i, slpticks, slptype, tryl, tticks;
 
 	tryl = 0;
 	for (;;) {
 		blkticks = blktime_threshold * hz;
 		slpticks = slptime_threshold * hz;
 
 		/*
 		 * Avoid to sleep on the sx_lock in order to avoid a possible
 		 * priority inversion problem leading to starvation.
 		 * If the lock can't be held after 100 tries, panic.
 		 */
 		if (!sx_try_slock(&allproc_lock)) {
 			if (tryl > 100)
 		panic("%s: possible deadlock detected on allproc_lock\n",
 				    __func__);
 			tryl++;
 			pause("allproc", sleepfreq * hz);
 			continue;
 		}
 		tryl = 0;
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			FOREACH_THREAD_IN_PROC(p, td) {
 
 				thread_lock(td);
 				if (TD_ON_LOCK(td)) {
 
 					/*
 					 * The thread should be blocked on a
 					 * turnstile, simply check if the
 					 * turnstile channel is in good state.
 					 */
 					MPASS(td->td_blocked != NULL);
 
 					tticks = ticks - td->td_blktick;
 					thread_unlock(td);
 					if (tticks > blkticks) {
 
 						/*
 						 * Accordingly with provided
 						 * thresholds, this thread is
 						 * stuck for too long on a
 						 * turnstile.
 						 */
 						PROC_UNLOCK(p);
 						sx_sunlock(&allproc_lock);
 	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
 						    __func__, td, tticks);
 					}
 				} else if (TD_IS_SLEEPING(td) &&
 				    TD_ON_SLEEPQ(td)) {
 
 					/*
 					 * Check if the thread is sleeping on a
 					 * lock, otherwise skip the check.
 					 * Drop the thread lock in order to
 					 * avoid a LOR with the sleepqueue
 					 * spinlock.
 					 */
 					wchan = td->td_wchan;
 					tticks = ticks - td->td_slptick;
 					thread_unlock(td);
 					slptype = sleepq_type(wchan);
 					if ((slptype == SLEEPQ_SX ||
 					    slptype == SLEEPQ_LK) &&
 					    tticks > slpticks) {
 
 						/*
 						 * Accordingly with provided
 						 * thresholds, this thread is
 						 * stuck for too long on a
 						 * sleepqueue.
 						 * However, being on a
 						 * sleepqueue, we might still
 						 * check for the blessed
 						 * list.
 						 */
 						tryl = 0;
 						for (i = 0; blessed[i] != NULL;
 						    i++) {
 							if (!strcmp(blessed[i],
 							    td->td_wmesg)) {
 								tryl = 1;
 								break;
 							}
 						}
 						if (tryl != 0) {
 							tryl = 0;
 							continue;
 						}
 						PROC_UNLOCK(p);
 						sx_sunlock(&allproc_lock);
 	panic("%s: possible deadlock detected for %p, blocked for %d ticks\n",
 						    __func__, td, tticks);
 					}
 				} else
 					thread_unlock(td);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 
 		/* Sleep for sleepfreq seconds. */
 		pause("-", sleepfreq * hz);
 	}
 }
 
 static struct kthread_desc deadlkres_kd = {
 	"deadlkres",
 	deadlkres,
 	(struct thread **)NULL
 };
 
 SYSINIT(deadlkres, SI_SUB_CLOCKS, SI_ORDER_ANY, kthread_start, &deadlkres_kd);
 
 static SYSCTL_NODE(_debug, OID_AUTO, deadlkres, CTLFLAG_RW, 0,
     "Deadlock resolver");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, slptime_threshold, CTLFLAG_RW,
     &slptime_threshold, 0,
     "Number of seconds within is valid to sleep on a sleepqueue");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, blktime_threshold, CTLFLAG_RW,
     &blktime_threshold, 0,
     "Number of seconds within is valid to block on a turnstile");
 SYSCTL_INT(_debug_deadlkres, OID_AUTO, sleepfreq, CTLFLAG_RW, &sleepfreq, 0,
     "Number of seconds between any deadlock resolver thread run");
 #endif	/* DEADLKRES */
 
 void
 read_cpu_time(long *cp_time)
 {
 	struct pcpu *pc;
 	int i, j;
 
 	/* Sum up global cp_time[]. */
 	bzero(cp_time, sizeof(long) * CPUSTATES);
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		for (j = 0; j < CPUSTATES; j++)
 			cp_time[j] += pc->pc_cp_time[j];
 	}
 }
 
 #include <sys/watchdog.h>
 
 static int watchdog_ticks;
 static int watchdog_enabled;
 static void watchdog_fire(void);
 static void watchdog_config(void *, u_int, int *);
 
 static void
 watchdog_attach(void)
 {
 	EVENTHANDLER_REGISTER(watchdog_list, watchdog_config, NULL, 0);
 }
 
 /*
  * Clock handling routines.
  *
  * This code is written to operate with two timers that run independently of
  * each other.
  *
  * The main timer, running hz times per second, is used to trigger interval
  * timers, timeouts and rescheduling as needed.
  *
  * The second timer handles kernel and user profiling,
  * and does resource use estimation.  If the second timer is programmable,
  * it is randomized to avoid aliasing between the two clocks.  For example,
  * the randomization prevents an adversary from always giving up the cpu
  * just before its quantum expires.  Otherwise, it would never accumulate
  * cpu ticks.  The mean frequency of the second timer is stathz.
  *
  * If no second timer exists, stathz will be zero; in this case we drive
  * profiling and statistics off the main clock.  This WILL NOT be accurate;
  * do not do it unless absolutely necessary.
  *
  * The statistics clock may (or may not) be run at a higher rate while
  * profiling.  This profile clock runs at profhz.  We require that profhz
  * be an integral multiple of stathz.
  *
  * If the statistics clock is running fast, it must be divided by the ratio
  * profhz/stathz for statistics.  (For profiling, every tick counts.)
  *
  * Time-of-day is maintained using a "timecounter", which may or may
  * not be related to the hardware generating the above mentioned
  * interrupts.
  */
 
 int	stathz;
 int	profhz;
 int	profprocs;
 volatile int	ticks;
 int	psratio;
 
 static DPCPU_DEFINE(int, pcputicks);	/* Per-CPU version of ticks. */
 #ifdef DEVICE_POLLING
 static int devpoll_run = 0;
 #endif
 
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 /* ARGSUSED*/
 static void
 initclocks(void *dummy)
 {
 	int i;
 
 	/*
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
 	mtx_init(&time_lock, "time lock", NULL, MTX_DEF);
 	cpu_initclocks();
 
 	/*
 	 * Compute profhz/stathz, and fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
 
 #ifdef SW_WATCHDOG
 	/* Enable hardclock watchdog now, even if a hardware watchdog exists. */
 	watchdog_attach();
 #else
 	/* Volunteer to run a software watchdog. */
 	if (wdog_software_attach == NULL)
 		wdog_software_attach = watchdog_attach;
 #endif
 }
 
 /*
  * Each time the real-time timer fires, this function is called on all CPUs.
  * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
  * the other CPUs in the system need to call this function.
  */
 void
 hardclock_cpu(int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int flags;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	pstats = p->p_stats;
 	flags = 0;
 	if (usermode &&
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
 			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
 			flags |= TDF_PROFPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	thread_lock(td);
 	td->td_flags |= flags;
 	thread_unlock(td);
 
 #ifdef HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
 		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
 	callout_process(sbinuptime());
+	epoch_pcpu_poll();
 }
 
 /*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(int usermode, uintfptr_t pc)
 {
 
 	atomic_add_int(&ticks, 1);
 	hardclock_cpu(usermode);
 	tc_ticktock(1);
 	cpu_tick_calibration();
 	/*
 	 * If no separate statistics clock is available, run it from here.
 	 *
 	 * XXX: this only works for UP
 	 */
 	if (stathz == 0) {
 		profclock(usermode, pc);
 		statclock(usermode);
 	}
 #ifdef DEVICE_POLLING
 	hardclock_device_poll();	/* this is very short and quick */
 #endif /* DEVICE_POLLING */
 	if (watchdog_enabled > 0 && --watchdog_ticks <= 0)
 		watchdog_fire();
 }
 
 void
 hardclock_cnt(int cnt, int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int *t = DPCPU_PTR(pcputicks);
 	int flags, global, newticks;
 	int i;
 
 	/*
 	 * Update per-CPU and possibly global ticks values.
 	 */
 	*t += cnt;
 	do {
 		global = ticks;
 		newticks = *t - global;
 		if (newticks <= 0) {
 			if (newticks < -1)
 				*t = global - 1;
 			newticks = 0;
 			break;
 		}
 	} while (!atomic_cmpset_int(&ticks, global, *t));
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	pstats = p->p_stats;
 	flags = 0;
 	if (usermode &&
 	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL],
 		    tick * cnt) == 0)
 			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
 		PROC_ITIMLOCK(p);
 		if (itimerdecr(&pstats->p_timer[ITIMER_PROF],
 		    tick * cnt) == 0)
 			flags |= TDF_PROFPEND | TDF_ASTPENDING;
 		PROC_ITIMUNLOCK(p);
 	}
 	if (flags != 0) {
 		thread_lock(td);
 		td->td_flags |= flags;
 		thread_unlock(td);
 	}
 
 #ifdef	HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
 		PMC_CALL_HOOK_UNLOCKED(curthread, PMC_FN_DO_SAMPLES, NULL);
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, hard, td->td_intr_frame);
 #endif
 	/* We are in charge to handle this tick duty. */
 	if (newticks > 0) {
 		tc_ticktock(newticks);
 #ifdef DEVICE_POLLING
 		/* Dangerous and no need to call these things concurrently. */
 		if (atomic_cmpset_acq_int(&devpoll_run, 0, 1)) {
 			/* This is very short and quick. */
 			hardclock_device_poll();
 			atomic_store_rel_int(&devpoll_run, 0);
 		}
 #endif /* DEVICE_POLLING */
 		if (watchdog_enabled > 0) {
 			i = atomic_fetchadd_int(&watchdog_ticks, -newticks);
 			if (i > 0 && i <= newticks)
 				watchdog_fire();
 		}
 	}
 	if (curcpu == CPU_FIRST())
 		cpu_tick_calibration();
+	epoch_pcpu_poll();
 }
 
 void
 hardclock_sync(int cpu)
 {
 	int *t;
 	KASSERT(!CPU_ABSENT(cpu), ("Absent CPU %d", cpu));
 	t = DPCPU_ID_PTR(cpu, pcputicks);
 
 	*t = ticks;
 }
 
 /*
  * Compute number of ticks in the specified amount of time.
  */
 int
 tvtohz(struct timeval *tv)
 {
 	unsigned long ticks;
 	long sec, usec;
 
 	/*
 	 * If the number of usecs in the whole seconds part of the time
 	 * difference fits in a long, then the total number of usecs will
 	 * fit in an unsigned long.  Compute the total and convert it to
 	 * ticks, rounding up and adding 1 to allow for the current tick
 	 * to expire.  Rounding also depends on unsigned long arithmetic
 	 * to avoid overflow.
 	 *
 	 * Otherwise, if the number of ticks in the whole seconds part of
 	 * the time difference fits in a long, then convert the parts to
 	 * ticks separately and add, using similar rounding methods and
 	 * overflow avoidance.  This method would work in the previous
 	 * case but it is slightly slower and assumes that hz is integral.
 	 *
 	 * Otherwise, round the time difference down to the maximum
 	 * representable value.
 	 *
 	 * If ints have 32 bits, then the maximum value for any timeout in
 	 * 10ms ticks is 248 days.
 	 */
 	sec = tv->tv_sec;
 	usec = tv->tv_usec;
 	if (usec < 0) {
 		sec--;
 		usec += 1000000;
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
 		if (usec > 0) {
 			sec++;
 			usec -= 1000000;
 		}
 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
 		ticks = 1;
 	} else if (sec <= LONG_MAX / 1000000)
 		ticks = howmany(sec * 1000000 + (unsigned long)usec, tick) + 1;
 	else if (sec <= LONG_MAX / hz)
 		ticks = sec * hz
 			+ howmany((unsigned long)usec, tick) + 1;
 	else
 		ticks = LONG_MAX;
 	if (ticks > INT_MAX)
 		ticks = INT_MAX;
 	return ((int)ticks);
 }
 
 /*
  * Start profiling on a process.
  *
  * Kernel profiling passes proc0 which never exits and hence
  * keeps the profile clock running constantly.
  */
 void
 startprofclock(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
 		p->p_flag |= P_PROFIL;
 		mtx_lock(&time_lock);
 		if (++profprocs == 1)
 			cpu_startprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Stop profiling on a process.
  */
 void
 stopprofclock(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_PROFIL) {
 		if (p->p_profthreads != 0) {
 			while (p->p_profthreads != 0) {
 				p->p_flag |= P_STOPPROF;
 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
 				    "stopprof", 0);
 			}
 		}
 		if ((p->p_flag & P_PROFIL) == 0)
 			return;
 		p->p_flag &= ~P_PROFIL;
 		mtx_lock(&time_lock);
 		if (--profprocs == 0)
 			cpu_stopprofclock();
 		mtx_unlock(&time_lock);
 	}
 }
 
 /*
  * Statistics clock.  Updates rusage information and calls the scheduler
  * to adjust priorities of the active thread.
  *
  * This should be called by all active processors.
  */
 void
 statclock(int usermode)
 {
 
 	statclock_cnt(1, usermode);
 }
 
 void
 statclock_cnt(int cnt, int usermode)
 {
 	struct rusage *ru;
 	struct vmspace *vm;
 	struct thread *td;
 	struct proc *p;
 	long rss;
 	long *cp_time;
 
 	td = curthread;
 	p = td->td_proc;
 
 	cp_time = (long *)PCPU_PTR(cp_time);
 	if (usermode) {
 		/*
 		 * Charge the time as appropriate.
 		 */
 		td->td_uticks += cnt;
 		if (p->p_nice > NZERO)
 			cp_time[CP_NICE] += cnt;
 		else
 			cp_time[CP_USER] += cnt;
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
 		 * - handling an interrupt,
 		 * - doing syscall or trap work on behalf of the current
 		 *   user process, or
 		 * - spinning in the idle loop.
 		 * Whichever it is, charge the time as appropriate.
 		 * Note that we charge interrupts to the current process,
 		 * regardless of whether they are ``for'' that process,
 		 * so that we know how much of its real time was spent
 		 * in ``non-process'' (i.e., interrupt) work.
 		 */
 		if ((td->td_pflags & TDP_ITHREAD) ||
 		    td->td_intr_nesting_level >= 2) {
 			td->td_iticks += cnt;
 			cp_time[CP_INTR] += cnt;
 		} else {
 			td->td_pticks += cnt;
 			td->td_sticks += cnt;
 			if (!TD_IS_IDLETHREAD(td))
 				cp_time[CP_SYS] += cnt;
 			else
 				cp_time[CP_IDLE] += cnt;
 		}
 	}
 
 	/* Update resource usage integrals and maximums. */
 	MPASS(p->p_vmspace != NULL);
 	vm = p->p_vmspace;
 	ru = &td->td_ru;
 	ru->ru_ixrss += pgtok(vm->vm_tsize) * cnt;
 	ru->ru_idrss += pgtok(vm->vm_dsize) * cnt;
 	ru->ru_isrss += pgtok(vm->vm_ssize) * cnt;
 	rss = pgtok(vmspace_resident_count(vm));
 	if (ru->ru_maxrss < rss)
 		ru->ru_maxrss = rss;
 	KTR_POINT2(KTR_SCHED, "thread", sched_tdname(td), "statclock",
 	    "prio:%d", td->td_priority, "stathz:%d", (stathz)?stathz:hz);
 	SDT_PROBE2(sched, , , tick, td, td->td_proc);
 	thread_lock_flags(td, MTX_QUIET);
 	for ( ; cnt > 0; cnt--)
 		sched_clock(td);
 	thread_unlock(td);
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, stat, td->td_intr_frame);
 #endif
 }
 
 void
 profclock(int usermode, uintfptr_t pc)
 {
 
 	profclock_cnt(1, usermode, pc);
 }
 
 void
 profclock_cnt(int cnt, int usermode, uintfptr_t pc)
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
 	uintfptr_t i;
 #endif
 
 	td = curthread;
 	if (usermode) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
 		 * if there is no related user location yet, don't
 		 * bother trying to count it.
 		 */
 		if (td->td_proc->p_flag & P_PROFIL)
 			addupc_intr(td, pc, cnt);
 	}
 #ifdef GPROF
 	else {
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
 		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
 			i = PC_TO_I(g, pc);
 			if (i < g->textsize) {
 				KCOUNT(g, i) += cnt;
 			}
 		}
 	}
 #endif
 #ifdef HWPMC_HOOKS
 	if (td->td_intr_frame != NULL)
 		PMC_SOFT_CALL_TF( , , clock, prof, td->td_intr_frame);
 #endif
 }
 
 /*
  * Return information about system clocks.
  */
 static int
 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
 {
 	struct clockinfo clkinfo;
 	/*
 	 * Construct clockinfo structure.
 	 */
 	bzero(&clkinfo, sizeof(clkinfo));
 	clkinfo.hz = hz;
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 }
 
 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate,
 	CTLTYPE_STRUCT|CTLFLAG_RD|CTLFLAG_MPSAFE,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
 	"Rate and period of various kernel clocks");
 
 static void
 watchdog_config(void *unused __unused, u_int cmd, int *error)
 {
 	u_int u;
 
 	u = cmd & WD_INTERVAL;
 	if (u >= WD_TO_1SEC) {
 		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
 		watchdog_enabled = 1;
 		*error = 0;
 	} else {
 		watchdog_enabled = 0;
 	}
 }
 
 /*
  * Handle a watchdog timeout by dumping interrupt information and
  * then either dropping to DDB or panicking.
  */
 static void
 watchdog_fire(void)
 {
 	int nintr;
 	uint64_t inttotal;
 	u_long *curintr;
 	char *curname;
 
 	curintr = intrcnt;
 	curname = intrnames;
 	inttotal = 0;
 	nintr = sintrcnt / sizeof(u_long);
 
 	printf("interrupt                   total\n");
 	while (--nintr >= 0) {
 		if (*curintr)
 			printf("%-12s %20lu\n", curname, *curintr);
 		curname += strlen(curname) + 1;
 		inttotal += *curintr++;
 	}
 	printf("Total        %20ju\n", (uintmax_t)inttotal);
 
 #if defined(KDB) && !defined(KDB_UNATTENDED)
 	kdb_backtrace();
 	kdb_enter(KDB_WHY_WATCHDOG, "watchdog timeout");
 #else
 	panic("watchdog timeout");
 #endif
 }
Index: head/sys/kern/subr_epoch.c
===================================================================
--- head/sys/kern/subr_epoch.c	(revision 333760)
+++ head/sys/kern/subr_epoch.c	(revision 333761)
@@ -1,576 +1,593 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/epoch.h>
 #include <sys/gtaskqueue.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <ck_epoch.h>
 
 static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation");
 
 /* arbitrary --- needs benchmarking */
 #define MAX_ADAPTIVE_SPIN 1000
+#define MAX_EPOCHS 64
 
-#define EPOCH_EXITING 0x1
 #ifdef __amd64__
 #define EPOCH_ALIGN CACHE_LINE_SIZE*2
 #else
 #define EPOCH_ALIGN CACHE_LINE_SIZE
 #endif
 
 CTASSERT(sizeof(epoch_section_t) == sizeof(ck_epoch_section_t));
 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context));
 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information");
 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats");
 
-static int poll_intvl;
-SYSCTL_INT(_kern_epoch, OID_AUTO, poll_intvl, CTLFLAG_RWTUN,
-		   &poll_intvl, 0, "# of ticks to wait between garbage collecting deferred frees");
+
 /* Stats. */
 static counter_u64_t block_count;
 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW,
 				   &block_count, "# of times a thread was in an epoch when epoch_wait was called");
 static counter_u64_t migrate_count;
 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW,
 				   &migrate_count, "# of times thread was migrated to another CPU in epoch_wait");
 static counter_u64_t turnstile_count;
 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW,
 				   &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait");
 static counter_u64_t switch_count;
 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW,
 				   &switch_count, "# of times a thread voluntarily context switched in epoch_wait");
 
 TAILQ_HEAD(threadlist, thread);
 
 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry,
     ck_epoch_entry_container)
 
 typedef struct epoch_record {
 	ck_epoch_record_t er_record;
 	volatile struct threadlist er_tdlist;
 	volatile uint32_t er_gen;
 	uint32_t er_cpuid;
 } *epoch_record_t;
 
 struct epoch_pcpu_state {
 	struct epoch_record eps_record;
 } __aligned(EPOCH_ALIGN);
 
 struct epoch {
 	struct ck_epoch e_epoch __aligned(EPOCH_ALIGN);
-	struct grouptask e_gtask;
-	struct callout e_timer;
-	struct mtx e_lock;
-	int e_flags;
-	/* make sure that immutable data doesn't overlap with the gtask, callout, and mutex*/
 	struct epoch_pcpu_state *e_pcpu_dom[MAXMEMDOM] __aligned(EPOCH_ALIGN);
 	counter_u64_t e_frees;
 	uint64_t e_free_last;
+	int e_idx;
 	struct epoch_pcpu_state *e_pcpu[0];
 };
 
+epoch_t allepochs[MAX_EPOCHS];
+
+static DPCPU_DEFINE(struct grouptask, cb_task);
+static DPCPU_DEFINE(int, cb_count);
+
 static __read_mostly int domcount[MAXMEMDOM];
 static __read_mostly int domoffsets[MAXMEMDOM];
 static __read_mostly int inited;
+static __read_mostly int epoch_count;
 __read_mostly epoch_t global_epoch;
+static __read_mostly epoch_t private_epoch;
 
-static void epoch_call_task(void *context);
+static void epoch_call_task(void *context __unused);
 
 #if defined(__powerpc64__) || defined(__powerpc__) || !defined(NUMA)
 static bool usedomains = false;
 #else
 static bool usedomains = true;
 #endif
 static void
 epoch_init(void *arg __unused)
 {
-	int domain, count;
+	int domain, count, cpu;
 
-	if (poll_intvl == 0)
-		poll_intvl = hz;
-
 	block_count = counter_u64_alloc(M_WAITOK);
 	migrate_count = counter_u64_alloc(M_WAITOK);
 	turnstile_count = counter_u64_alloc(M_WAITOK);
 	switch_count = counter_u64_alloc(M_WAITOK);
 	if (usedomains == false)
 		goto done;
 	count = domain = 0;
 	domoffsets[0] = 0;
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		domcount[domain] = CPU_COUNT(&cpuset_domain[domain]);
 		if (bootverbose)
 			printf("domcount[%d] %d\n", domain, domcount[domain]);
 	}
 	for (domain = 1; domain < vm_ndomains; domain++)
 		domoffsets[domain] = domoffsets[domain-1] + domcount[domain-1];
 
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		if (domcount[domain] == 0) {
 			usedomains = false;
 			break;
 		}
 	}
  done:
+	CPU_FOREACH(cpu) {
+		GROUPTASK_INIT(DPCPU_ID_PTR(cpu, cb_task), 0, epoch_call_task, NULL);
+		taskqgroup_attach_cpu(qgroup_softirq, DPCPU_ID_PTR(cpu, cb_task), NULL, cpu, -1, "epoch call task");
+	}
 	inited = 1;
 	global_epoch = epoch_alloc();
+	private_epoch = epoch_alloc();
 }
 SYSINIT(epoch, SI_SUB_TASKQ + 1, SI_ORDER_FIRST, epoch_init, NULL);
 
 static void
 epoch_init_numa(epoch_t epoch)
 {
 	int domain, cpu_offset;
 	struct epoch_pcpu_state *eps;
 	epoch_record_t er;
 
 	for (domain = 0; domain < vm_ndomains; domain++) {
 		eps = malloc_domain(sizeof(*eps)*domcount[domain], M_EPOCH,
 							domain, M_ZERO|M_WAITOK);
 		epoch->e_pcpu_dom[domain] = eps;
 		cpu_offset = domoffsets[domain];
 		for (int i = 0; i < domcount[domain]; i++, eps++) {
 			epoch->e_pcpu[cpu_offset + i] = eps;
 			er = &eps->eps_record;
 			ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
 			TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
 			er->er_cpuid = cpu_offset + i;
 		}
 	}
 }
 
 static void
 epoch_init_legacy(epoch_t epoch)
 {
 	struct epoch_pcpu_state *eps;
 	epoch_record_t er;
 
 	eps = malloc(sizeof(*eps)*mp_ncpus, M_EPOCH, M_ZERO|M_WAITOK);
 	epoch->e_pcpu_dom[0] = eps;
 	for (int i = 0; i < mp_ncpus; i++, eps++) {
 		epoch->e_pcpu[i] = eps;
 		er = &eps->eps_record;
 		ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL);
 		TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist);
 		er->er_cpuid = i;
 	}
 }
 
-static void
-epoch_callout(void *arg)
-{
-	epoch_t epoch;
-	uint64_t frees;
-
-	epoch = arg;
-	frees = counter_u64_fetch(epoch->e_frees);
-	/* pick some better value */
-	if (frees - epoch->e_free_last > 10) {
-		GROUPTASK_ENQUEUE(&epoch->e_gtask);
-		epoch->e_free_last = frees;
-	}
-	if ((epoch->e_flags & EPOCH_EXITING) == 0)
-		callout_reset(&epoch->e_timer, poll_intvl, epoch_callout, epoch);
-}
-
 epoch_t
 epoch_alloc(void)
 {
 	epoch_t epoch;
 
 	if (__predict_false(!inited))
 		panic("%s called too early in boot", __func__);
 	epoch = malloc(sizeof(struct epoch) + mp_ncpus*sizeof(void*),
 				   M_EPOCH, M_ZERO|M_WAITOK);
 	ck_epoch_init(&epoch->e_epoch);
 	epoch->e_frees = counter_u64_alloc(M_WAITOK);
-	mtx_init(&epoch->e_lock, "epoch callout", NULL, MTX_DEF);
-	callout_init_mtx(&epoch->e_timer, &epoch->e_lock, 0);
-	taskqgroup_config_gtask_init(epoch, &epoch->e_gtask, epoch_call_task, "epoch call task");
 	if (usedomains)
 		epoch_init_numa(epoch);
 	else
 		epoch_init_legacy(epoch);
-	callout_reset(&epoch->e_timer, poll_intvl, epoch_callout, epoch);
+	MPASS(epoch_count < MAX_EPOCHS-2);
+	epoch->e_idx = epoch_count;
+	allepochs[epoch_count++] = epoch;
 	return (epoch);
 }
 
 void
 epoch_free(epoch_t epoch)
 {
 	int domain;
 #ifdef INVARIANTS
 	struct epoch_pcpu_state *eps;
 	int cpu;
 
 	CPU_FOREACH(cpu) {
 		eps = epoch->e_pcpu[cpu];
 		MPASS(TAILQ_EMPTY(&eps->eps_record.er_tdlist));
 	}
 #endif
-	mtx_lock(&epoch->e_lock);
-	epoch->e_flags |= EPOCH_EXITING;
-	mtx_unlock(&epoch->e_lock);
+	allepochs[epoch->e_idx] = NULL;
+	epoch_wait(private_epoch);
 	/*
 	 * Execute any lingering callbacks
 	 */
-	GROUPTASK_ENQUEUE(&epoch->e_gtask);
-	gtaskqueue_drain(epoch->e_gtask.gt_taskqueue, &epoch->e_gtask.gt_task);
-	callout_drain(&epoch->e_timer);
-	mtx_destroy(&epoch->e_lock);
 	counter_u64_free(epoch->e_frees);
-	taskqgroup_config_gtask_deinit(&epoch->e_gtask);
 	if (usedomains)
 		for (domain = 0; domain < vm_ndomains; domain++)
 			free_domain(epoch->e_pcpu_dom[domain], M_EPOCH);
 	else
 		free(epoch->e_pcpu_dom[0], M_EPOCH);
 	free(epoch, M_EPOCH);
 }
 
 #define INIT_CHECK(epoch)								\
 	do {											\
 		if (__predict_false((epoch) == NULL))		\
 			return;									\
 	} while (0)
 
 void
 epoch_enter_internal(epoch_t epoch, struct thread *td)
 {
 	struct epoch_pcpu_state *eps;
 
 	INIT_CHECK(epoch);
 	critical_enter();
 	td->td_pre_epoch_prio = td->td_priority;
 	eps = epoch->e_pcpu[curcpu];
 #ifdef INVARIANTS
 	MPASS(td->td_epochnest < UCHAR_MAX - 2);
 	if (td->td_epochnest > 1) {
 		struct thread *curtd;
 		int found = 0;
 
 		TAILQ_FOREACH(curtd, &eps->eps_record.er_tdlist, td_epochq)
 			if (curtd == td)
 				found = 1;
 		KASSERT(found, ("recursing on a second epoch"));
 		critical_exit();
 		return;
 	}
 #endif
 	TAILQ_INSERT_TAIL(&eps->eps_record.er_tdlist, td, td_epochq);
 	sched_pin();
 	ck_epoch_begin(&eps->eps_record.er_record, (ck_epoch_section_t*)&td->td_epoch_section);
 	critical_exit();
 }
 
+
+static void
+epoch_enter_private(ck_epoch_section_t *section)
+{
+	struct epoch_pcpu_state *eps;
+
+	MPASS(curthread->td_critnest);
+	eps = private_epoch->e_pcpu[curcpu];
+	ck_epoch_begin(&eps->eps_record.er_record, section);
+}
+
 void
 epoch_exit_internal(epoch_t epoch, struct thread *td)
 {
 	struct epoch_pcpu_state *eps;
 
-	td = curthread;
 	MPASS(td->td_epochnest == 0);
 	INIT_CHECK(epoch);
 	critical_enter();
 	eps = epoch->e_pcpu[curcpu];
 
 	ck_epoch_end(&eps->eps_record.er_record, (ck_epoch_section_t*)&td->td_epoch_section);
 	TAILQ_REMOVE(&eps->eps_record.er_tdlist, td, td_epochq);
 	eps->eps_record.er_gen++;
 	sched_unpin();
 	if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) {
 		thread_lock(td);
 		sched_prio(td, td->td_pre_epoch_prio);
 		thread_unlock(td);
 	}
 	critical_exit();
 }
 
+static void
+epoch_exit_private(ck_epoch_section_t *section)
+{
+	struct epoch_pcpu_state *eps;
+
+	MPASS(curthread->td_critnest);
+	eps = private_epoch->e_pcpu[curcpu];
+	ck_epoch_end(&eps->eps_record.er_record, section);
+}
+
 /*
  * epoch_block_handler is a callback from the ck code when another thread is
  * currently in an epoch section.
  */
 static void
 epoch_block_handler(struct ck_epoch *global __unused, ck_epoch_record_t *cr,
 					void *arg __unused)
 {
 	epoch_record_t record;
 	struct epoch_pcpu_state *eps;
 	struct thread *td, *tdwait, *owner;
 	struct turnstile *ts;
 	struct lock_object *lock;
 	int spincount, gen;
 
 	eps = arg;
 	record = __containerof(cr, struct epoch_record, er_record);
 	td = curthread;
 	spincount = 0;
 	counter_u64_add(block_count, 1);
 	if (record->er_cpuid != curcpu) {
 		/*
 		 * If the head of the list is running, we can wait for it
 		 * to remove itself from the list and thus save us the
 		 * overhead of a migration
 		 */
 		if ((tdwait = TAILQ_FIRST(&record->er_tdlist)) != NULL &&
 			TD_IS_RUNNING(tdwait)) {
 			gen = record->er_gen;
 			thread_unlock(td);
 			do {
 				cpu_spinwait();
 			} while (tdwait == TAILQ_FIRST(&record->er_tdlist) &&
 					 gen == record->er_gen && TD_IS_RUNNING(tdwait) &&
 					 spincount++ < MAX_ADAPTIVE_SPIN);
 			thread_lock(td);
 			return;
 		}
 
 		/*
 		 * Being on the same CPU as that of the record on which
 		 * we need to wait allows us access to the thread
 		 * list associated with that CPU. We can then examine the
 		 * oldest thread in the queue and wait on its turnstile
 		 * until it resumes and so on until a grace period
 		 * elapses.
 		 *
 		 */
 		counter_u64_add(migrate_count, 1);
 		sched_bind(td, record->er_cpuid);
 		/*
 		 * At this point we need to return to the ck code
 		 * to scan to see if a grace period has elapsed.
 		 * We can't move on to check the thread list, because
 		 * in the meantime new threads may have arrived that
 		 * in fact belong to a different epoch.
 		 */
 		return;
 	}
 	/*
 	 * Try to find a thread in an epoch section on this CPU 
 	 * waiting on a turnstile. Otherwise find the lowest
 	 * priority thread (highest prio value) and drop our priority
 	 * to match to allow it to run.
 	 */
 	TAILQ_FOREACH(tdwait, &record->er_tdlist, td_epochq) {
 		/*
 		 * Propagate our priority to any other waiters to prevent us
 		 * from starving them. They will have their original priority
 		 * restore on exit from epoch_wait().
 		 */
 		if (!TD_IS_INHIBITED(tdwait) && tdwait->td_priority > td->td_priority) {
 			critical_enter();
 			thread_unlock(td);
 			thread_lock(tdwait);
 			sched_prio(tdwait, td->td_priority);
 			thread_unlock(tdwait);
 			thread_lock(td);
 			critical_exit();
 		}
 		if (TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait) &&
 			((ts = tdwait->td_blocked) != NULL)) {
 			/*
 			 * We unlock td to allow turnstile_wait to reacquire the
 			 * the thread lock. Before unlocking it we enter a critical
 			 * section to prevent preemption after we reenable interrupts
 			 * by dropping the thread lock in order to prevent tdwait
 			 * from getting to run.
 			 */
 			critical_enter();
 			thread_unlock(td);
 			owner = turnstile_lock(ts, &lock);
 			/*
 			 * The owner pointer indicates that the lock succeeded. Only
 			 * in case we hold the lock and the turnstile we locked is still
 			 * the one that tdwait is blocked on can we continue. Otherwise
 			 * The turnstile pointer has been changed out from underneath
 			 * us, as in the case where the lock holder has signalled tdwait,
 			 * and we need to continue.
 			 */
 			if (owner != NULL && ts == tdwait->td_blocked) {
 				MPASS(TD_IS_INHIBITED(tdwait) && TD_ON_LOCK(tdwait));
 				critical_exit();
 				turnstile_wait(ts, owner, tdwait->td_tsqueue);
 				counter_u64_add(turnstile_count, 1);
 				thread_lock(td);
 				return;
 			} else if (owner != NULL)
 				turnstile_unlock(ts, lock);
 			thread_lock(td);
 			critical_exit();
 			KASSERT(td->td_locks == 0,
 					("%d locks held", td->td_locks));
 		}
 	}
 	/*
 	 * We didn't find any threads actually blocked on a lock
 	 * so we have nothing to do except context switch away.
 	 */
 	counter_u64_add(switch_count, 1);
 	mi_switch(SW_VOL | SWT_RELINQUISH, NULL);
 
 	/*
 	 * Release the thread lock while yielding to
 	 * allow other threads to acquire the lock
 	 * pointed to by TDQ_LOCKPTR(td). Else a
 	 * deadlock like situation might happen. (HPS)
 	 */
 	thread_unlock(td);
 	thread_lock(td);
 }
 
 void
 epoch_wait(epoch_t epoch)
 {
 	struct thread *td;
 	int was_bound;
 	int old_cpu;
 	int old_pinned;
 	u_char old_prio;
 #ifdef INVARIANTS
 	int locks;
 
 	locks = curthread->td_locks;
 #endif
 	INIT_CHECK(epoch);
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 	    "epoch_wait() can sleep");
 
 	td = curthread;
 	KASSERT(td->td_epochnest == 0, ("epoch_wait() in the middle of an epoch section"));
 	thread_lock(td);
 
 	DROP_GIANT();
 
 	old_cpu = PCPU_GET(cpuid);
 	old_pinned = td->td_pinned;
 	old_prio = td->td_priority;
 	was_bound = sched_is_bound(td);
 	sched_unbind(td);
 	td->td_pinned = 0;
 	sched_bind(td, old_cpu);
 
 	ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL);
 
 	/* restore CPU binding, if any */
 	if (was_bound != 0) {
 		sched_bind(td, old_cpu);
 	} else {
 		/* get thread back to initial CPU, if any */
 		if (old_pinned != 0)
 			sched_bind(td, old_cpu);
 		sched_unbind(td);
 	}
 	/* restore pinned after bind */
 	td->td_pinned = old_pinned;
 
 	/* restore thread priority */
 	sched_prio(td, old_prio);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	KASSERT(td->td_locks == locks,
 			("%d residual locks held", td->td_locks - locks));
 }
 
 void
 epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t))
 {
 	struct epoch_pcpu_state *eps;
 	ck_epoch_entry_t *cb;
 
 	cb = (void *)ctx;
 
 	MPASS(callback);
 	/* too early in boot to have epoch set up */
 	if (__predict_false(epoch == NULL))
 		goto boottime;
 
 	counter_u64_add(epoch->e_frees, 1);
 
 	critical_enter();
+	*DPCPU_PTR(cb_count) += 1;
 	eps = epoch->e_pcpu[curcpu];
 	ck_epoch_call(&eps->eps_record.er_record, cb, (ck_epoch_cb_t*)callback);
 	critical_exit();
 	return;
  boottime:
 	callback(ctx);
 }
 
+
 static void
-epoch_call_task(void *context)
+epoch_call_task(void *arg __unused)
 {
-	struct epoch_pcpu_state *eps;
+	ck_stack_entry_t *cursor, *head, *next;
+	ck_epoch_record_t *record;
+	ck_epoch_section_t section;
 	epoch_t epoch;
-	struct thread *td;
-	ck_stack_entry_t *cursor;
-	ck_stack_t deferred;
-	int cpu;
+	ck_stack_t cb_stack;
+	int i, npending, total;
 
-	epoch = context;
-	td = curthread;
-	ck_stack_init(&deferred);
-	thread_lock(td);
-	CPU_FOREACH(cpu) {
-		sched_bind(td, cpu);
-		eps = epoch->e_pcpu[cpu];
-		ck_epoch_poll_deferred(&eps->eps_record.er_record, &deferred);
+	ck_stack_init(&cb_stack);
+	critical_enter();
+	epoch_enter_private(&section);
+	for (total = i = 0; i < epoch_count; i++) {
+		if (__predict_false((epoch = allepochs[i]) == NULL))
+			continue;
+		record = &epoch->e_pcpu[curcpu]->eps_record.er_record;
+		if ((npending = record->n_pending) == 0)
+			continue;
+		ck_epoch_poll_deferred(record, &cb_stack);
+		total += npending - record->n_pending;
 	}
-	sched_unbind(td);
-	thread_unlock(td);
-	while((cursor = ck_stack_pop_npsc(&deferred)) != NULL) {
+	epoch_exit_private(&section);
+	*DPCPU_PTR(cb_count) -= total;
+	critical_exit();
+
+	head = ck_stack_batch_pop_npsc(&cb_stack);
+	for (cursor = head; cursor != NULL; cursor = next) {
 		struct ck_epoch_entry *entry =
 		    ck_epoch_entry_container(cursor);
+		next = CK_STACK_NEXT(cursor);
 		entry->function(entry);
 	}
+}
+
+void
+epoch_pcpu_poll(void)
+{
+
+	if (DPCPU_GET(cb_count))
+		GROUPTASK_ENQUEUE(DPCPU_PTR(cb_task));
 }
 
 int
 in_epoch(void)
 {
 	return (curthread->td_epochnest != 0);
 }
Index: head/sys/sys/epoch.h
===================================================================
--- head/sys/sys/epoch.h	(revision 333760)
+++ head/sys/sys/epoch.h	(revision 333761)
@@ -1,79 +1,80 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_EPOCH_H_
 #define _SYS_EPOCH_H_
 #include <sys/lock.h>
 #include <sys/proc.h>
 
 struct epoch;
 typedef struct epoch *epoch_t;
 
 extern epoch_t global_epoch;
 
 struct epoch_context {
 	void *data[2];
 } __aligned(sizeof(void *));
 
 typedef struct epoch_context *epoch_context_t;
 
 epoch_t epoch_alloc(void);
 void epoch_free(epoch_t epoch);
 void epoch_enter_internal(epoch_t epoch, struct thread *td);
 void epoch_exit_internal(epoch_t epoch, struct thread *td);
 void epoch_wait(epoch_t epoch);
 void epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t));
+void epoch_pcpu_poll(void);
 int in_epoch(void);
 
 static __inline void
 epoch_enter(epoch_t epoch)
 {
 	struct thread *td;
 	int nesting;
 
 	td = curthread;
 	nesting = td->td_epochnest++;
 #ifndef INVARIANTS
 	if (nesting == 0)
 #endif
 		epoch_enter_internal(epoch, td);
 }
 
 static __inline void
 epoch_exit(epoch_t epoch)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(td->td_epochnest);
 	if (td->td_epochnest-- == 1)
 		epoch_exit_internal(epoch, td);
 }
 
 #endif