Index: head/sys/kern/kern_tc.c
===================================================================
--- head/sys/kern/kern_tc.c	(revision 315279)
+++ head/sys/kern/kern_tc.c	(revision 315280)
@@ -1,2175 +1,2193 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
  * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Julien Ridoux at the University
  * of Melbourne under sponsorship from the FreeBSD Foundation.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ntp.h"
 #include "opt_ffclock.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/proc.h>
 #include <sys/sbuf.h>
+#include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/timeffc.h>
 #include <sys/timepps.h>
 #include <sys/timetc.h>
 #include <sys/timex.h>
 #include <sys/vdso.h>
 
 /*
  * A large step happens on boot.  This constant detects such steps.
  * It is relatively small so that ntp_update_second gets called enough
  * in the typical 'missed a couple of seconds' case, but doesn't loop
  * forever when the time step is large.
  */
 #define LARGE_STEP	200
 
 /*
  * Implement a dummy timecounter which we can use until we get a real one
  * in the air.  This allows the console and other early stuff to use
  * time services.
  */
 
 static u_int
 dummy_get_timecount(struct timecounter *tc)
 {
 	static u_int now;
 
 	return (++now);
 }
 
 static struct timecounter dummy_timecounter = {
 	dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000
 };
 
 struct timehands {
 	/* These fields must be initialized by the driver. */
 	struct timecounter	*th_counter;
 	int64_t			th_adjustment;
 	uint64_t		th_scale;
 	u_int	 		th_offset_count;
 	struct bintime		th_offset;
 	struct bintime		th_bintime;
 	struct timeval		th_microtime;
 	struct timespec		th_nanotime;
 	struct bintime		th_boottime;
 	/* Fields not to be copied in tc_windup start with th_generation. */
 	u_int			th_generation;
 	struct timehands	*th_next;
 };
 
 static struct timehands th0;
 static struct timehands th1 = {
 	.th_next = &th0
 };
 static struct timehands th0 = {
 	.th_counter = &dummy_timecounter,
 	.th_scale = (uint64_t)-1 / 1000000,
 	.th_offset = { .sec = 1 },
 	.th_generation = 1,
 	.th_next = &th1
 };
 
 static struct timehands *volatile timehands = &th0;
 struct timecounter *timecounter = &dummy_timecounter;
 static struct timecounter *timecounters = &dummy_timecounter;
 
 int tc_min_ticktock_freq = 1;
 
 volatile time_t time_second = 1;
 volatile time_t time_uptime = 1;
 
 static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
 
 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
 static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
 
 static int timestepwarnings;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
     &timestepwarnings, 0, "Log time steps");
 
 struct bintime bt_timethreshold;
 struct bintime bt_tickthreshold;
 sbintime_t sbt_timethreshold;
 sbintime_t sbt_tickthreshold;
 struct bintime tc_tick_bt;
 sbintime_t tc_tick_sbt;
 int tc_precexp;
 int tc_timepercentage = TC_DEFAULTPERC;
 static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation,
     CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
     sysctl_kern_timecounter_adjprecision, "I",
     "Allowed time interval deviation in percents");
 
+volatile int rtc_generation = 1;
+
 static int tc_chosen;	/* Non-zero if a specific tc was chosen via sysctl. */
 
 static void tc_windup(struct bintime *new_boottimebin);
 static void cpu_tick_calibrate(int);
 
 void dtrace_getnanotime(struct timespec *tsp);
 
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
 {
 	struct timeval boottime;
 
 	getboottime(&boottime);
 
 #ifndef __mips__
 #ifdef SCTL_MASK32
 	int tv[2];
 
 	if (req->flags & SCTL_MASK32) {
 		tv[0] = boottime.tv_sec;
 		tv[1] = boottime.tv_usec;
 		return (SYSCTL_OUT(req, tv, sizeof(tv)));
 	}
 #endif
 #endif
 	return (SYSCTL_OUT(req, &boottime, sizeof(boottime)));
 }
 
 static int
 sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
 {
 	u_int ncount;
 	struct timecounter *tc = arg1;
 
 	ncount = tc->tc_get_timecount(tc);
 	return (sysctl_handle_int(oidp, &ncount, 0, req));
 }
 
 static int
 sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t freq;
 	struct timecounter *tc = arg1;
 
 	freq = tc->tc_frequency;
 	return (sysctl_handle_64(oidp, &freq, 0, req));
 }
 
 /*
  * Return the difference between the timehands' counter value now and what
  * was when we copied it to the timehands' offset_count.
  */
 static __inline u_int
 tc_delta(struct timehands *th)
 {
 	struct timecounter *tc;
 
 	tc = th->th_counter;
 	return ((tc->tc_get_timecount(tc) - th->th_offset_count) &
 	    tc->tc_counter_mask);
 }
 
 /*
  * Functions for reading the time.  We have to loop until we are sure that
  * the timehands that we operated on was not updated under our feet.  See
  * the comment in <sys/time.h> for a description of these 12 functions.
  */
 
 #ifdef FFCLOCK
 void
 fbclock_binuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_bintime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 fbclock_microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	fbclock_bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 fbclock_getbinuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getnanouptime(struct timespec *tsp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timespec(&th->th_offset, tsp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getmicrouptime(struct timeval *tvp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timeval(&th->th_offset, tvp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getbintime(struct bintime *bt)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 fbclock_getmicrotime(struct timeval *tvp)
 {
 	struct timehands *th;
 	unsigned int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tvp = th->th_microtime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 #else /* !FFCLOCK */
 void
 binuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	binuptime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 bintime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		bintime_addx(bt, th->th_scale * tc_delta(th));
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timespec(&bt, tsp);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 	struct bintime bt;
 
 	bintime(&bt);
 	bintime2timeval(&bt, tvp);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_offset;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timespec(&th->th_offset, tsp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		bintime2timeval(&th->th_offset, tvp);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*bt = th->th_bintime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tvp = th->th_microtime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 #endif /* FFCLOCK */
 
 void
 getboottime(struct timeval *boottime)
 {
 	struct bintime boottimebin;
 
 	getboottimebin(&boottimebin);
 	bintime2timeval(&boottimebin, boottime);
 }
 
 void
 getboottimebin(struct bintime *boottimebin)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*boottimebin = th->th_boottime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 #ifdef FFCLOCK
 /*
  * Support for feed-forward synchronization algorithms. This is heavily inspired
  * by the timehands mechanism but kept independent from it. *_windup() functions
  * have some connection to avoid accessing the timecounter hardware more than
  * necessary.
  */
 
 /* Feed-forward clock estimates kept updated by the synchronization daemon. */
 struct ffclock_estimate ffclock_estimate;
 struct bintime ffclock_boottime;	/* Feed-forward boot time estimate. */
 uint32_t ffclock_status;		/* Feed-forward clock status. */
 int8_t ffclock_updated;			/* New estimates are available. */
 struct mtx ffclock_mtx;			/* Mutex on ffclock_estimate. */
 
 struct fftimehands {
 	struct ffclock_estimate	cest;
 	struct bintime		tick_time;
 	struct bintime		tick_time_lerp;
 	ffcounter		tick_ffcount;
 	uint64_t		period_lerp;
 	volatile uint8_t	gen;
 	struct fftimehands	*next;
 };
 
 #define	NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x))
 
 static struct fftimehands ffth[10];
 static struct fftimehands *volatile fftimehands = ffth;
 
 static void
 ffclock_init(void)
 {
 	struct fftimehands *cur;
 	struct fftimehands *last;
 
 	memset(ffth, 0, sizeof(ffth));
 
 	last = ffth + NUM_ELEMENTS(ffth) - 1;
 	for (cur = ffth; cur < last; cur++)
 		cur->next = cur + 1;
 	last->next = ffth;
 
 	ffclock_updated = 0;
 	ffclock_status = FFCLOCK_STA_UNSYNC;
 	mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF);
 }
 
 /*
  * Reset the feed-forward clock estimates. Called from inittodr() to get things
  * kick started and uses the timecounter nominal frequency as a first period
  * estimate. Note: this function may be called several time just after boot.
  * Note: this is the only function that sets the value of boot time for the
  * monotonic (i.e. uptime) version of the feed-forward clock.
  */
 void
 ffclock_reset_clock(struct timespec *ts)
 {
 	struct timecounter *tc;
 	struct ffclock_estimate cest;
 
 	tc = timehands->th_counter;
 	memset(&cest, 0, sizeof(struct ffclock_estimate));
 
 	timespec2bintime(ts, &ffclock_boottime);
 	timespec2bintime(ts, &(cest.update_time));
 	ffclock_read_counter(&cest.update_ffcount);
 	cest.leapsec_next = 0;
 	cest.period = ((1ULL << 63) / tc->tc_frequency) << 1;
 	cest.errb_abs = 0;
 	cest.errb_rate = 0;
 	cest.status = FFCLOCK_STA_UNSYNC;
 	cest.leapsec_total = 0;
 	cest.leapsec = 0;
 
 	mtx_lock(&ffclock_mtx);
 	bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate));
 	ffclock_updated = INT8_MAX;
 	mtx_unlock(&ffclock_mtx);
 
 	printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name,
 	    (unsigned long long)tc->tc_frequency, (long)ts->tv_sec,
 	    (unsigned long)ts->tv_nsec);
 }
 
 /*
  * Sub-routine to convert a time interval measured in RAW counter units to time
  * in seconds stored in bintime format.
  * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be
  * larger than the max value of u_int (on 32 bit architecture). Loop to consume
  * extra cycles.
  */
 static void
 ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt)
 {
 	struct bintime bt2;
 	ffcounter delta, delta_max;
 
 	delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1;
 	bintime_clear(bt);
 	do {
 		if (ffdelta > delta_max)
 			delta = delta_max;
 		else
 			delta = ffdelta;
 		bt2.sec = 0;
 		bt2.frac = period;
 		bintime_mul(&bt2, (unsigned int)delta);
 		bintime_add(bt, &bt2);
 		ffdelta -= delta;
 	} while (ffdelta > 0);
 }
 
 /*
  * Update the fftimehands.
  * Push the tick ffcount and time(s) forward based on current clock estimate.
  * The conversion from ffcounter to bintime relies on the difference clock
  * principle, whose accuracy relies on computing small time intervals. If a new
  * clock estimate has been passed by the synchronisation daemon, make it
  * current, and compute the linear interpolation for monotonic time if needed.
  */
 static void
 ffclock_windup(unsigned int delta)
 {
 	struct ffclock_estimate *cest;
 	struct fftimehands *ffth;
 	struct bintime bt, gap_lerp;
 	ffcounter ffdelta;
 	uint64_t frac;
 	unsigned int polling;
 	uint8_t forward_jump, ogen;
 
 	/*
 	 * Pick the next timehand, copy current ffclock estimates and move tick
 	 * times and counter forward.
 	 */
 	forward_jump = 0;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 	cest = &ffth->cest;
 	bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate));
 	ffdelta = (ffcounter)delta;
 	ffth->period_lerp = fftimehands->period_lerp;
 
 	ffth->tick_time = fftimehands->tick_time;
 	ffclock_convert_delta(ffdelta, cest->period, &bt);
 	bintime_add(&ffth->tick_time, &bt);
 
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt);
 	bintime_add(&ffth->tick_time_lerp, &bt);
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta;
 
 	/*
 	 * Assess the status of the clock, if the last update is too old, it is
 	 * likely the synchronisation daemon is dead and the clock is free
 	 * running.
 	 */
 	if (ffclock_updated == 0) {
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		if (bt.sec > 2 * FFCLOCK_SKM_SCALE)
 			ffclock_status |= FFCLOCK_STA_UNSYNC;
 	}
 
 	/*
 	 * If available, grab updated clock estimates and make them current.
 	 * Recompute time at this tick using the updated estimates. The clock
 	 * estimates passed the feed-forward synchronisation daemon may result
 	 * in time conversion that is not monotonically increasing (just after
 	 * the update). time_lerp is a particular linear interpolation over the
 	 * synchronisation algo polling period that ensures monotonicity for the
 	 * clock ids requesting it.
 	 */
 	if (ffclock_updated > 0) {
 		bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate));
 		ffdelta = ffth->tick_ffcount - cest->update_ffcount;
 		ffth->tick_time = cest->update_time;
 		ffclock_convert_delta(ffdelta, cest->period, &bt);
 		bintime_add(&ffth->tick_time, &bt);
 
 		/* ffclock_reset sets ffclock_updated to INT8_MAX */
 		if (ffclock_updated == INT8_MAX)
 			ffth->tick_time_lerp = ffth->tick_time;
 
 		if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >))
 			forward_jump = 1;
 		else
 			forward_jump = 0;
 
 		bintime_clear(&gap_lerp);
 		if (forward_jump) {
 			gap_lerp = ffth->tick_time;
 			bintime_sub(&gap_lerp, &ffth->tick_time_lerp);
 		} else {
 			gap_lerp = ffth->tick_time_lerp;
 			bintime_sub(&gap_lerp, &ffth->tick_time);
 		}
 
 		/*
 		 * The reset from the RTC clock may be far from accurate, and
 		 * reducing the gap between real time and interpolated time
 		 * could take a very long time if the interpolated clock insists
 		 * on strict monotonicity. The clock is reset under very strict
 		 * conditions (kernel time is known to be wrong and
 		 * synchronization daemon has been restarted recently.
 		 * ffclock_boottime absorbs the jump to ensure boot time is
 		 * correct and uptime functions stay consistent.
 		 */
 		if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) &&
 		    ((cest->status & FFCLOCK_STA_UNSYNC) == 0) &&
 		    ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) {
 			if (forward_jump)
 				bintime_add(&ffclock_boottime, &gap_lerp);
 			else
 				bintime_sub(&ffclock_boottime, &gap_lerp);
 			ffth->tick_time_lerp = ffth->tick_time;
 			bintime_clear(&gap_lerp);
 		}
 
 		ffclock_status = cest->status;
 		ffth->period_lerp = cest->period;
 
 		/*
 		 * Compute corrected period used for the linear interpolation of
 		 * time. The rate of linear interpolation is capped to 5000PPM
 		 * (5ms/s).
 		 */
 		if (bintime_isset(&gap_lerp)) {
 			ffdelta = cest->update_ffcount;
 			ffdelta -= fftimehands->cest.update_ffcount;
 			ffclock_convert_delta(ffdelta, cest->period, &bt);
 			polling = bt.sec;
 			bt.sec = 0;
 			bt.frac = 5000000 * (uint64_t)18446744073LL;
 			bintime_mul(&bt, polling);
 			if (bintime_cmp(&gap_lerp, &bt, >))
 				gap_lerp = bt;
 
 			/* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */
 			frac = 0;
 			if (gap_lerp.sec > 0) {
 				frac -= 1;
 				frac /= ffdelta / gap_lerp.sec;
 			}
 			frac += gap_lerp.frac / ffdelta;
 
 			if (forward_jump)
 				ffth->period_lerp += frac;
 			else
 				ffth->period_lerp -= frac;
 		}
 
 		ffclock_updated = 0;
 	}
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Adjust the fftimehands when the timecounter is changed. Stating the obvious,
  * the old and new hardware counter cannot be read simultaneously. tc_windup()
  * does read the two counters 'back to back', but a few cycles are effectively
  * lost, and not accumulated in tick_ffcount. This is a fairly radical
  * operation for a feed-forward synchronization daemon, and it is its job to not
  * pushing irrelevant data to the kernel. Because there is no locking here,
  * simply force to ignore pending or next update to give daemon a chance to
  * realize the counter has changed.
  */
 static void
 ffclock_change_tc(struct timehands *th)
 {
 	struct fftimehands *ffth;
 	struct ffclock_estimate *cest;
 	struct timecounter *tc;
 	uint8_t ogen;
 
 	tc = th->th_counter;
 	ffth = fftimehands->next;
 	ogen = ffth->gen;
 	ffth->gen = 0;
 
 	cest = &ffth->cest;
 	bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate));
 	cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1;
 	cest->errb_abs = 0;
 	cest->errb_rate = 0;
 	cest->status |= FFCLOCK_STA_UNSYNC;
 
 	ffth->tick_ffcount = fftimehands->tick_ffcount;
 	ffth->tick_time_lerp = fftimehands->tick_time_lerp;
 	ffth->tick_time = fftimehands->tick_time;
 	ffth->period_lerp = cest->period;
 
 	/* Do not lock but ignore next update from synchronization daemon. */
 	ffclock_updated--;
 
 	if (++ogen == 0)
 		ogen = 1;
 	ffth->gen = ogen;
 	fftimehands = ffth;
 }
 
 /*
  * Retrieve feed-forward counter and time of last kernel tick.
  */
 void
 ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP)
 			*bt = ffth->tick_time_lerp;
 		else
 			*bt = ffth->tick_time;
 		*ffcount = ffth->tick_ffcount;
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Absolute clock conversion. Low level function to convert ffcounter to
  * bintime. The ffcounter is converted using the current ffclock period estimate
  * or the "interpolated period" to ensure monotonicity.
  * NOTE: this conversion may have been deferred, and the clock updated since the
  * hardware counter has been read.
  */
 void
 ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags)
 {
 	struct fftimehands *ffth;
 	struct bintime bt2;
 	ffcounter ffdelta;
 	uint8_t gen;
 
 	/*
 	 * No locking but check generation has not changed. Also need to make
 	 * sure ffdelta is positive, i.e. ffcount > tick_ffcount.
 	 */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		if (ffcount > ffth->tick_ffcount)
 			ffdelta = ffcount - ffth->tick_ffcount;
 		else
 			ffdelta = ffth->tick_ffcount - ffcount;
 
 		if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) {
 			*bt = ffth->tick_time_lerp;
 			ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2);
 		} else {
 			*bt = ffth->tick_time;
 			ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2);
 		}
 
 		if (ffcount > ffth->tick_ffcount)
 			bintime_add(bt, &bt2);
 		else
 			bintime_sub(bt, &bt2);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Difference clock conversion.
  * Low level function to Convert a time interval measured in RAW counter units
  * into bintime. The difference clock allows measuring small intervals much more
  * reliably than the absolute clock.
  */
 void
 ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt)
 {
 	struct fftimehands *ffth;
 	uint8_t gen;
 
 	/* No locking but check generation has not changed. */
 	do {
 		ffth = fftimehands;
 		gen = ffth->gen;
 		ffclock_convert_delta(ffdelta, ffth->cest.period, bt);
 	} while (gen == 0 || gen != ffth->gen);
 }
 
 /*
  * Access to current ffcounter value.
  */
 void
 ffclock_read_counter(ffcounter *ffcount)
 {
 	struct timehands *th;
 	struct fftimehands *ffth;
 	unsigned int gen, delta;
 
 	/*
 	 * ffclock_windup() called from tc_windup(), safe to rely on
 	 * th->th_generation only, for correct delta and ffcounter.
 	 */
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		ffth = fftimehands;
 		delta = tc_delta(th);
 		*ffcount = ffth->tick_ffcount;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	*ffcount += delta;
 }
 
 void
 binuptime(struct bintime *bt)
 {
 
 	binuptime_fromclock(bt, sysclock_active);
 }
 
 void
 nanouptime(struct timespec *tsp)
 {
 
 	nanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 microuptime(struct timeval *tvp)
 {
 
 	microuptime_fromclock(tvp, sysclock_active);
 }
 
 void
 bintime(struct bintime *bt)
 {
 
 	bintime_fromclock(bt, sysclock_active);
 }
 
 void
 nanotime(struct timespec *tsp)
 {
 
 	nanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 microtime(struct timeval *tvp)
 {
 
 	microtime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbinuptime(struct bintime *bt)
 {
 
 	getbinuptime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanouptime(struct timespec *tsp)
 {
 
 	getnanouptime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrouptime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 void
 getbintime(struct bintime *bt)
 {
 
 	getbintime_fromclock(bt, sysclock_active);
 }
 
 void
 getnanotime(struct timespec *tsp)
 {
 
 	getnanotime_fromclock(tsp, sysclock_active);
 }
 
 void
 getmicrotime(struct timeval *tvp)
 {
 
 	getmicrouptime_fromclock(tvp, sysclock_active);
 }
 
 #endif /* FFCLOCK */
 
 /*
  * This is a clone of getnanotime and used for walltimestamps.
  * The dtrace_ prefix prevents fbt from creating probes for
  * it so walltimestamp can be safely used in all fbt probes.
  */
 void
 dtrace_getnanotime(struct timespec *tsp)
 {
 	struct timehands *th;
 	u_int gen;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		*tsp = th->th_nanotime;
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 }
 
 /*
  * System clock currently providing time to the system. Modifiable via sysctl
  * when the FFCLOCK option is defined.
  */
 int sysclock_active = SYSCLOCK_FBCK;
 
 /* Internal NTP status and error estimates. */
 extern int time_status;
 extern long time_esterror;
 
 /*
  * Take a snapshot of sysclock data which can be used to compare system clocks
  * and generate timestamps after the fact.
  */
 void
 sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast)
 {
 	struct fbclock_info *fbi;
 	struct timehands *th;
 	struct bintime bt;
 	unsigned int delta, gen;
 #ifdef FFCLOCK
 	ffcounter ffcount;
 	struct fftimehands *ffth;
 	struct ffclock_info *ffi;
 	struct ffclock_estimate cest;
 
 	ffi = &clock_snap->ff_info;
 #endif
 
 	fbi = &clock_snap->fb_info;
 	delta = 0;
 
 	do {
 		th = timehands;
 		gen = atomic_load_acq_int(&th->th_generation);
 		fbi->th_scale = th->th_scale;
 		fbi->tick_time = th->th_offset;
 #ifdef FFCLOCK
 		ffth = fftimehands;
 		ffi->tick_time = ffth->tick_time_lerp;
 		ffi->tick_time_lerp = ffth->tick_time_lerp;
 		ffi->period = ffth->cest.period;
 		ffi->period_lerp = ffth->period_lerp;
 		clock_snap->ffcount = ffth->tick_ffcount;
 		cest = ffth->cest;
 #endif
 		if (!fast)
 			delta = tc_delta(th);
 		atomic_thread_fence_acq();
 	} while (gen == 0 || gen != th->th_generation);
 
 	clock_snap->delta = delta;
 	clock_snap->sysclock_active = sysclock_active;
 
 	/* Record feedback clock status and error. */
 	clock_snap->fb_info.status = time_status;
 	/* XXX: Very crude estimate of feedback clock error. */
 	bt.sec = time_esterror / 1000000;
 	bt.frac = ((time_esterror - bt.sec) * 1000000) *
 	    (uint64_t)18446744073709ULL;
 	clock_snap->fb_info.error = bt;
 
 #ifdef FFCLOCK
 	if (!fast)
 		clock_snap->ffcount += delta;
 
 	/* Record feed-forward clock leap second adjustment. */
 	ffi->leapsec_adjustment = cest.leapsec_total;
 	if (clock_snap->ffcount > cest.leapsec_next)
 		ffi->leapsec_adjustment -= cest.leapsec;
 
 	/* Record feed-forward clock status and error. */
 	clock_snap->ff_info.status = cest.status;
 	ffcount = clock_snap->ffcount - cest.update_ffcount;
 	ffclock_convert_delta(ffcount, cest.period, &bt);
 	/* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */
 	bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL);
 	/* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */
 	bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL);
 	clock_snap->ff_info.error = bt;
 #endif
 }
 
 /*
  * Convert a sysclock snapshot into a struct bintime based on the specified
  * clock source and flags.
  */
 int
 sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt,
     int whichclock, uint32_t flags)
 {
 	struct bintime boottimebin;
 #ifdef FFCLOCK
 	struct bintime bt2;
 	uint64_t period;
 #endif
 
 	switch (whichclock) {
 	case SYSCLOCK_FBCK:
 		*bt = cs->fb_info.tick_time;
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0)
 			bintime_addx(bt, cs->fb_info.th_scale * cs->delta);
 
 		if ((flags & FBCLOCK_UPTIME) == 0) {
 			getboottimebin(&boottimebin);
 			bintime_add(bt, &boottimebin);
 		}
 		break;
 #ifdef FFCLOCK
 	case SYSCLOCK_FFWD:
 		if (flags & FFCLOCK_LERP) {
 			*bt = cs->ff_info.tick_time_lerp;
 			period = cs->ff_info.period_lerp;
 		} else {
 			*bt = cs->ff_info.tick_time;
 			period = cs->ff_info.period;
 		}
 
 		/* If snapshot was created with !fast, delta will be >0. */
 		if (cs->delta > 0) {
 			ffclock_convert_delta(cs->delta, period, &bt2);
 			bintime_add(bt, &bt2);
 		}
 
 		/* Leap second adjustment. */
 		if (flags & FFCLOCK_LEAPSEC)
 			bt->sec -= cs->ff_info.leapsec_adjustment;
 
 		/* Boot time adjustment, for uptime/monotonic clocks. */
 		if (flags & FFCLOCK_UPTIME)
 			bintime_sub(bt, &ffclock_boottime);
 		break;
 #endif
 	default:
 		return (EINVAL);
 		break;
 	}
 
 	return (0);
 }
 
 /*
  * Initialize a new timecounter and possibly use it.
  */
 void
 tc_init(struct timecounter *tc)
 {
 	u_int u;
 	struct sysctl_oid *tc_root;
 
 	u = tc->tc_frequency / tc->tc_counter_mask;
 	/* XXX: We need some margin here, 10% is a guess */
 	u *= 11;
 	u /= 10;
 	if (u > hz && tc->tc_quality >= 0) {
 		tc->tc_quality = -2000;
 		if (bootverbose) {
 			printf("Timecounter \"%s\" frequency %ju Hz",
 			    tc->tc_name, (uintmax_t)tc->tc_frequency);
 			printf(" -- Insufficient hz, needs at least %u\n", u);
 		}
 	} else if (tc->tc_quality >= 0 || bootverbose) {
 		printf("Timecounter \"%s\" frequency %ju Hz quality %d\n",
 		    tc->tc_name, (uintmax_t)tc->tc_frequency,
 		    tc->tc_quality);
 	}
 
 	tc->tc_next = timecounters;
 	timecounters = tc;
 	/*
 	 * Set up sysctl tree for this counter.
 	 */
 	tc_root = SYSCTL_ADD_NODE_WITH_LABEL(NULL,
 	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
 	    CTLFLAG_RW, 0, "timecounter description", "timecounter");
 	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
 	    "mask for implemented bits");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
 	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
 	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc),
 	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
 	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
 	    "goodness of time counter");
 	/*
 	 * Do not automatically switch if the current tc was specifically
 	 * chosen.  Never automatically use a timecounter with negative quality.
 	 * Even though we run on the dummy counter, switching here may be
 	 * worse since this timecounter may not be monotonic.
 	 */
 	if (tc_chosen)
 		return;
 	if (tc->tc_quality < 0)
 		return;
 	if (tc->tc_quality < timecounter->tc_quality)
 		return;
 	if (tc->tc_quality == timecounter->tc_quality &&
 	    tc->tc_frequency < timecounter->tc_frequency)
 		return;
 	(void)tc->tc_get_timecount(tc);
 	(void)tc->tc_get_timecount(tc);
 	timecounter = tc;
 }
 
 /* Report the frequency of the current timecounter. */
 uint64_t
 tc_getfrequency(void)
 {
 
 	return (timehands->th_counter->tc_frequency);
 }
 
+static bool
+sleeping_on_old_rtc(struct thread *td)
+{
+
+	if (td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation) {
+		td->td_rtcgen = 0;
+		return (true);
+	}
+	return (false);
+}
+
 static struct mtx tc_setclock_mtx;
 MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN);
 
 /*
  * Step our concept of UTC.  This is done by modifying our estimate of
  * when we booted.
  */
 void
 tc_setclock(struct timespec *ts)
 {
 	struct timespec tbef, taft;
 	struct bintime bt, bt2;
 
 	timespec2bintime(ts, &bt);
 	nanotime(&tbef);
 	mtx_lock_spin(&tc_setclock_mtx);
 	cpu_tick_calibrate(1);
 	binuptime(&bt2);
 	bintime_sub(&bt, &bt2);
 
 	/* XXX fiddle all the little crinkly bits around the fiords... */
 	tc_windup(&bt);
 	mtx_unlock_spin(&tc_setclock_mtx);
+	/* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */
+	atomic_add_rel_int(&rtc_generation, 2);
+	sleepq_chains_remove_matching(sleeping_on_old_rtc);
 	if (timestepwarnings) {
 		nanotime(&taft);
 		log(LOG_INFO,
 		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
 		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
 		    (intmax_t)taft.tv_sec, taft.tv_nsec,
 		    (intmax_t)ts->tv_sec, ts->tv_nsec);
 	}
 }
 
 /*
  * Initialize the next struct timehands in the ring and make
  * it the active timehands.  Along the way we might switch to a different
  * timecounter and/or do seconds processing in NTP.  Slightly magic.
  */
 static void
 tc_windup(struct bintime *new_boottimebin)
 {
 	struct bintime bt;
 	struct timehands *th, *tho;
 	uint64_t scale;
 	u_int delta, ncount, ogen;
 	int i;
 	time_t t;
 
 	/*
 	 * Make the next timehands a copy of the current one, but do
 	 * not overwrite the generation or next pointer.  While we
 	 * update the contents, the generation must be zero.  We need
 	 * to ensure that the zero generation is visible before the
 	 * data updates become visible, which requires release fence.
 	 * For similar reasons, re-reading of the generation after the
 	 * data is read should use acquire fence.
 	 */
 	tho = timehands;
 	th = tho->th_next;
 	ogen = th->th_generation;
 	th->th_generation = 0;
 	atomic_thread_fence_rel();
 	bcopy(tho, th, offsetof(struct timehands, th_generation));
 	if (new_boottimebin != NULL)
 		th->th_boottime = *new_boottimebin;
 
 	/*
 	 * Capture a timecounter delta on the current timecounter and if
 	 * changing timecounters, a counter value from the new timecounter.
 	 * Update the offset fields accordingly.
 	 */
 	delta = tc_delta(th);
 	if (th->th_counter != timecounter)
 		ncount = timecounter->tc_get_timecount(timecounter);
 	else
 		ncount = 0;
 #ifdef FFCLOCK
 	ffclock_windup(delta);
 #endif
 	th->th_offset_count += delta;
 	th->th_offset_count &= th->th_counter->tc_counter_mask;
 	while (delta > th->th_counter->tc_frequency) {
 		/* Eat complete unadjusted seconds. */
 		delta -= th->th_counter->tc_frequency;
 		th->th_offset.sec++;
 	}
 	if ((delta > th->th_counter->tc_frequency / 2) &&
 	    (th->th_scale * delta < ((uint64_t)1 << 63))) {
 		/* The product th_scale * delta just barely overflows. */
 		th->th_offset.sec++;
 	}
 	bintime_addx(&th->th_offset, th->th_scale * delta);
 
 	/*
 	 * Hardware latching timecounters may not generate interrupts on
 	 * PPS events, so instead we poll them.  There is a finite risk that
 	 * the hardware might capture a count which is later than the one we
 	 * got above, and therefore possibly in the next NTP second which might
 	 * have a different rate than the current NTP second.  It doesn't
 	 * matter in practice.
 	 */
 	if (tho->th_counter->tc_poll_pps)
 		tho->th_counter->tc_poll_pps(tho->th_counter);
 
 	/*
 	 * Deal with NTP second processing.  The for loop normally
 	 * iterates at most once, but in extreme situations it might
 	 * keep NTP sane if timeouts are not run for several seconds.
 	 * At boot, the time step can be large when the TOD hardware
 	 * has been read, so on really large steps, we call
 	 * ntp_update_second only twice.  We need to call it twice in
 	 * case we missed a leap second.
 	 */
 	bt = th->th_offset;
 	bintime_add(&bt, &th->th_boottime);
 	i = bt.sec - tho->th_microtime.tv_sec;
 	if (i > LARGE_STEP)
 		i = 2;
 	for (; i > 0; i--) {
 		t = bt.sec;
 		ntp_update_second(&th->th_adjustment, &bt.sec);
 		if (bt.sec != t)
 			th->th_boottime.sec += bt.sec - t;
 	}
 	th->th_bintime = th->th_offset;
 	bintime_add(&th->th_bintime, &th->th_boottime);
 	/* Update the UTC timestamps used by the get*() functions. */
 	/* XXX shouldn't do this here.  Should force non-`get' versions. */
 	bintime2timeval(&bt, &th->th_microtime);
 	bintime2timespec(&bt, &th->th_nanotime);
 
 	/* Now is a good time to change timecounters. */
 	if (th->th_counter != timecounter) {
 #ifndef __arm__
 		if ((timecounter->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep++;
 		if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0)
 			cpu_disable_c2_sleep--;
 #endif
 		th->th_counter = timecounter;
 		th->th_offset_count = ncount;
 		tc_min_ticktock_freq = max(1, timecounter->tc_frequency /
 		    (((uint64_t)timecounter->tc_counter_mask + 1) / 3));
 #ifdef FFCLOCK
 		ffclock_change_tc(th);
 #endif
 	}
 
 	/*-
 	 * Recalculate the scaling factor.  We want the number of 1/2^64
 	 * fractions of a second per period of the hardware counter, taking
 	 * into account the th_adjustment factor which the NTP PLL/adjtime(2)
 	 * processing provides us with.
 	 *
 	 * The th_adjustment is nanoseconds per second with 32 bit binary
 	 * fraction and we want 64 bit binary fraction of second:
 	 *
 	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
 	 *
 	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
 	 * we can only multiply by about 850 without overflowing, that
 	 * leaves no suitably precise fractions for multiply before divide.
 	 *
 	 * Divide before multiply with a fraction of 2199/512 results in a
 	 * systematic undercompensation of 10PPM of th_adjustment.  On a
 	 * 5000PPM adjustment this is a 0.05PPM error.  This is acceptable.
  	 *
 	 * We happily sacrifice the lowest of the 64 bits of our result
 	 * to the goddess of code clarity.
 	 *
 	 */
 	scale = (uint64_t)1 << 63;
 	scale += (th->th_adjustment / 1024) * 2199;
 	scale /= th->th_counter->tc_frequency;
 	th->th_scale = scale * 2;
 
 	/*
 	 * Now that the struct timehands is again consistent, set the new
 	 * generation number, making sure to not make it zero.
 	 */
 	if (++ogen == 0)
 		ogen = 1;
 	atomic_store_rel_int(&th->th_generation, ogen);
 
 	/* Go live with the new struct timehands. */
 #ifdef FFCLOCK
 	switch (sysclock_active) {
 	case SYSCLOCK_FBCK:
 #endif
 		time_second = th->th_microtime.tv_sec;
 		time_uptime = th->th_offset.sec;
 #ifdef FFCLOCK
 		break;
 	case SYSCLOCK_FFWD:
 		time_second = fftimehands->tick_time_lerp.sec;
 		time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec;
 		break;
 	}
 #endif
 
 	timehands = th;
 	timekeep_push_vdso();
 }
 
 /* Report or change the active timecounter hardware. */
 static int
 sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS)
 {
 	char newname[32];
 	struct timecounter *newtc, *tc;
 	int error;
 
 	tc = timecounter;
 	strlcpy(newname, tc->tc_name, sizeof(newname));
 
 	error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	/* Record that the tc in use now was specifically chosen. */
 	tc_chosen = 1;
 	if (strcmp(newname, tc->tc_name) == 0)
 		return (0);
 	for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
 		if (strcmp(newname, newtc->tc_name) != 0)
 			continue;
 
 		/* Warm up new timecounter. */
 		(void)newtc->tc_get_timecount(newtc);
 		(void)newtc->tc_get_timecount(newtc);
 
 		timecounter = newtc;
 
 		/*
 		 * The vdso timehands update is deferred until the next
 		 * 'tc_windup()'.
 		 *
 		 * This is prudent given that 'timekeep_push_vdso()' does not
 		 * use any locking and that it can be called in hard interrupt
 		 * context via 'tc_windup()'.
 		 */
 		return (0);
 	}
 	return (EINVAL);
 }
 
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW,
     0, 0, sysctl_kern_timecounter_hardware, "A",
     "Timecounter hardware selected");
 
 
 /* Report the available timecounter hardware. */
 static int
 sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct timecounter *tc;
 	int error;
 
 	sbuf_new_for_sysctl(&sb, NULL, 0, req);
 	for (tc = timecounters; tc != NULL; tc = tc->tc_next) {
 		if (tc != timecounters)
 			sbuf_putc(&sb, ' ');
 		sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality);
 	}
 	error = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error);
 }
 
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD,
     0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected");
 
 /*
  * RFC 2783 PPS-API implementation.
  */
 
 /*
  *  Return true if the driver is aware of the abi version extensions in the
  *  pps_state structure, and it supports at least the given abi version number.
  */
 static inline int
 abi_aware(struct pps_state *pps, int vers)
 {
 
 	return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers);
 }
 
 static int
 pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps)
 {
 	int err, timo;
 	pps_seq_t aseq, cseq;
 	struct timeval tv;
 
 	if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC)
 		return (EINVAL);
 
 	/*
 	 * If no timeout is requested, immediately return whatever values were
 	 * most recently captured.  If timeout seconds is -1, that's a request
 	 * to block without a timeout.  WITNESS won't let us sleep forever
 	 * without a lock (we really don't need a lock), so just repeatedly
 	 * sleep a long time.
 	 */
 	if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) {
 		if (fapi->timeout.tv_sec == -1)
 			timo = 0x7fffffff;
 		else {
 			tv.tv_sec = fapi->timeout.tv_sec;
 			tv.tv_usec = fapi->timeout.tv_nsec / 1000;
 			timo = tvtohz(&tv);
 		}
 		aseq = pps->ppsinfo.assert_sequence;
 		cseq = pps->ppsinfo.clear_sequence;
 		while (aseq == pps->ppsinfo.assert_sequence &&
 		    cseq == pps->ppsinfo.clear_sequence) {
 			if (abi_aware(pps, 1) && pps->driver_mtx != NULL) {
 				if (pps->flags & PPSFLAG_MTX_SPIN) {
 					err = msleep_spin(pps, pps->driver_mtx,
 					    "ppsfch", timo);
 				} else {
 					err = msleep(pps, pps->driver_mtx, PCATCH,
 					    "ppsfch", timo);
 				}
 			} else {
 				err = tsleep(pps, PCATCH, "ppsfch", timo);
 			}
 			if (err == EWOULDBLOCK) {
 				if (fapi->timeout.tv_sec == -1) {
 					continue;
 				} else {
 					return (ETIMEDOUT);
 				}
 			} else if (err != 0) {
 				return (err);
 			}
 		}
 	}
 
 	pps->ppsinfo.current_mode = pps->ppsparam.mode;
 	fapi->pps_info_buf = pps->ppsinfo;
 
 	return (0);
 }
 
 int
 pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps)
 {
 	pps_params_t *app;
 	struct pps_fetch_args *fapi;
 #ifdef FFCLOCK
 	struct pps_fetch_ffc_args *fapi_ffc;
 #endif
 #ifdef PPS_SYNC
 	struct pps_kcbind_args *kapi;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl"));
 	switch (cmd) {
 	case PPS_IOC_CREATE:
 		return (0);
 	case PPS_IOC_DESTROY:
 		return (0);
 	case PPS_IOC_SETPARAMS:
 		app = (pps_params_t *)data;
 		if (app->mode & ~pps->ppscap)
 			return (EINVAL);
 #ifdef FFCLOCK
 		/* Ensure only a single clock is selected for ffc timestamp. */
 		if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK)
 			return (EINVAL);
 #endif
 		pps->ppsparam = *app;
 		return (0);
 	case PPS_IOC_GETPARAMS:
 		app = (pps_params_t *)data;
 		*app = pps->ppsparam;
 		app->api_version = PPS_API_VERS_1;
 		return (0);
 	case PPS_IOC_GETCAP:
 		*(int*)data = pps->ppscap;
 		return (0);
 	case PPS_IOC_FETCH:
 		fapi = (struct pps_fetch_args *)data;
 		return (pps_fetch(fapi, pps));
 #ifdef FFCLOCK
 	case PPS_IOC_FETCH_FFCOUNTER:
 		fapi_ffc = (struct pps_fetch_ffc_args *)data;
 		if (fapi_ffc->tsformat && fapi_ffc->tsformat !=
 		    PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec)
 			return (EOPNOTSUPP);
 		pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode;
 		fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc;
 		/* Overwrite timestamps if feedback clock selected. */
 		switch (pps->ppsparam.mode & PPS_TSCLK_MASK) {
 		case PPS_TSCLK_FBCK:
 			fapi_ffc->pps_info_buf_ffc.assert_timestamp =
 			    pps->ppsinfo.assert_timestamp;
 			fapi_ffc->pps_info_buf_ffc.clear_timestamp =
 			    pps->ppsinfo.clear_timestamp;
 			break;
 		case PPS_TSCLK_FFWD:
 			break;
 		default:
 			break;
 		}
 		return (0);
 #endif /* FFCLOCK */
 	case PPS_IOC_KCBIND:
 #ifdef PPS_SYNC
 		kapi = (struct pps_kcbind_args *)data;
 		/* XXX Only root should be able to do this */
 		if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC)
 			return (EINVAL);
 		if (kapi->kernel_consumer != PPS_KC_HARDPPS)
 			return (EINVAL);
 		if (kapi->edge & ~pps->ppscap)
 			return (EINVAL);
 		pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) |
 		    (pps->kcmode & KCMODE_ABIFLAG);
 		return (0);
 #else
 		return (EOPNOTSUPP);
 #endif
 	default:
 		return (ENOIOCTL);
 	}
 }
 
 void
 pps_init(struct pps_state *pps)
 {
 	pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT;
 	if (pps->ppscap & PPS_CAPTUREASSERT)
 		pps->ppscap |= PPS_OFFSETASSERT;
 	if (pps->ppscap & PPS_CAPTURECLEAR)
 		pps->ppscap |= PPS_OFFSETCLEAR;
 #ifdef FFCLOCK
 	pps->ppscap |= PPS_TSCLK_MASK;
 #endif
 	pps->kcmode &= ~KCMODE_ABIFLAG;
 }
 
 void
 pps_init_abi(struct pps_state *pps)
 {
 
 	pps_init(pps);
 	if (pps->driver_abi > 0) {
 		pps->kcmode |= KCMODE_ABIFLAG;
 		pps->kernel_abi = PPS_ABI_VERSION;
 	}
 }
 
 void
 pps_capture(struct pps_state *pps)
 {
 	struct timehands *th;
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_capture"));
 	th = timehands;
 	pps->capgen = atomic_load_acq_int(&th->th_generation);
 	pps->capth = th;
 #ifdef FFCLOCK
 	pps->capffth = fftimehands;
 #endif
 	pps->capcount = th->th_counter->tc_get_timecount(th->th_counter);
 	atomic_thread_fence_acq();
 	if (pps->capgen != th->th_generation)
 		pps->capgen = 0;
 }
 
 void
 pps_event(struct pps_state *pps, int event)
 {
 	struct bintime bt;
 	struct timespec ts, *tsp, *osp;
 	u_int tcount, *pcount;
 	int foff;
 	pps_seq_t *pseq;
 #ifdef FFCLOCK
 	struct timespec *tsp_ffc;
 	pps_seq_t *pseq_ffc;
 	ffcounter *ffcount;
 #endif
 #ifdef PPS_SYNC
 	int fhard;
 #endif
 
 	KASSERT(pps != NULL, ("NULL pps pointer in pps_event"));
 	/* Nothing to do if not currently set to capture this event type. */
 	if ((event & pps->ppsparam.mode) == 0)
 		return;
 	/* If the timecounter was wound up underneath us, bail out. */
 	if (pps->capgen == 0 || pps->capgen !=
 	    atomic_load_acq_int(&pps->capth->th_generation))
 		return;
 
 	/* Things would be easier with arrays. */
 	if (event == PPS_CAPTUREASSERT) {
 		tsp = &pps->ppsinfo.assert_timestamp;
 		osp = &pps->ppsparam.assert_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTUREASSERT;
 #endif
 		pcount = &pps->ppscount[0];
 		pseq = &pps->ppsinfo.assert_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.assert_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.assert_sequence;
 #endif
 	} else {
 		tsp = &pps->ppsinfo.clear_timestamp;
 		osp = &pps->ppsparam.clear_offset;
 		foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
 #ifdef PPS_SYNC
 		fhard = pps->kcmode & PPS_CAPTURECLEAR;
 #endif
 		pcount = &pps->ppscount[1];
 		pseq = &pps->ppsinfo.clear_sequence;
 #ifdef FFCLOCK
 		ffcount = &pps->ppsinfo_ffc.clear_ffcount;
 		tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp;
 		pseq_ffc = &pps->ppsinfo_ffc.clear_sequence;
 #endif
 	}
 
 	/*
 	 * If the timecounter changed, we cannot compare the count values, so
 	 * we have to drop the rest of the PPS-stuff until the next event.
 	 */
 	if (pps->ppstc != pps->capth->th_counter) {
 		pps->ppstc = pps->capth->th_counter;
 		*pcount = pps->capcount;
 		pps->ppscount[2] = pps->capcount;
 		return;
 	}
 
 	/* Convert the count to a timespec. */
 	tcount = pps->capcount - pps->capth->th_offset_count;
 	tcount &= pps->capth->th_counter->tc_counter_mask;
 	bt = pps->capth->th_bintime;
 	bintime_addx(&bt, pps->capth->th_scale * tcount);
 	bintime2timespec(&bt, &ts);
 
 	/* If the timecounter was wound up underneath us, bail out. */
 	atomic_thread_fence_acq();
 	if (pps->capgen != pps->capth->th_generation)
 		return;
 
 	*pcount = pps->capcount;
 	(*pseq)++;
 	*tsp = ts;
 
 	if (foff) {
 		timespecadd(tsp, osp);
 		if (tsp->tv_nsec < 0) {
 			tsp->tv_nsec += 1000000000;
 			tsp->tv_sec -= 1;
 		}
 	}
 
 #ifdef FFCLOCK
 	*ffcount = pps->capffth->tick_ffcount + tcount;
 	bt = pps->capffth->tick_time;
 	ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt);
 	bintime_add(&bt, &pps->capffth->tick_time);
 	bintime2timespec(&bt, &ts);
 	(*pseq_ffc)++;
 	*tsp_ffc = ts;
 #endif
 
 #ifdef PPS_SYNC
 	if (fhard) {
 		uint64_t scale;
 
 		/*
 		 * Feed the NTP PLL/FLL.
 		 * The FLL wants to know how many (hardware) nanoseconds
 		 * elapsed since the previous event.
 		 */
 		tcount = pps->capcount - pps->ppscount[2];
 		pps->ppscount[2] = pps->capcount;
 		tcount &= pps->capth->th_counter->tc_counter_mask;
 		scale = (uint64_t)1 << 63;
 		scale /= pps->capth->th_counter->tc_frequency;
 		scale *= 2;
 		bt.sec = 0;
 		bt.frac = 0;
 		bintime_addx(&bt, scale * tcount);
 		bintime2timespec(&bt, &ts);
 		hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec);
 	}
 #endif
 
 	/* Wakeup anyone sleeping in pps_fetch().  */
 	wakeup(pps);
 }
 
 /*
  * Timecounters need to be updated every so often to prevent the hardware
  * counter from overflowing.  Updating also recalculates the cached values
  * used by the get*() family of functions, so their precision depends on
  * the update frequency.
  */
 
 static int tc_tick;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0,
     "Approximate number of hardclock ticks in a millisecond");
 
 void
 tc_ticktock(int cnt)
 {
 	static int count;
 
 	if (mtx_trylock_spin(&tc_setclock_mtx)) {
 		count += cnt;
 		if (count >= tc_tick) {
 			count = 0;
 			tc_windup(NULL);
 		}
 		mtx_unlock_spin(&tc_setclock_mtx);
 	}
 }
 
 static void __inline
 tc_adjprecision(void)
 {
 	int t;
 
 	if (tc_timepercentage > 0) {
 		t = (99 + tc_timepercentage) / tc_timepercentage;
 		tc_precexp = fls(t + (t >> 1)) - 1;
 		FREQ2BT(hz / tc_tick, &bt_timethreshold);
 		FREQ2BT(hz, &bt_tickthreshold);
 		bintime_shift(&bt_timethreshold, tc_precexp);
 		bintime_shift(&bt_tickthreshold, tc_precexp);
 	} else {
 		tc_precexp = 31;
 		bt_timethreshold.sec = INT_MAX;
 		bt_timethreshold.frac = ~(uint64_t)0;
 		bt_tickthreshold = bt_timethreshold;
 	}
 	sbt_timethreshold = bttosbt(bt_timethreshold);
 	sbt_tickthreshold = bttosbt(bt_tickthreshold);
 }
 
 static int
 sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = tc_timepercentage;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	tc_timepercentage = val;
 	if (cold)
 		goto done;
 	tc_adjprecision();
 done:
 	return (0);
 }
 
 static void
 inittimecounter(void *dummy)
 {
 	u_int p;
 	int tick_rate;
 
 	/*
 	 * Set the initial timeout to
 	 * max(1, <approx. number of hardclock ticks in a millisecond>).
 	 * People should probably not use the sysctl to set the timeout
 	 * to smaller than its initial value, since that value is the
 	 * smallest reasonable one.  If they want better timestamps they
 	 * should use the non-"get"* functions.
 	 */
 	if (hz > 1000)
 		tc_tick = (hz + 500) / 1000;
 	else
 		tc_tick = 1;
 	tc_adjprecision();
 	FREQ2BT(hz, &tick_bt);
 	tick_sbt = bttosbt(tick_bt);
 	tick_rate = hz / tc_tick;
 	FREQ2BT(tick_rate, &tc_tick_bt);
 	tc_tick_sbt = bttosbt(tc_tick_bt);
 	p = (tc_tick * 1000000) / hz;
 	printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000);
 
 #ifdef FFCLOCK
 	ffclock_init();
 #endif
 	/* warm up new timecounter (again) and get rolling. */
 	(void)timecounter->tc_get_timecount(timecounter);
 	(void)timecounter->tc_get_timecount(timecounter);
 	mtx_lock_spin(&tc_setclock_mtx);
 	tc_windup(NULL);
 	mtx_unlock_spin(&tc_setclock_mtx);
 }
 
 SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL);
 
 /* Cpu tick handling -------------------------------------------------*/
 
 static int cpu_tick_variable;
 static uint64_t	cpu_tick_frequency;
 
 static DPCPU_DEFINE(uint64_t, tc_cpu_ticks_base);
 static DPCPU_DEFINE(unsigned, tc_cpu_ticks_last);
 
 static uint64_t
 tc_cpu_ticks(void)
 {
 	struct timecounter *tc;
 	uint64_t res, *base;
 	unsigned u, *last;
 
 	critical_enter();
 	base = DPCPU_PTR(tc_cpu_ticks_base);
 	last = DPCPU_PTR(tc_cpu_ticks_last);
 	tc = timehands->th_counter;
 	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
 	if (u < *last)
 		*base += (uint64_t)tc->tc_counter_mask + 1;
 	*last = u;
 	res = u + *base;
 	critical_exit();
 	return (res);
 }
 
 void
 cpu_tick_calibration(void)
 {
 	static time_t last_calib;
 
 	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
 		cpu_tick_calibrate(0);
 		last_calib = time_uptime;
 	}
 }
 
 /*
  * This function gets called every 16 seconds on only one designated
  * CPU in the system from hardclock() via cpu_tick_calibration()().
  *
  * Whenever the real time clock is stepped we get called with reset=1
  * to make sure we handle suspend/resume and similar events correctly.
  */
 
 static void
 cpu_tick_calibrate(int reset)
 {
 	static uint64_t c_last;
 	uint64_t c_this, c_delta;
 	static struct bintime  t_last;
 	struct bintime t_this, t_delta;
 	uint32_t divi;
 
 	if (reset) {
 		/* The clock was stepped, abort & reset */
 		t_last.sec = 0;
 		return;
 	}
 
 	/* we don't calibrate fixed rate cputicks */
 	if (!cpu_tick_variable)
 		return;
 
 	getbinuptime(&t_this);
 	c_this = cpu_ticks();
 	if (t_last.sec != 0) {
 		c_delta = c_this - c_last;
 		t_delta = t_this;
 		bintime_sub(&t_delta, &t_last);
 		/*
 		 * Headroom:
 		 * 	2^(64-20) / 16[s] =
 		 * 	2^(44) / 16[s] =
 		 * 	17.592.186.044.416 / 16 =
 		 * 	1.099.511.627.776 [Hz]
 		 */
 		divi = t_delta.sec << 20;
 		divi |= t_delta.frac >> (64 - 20);
 		c_delta <<= 20;
 		c_delta /= divi;
 		if (c_delta > cpu_tick_frequency) {
 			if (0 && bootverbose)
 				printf("cpu_tick increased to %ju Hz\n",
 				    c_delta);
 			cpu_tick_frequency = c_delta;
 		}
 	}
 	c_last = c_this;
 	t_last = t_this;
 }
 
 void
 set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
 {
 
 	if (func == NULL) {
 		cpu_ticks = tc_cpu_ticks;
 	} else {
 		cpu_tick_frequency = freq;
 		cpu_tick_variable = var;
 		cpu_ticks = func;
 	}
 }
 
 uint64_t
 cpu_tickrate(void)
 {
 
 	if (cpu_ticks == tc_cpu_ticks) 
 		return (tc_getfrequency());
 	return (cpu_tick_frequency);
 }
 
 /*
  * We need to be slightly careful converting cputicks to microseconds.
  * There is plenty of margin in 64 bits of microseconds (half a million
  * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
  * before divide conversion (to retain precision) we find that the
  * margin shrinks to 1.5 hours (one millionth of 146y).
  * With a three prong approach we never lose significant bits, no
  * matter what the cputick rate and length of timeinterval is.
  */
 
 uint64_t
 cputick2usec(uint64_t tick)
 {
 
 	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
 		return (tick / (cpu_tickrate() / 1000000LL));
 	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
 		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
 	else
 		return ((tick * 1000000LL) / cpu_tickrate());
 }
 
 cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
 
 static int vdso_th_enable = 1;
 static int
 sysctl_fast_gettime(SYSCTL_HANDLER_ARGS)
 {
 	int old_vdso_th_enable, error;
 
 	old_vdso_th_enable = vdso_th_enable;
 	error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req);
 	if (error != 0)
 		return (error);
 	vdso_th_enable = old_vdso_th_enable;
 	return (0);
 }
 SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day");
 
 uint32_t
 tc_fill_vdso_timehands(struct vdso_timehands *vdso_th)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	vdso_th->th_scale = th->th_scale;
 	vdso_th->th_offset_count = th->th_offset_count;
 	vdso_th->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th->th_offset = th->th_offset;
 	vdso_th->th_boottime = th->th_boottime;
 	if (th->th_counter->tc_fill_vdso_timehands != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 
 #ifdef COMPAT_FREEBSD32
 uint32_t
 tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32)
 {
 	struct timehands *th;
 	uint32_t enabled;
 
 	th = timehands;
 	*(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale;
 	vdso_th32->th_offset_count = th->th_offset_count;
 	vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask;
 	vdso_th32->th_offset.sec = th->th_offset.sec;
 	*(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac;
 	vdso_th32->th_boottime.sec = th->th_boottime.sec;
 	*(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac;
 	if (th->th_counter->tc_fill_vdso_timehands32 != NULL) {
 		enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32,
 		    th->th_counter);
 	} else
 		enabled = 0;
 	if (!vdso_th_enable)
 		enabled = 0;
 	return (enabled);
 }
 #endif
Index: head/sys/kern/kern_umtx.c
===================================================================
--- head/sys/kern/kern_umtx.c	(revision 315279)
+++ head/sys/kern/kern_umtx.c	(revision 315280)
@@ -1,4545 +1,4575 @@
 /*-
  * Copyright (c) 2015, 2016 The FreeBSD Foundation
  * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_umtx_profiling.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
+#include <sys/time.h>
 #include <sys/eventhandler.h>
 #include <sys/umtx.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
+#include <machine/atomic.h>
 #include <machine/cpu.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_proto.h>
 #endif
 
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
 #ifdef UMTX_PROFILING
 #define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
 	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
 #endif
 
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
 	struct thread		*pi_owner;
 
 	/* Reference count */
 	int			pi_refcount;
 
  	/* List entry to link umtx holding by thread */
 	TAILQ_ENTRY(umtx_pi)	pi_link;
 
 	/* List entry in hash */
 	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
 
 	/* List for waiters */
 	TAILQ_HEAD(,umtx_q)	pi_blocked;
 
 	/* Identify a userland lock object */
 	struct umtx_key		pi_key;
 };
 
 /* A userland synchronous object user. */
 struct umtx_q {
 	/* Linked list for the hash. */
 	TAILQ_ENTRY(umtx_q)	uq_link;
 
 	/* Umtx key. */
 	struct umtx_key		uq_key;
 
 	/* Umtx flags. */
 	int			uq_flags;
 #define UQF_UMTXQ	0x0001
 
 	/* The thread waits on. */
 	struct thread		*uq_thread;
 
 	/*
 	 * Blocked on PI mutex. read can use chain lock
 	 * or umtx_lock, write must have both chain lock and
 	 * umtx_lock being hold.
 	 */
 	struct umtx_pi		*uq_pi_blocked;
 
 	/* On blocked list */
 	TAILQ_ENTRY(umtx_q)	uq_lockq;
 
 	/* Thread contending with us */
 	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
 
 	/* Inherited priority from PP mutex */
 	u_char			uq_inherited_pri;
 	
 	/* Spare queue ready to be reused */
 	struct umtxq_queue	*uq_spare_queue;
 
 	/* The queue we on */
 	struct umtxq_queue	*uq_cur_queue;
 };
 
 TAILQ_HEAD(umtxq_head, umtx_q);
 
 /* Per-key wait-queue */
 struct umtxq_queue {
 	struct umtxq_head	head;
 	struct umtx_key		key;
 	LIST_ENTRY(umtxq_queue)	link;
 	int			length;
 };
 
 LIST_HEAD(umtxq_list, umtxq_queue);
 
 /* Userland lock object's wait-queue chain */
 struct umtxq_chain {
 	/* Lock for this chain. */
 	struct mtx		uc_lock;
 
 	/* List of sleep queues. */
 	struct umtxq_list	uc_queue[2];
 #define UMTX_SHARED_QUEUE	0
 #define UMTX_EXCLUSIVE_QUEUE	1
 
 	LIST_HEAD(, umtxq_queue) uc_spare_queue;
 
 	/* Busy flag */
 	char			uc_busy;
 
 	/* Chain lock waiters */
 	int			uc_waiters;
 
 	/* All PI in the list */
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
 	u_int 			length;
 	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
  * a user can simply introduce PI-mutex, let thread A lock the mutex,
  * and let another thread B block on the mutex, because B is
  * sleeping, its priority will be boosted, this causes A's priority to
  * be boosted via priority propagating too and will never be lowered even
  * if it is using 100%CPU, this is unfair to other processes.
  */
 
 #define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
 			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
 			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
 
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #ifndef	UMTX_CHAINS
 #define	UMTX_CHAINS		512
 #endif
 #define	UMTX_SHIFTS		(__WORD_BIT - 9)
 
 #define	GET_SHARE(flags)	\
     (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
 
 #define BUSY_SPINS		200
 
 struct abs_timeout {
 	int clockid;
+	bool is_abs_real;	/* TIMER_ABSTIME && CLOCK_REALTIME* */
 	struct timespec cur;
 	struct timespec end;
 };
 
 #ifdef COMPAT_FREEBSD32
 struct umutex32 {
 	volatile __lwpid_t	m_owner;	/* Owner of the mutex */
 	__uint32_t		m_flags;	/* Flags of the mutex */
 	__uint32_t		m_ceilings[2];	/* Priority protect ceiling */
 	__uint32_t		m_rb_lnk;	/* Robust linkage */
 	__uint32_t		m_pad;
 	__uint32_t		m_spare[2];
 };
 
 _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32");
 _Static_assert(__offsetof(struct umutex, m_spare[0]) ==
     __offsetof(struct umutex32, m_spare[0]), "m_spare32");
 #endif
 
 int umtx_shm_vnobj_persistent = 0;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN,
     &umtx_shm_vnobj_persistent, 0,
     "False forces destruction of umtx attached to file, on last close");
 static int umtx_max_rb = 1000;
 SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN,
     &umtx_max_rb, 0,
     "");
 
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
 static int			umtx_pi_allocated;
 
 static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
 SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
     &umtx_pi_allocated, 0, "Allocated umtx_pi");
 static int umtx_verbose_rb = 1;
 SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN,
     &umtx_verbose_rb, 0,
     "");
 
 #ifdef UMTX_PROFILING
 static long max_length;
 SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
 static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
 #endif
 
+static void abs_timeout_update(struct abs_timeout *timo);
+
 static void umtx_shm_init(void);
 static void umtxq_sysinit(void *);
 static void umtxq_hash(struct umtx_key *key);
 static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
 static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
 static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     bool rb);
 static void umtx_thread_cleanup(struct thread *td);
 static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
     struct image_params *imgp __unused);
 SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
 
 #define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
 #define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
 #define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
 
 static struct mtx umtx_lock;
 
 #ifdef UMTX_PROFILING
 static void
 umtx_init_profiling(void) 
 {
 	struct sysctl_oid *chain_oid;
 	char chain_name[10];
 	int i;
 
 	for (i = 0; i < UMTX_CHAINS; ++i) {
 		snprintf(chain_name, sizeof(chain_name), "%d", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL, 
 		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, 
 		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
 		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
 
 static int
 sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
 {
 	char buf[512];
 	struct sbuf sb;
 	struct umtxq_chain *uc;
 	u_int fract, i, j, tot, whole;
 	u_int sf0, sf1, sf2, sf3, sf4;
 	u_int si0, si1, si2, si3, si4;
 	u_int sw0, sw1, sw2, sw3, sw4;
 
 	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 	for (i = 0; i < 2; i++) {
 		tot = 0;
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			uc = &umtxq_chains[i][j];
 			mtx_lock(&uc->uc_lock);
 			tot += uc->max_length;
 			mtx_unlock(&uc->uc_lock);
 		}
 		if (tot == 0)
 			sbuf_printf(&sb, "%u) Empty ", i);
 		else {
 			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
 			si0 = si1 = si2 = si3 = si4 = 0;
 			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
 			for (j = 0; j < UMTX_CHAINS; j++) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				whole = uc->max_length * 100;
 				mtx_unlock(&uc->uc_lock);
 				fract = (whole % tot) * 100;
 				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
 					sf0 = fract;
 					si0 = j;
 					sw0 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
 				    sf1)) {
 					sf1 = fract;
 					si1 = j;
 					sw1 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
 				    sf2)) {
 					sf2 = fract;
 					si2 = j;
 					sw2 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
 				    sf3)) {
 					sf3 = fract;
 					si3 = j;
 					sw3 = whole;
 				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
 				    sf4)) {
 					sf4 = fract;
 					si4 = j;
 					sw4 = whole;
 				}
 			}
 			sbuf_printf(&sb, "queue %u:\n", i);
 			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
 			    sf0 / tot, si0);
 			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
 			    sf1 / tot, si1);
 			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
 			    sf2 / tot, si2);
 			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
 			    sf3 / tot, si3);
 			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
 			    sf4 / tot, si4);
 		}
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (0);
 }
 
 static int
 sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
 {
 	struct umtxq_chain *uc;
 	u_int i, j;
 	int clear, error;
 
 	clear = 0;
 	error = sysctl_handle_int(oidp, &clear, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (clear != 0) {
 		for (i = 0; i < 2; ++i) {
 			for (j = 0; j < UMTX_CHAINS; ++j) {
 				uc = &umtxq_chains[i][j];
 				mtx_lock(&uc->uc_lock);
 				uc->length = 0;
 				uc->max_length = 0;	
 				mtx_unlock(&uc->uc_lock);
 			}
 		}
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
 SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
     sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
 umtxq_sysinit(void *arg __unused)
 {
 	int i, j;
 
 	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	for (i = 0; i < 2; ++i) {
 		for (j = 0; j < UMTX_CHAINS; ++j) {
 			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
 				 MTX_DEF | MTX_DUPOK);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
 			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
 			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
 			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
 			umtxq_chains[i][j].uc_busy = 0;
 			umtxq_chains[i][j].uc_waiters = 0;
 #ifdef UMTX_PROFILING
 			umtxq_chains[i][j].length = 0;
 			umtxq_chains[i][j].max_length = 0;	
 #endif
 		}
 	}
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
 	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	umtx_shm_init();
 }
 
 struct umtx_q *
 umtxq_alloc(void)
 {
 	struct umtx_q *uq;
 
 	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
 	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX,
 	    M_WAITOK | M_ZERO);
 	TAILQ_INIT(&uq->uq_spare_queue->head);
 	TAILQ_INIT(&uq->uq_pi_contested);
 	uq->uq_inherited_pri = PRI_MAX;
 	return (uq);
 }
 
 void
 umtxq_free(struct umtx_q *uq)
 {
 
 	MPASS(uq->uq_spare_queue != NULL);
 	free(uq->uq_spare_queue, M_UMTX);
 	free(uq, M_UMTX);
 }
 
 static inline void
 umtxq_hash(struct umtx_key *key)
 {
 	unsigned n;
 
 	n = (uintptr_t)key->info.both.a + key->info.both.b;
 	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline struct umtxq_chain *
 umtxq_getchain(struct umtx_key *key)
 {
 
 	if (key->type <= TYPE_SEM)
 		return (&umtxq_chains[1][key->hash]);
 	return (&umtxq_chains[0][key->hash]);
 }
 
 /*
  * Lock a chain.
  */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_lock(&uc->uc_lock);
 }
 
 /*
  * Unlock a chain.
  */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_unlock(&uc->uc_lock);
 }
 
 /*
  * Set chain to busy state when following operation
  * may be blocked (kernel mutex can not be used).
  */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	if (uc->uc_busy) {
 #ifdef SMP
 		if (smp_cpus > 1) {
 			int count = BUSY_SPINS;
 			if (count > 0) {
 				umtxq_unlock(key);
 				while (uc->uc_busy && --count > 0)
 					cpu_spinwait();
 				umtxq_lock(key);
 			}
 		}
 #endif
 		while (uc->uc_busy) {
 			uc->uc_waiters++;
 			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
 			uc->uc_waiters--;
 		}
 	}
 	uc->uc_busy = 1;
 }
 
 /*
  * Unbusy a chain.
  */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	mtx_assert(&uc->uc_lock, MA_OWNED);
 	KASSERT(uc->uc_busy != 0, ("not busy"));
 	uc->uc_busy = 0;
 	if (uc->uc_waiters)
 		wakeup_one(uc);
 }
 
 static inline void
 umtxq_unbusy_unlocked(struct umtx_key *key)
 {
 
 	umtxq_lock(key);
 	umtxq_unbusy(key);
 	umtxq_unlock(key);
 }
 
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
 		if (umtx_key_match(&uh->key, key))
 			return (uh);
 	}
 
 	return (NULL);
 }
 
 static inline void
 umtxq_insert_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_queue *uh;
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
 	uh = umtxq_queue_lookup(&uq->uq_key, q);
 	if (uh != NULL) {
 		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
 	} else {
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
 #ifdef UMTX_PROFILING
 		uc->length++;
 		if (uc->length > uc->max_length) {
 			uc->max_length = uc->length;
 			if (uc->max_length > max_length)
 				max_length = uc->max_length;	
 		}
 #endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
 }
 
 static inline void
 umtxq_remove_queue(struct umtx_q *uq, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	if (uq->uq_flags & UQF_UMTXQ) {
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
 #ifdef UMTX_PROFILING
 			uc->length--;
 #endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
 			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
 			LIST_REMOVE(uh, link);
 		}
 		uq->uq_spare_queue = uh;
 		uq->uq_cur_queue = NULL;
 	}
 }
 
 /*
  * Check if there are multiple waiters
  */
 static int
 umtxq_count(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL)
 		return (uh->length);
 	return (0);
 }
 
 /*
  * Check if there are multiple PI waiters and returns first
  * waiter.
  */
 static int
 umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 
 	*first = NULL;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
 	if (uh != NULL) {
 		*first = TAILQ_FIRST(&uh->head);
 		return (uh->length);
 	}
 	return (0);
 }
 
 static int
 umtxq_check_susp(struct thread *td)
 {
 	struct proc *p;
 	int error;
 
 	/*
 	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
 	 * eventually break the lockstep loop.
 	 */
 	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
 		return (0);
 	error = 0;
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (P_SHOULDSTOP(p) ||
 	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
 		if (p->p_flag & P_SINGLE_EXIT)
 			error = EINTR;
 		else
 			error = ERESTART;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Wake up threads waiting on an userland object.
  */
 
 static int
 umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
 {
 	struct umtxq_chain *uc;
 	struct umtxq_queue *uh;
 	struct umtx_q *uq;
 	int ret;
 
 	ret = 0;
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	uh = umtxq_queue_lookup(key, q);
 	if (uh != NULL) {
 		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
 			umtxq_remove_queue(uq, q);
 			wakeup(uq);
 			if (++ret >= n_wake)
 				return (ret);
 		}
 	}
 	return (ret);
 }
 
 
 /*
  * Wake up specified thread.
  */
 static inline void
 umtxq_signal_thread(struct umtx_q *uq)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	umtxq_remove(uq);
 	wakeup(uq);
 }
 
 static inline int 
 tstohz(const struct timespec *tsp)
 {
 	struct timeval tv;
 
 	TIMESPEC_TO_TIMEVAL(&tv, tsp);
 	return tvtohz(&tv);
 }
 
 static void
 abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
 	const struct timespec *timeout)
 {
 
 	timo->clockid = clockid;
 	if (!absolute) {
-		kern_clock_gettime(curthread, clockid, &timo->end);
-		timo->cur = timo->end;
+		timo->is_abs_real = false;
+		abs_timeout_update(timo);
+		timo->end = timo->cur;
 		timespecadd(&timo->end, timeout);
 	} else {
 		timo->end = *timeout;
-		kern_clock_gettime(curthread, clockid, &timo->cur);
+		timo->is_abs_real = clockid == CLOCK_REALTIME ||
+		    clockid == CLOCK_REALTIME_FAST ||
+		    clockid == CLOCK_REALTIME_PRECISE;
+		/*
+		 * If is_abs_real, umtxq_sleep will read the clock
+		 * after setting td_rtcgen; otherwise, read it here.
+		 */
+		if (!timo->is_abs_real) {
+			abs_timeout_update(timo);
+		}
 	}
 }
 
 static void
 abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
 {
 
 	abs_timeout_init(timo, umtxtime->_clockid,
 	    (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout);
 }
 
 static inline void
 abs_timeout_update(struct abs_timeout *timo)
 {
 
 	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
 }
 
 static int
 abs_timeout_gethz(struct abs_timeout *timo)
 {
 	struct timespec tts;
 
 	if (timespeccmp(&timo->end, &timo->cur, <=))
 		return (-1); 
 	tts = timo->end;
 	timespecsub(&tts, &timo->cur);
 	return (tstohz(&tts));
 }
 
 static uint32_t
 umtx_unlock_val(uint32_t flags, bool rb)
 {
 
 	if (rb)
 		return (UMUTEX_RB_OWNERDEAD);
 	else if ((flags & UMUTEX_NONCONSISTENT) != 0)
 		return (UMUTEX_RB_NOTRECOV);
 	else
 		return (UMUTEX_UNOWNED);
 
 }
 
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
 umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
 	int error, timo;
 
+	if (abstime != NULL && abstime->is_abs_real) {
+		curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation);
+		abs_timeout_update(abstime);
+	}
+
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	for (;;) {
-		if (!(uq->uq_flags & UQF_UMTXQ))
-			return (0);
+		if (!(uq->uq_flags & UQF_UMTXQ)) {
+			error = 0;
+			break;
+		}
 		if (abstime != NULL) {
 			timo = abs_timeout_gethz(abstime);
-			if (timo < 0)
-				return (ETIMEDOUT);
+			if (timo < 0) {
+				error = ETIMEDOUT;
+				break;
+			}
 		} else
 			timo = 0;
 		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
-		if (error != EWOULDBLOCK) {
+		if (error == EINTR || error == ERESTART) {
 			umtxq_lock(&uq->uq_key);
 			break;
 		}
-		if (abstime != NULL)
+		if (abstime != NULL) {
+			if (abstime->is_abs_real)
+				curthread->td_rtcgen =
+				    atomic_load_acq_int(&rtc_generation);
 			abs_timeout_update(abstime);
+		}
 		umtxq_lock(&uq->uq_key);
 	}
+
+	curthread->td_rtcgen = 0;
 	return (error);
 }
 
 /*
  * Convert userspace address into unique logical address.
  */
 int
 umtx_key_get(const void *addr, int type, int share, struct umtx_key *key)
 {
 	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
 	key->type = type;
 	if (share == THREAD_SHARE) {
 		key->shared = 0;
 		key->info.private.vs = td->td_proc->p_vmspace;
 		key->info.private.addr = (uintptr_t)addr;
 	} else {
 		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
 		map = &td->td_proc->p_vmspace->vm_map;
 		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
 		    &entry, &key->info.shared.object, &pindex, &prot,
 		    &wired) != KERN_SUCCESS) {
 			return (EFAULT);
 		}
 
 		if ((share == PROCESS_SHARE) ||
 		    (share == AUTO_SHARE &&
 		     VM_INHERIT_SHARE == entry->inheritance)) {
 			key->shared = 1;
 			key->info.shared.offset = (vm_offset_t)addr -
 			    entry->start + entry->offset;
 			vm_object_reference(key->info.shared.object);
 		} else {
 			key->shared = 0;
 			key->info.private.vs = td->td_proc->p_vmspace;
 			key->info.private.addr = (uintptr_t)addr;
 		}
 		vm_map_lookup_done(map, entry);
 	}
 
 	umtxq_hash(key);
 	return (0);
 }
 
 /*
  * Release key.
  */
 void
 umtx_key_release(struct umtx_key *key)
 {
 	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
 /*
  * Fetch and compare value, sleep on the address if value is not changed.
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
     struct _umtx_time *timeout, int compat32, int is_private)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long tmp;
 	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	if (compat32 == 0) {
 		error = fueword(addr, &tmp);
 		if (error != 0)
 			error = EFAULT;
 	} else {
 		error = fueword32(addr, &tmp32);
 		if (error == 0)
 			tmp = tmp32;
 		else
 			error = EFAULT;
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		if (tmp == id)
 			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
 			    NULL : &timo);
 		if ((uq->uq_flags & UQF_UMTXQ) == 0)
 			error = 0;
 		else
 			umtxq_remove(uq);
 	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 /*
  * Wake up threads sleeping on the specified address.
  */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
 {
 	struct umtx_key key;
 	int ret;
 	
 	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
 	    is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
 }
 
 /*
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int mode)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	error = 0;
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		rv = fueword32(&m->m_owner, &owner);
 		if (rv == -1)
 			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED ||
 			    owner == UMUTEX_CONTESTED ||
 			    owner == UMUTEX_RB_OWNERDEAD ||
 			    owner == UMUTEX_RB_NOTRECOV)
 				return (0);
 		} else {
 			/*
 			 * Robust mutex terminated.  Kernel duty is to
 			 * return EOWNERDEAD to the userspace.  The
 			 * umutex.m_flags UMUTEX_NONCONSISTENT is set
 			 * by the common userspace code.
 			 */
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_RB_OWNERDEAD, &owner,
 				    id | UMUTEX_CONTESTED);
 				if (rv == -1)
 					return (EFAULT);
 				if (owner == UMUTEX_RB_OWNERDEAD)
 					return (EOWNERDEAD); /* success */
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 				continue;
 			}
 			if (owner == UMUTEX_RB_NOTRECOV)
 				return (ENOTRECOVERABLE);
 
 
 			/*
 			 * Try the uncontested case.  This should be
 			 * done in userland.
 			 */
 			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
 			    &owner, id);
 			/* The address was invalid. */
 			if (rv == -1)
 				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
 			/*
 			 * If no one owns it but it is contested try
 			 * to acquire it.
 			 */
 			if (owner == UMUTEX_CONTESTED) {
 				rv = casueword32(&m->m_owner,
 				    UMUTEX_CONTESTED, &owner,
 				    id | UMUTEX_CONTESTED);
 				/* The address was invalid. */
 				if (rv == -1)
 					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
 				rv = umtxq_check_susp(td);
 				if (rv != 0)
 					return (rv);
 
 				/*
 				 * If this failed the lock has
 				 * changed, restart.
 				 */
 				continue;
 			}
 		}
 
 		if (mode == _UMUTEX_TRY)
 			return (EBUSY);
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			return (error);
 
 		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
 		    GET_SHARE(flags), &uq->uq_key)) != 0)
 			return (error);
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old,
 		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
 			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
 			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 
 		if (error == 0)
 			error = umtxq_check_susp(td);
 	}
 
 	return (0);
 }
 
 /*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
 do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	uint32_t owner, old, id, newlock;
 	int error, count;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	newlock = umtx_unlock_val(flags, rb);
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, newlock);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 	if (count > 1)
 		newlock |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, newlock);
 	umtxq_lock(&key);
 	umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Check if the mutex is available and wake up a waiter,
  * only for simple mutex.
  */
 static int
 do_wake_umutex(struct thread *td, struct umutex *m)
 {
 	struct umtx_key key;
 	uint32_t owner;
 	uint32_t flags;
 	int error;
 	int count;
 
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV)
 		return (0);
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
 	if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD &&
 	    owner != UMUTEX_RB_NOTRECOV) {
 		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    UMUTEX_UNOWNED);
 		if (error == -1)
 			error = EFAULT;
 	}
 
 	umtxq_lock(&key);
 	if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 /*
  * Check if the mutex has waiters and tries to fix contention bit.
  */
 static int
 do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
 {
 	struct umtx_key key;
 	uint32_t owner, old;
 	int type;
 	int error;
 	int count;
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT |
 	    UMUTEX_ROBUST)) {
 	case 0:
 	case UMUTEX_ROBUST:
 		type = TYPE_NORMAL_UMUTEX;
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		type = TYPE_PI_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST):
 		type = TYPE_PI_ROBUST_UMUTEX;
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		type = TYPE_PP_UMUTEX;
 		break;
 	case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST):
 		type = TYPE_PP_ROBUST_UMUTEX;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0)
 		return (error);
 
 	owner = 0;
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 	/*
 	 * Only repair contention bit if there is a waiter, this means the mutex
 	 * is still being referenced by userland code, otherwise don't update
 	 * any memory.
 	 */
 	if (count > 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	} else if (count == 1) {
 		error = fueword32(&m->m_owner, &owner);
 		if (error == -1)
 			error = EFAULT;
 		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
 		    (owner & UMUTEX_CONTESTED) == 0) {
 			error = casueword32(&m->m_owner, owner, &old,
 			    owner | UMUTEX_CONTESTED);
 			if (error == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (old == owner)
 				break;
 			owner = old;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 	}
 	umtxq_lock(&key);
 	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
 	} else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 ||
 	    owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV))
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static inline struct umtx_pi *
 umtx_pi_alloc(int flags)
 {
 	struct umtx_pi *pi;
 
 	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
 	TAILQ_INIT(&pi->pi_blocked);
 	atomic_add_int(&umtx_pi_allocated, 1);
 	return (pi);
 }
 
 static inline void
 umtx_pi_free(struct umtx_pi *pi)
 {
 	uma_zfree(umtx_pi_zone, pi);
 	atomic_add_int(&umtx_pi_allocated, -1);
 }
 
 /*
  * Adjust the thread's position on a pi_state after its priority has been
  * changed.
  */
 static int
 umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
 {
 	struct umtx_q *uq, *uq1, *uq2;
 	struct thread *td1;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (0);
 
 	uq = td->td_umtxq;
 
 	/*
 	 * Check if the thread needs to be moved on the blocked chain.
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
 	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
 	uq2 = TAILQ_NEXT(uq, uq_lockq);
 	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
 	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
 		/*
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
 		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 			td1 = uq1->uq_thread;
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (UPRI(td1) > UPRI(td))
 				break;
 		}
 
 		if (uq1 == NULL)
 			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 		else
 			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	}
 	return (1);
 }
 
 static struct umtx_pi *
 umtx_pi_next(struct umtx_pi *pi)
 {
 	struct umtx_q *uq_owner;
 
 	if (pi->pi_owner == NULL)
 		return (NULL);
 	uq_owner = pi->pi_owner->td_umtxq;
 	if (uq_owner == NULL)
 		return (NULL);
 	return (uq_owner->uq_pi_blocked);
 }
 
 /*
  * Floyd's Cycle-Finding Algorithm.
  */
 static bool
 umtx_pi_check_loop(struct umtx_pi *pi)
 {
 	struct umtx_pi *pi1;	/* fast iterator */
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi == NULL)
 		return (false);
 	pi1 = pi;
 	for (;;) {
 		pi = umtx_pi_next(pi);
 		if (pi == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		pi1 = umtx_pi_next(pi1);
 		if (pi1 == NULL)
 			break;
 		if (pi == pi1)
 			return (true);
 	}
 	return (false);
 }
 
 /*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
 static void
 umtx_propagate_priority(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	pri = UPRI(td);
 	uq = td->td_umtxq;
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
 	if (umtx_pi_check_loop(pi))
 		return;
 
 	for (;;) {
 		td = pi->pi_owner;
 		if (td == NULL || td == curthread)
 			return;
 
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		thread_lock(td);
 		if (td->td_lend_user_pri > pri)
 			sched_lend_user_prio(td, pri);
 		else {
 			thread_unlock(td);
 			break;
 		}
 		thread_unlock(td);
 
 		/*
 		 * Pick up the lock that td is blocked on.
 		 */
 		uq = td->td_umtxq;
 		pi = uq->uq_pi_blocked;
 		if (pi == NULL)
 			break;
 		/* Resort td on the list if needed. */
 		umtx_pi_adjust_thread(pi, td);
 	}
 }
 
 /*
  * Unpropagate priority for a PI mutex when a thread blocked on
  * it is interrupted by signal or resumed by others.
  */
 static void
 umtx_repropagate_priority(struct umtx_pi *pi)
 {
 	struct umtx_q *uq, *uq_owner;
 	struct umtx_pi *pi2;
 	int pri;
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
 	if (umtx_pi_check_loop(pi))
 		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
 
 		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
 			uq = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq != NULL) {
 				if (pri > UPRI(uq->uq_thread))
 					pri = UPRI(uq->uq_thread);
 			}
 		}
 
 		if (pri > uq_owner->uq_inherited_pri)
 			pri = uq_owner->uq_inherited_pri;
 		thread_lock(pi->pi_owner);
 		sched_lend_user_prio(pi->pi_owner, pri);
 		thread_unlock(pi->pi_owner);
 		if ((pi = uq_owner->uq_pi_blocked) != NULL)
 			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
 	}
 }
 
 /*
  * Insert a PI mutex into owned list.
  */
 static void
 umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq_owner;
 
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
 		panic("pi_owner != NULL");
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
 
 /*
  * Disown a PI mutex, and remove it from the owned list.
  */
 static void
 umtx_pi_disown(struct umtx_pi *pi)
 {
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
 	pi->pi_owner = NULL;
 }
 
 /*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
 	struct umtx_q *uq;
 	int pri;
 
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
 		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
 	if (pi->pi_owner != NULL) {
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
 		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
 	uq = TAILQ_FIRST(&pi->pi_blocked);
 	if (uq != NULL) {
 		pri = UPRI(uq->uq_thread);
 		thread_lock(owner);
 		if (pri < UPRI(owner))
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
 	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
 /*
  * Adjust a thread's order position in its blocked PI mutex,
  * this may result new priority propagating process.
  */
 void
 umtx_pi_adjust(struct thread *td, u_char oldpri)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
 	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
 	pi = uq->uq_pi_blocked;
 	if (pi != NULL) {
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
 	mtx_unlock(&umtx_lock);
 }
 
 /*
  * Sleep on a PI mutex.
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner,
     const char *wmesg, struct abs_timeout *timo, bool shared)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
 	struct umtx_q *uq1;
 	int error, pri;
 
 	error = 0;
 	td = uq->uq_thread;
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
 	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
 		mtx_unlock(&umtx_lock);
 		td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid);
 		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
 			PROC_UNLOCK(td1->td_proc);
 		}
 	}
 
 	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
 		pri = UPRI(uq1->uq_thread);
 		if (pri > UPRI(td))
 			break;
 	}
 
 	if (uq1 != NULL)
 		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
 	else
 		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
 
 	uq->uq_pi_blocked = pi;
 	thread_lock(td);
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
 	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, wmesg, timo);
 	umtxq_remove(uq);
 
 	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
 	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
 }
 
 /*
  * Add reference count for a PI mutex.
  */
 static void
 umtx_pi_ref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	pi->pi_refcount++;
 }
 
 /*
  * Decrease reference count for a PI mutex, if the counter
  * is decreased to zero, its memory space is freed.
  */ 
 static void
 umtx_pi_unref(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
 		mtx_lock(&umtx_lock);
 		if (pi->pi_owner != NULL)
 			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
 		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
 }
 
 /*
  * Find a PI mutex in hash table.
  */
 static struct umtx_pi *
 umtx_pi_lookup(struct umtx_key *key)
 {
 	struct umtxq_chain *uc;
 	struct umtx_pi *pi;
 
 	uc = umtxq_getchain(key);
 	UMTXQ_LOCKED_ASSERT(uc);
 
 	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
 		if (umtx_key_match(&pi->pi_key, key)) {
 			return (pi);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Insert a PI mutex into hash table.
  */
 static inline void
 umtx_pi_insert(struct umtx_pi *pi)
 {
 	struct umtxq_chain *uc;
 
 	uc = umtxq_getchain(&pi->pi_key);
 	UMTXQ_LOCKED_ASSERT(uc);
 	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
 }
 
 /*
  * Lock a PI mutex.
  */
 static int
 do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, old_owner, owner, old;
 	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
 		new_pi = umtx_pi_alloc(M_NOWAIT);
 		if (new_pi == NULL) {
 			umtxq_unlock(&uq->uq_key);
 			new_pi = umtx_pi_alloc(M_WAITOK);
 			umtxq_lock(&uq->uq_key);
 			pi = umtx_pi_lookup(&uq->uq_key);
 			if (pi != NULL) {
 				umtx_pi_free(new_pi);
 				new_pi = NULL;
 			}
 		}
 		if (new_pi != NULL) {
 			new_pi->pi_key = uq->uq_key;
 			umtx_pi_insert(new_pi);
 			pi = new_pi;
 		}
 	}
 	umtx_pi_ref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure.  It
 	 * can fault on any access.
 	 */
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
 		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
 			error = 0;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) {
 			old_owner = owner;
 			rv = casueword32(&m->m_owner, owner, &owner,
 			    id | UMUTEX_CONTESTED);
 			/* The address was invalid. */
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 
 			if (owner == old_owner) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 				if (error != 0) {
 					/*
 					 * Since we're going to return an
 					 * error, restore the m_owner to its
 					 * previous, unowned state to avoid
 					 * compounding the problem.
 					 */
 					(void)casuword32(&m->m_owner,
 					    id | UMUTEX_CONTESTED,
 					    old_owner);
 				}
 				if (error == 0 &&
 				    old_owner == UMUTEX_RB_OWNERDEAD)
 					error = EOWNERDEAD;
 				break;
 			}
 
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 			
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
 		rv = casueword32(&m->m_owner, owner, &old, owner |
 		    UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 
 		umtxq_lock(&uq->uq_key);
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.  Note that the UMUTEX_RB_OWNERDEAD
 		 * value for owner is impossible there.
 		 */
 		if (old == owner) {
 			error = umtxq_sleep_pi(uq, pi,
 			    owner & ~UMUTEX_CONTESTED,
 			    "umtxpi", timeout == NULL ? NULL : &timo,
 			    (flags & USYNC_PROCESS_SHARED) != 0);
 			if (error != 0)
 				continue;
 		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
 
 		error = umtxq_check_susp(td);
 		if (error != 0)
 			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtx_pi_unref(pi);
 	umtxq_unlock(&uq->uq_key);
 
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PI mutex.
  */
 static int
 do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq_first, *uq_first2, *uq_me;
 	struct umtx_pi *pi, *pi2;
 	uint32_t id, new_owner, old, owner;
 	int count, error, pri;
 
 	id = td->td_tid;
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	new_owner = umtx_unlock_val(flags, rb);
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
 		error = casueword32(&m->m_owner, owner, &old, new_owner);
 		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
 		owner = old;
 	}
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
 		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
 		if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) {
 			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
 			/* userland messed the mutex */
 			return (EPERM);
 		}
 		uq_me = td->td_umtxq;
 		if (pi->pi_owner == td)
 			umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
 		    (uq_first->uq_flags & UQF_UMTXQ) == 0) {
 			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
 		}
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
 			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
 			if (uq_first2 != NULL) {
 				if (pri > UPRI(uq_first2->uq_thread))
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
 	} else {
 		pi = umtx_pi_lookup(&key);
 		/*
 		 * A umtx_pi can exist if a signal or timeout removed the
 		 * last waiter from the umtxq, but there is still
 		 * a thread in do_lock_pi() holding the umtx_pi.
 		 */
 		if (pi != NULL) {
 			/*
 			 * The umtx_pi can be unowned, such as when a thread
 			 * has just entered do_lock_pi(), allocated the
 			 * umtx_pi, and unlocked the umtxq.
 			 * If the current thread owns it, it must disown it.
 			 */
 			mtx_lock(&umtx_lock);
 			if (pi->pi_owner == td)
 				umtx_pi_disown(pi);
 			mtx_unlock(&umtx_lock);
 		}
 	}
 	umtxq_unlock(&key);
 
 	/*
 	 * When unlocking the umtx, it must be marked as unowned if
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
 
 	if (count > 1)
 		new_owner |= UMUTEX_CONTESTED;
 	error = casueword32(&m->m_owner, owner, &old, new_owner);
 
 	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
 	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Lock a PP mutex.
  */
 static int
 do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
     struct _umtx_time *timeout, int try)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
 	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			goto out;
 		}
 		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
 		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
 			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
 		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
 			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
 			thread_lock(td);
 			if (uq->uq_inherited_pri < UPRI(td))
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
 		mtx_unlock(&umtx_lock);
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		/* The address was invalid. */
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
 			break;
 		} else if (owner == UMUTEX_RB_OWNERDEAD) {
 			rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD,
 			    &owner, id | UMUTEX_CONTESTED);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (owner == UMUTEX_RB_OWNERDEAD) {
 				error = EOWNERDEAD; /* success */
 				break;
 			}
 			error = 0;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		if (try != 0) {
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
 		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0 && error != EOWNERDEAD) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 
 out:
 	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Unlock a PP mutex.
  */
 static int
 do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb)
 {
 	struct umtx_key key;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t id, owner, rceiling;
 	int error, pri, new_inherited_pri, su;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 
 	/*
 	 * Make sure we own this mtx.
 	 */
 	error = fueword32(&m->m_owner, &owner);
 	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
 		return (EPERM);
 
 	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
 	if (error != 0)
 		return (error);
 
 	if (rceiling == -1)
 		new_inherited_pri = PRI_MAX;
 	else {
 		rceiling = RTP_PRIO_MAX - rceiling;
 		if (rceiling > RTP_PRIO_MAX)
 			return (EINVAL);
 		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
 	}
 
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &key)) != 0)
 		return (error);
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_unlock(&key);
 	/*
 	 * For priority protected mutex, always set unlocked state
 	 * to UMUTEX_CONTESTED, so that userland always enters kernel
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
 	error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) |
 	    UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 
 	if (error == -1)
 		error = EFAULT;
 	else {
 		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
 			uq2 = TAILQ_FIRST(&pi->pi_blocked);
 			if (uq2 != NULL) {
 				if (pri > UPRI(uq2->uq_thread))
 					pri = UPRI(uq2->uq_thread);
 			}
 		}
 		if (pri > uq->uq_inherited_pri)
 			pri = uq->uq_inherited_pri;
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
 		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
     uint32_t *old_ceiling)
 {
 	struct umtx_q *uq;
 	uint32_t flags, id, owner, save_ceiling;
 	int error, rv, rv1;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
 		return (EINVAL);
 	id = td->td_tid;
 	uq = td->td_umtxq;
 	if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ?
 	    TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
 	for (;;) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
 		    id | UMUTEX_CONTESTED);
 		if (rv == -1) {
 			error = EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = (rv == 0 && rv1 == 0) ? 0: EFAULT;
 			break;
 		}
 
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			rv = suword32(&m->m_ceilings[0], ceiling);
 			error = rv == 0 ? 0 : EFAULT;
 			break;
 		}
 
 		if (owner == UMUTEX_RB_OWNERDEAD) {
 			error = EOWNERDEAD;
 			break;
 		} else if (owner == UMUTEX_RB_NOTRECOV) {
 			error = ENOTRECOVERABLE;
 			break;
 		}
 
 		/*
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
 		if (error != 0)
 			break;
 
 		/*
 		 * We set the contested bit, sleep. Otherwise the lock changed
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
 		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
 	umtxq_lock(&uq->uq_key);
 	if (error == 0)
 		umtxq_signal(&uq->uq_key, INT_MAX);
 	umtxq_unbusy(&uq->uq_key);
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == 0 && old_ceiling != NULL) {
 		rv = suword32(old_ceiling, save_ceiling);
 		error = rv == 0 ? 0 : EFAULT;
 	}
 	return (error);
 }
 
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
     struct _umtx_time *timeout, int mode)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		error = do_lock_normal(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_INHERIT:
 		error = do_lock_pi(td, m, flags, timeout, mode);
 		break;
 	case UMUTEX_PRIO_PROTECT:
 		error = do_lock_pp(td, m, flags, timeout, mode);
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (timeout == NULL) {
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
 	}
 	return (error);
 }
 
 /*
  * Unlock a userland POSIX mutex.
  */
 static int
 do_unlock_umutex(struct thread *td, struct umutex *m, bool rb)
 {
 	uint32_t flags;
 	int error;
 
 	error = fueword32(&m->m_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 
 	switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
 	case 0:
 		return (do_unlock_normal(td, m, flags, rb));
 	case UMUTEX_PRIO_INHERIT:
 		return (do_unlock_pi(td, m, flags, rb));
 	case UMUTEX_PRIO_PROTECT:
 		return (do_unlock_pp(td, m, flags, rb));
 	}
 
 	return (EINVAL);
 }
 
 static int
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
     struct timespec *timeout, u_long wflags)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
 		error = fueword32(&cv->c_clockid, &clockid);
 		if (error == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
 			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
 		clockid = CLOCK_REALTIME;
 	}
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 
 	/*
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
 	error = fueword32(&cv->c_has_waiters, &hasw);
 	if (error == 0 && hasw == 0)
 		suword32(&cv->c_has_waiters, 1);
 
 	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m, false);
 
 	if (timeout != NULL)
 		abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0,
 		    timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
 		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
 		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		/*
 		 * This must be timeout,interrupted by signal or
 		 * surprious wakeup, clear c_has_waiter flag when
 		 * necessary.
 		 */
 		umtxq_busy(&uq->uq_key);
 		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 			int oldlen = uq->uq_cur_queue->length;
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
 				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
 		umtxq_unbusy(&uq->uq_key);
 		if (error == ERESTART)
 			error = EINTR;
 	}
 
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland condition variable.
  */
 static int
 do_cv_signal(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error, cnt, nwake;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
 		error = suword32(&cv->c_has_waiters, 0);
 		if (error == -1)
 			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_cv_broadcast(struct thread *td, struct ucond *cv)
 {
 	struct umtx_key key;
 	int error;
 	uint32_t flags;
 
 	error = fueword32(&cv->c_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
 	error = suword32(&cv->c_has_waiters, 0);
 	if (error == -1)
 		error = EFAULT;
 
 	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
 }
 
 static int
 do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state + 1);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			state = oldstate;
 		}
 
 		if (error)
 			break;
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		/* set read contention bit */
 		while (error == 0 && (state & wrflags) &&
 		    !(state & URWLOCK_READ_WAITERS)) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_READ_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
 		rv = fueword32(&rwlock->rw_blocked_readers,
 		    &blocked_readers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_READ_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 		}
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 		if (error != 0)
 			break;
 	}
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
 	int error, error1, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	blocked_readers = 0;
 	for (;;) {
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1) {
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				umtx_key_release(&uq->uq_key);
 				return (EFAULT);
 			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 
 		if (error) {
 			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
 			    blocked_readers != 0) {
 				umtxq_lock(&uq->uq_key);
 				umtxq_busy(&uq->uq_key);
 				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
 			}
 
 			break;
 		}
 
 		/* grab monitor lock */
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
 		/*
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
 		rv = fueword32(&rwlock->rw_state, &state);
 		if (rv == -1)
 			error = EFAULT;
 
 		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
 		    URWLOCK_READER_COUNT(state) != 0) &&
 		    (state & URWLOCK_WRITE_WAITERS) == 0) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state | URWLOCK_WRITE_WAITERS);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 		}
 		if (error != 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			break;
 		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = umtxq_check_susp(td);
 			if (error != 0)
 				break;
 			continue;
 		}
 sleep:
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
 			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
 			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				error = EFAULT;
 				break;
 			}
 		}
 
 		rv = fueword32(&rwlock->rw_blocked_writers,
 		    &blocked_writers);
 		if (rv == -1) {
 			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
 			rv = fueword32(&rwlock->rw_state, &state);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 			for (;;) {
 				rv = casueword32(&rwlock->rw_state, state,
 				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
 				if (rv == -1) {
 					error = EFAULT;
 					break;
 				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
 				error1 = umtxq_check_susp(td);
 				/*
 				 * We are leaving the URWLOCK_WRITE_WAITERS
 				 * behind, but this should not harm the
 				 * correctness.
 				 */
 				if (error1 != 0) {
 					if (error == 0)
 						error = error1;
 					break;
 				}
 			}
 			rv = fueword32(&rwlock->rw_blocked_readers,
 			    &blocked_readers);
 			if (rv == -1) {
 				umtxq_unbusy_unlocked(&uq->uq_key);
 				error = EFAULT;
 				break;
 			}
 		} else
 			blocked_readers = 0;
 
 		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
 static int
 do_rw_unlock(struct thread *td, struct urwlock *rwlock)
 {
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int error, rv, q, count;
 
 	uq = td->td_umtxq;
 	error = fueword32(&rwlock->rw_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	error = fueword32(&rwlock->rw_state, &state);
 	if (error == -1) {
 		error = EFAULT;
 		goto out;
 	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state, 
 			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
 			rv = casueword32(&rwlock->rw_state, state,
 			    &oldstate, state - 1);
 			if (rv == -1) {
 				error = EFAULT;
 				goto out;
 			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
 					error = EPERM;
 					goto out;
 				}
 				error = umtxq_check_susp(td);
 				if (error != 0)
 					goto out;
 			} else
 				break;
 		}
 	} else {
 		error = EPERM;
 		goto out;
 	}
 
 	count = 0;
 
 	if (!(flags & URWLOCK_PREFER_READER)) {
 		if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		} else if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		}
 	} else {
 		if (state & URWLOCK_READ_WAITERS) {
 			count = INT_MAX;
 			q = UMTX_SHARED_QUEUE;
 		} else if (state & URWLOCK_WRITE_WAITERS) {
 			count = 1;
 			q = UMTX_EXCLUSIVE_QUEUE;
 		}
 	}
 
 	if (count) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_busy(&uq->uq_key);
 		umtxq_signal_queue(&uq->uq_key, count, q);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 	}
 out:
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, count, count1;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
 	if (rv == 0)
 		rv = fueword32(&sem->_count, &count);
 	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (rv == -1 ? EFAULT : 0);
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
 		if (error == ERESTART && timeout != NULL &&
 		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
 	int error, cnt;
 	uint32_t flags;
 
 	error = fueword32(&sem->_flags, &flags);
 	if (error == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * Check if count is greater than 0, this means the memory is
 		 * still being referenced by user code, so we can safely
 		 * update _has_waiters flag.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			error = suword32(&sem->_has_waiters, 0);
 			umtxq_lock(&key);
 			if (error == -1)
 				error = EFAULT;
 		}
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 #endif
 
 static int
 do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout)
 {
 	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t count, flags;
 	int error, rv;
 
 	uq = td->td_umtxq;
 	flags = fuword32(&sem->_flags);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if (timeout != NULL)
 		abs_timeout_init2(&timo, timeout);
 
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
 	rv = fueword32(&sem->_count, &count);
 	if (rv == -1) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 		return (EFAULT);
 	}
 	for (;;) {
 		if (USEM_COUNT(count) != 0) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (0);
 		}
 		if (count == USEM_HAS_WAITERS)
 			break;
 		rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS);
 		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
 		}
 		if (count == 0)
 			break;
 	}
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
 
 	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
 	else {
 		umtxq_remove(uq);
 		if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) {
 			/* A relative timeout cannot be restarted. */
 			if (error == ERESTART)
 				error = EINTR;
 			if (error == EINTR) {
 				abs_timeout_update(&timo);
 				timeout->_timeout = timo.end;
 				timespecsub(&timeout->_timeout, &timo.cur);
 			}
 		}
 	}
 	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
 
 /*
  * Signal a userland semaphore.
  */
 static int
 do_sem2_wake(struct thread *td, struct _usem2 *sem)
 {
 	struct umtx_key key;
 	int error, cnt, rv;
 	uint32_t count, flags;
 
 	rv = fueword32(&sem->_flags, &flags);
 	if (rv == -1)
 		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
 	if (cnt > 0) {
 		/*
 		 * If this was the last sleeping thread, clear the waiters
 		 * flag in _count.
 		 */
 		if (cnt == 1) {
 			umtxq_unlock(&key);
 			rv = fueword32(&sem->_count, &count);
 			while (rv != -1 && count & USEM_HAS_WAITERS)
 				rv = casueword32(&sem->_count, count, &count,
 				    count & ~USEM_HAS_WAITERS);
 			if (rv == -1)
 				error = EFAULT;
 			umtxq_lock(&key);
 		}
 
 		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (error);
 }
 
 inline int
 umtx_copyin_timeout(const void *addr, struct timespec *tsp)
 {
 	int error;
 
 	error = copyin(addr, tsp, sizeof(struct timespec));
 	if (error == 0) {
 		if (tsp->tv_sec < 0 ||
 		    tsp->tv_nsec >= 1000000000 ||
 		    tsp->tv_nsec < 0)
 			error = EINVAL;
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	int error;
 	
 	if (size <= sizeof(struct timespec)) {
 		tp->_clockid = CLOCK_REALTIME;
 		tp->_flags = 0;
 		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
 	} else 
 		error = copyin(addr, tp, sizeof(struct _umtx_time));
 	if (error != 0)
 		return (error);
 	if (tp->_timeout.tv_sec < 0 ||
 	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0));
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 static int
 __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
 }
 
 #define BATCH_SIZE	128
 static int
 __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 	char *uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (char **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
 }
 
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY));
 }
 
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake_umutex(td, uap->obj));
 }
 
 static int
 __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_unlock_umutex(td, uap->obj, false));
 }
 
 static int
 __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1));
 }
 
 static int
 __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_signal(td, uap->obj));
 }
 
 static int
 __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_cv_broadcast(td, uap->obj));
 }
 
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time(uap->uaddr2, 
 		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_rw_unlock(td, uap->obj));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(
 		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
 __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem_wake(td, uap->obj));
 }
 #endif
 
 static int
 __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_wake2_umutex(td, uap->obj, uap->val));
 }
 
 static int
 __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) {
 		error = copyout(&timeout._timeout,
 		    (struct _umtx_time *)uap->uaddr2 + 1,
 		    sizeof(struct timespec));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (do_sem2_wake(td, uap->obj));
 }
 
 #define	USHM_OBJ_UMTX(o)						\
     ((struct umtx_shm_obj_list *)(&(o)->umtx_data))
 
 #define	USHMF_REG_LINKED	0x0001
 #define	USHMF_OBJ_LINKED	0x0002
 struct umtx_shm_reg {
 	TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link;
 	LIST_ENTRY(umtx_shm_reg) ushm_obj_link;
 	struct umtx_key		ushm_key;
 	struct ucred		*ushm_cred;
 	struct shmfd		*ushm_obj;
 	u_int			ushm_refcnt;
 	u_int			ushm_flags;
 };
 
 LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg);
 TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg);
 
 static uma_zone_t umtx_shm_reg_zone;
 static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS];
 static struct mtx umtx_shm_lock;
 static struct umtx_shm_reg_head umtx_shm_reg_delfree =
     TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree);
 
 static void umtx_shm_free_reg(struct umtx_shm_reg *reg);
 
 static void
 umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused)
 {
 	struct umtx_shm_reg_head d;
 	struct umtx_shm_reg *reg, *reg1;
 
 	TAILQ_INIT(&d);
 	mtx_lock(&umtx_shm_lock);
 	TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link);
 	mtx_unlock(&umtx_shm_lock);
 	TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) {
 		TAILQ_REMOVE(&d, reg, ushm_reg_link);
 		umtx_shm_free_reg(reg);
 	}
 }
 
 static struct task umtx_shm_reg_delfree_task =
     TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL);
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg_locked(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 	struct umtx_shm_reg_head *reg_head;
 
 	KASSERT(key->shared, ("umtx_p_find_rg: private key"));
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	reg_head = &umtx_shm_registry[key->hash];
 	TAILQ_FOREACH(reg, reg_head, ushm_reg_link) {
 		KASSERT(reg->ushm_key.shared,
 		    ("non-shared key on reg %p %d", reg, reg->ushm_key.shared));
 		if (reg->ushm_key.info.shared.object ==
 		    key->info.shared.object &&
 		    reg->ushm_key.info.shared.offset ==
 		    key->info.shared.offset) {
 			KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM"));
 			KASSERT(reg->ushm_refcnt > 0,
 			    ("reg %p refcnt 0 onlist", reg));
 			KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0,
 			    ("reg %p not linked", reg));
 			reg->ushm_refcnt++;
 			return (reg);
 		}
 	}
 	return (NULL);
 }
 
 static struct umtx_shm_reg *
 umtx_shm_find_reg(const struct umtx_key *key)
 {
 	struct umtx_shm_reg *reg;
 
 	mtx_lock(&umtx_shm_lock);
 	reg = umtx_shm_find_reg_locked(key);
 	mtx_unlock(&umtx_shm_lock);
 	return (reg);
 }
 
 static void
 umtx_shm_free_reg(struct umtx_shm_reg *reg)
 {
 
 	chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0);
 	crfree(reg->ushm_cred);
 	shm_drop(reg->ushm_obj);
 	uma_zfree(umtx_shm_reg_zone, reg);
 }
 
 static bool
 umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force)
 {
 	bool res;
 
 	mtx_assert(&umtx_shm_lock, MA_OWNED);
 	KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg));
 	reg->ushm_refcnt--;
 	res = reg->ushm_refcnt == 0;
 	if (res || force) {
 		if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) {
 			TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash],
 			    reg, ushm_reg_link);
 			reg->ushm_flags &= ~USHMF_REG_LINKED;
 		}
 		if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) {
 			LIST_REMOVE(reg, ushm_obj_link);
 			reg->ushm_flags &= ~USHMF_OBJ_LINKED;
 		}
 	}
 	return (res);
 }
 
 static void
 umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force)
 {
 	vm_object_t object;
 	bool dofree;
 
 	if (force) {
 		object = reg->ushm_obj->shm_object;
 		VM_OBJECT_WLOCK(object);
 		object->flags |= OBJ_UMTXDEAD;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	mtx_lock(&umtx_shm_lock);
 	dofree = umtx_shm_unref_reg_locked(reg, force);
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		umtx_shm_free_reg(reg);
 }
 
 void
 umtx_shm_object_init(vm_object_t object)
 {
 
 	LIST_INIT(USHM_OBJ_UMTX(object));
 }
 
 void
 umtx_shm_object_terminated(vm_object_t object)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	bool dofree;
 
 	dofree = false;
 	mtx_lock(&umtx_shm_lock);
 	LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) {
 		if (umtx_shm_unref_reg_locked(reg, true)) {
 			TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg,
 			    ushm_reg_link);
 			dofree = true;
 		}
 	}
 	mtx_unlock(&umtx_shm_lock);
 	if (dofree)
 		taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task);
 }
 
 static int
 umtx_shm_create_reg(struct thread *td, const struct umtx_key *key,
     struct umtx_shm_reg **res)
 {
 	struct umtx_shm_reg *reg, *reg1;
 	struct ucred *cred;
 	int error;
 
 	reg = umtx_shm_find_reg(key);
 	if (reg != NULL) {
 		*res = reg;
 		return (0);
 	}
 	cred = td->td_ucred;
 	if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP)))
 		return (ENOMEM);
 	reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO);
 	reg->ushm_refcnt = 1;
 	bcopy(key, &reg->ushm_key, sizeof(*key));
 	reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR);
 	reg->ushm_cred = crhold(cred);
 	error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE);
 	if (error != 0) {
 		umtx_shm_free_reg(reg);
 		return (error);
 	}
 	mtx_lock(&umtx_shm_lock);
 	reg1 = umtx_shm_find_reg_locked(key);
 	if (reg1 != NULL) {
 		mtx_unlock(&umtx_shm_lock);
 		umtx_shm_free_reg(reg);
 		*res = reg1;
 		return (0);
 	}
 	reg->ushm_refcnt++;
 	TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link);
 	LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg,
 	    ushm_obj_link);
 	reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED;
 	mtx_unlock(&umtx_shm_lock);
 	*res = reg;
 	return (0);
 }
 
 static int
 umtx_shm_alive(struct thread *td, void *addr)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	int res, ret;
 	boolean_t wired;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry,
 	    &object, &pindex, &prot, &wired);
 	if (res != KERN_SUCCESS)
 		return (EFAULT);
 	if (object == NULL)
 		ret = EINVAL;
 	else
 		ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0;
 	vm_map_lookup_done(map, entry);
 	return (ret);
 }
 
 static void
 umtx_shm_init(void)
 {
 	int i;
 
 	umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF);
 	for (i = 0; i < nitems(umtx_shm_registry); i++)
 		TAILQ_INIT(&umtx_shm_registry[i]);
 }
 
 static int
 umtx_shm(struct thread *td, void *addr, u_int flags)
 {
 	struct umtx_key key;
 	struct umtx_shm_reg *reg;
 	struct file *fp;
 	int error, fd;
 
 	if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP |
 	    UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1)
 		return (EINVAL);
 	if ((flags & UMTX_SHM_ALIVE) != 0)
 		return (umtx_shm_alive(td, addr));
 	error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key);
 	if (error != 0)
 		return (error);
 	KASSERT(key.shared == 1, ("non-shared key"));
 	if ((flags & UMTX_SHM_CREAT) != 0) {
 		error = umtx_shm_create_reg(td, &key, &reg);
 	} else {
 		reg = umtx_shm_find_reg(&key);
 		if (reg == NULL)
 			error = ESRCH;
 	}
 	umtx_key_release(&key);
 	if (error != 0)
 		return (error);
 	KASSERT(reg != NULL, ("no reg"));
 	if ((flags & UMTX_SHM_DESTROY) != 0) {
 		umtx_shm_unref_reg(reg, true);
 	} else {
 #if 0
 #ifdef MAC
 		error = mac_posixshm_check_open(td->td_ucred,
 		    reg->ushm_obj, FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = shm_access(reg->ushm_obj, td->td_ucred,
 			    FFLAGS(O_RDWR));
 		if (error == 0)
 #endif
 			error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL);
 		if (error == 0) {
 			shm_hold(reg->ushm_obj);
 			finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj,
 			    &shm_ops);
 			td->td_retval[0] = fd;
 			fdrop(fp, td);
 		}
 	}
 	umtx_shm_unref_reg(reg, false);
 	return (error);
 }
 
 static int
 __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	return (umtx_shm(td, uap->uaddr1, uap->val));
 }
 
 static int
 umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp)
 {
 
 	td->td_rb_list = rbp->robust_list_offset;
 	td->td_rbp_list = rbp->robust_priv_list_offset;
 	td->td_rb_inact = rbp->robust_inact_offset;
 	return (0);
 }
 
 static int
 __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	int error;
 
 	if (uap->val > sizeof(rb))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	error = copyin(uap->uaddr1, &rb, uap->val);
 	if (error != 0)
 		return (error);
 	return (umtx_robust_lists(td, &rb));
 }
 
 typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
 
 static const _umtx_op_func op_table[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_uint,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists,
 };
 
 int
 sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table))
 		return (*op_table[uap->op])(td, uap);
 	return (EINVAL);
 }
 
 #ifdef COMPAT_FREEBSD32
 
 struct timespec32 {
 	int32_t tv_sec;
 	int32_t tv_nsec;
 };
 
 struct umtx_time32 {
 	struct	timespec32	timeout;
 	uint32_t		flags;
 	uint32_t		clockid;
 };
 
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
 	struct timespec32 ts32;
 	int error;
 
 	error = copyin(addr, &ts32, sizeof(struct timespec32));
 	if (error == 0) {
 		if (ts32.tv_sec < 0 ||
 		    ts32.tv_nsec >= 1000000000 ||
 		    ts32.tv_nsec < 0)
 			error = EINVAL;
 		else {
 			tsp->tv_sec = ts32.tv_sec;
 			tsp->tv_nsec = ts32.tv_nsec;
 		}
 	}
 	return (error);
 }
 
 static inline int
 umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
 {
 	struct umtx_time32 t32;
 	int error;
 	
 	t32.clockid = CLOCK_REALTIME;
 	t32.flags   = 0;
 	if (size <= sizeof(struct timespec32))
 		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
 	else 
 		error = copyin(addr, &t32, sizeof(struct umtx_time32));
 	if (error != 0)
 		return (error);
 	if (t32.timeout.tv_sec < 0 ||
 	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
 		return (EINVAL);
 	tp->_timeout.tv_sec = t32.timeout.tv_sec;
 	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
 	tp->_flags = t32.flags;
 	tp->_clockid = t32.clockid;
 	return (0);
 }
 
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0));
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time(uap->uaddr2,
 			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, 0));
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2, 
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT));
 }
 
 static int
 __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct timespec *ts, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		ts = NULL;
 	else {
 		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
 		if (error != 0)
 			return (error);
 		ts = &timeout;
 	}
 	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
 }
 
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
 
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(
 		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1));
 }
 
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
 		tm_p = NULL;
 	else {
 		error = umtx_copyin_umtx_time32(uap->uaddr2,
 		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	return (do_sem_wait(td, uap->obj, tm_p));
 }
 #endif
 
 static int
 __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct _umtx_time *tm_p, timeout;
 	size_t uasize;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL) {
 		uasize = 0;
 		tm_p = NULL;
 	} else {
 		uasize = (size_t)uap->uaddr1;
 		error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout);
 		if (error != 0)
 			return (error);
 		tm_p = &timeout;
 	}
 	error = do_sem2_wait(td, uap->obj, tm_p);
 	if (error == EINTR && uap->uaddr2 != NULL &&
 	    (timeout._flags & UMTX_ABSTIME) == 0 &&
 	    uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) {
 		struct timespec32 remain32 = {
 			.tv_sec = timeout._timeout.tv_sec,
 			.tv_nsec = timeout._timeout.tv_nsec
 		};
 		error = copyout(&remain32,
 		    (struct umtx_time32 *)uap->uaddr2 + 1,
 		    sizeof(struct timespec32));
 		if (error == 0) {
 			error = EINTR;
 		}
 	}
 
 	return (error);
 }
 
 static int
 __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
 {
 	uint32_t uaddrs[BATCH_SIZE], **upp;
 	int count, error, i, pos, tocopy;
 
 	upp = (uint32_t **)uap->obj;
 	error = 0;
 	for (count = uap->val, pos = 0; count > 0; count -= tocopy,
 	    pos += tocopy) {
 		tocopy = MIN(count, BATCH_SIZE);
 		error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t));
 		if (error != 0)
 			break;
 		for (i = 0; i < tocopy; ++i)
 			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
 			    INT_MAX, 1);
 		maybe_yield();
 	}
 	return (error);
 }
 
 struct umtx_robust_lists_params_compat32 {
 	uint32_t	robust_list_offset;
 	uint32_t	robust_priv_list_offset;
 	uint32_t	robust_inact_offset;
 };
 
 static int
 __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
 	struct umtx_robust_lists_params rb;
 	struct umtx_robust_lists_params_compat32 rb32;
 	int error;
 
 	if (uap->val > sizeof(rb32))
 		return (EINVAL);
 	bzero(&rb, sizeof(rb));
 	bzero(&rb32, sizeof(rb32));
 	error = copyin(uap->uaddr1, &rb32, uap->val);
 	if (error != 0)
 		return (error);
 	rb.robust_list_offset = rb32.robust_list_offset;
 	rb.robust_priv_list_offset = rb32.robust_priv_list_offset;
 	rb.robust_inact_offset = rb32.robust_inact_offset;
 	return (umtx_robust_lists(td, &rb));
 }
 
 static const _umtx_op_func op_table_compat32[] = {
 	[UMTX_OP_RESERVED0]	= __umtx_op_unimpl,
 	[UMTX_OP_RESERVED1]	= __umtx_op_unimpl,
 	[UMTX_OP_WAIT]		= __umtx_op_wait_compat32,
 	[UMTX_OP_WAKE]		= __umtx_op_wake,
 	[UMTX_OP_MUTEX_TRYLOCK]	= __umtx_op_trylock_umutex,
 	[UMTX_OP_MUTEX_LOCK]	= __umtx_op_lock_umutex_compat32,
 	[UMTX_OP_MUTEX_UNLOCK]	= __umtx_op_unlock_umutex,
 	[UMTX_OP_SET_CEILING]	= __umtx_op_set_ceiling,
 	[UMTX_OP_CV_WAIT]	= __umtx_op_cv_wait_compat32,
 	[UMTX_OP_CV_SIGNAL]	= __umtx_op_cv_signal,
 	[UMTX_OP_CV_BROADCAST]	= __umtx_op_cv_broadcast,
 	[UMTX_OP_WAIT_UINT]	= __umtx_op_wait_compat32,
 	[UMTX_OP_RW_RDLOCK]	= __umtx_op_rw_rdlock_compat32,
 	[UMTX_OP_RW_WRLOCK]	= __umtx_op_rw_wrlock_compat32,
 	[UMTX_OP_RW_UNLOCK]	= __umtx_op_rw_unlock,
 	[UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32,
 	[UMTX_OP_WAKE_PRIVATE]	= __umtx_op_wake_private,
 	[UMTX_OP_MUTEX_WAIT]	= __umtx_op_wait_umutex_compat32,
 	[UMTX_OP_MUTEX_WAKE]	= __umtx_op_wake_umutex,
 #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10)
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_sem_wait_compat32,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_sem_wake,
 #else
 	[UMTX_OP_SEM_WAIT]	= __umtx_op_unimpl,
 	[UMTX_OP_SEM_WAKE]	= __umtx_op_unimpl,
 #endif
 	[UMTX_OP_NWAKE_PRIVATE]	= __umtx_op_nwake_private32,
 	[UMTX_OP_MUTEX_WAKE2]	= __umtx_op_wake2_umutex,
 	[UMTX_OP_SEM2_WAIT]	= __umtx_op_sem2_wait_compat32,
 	[UMTX_OP_SEM2_WAKE]	= __umtx_op_sem2_wake,
 	[UMTX_OP_SHM]		= __umtx_op_shm,
 	[UMTX_OP_ROBUST_LISTS]	= __umtx_op_robust_lists_compat32,
 };
 
 int
 freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
 {
 
 	if ((unsigned)uap->op < nitems(op_table_compat32)) {
 		return (*op_table_compat32[uap->op])(td,
 		    (struct _umtx_op_args *)uap);
 	}
 	return (EINVAL);
 }
 #endif
 
 void
 umtx_thread_init(struct thread *td)
 {
 
 	td->td_umtxq = umtxq_alloc();
 	td->td_umtxq->uq_thread = td;
 }
 
 void
 umtx_thread_fini(struct thread *td)
 {
 
 	umtxq_free(td->td_umtxq);
 }
 
 /*
  * It will be called when new thread is created, e.g fork().
  */
 void
 umtx_thread_alloc(struct thread *td)
 {
 	struct umtx_q *uq;
 
 	uq = td->td_umtxq;
 	uq->uq_inherited_pri = PRI_MAX;
 
 	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
 	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
 	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
 	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
 }
 
 /*
  * exec() hook.
  *
  * Clear robust lists for all process' threads, not delaying the
  * cleanup to thread_exit hook, since the relevant address space is
  * destroyed right now.
  */
 static void
 umtx_exec_hook(void *arg __unused, struct proc *p,
     struct image_params *imgp __unused)
 {
 	struct thread *td;
 
 	KASSERT(p == curproc, ("need curproc"));
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_HADTHREADS) == 0 ||
 	    (p->p_flag & P_STOPPED_SINGLE) != 0,
 	    ("curproc must be single-threaded"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(td == curthread ||
 		    ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)),
 		    ("running thread %p %p", p, td));
 		PROC_UNLOCK(p);
 		umtx_thread_cleanup(td);
 		PROC_LOCK(p);
 		td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0;
 	}
 	PROC_UNLOCK(p);
 }
 
 /*
  * thread_exit() hook.
  */
 void
 umtx_thread_exit(struct thread *td)
 {
 
 	umtx_thread_cleanup(td);
 }
 
 static int
 umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res)
 {
 	u_long res1;
 #ifdef COMPAT_FREEBSD32
 	uint32_t res32;
 #endif
 	int error;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		error = fueword32((void *)ptr, &res32);
 		if (error == 0)
 			res1 = res32;
 	} else
 #endif
 	{
 		error = fueword((void *)ptr, &res1);
 	}
 	if (error == 0)
 		*res = res1;
 	else
 		error = EFAULT;
 	return (error);
 }
 
 static void
 umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list)
 {
 #ifdef COMPAT_FREEBSD32
 	struct umutex32 m32;
 
 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 		memcpy(&m32, m, sizeof(m32));
 		*rb_list = m32.m_rb_lnk;
 	} else
 #endif
 		*rb_list = m->m_rb_lnk;
 }
 
 static int
 umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact)
 {
 	struct umutex m;
 	int error;
 
 	KASSERT(td->td_proc == curproc, ("need current vmspace"));
 	error = copyin((void *)rbp, &m, sizeof(m));
 	if (error != 0)
 		return (error);
 	if (rb_list != NULL)
 		umtx_read_rb_list(td, &m, rb_list);
 	if ((m.m_flags & UMUTEX_ROBUST) == 0)
 		return (EINVAL);
 	if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid)
 		/* inact is cleared after unlock, allow the inconsistency */
 		return (inact ? 0 : EINVAL);
 	return (do_unlock_umutex(td, (struct umutex *)rbp, true));
 }
 
 static void
 umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact,
     const char *name)
 {
 	int error, i;
 	uintptr_t rbp;
 	bool inact;
 
 	if (rb_list == 0)
 		return;
 	error = umtx_read_uptr(td, rb_list, &rbp);
 	for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) {
 		if (rbp == *rb_inact) {
 			inact = true;
 			*rb_inact = 0;
 		} else
 			inact = false;
 		error = umtx_handle_rb(td, rbp, &rbp, inact);
 	}
 	if (i == umtx_max_rb && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: reached umtx %smax rb %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb);
 	}
 	if (error != 0 && umtx_verbose_rb) {
 		uprintf("comm %s pid %d: handling %srb error %d\n",
 		    td->td_proc->p_comm, td->td_proc->p_pid, name, error);
 	}
 }
 
 /*
  * Clean up umtx data.
  */
 static void
 umtx_thread_cleanup(struct thread *td)
 {
 	struct umtx_q *uq;
 	struct umtx_pi *pi;
 	uintptr_t rb_inact;
 
 	/*
 	 * Disown pi mutexes.
 	 */
 	uq = td->td_umtxq;
 	if (uq != NULL) {
 		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = PRI_MAX;
 		while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 			pi->pi_owner = NULL;
 			TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 		}
 		mtx_unlock(&umtx_lock);
 		thread_lock(td);
 		sched_lend_user_prio(td, PRI_MAX);
 		thread_unlock(td);
 	}
 
 	/*
 	 * Handle terminated robust mutexes.  Must be done after
 	 * robust pi disown, otherwise unlock could see unowned
 	 * entries.
 	 */
 	rb_inact = td->td_rb_inact;
 	if (rb_inact != 0)
 		(void)umtx_read_uptr(td, rb_inact, &rb_inact);
 	umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, "");
 	umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv ");
 	if (rb_inact != 0)
 		(void)umtx_handle_rb(td, rb_inact, NULL, true);
 }
Index: head/sys/kern/subr_sleepqueue.c
===================================================================
--- head/sys/kern/subr_sleepqueue.c	(revision 315279)
+++ head/sys/kern/subr_sleepqueue.c	(revision 315280)
@@ -1,1382 +1,1437 @@
 /*-
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Implementation of sleep queues used to hold queue of threads blocked on
  * a wait channel.  Sleep queues are different from turnstiles in that wait
  * channels are not owned by anyone, so there is no priority propagation.
  * Sleep queues can also provide a timeout and can also be interrupted by
  * signals.  That said, there are several similarities between the turnstile
  * and sleep queue implementations.  (Note: turnstiles were implemented
  * first.)  For example, both use a hash table of the same size where each
  * bucket is referred to as a "chain" that contains both a spin lock and
  * a linked list of queues.  An individual queue is located by using a hash
  * to pick a chain, locking the chain, and then walking the chain searching
  * for the queue.  This means that a wait channel object does not need to
  * embed its queue head just as locks do not embed their turnstile queue
  * head.  Threads also carry around a sleep queue that they lend to the
  * wait channel when blocking.  Just as in turnstiles, the queue includes
  * a free list of the sleep queues of other threads blocked on the same
  * wait channel in the case of multiple waiters.
  *
  * Some additional functionality provided by sleep queues include the
  * ability to set a timeout.  The timeout is managed using a per-thread
  * callout that resumes a thread if it is asleep.  A thread may also
  * catch signals while it is asleep (aka an interruptible sleep).  The
  * signal code uses sleepq_abort() to interrupt a sleeping thread.  Finally,
  * sleep queues also provide some extra assertions.  One is not allowed to
  * mix the sleep/wakeup and cv APIs for a given wait channel.  Also, one
  * must consistently use the same lock to synchronize with a wait channel,
  * though this check is currently only a warning for sleep/wakeup due to
  * pre-existing abuse of that API.  The same lock must also be held when
  * awakening threads, though that is currently only enforced for condition
  * variables.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_sleepqueue_profiling.h"
 #include "opt_ddb.h"
 #include "opt_sched.h"
 #include "opt_stack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
+#include <sys/time.h>
 
+#include <machine/atomic.h>
+
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 
 /*
  * Constants for the hash table of sleep queue chains.
  * SC_TABLESIZE must be a power of two for SC_MASK to work properly.
  */
 #define	SC_TABLESIZE	256			/* Must be power of 2. */
 #define	SC_MASK		(SC_TABLESIZE - 1)
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
 			    SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
 #define NR_SLEEPQS      2
 /*
  * There are two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
  * that a sleep queue is on when it is attached to a wait channel.  The
  * second list is the free list hung off of a sleep queue that is attached
  * to a wait channel.
  *
  * Each sleep queue also contains the wait channel it is attached to, the
  * list of threads blocked on that wait channel, flags specific to the
  * wait channel, and the lock used to synchronize with a wait channel.
  * The flags are used to catch mismatches between the various consumers
  * of the sleep queue API (e.g. sleep/wakeup and condition variables).
  * The lock pointer is only used when invariants are enabled for various
  * debugging checks.
  *
  * Locking key:
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
 	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
 	u_int sq_blockedcnt[NR_SLEEPQS];	/* (c) N. of blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
 	int	sq_type;			/* (c) Queue type. */
 #ifdef INVARIANTS
 	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
 struct sleepqueue_chain {
 	LIST_HEAD(, sleepqueue) sc_queues;	/* List of sleep queues. */
 	struct mtx sc_lock;			/* Spin lock for this chain. */
 #ifdef SLEEPQUEUE_PROFILING
 	u_int	sc_depth;			/* Length of sc_queues. */
 	u_int	sc_max_depth;			/* Max length of sc_queues. */
 #endif
 };
 
 #ifdef SLEEPQUEUE_PROFILING
 u_int sleepq_max_depth;
 static SYSCTL_NODE(_debug, OID_AUTO, sleepq, CTLFLAG_RD, 0, "sleepq profiling");
 static SYSCTL_NODE(_debug_sleepq, OID_AUTO, chains, CTLFLAG_RD, 0,
     "sleepq chain stats");
 SYSCTL_UINT(_debug_sleepq, OID_AUTO, max_depth, CTLFLAG_RD, &sleepq_max_depth,
     0, "maxmimum depth achieved of a single chain");
 
 static void	sleepq_profile(const char *wmesg);
 static int	prof_enabled;
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
 static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
 static int	sleepq_catch_signals(void *wchan, int pri);
 static int	sleepq_check_signals(void);
 static int	sleepq_check_timeout(void);
 #ifdef INVARIANTS
 static void	sleepq_dtor(void *mem, int size, void *arg);
 #endif
 static int	sleepq_init(void *mem, int size, int flags);
 static int	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
 		    int pri);
 static void	sleepq_switch(void *wchan, int pri);
 static void	sleepq_timeout(void *arg);
 
 SDT_PROBE_DECLARE(sched, , , sleep);
 SDT_PROBE_DECLARE(sched, , , wakeup);
 
 /*
  * Initialize SLEEPQUEUE_PROFILING specific sysctl nodes.
  * Note that it must happen after sleepinit() has been fully executed, so
  * it must happen after SI_SUB_KMEM SYSINIT() subsystem setup.
  */
 #ifdef SLEEPQUEUE_PROFILING
 static void
 init_sleepqueue_profiling(void)
 {
 	char chain_name[10];
 	struct sysctl_oid *chain_oid;
 	u_int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		snprintf(chain_name, sizeof(chain_name), "%u", i);
 		chain_oid = SYSCTL_ADD_NODE(NULL,
 		    SYSCTL_STATIC_CHILDREN(_debug_sleepq_chains), OID_AUTO,
 		    chain_name, CTLFLAG_RD, NULL, "sleepq chain stats");
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "depth", CTLFLAG_RD, &sleepq_chains[i].sc_depth, 0, NULL);
 		SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
 		    "max_depth", CTLFLAG_RD, &sleepq_chains[i].sc_max_depth, 0,
 		    NULL);
 	}
 }
 
 SYSINIT(sleepqueue_profiling, SI_SUB_LOCK, SI_ORDER_ANY,
     init_sleepqueue_profiling, NULL);
 #endif
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
  * SYSINIT.
  */
 void
 init_sleepqueues(void)
 {
 	int i;
 
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_INIT(&sleepq_chains[i].sc_queues);
 		mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
 		    MTX_SPIN | MTX_RECURSE);
 	}
 	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
 #ifdef INVARIANTS
 	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #else
 	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
 #endif
 
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
  * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
 
 	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
  * Free a sleep queue when a thread is destroyed.
  */
 void
 sleepq_free(struct sleepqueue *sq)
 {
 
 	uma_zfree(sleepq_zone, sq);
 }
 
 /*
  * Lock the sleep queue chain associated with the specified wait channel.
  */
 void
 sleepq_lock(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_lock_spin(&sc->sc_lock);
 }
 
 /*
  * Look up the sleep queue associated with a given wait channel in the hash
  * table locking the associated sleep queue chain.  If no queue is found in
  * the table, NULL is returned.
  */
 struct sleepqueue *
 sleepq_lookup(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			return (sq);
 	return (NULL);
 }
 
 /*
  * Unlock the sleep queue chain associated with a given wait channel.
  */
 void
 sleepq_release(void *wchan)
 {
 	struct sleepqueue_chain *sc;
 
 	sc = SC_LOOKUP(wchan);
 	mtx_unlock_spin(&sc->sc_lock);
 }
 
 /*
  * Places the current thread on the sleep queue for the specified wait
  * channel.  If INVARIANTS is enabled, then it associates the passed in
  * lock with the sleepq to make sure it is held when that sleep queue is
  * woken up.
  */
 void
 sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
     int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	KASSERT(td->td_no_sleeping == 0,
 	    ("%s: td %p to sleep on wchan %p with sleeping prohibited",
 	    __func__, td, wchan));
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
 
 	/*
 	 * If the wait channel does not already have a sleep queue, use
 	 * this thread's sleep queue.  Otherwise, insert the current thread
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
 #ifdef INVARIANTS
 		int i;
 
 		sq = td->td_sleepqueue;
 		for (i = 0; i < NR_SLEEPQS; i++) {
 			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
 			    ("thread's sleep queue %d is not empty", i));
 			KASSERT(sq->sq_blockedcnt[i] == 0,
 			    ("thread's sleep queue %d count mismatches", i));
 		}
 		KASSERT(LIST_EMPTY(&sq->sq_free),
 		    ("thread's sleep queue has a non-empty free list"));
 		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_lock = lock;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
 			sc->sc_max_depth = sc->sc_depth;
 			if (sc->sc_max_depth > sleepq_max_depth)
 				sleepq_max_depth = sc->sc_max_depth;
 		}
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
 		sq->sq_wchan = wchan;
 		sq->sq_type = flags & SLEEPQ_TYPE;
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
 	thread_lock(td);
 	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	sq->sq_blockedcnt[queue]++;
 	td->td_sleepqueue = NULL;
 	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 	}
 	thread_unlock(td);
 }
 
 /*
  * Sets a timeout that will remove the current thread from the specified
  * sleep queue after timo ticks if the thread has not already been awakened.
  */
 void
 sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt, sbintime_t pr,
     int flags)
 {
 	struct sleepqueue_chain *sc;
 	struct thread *td;
 	sbintime_t pr1;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_sleepqueue == NULL);
 	MPASS(wchan != NULL);
 	if (cold && td == &thread0)
 		panic("timed sleep before timers are working");
 	KASSERT(td->td_sleeptimo == 0, ("td %d %p td_sleeptimo %jx",
 	    td->td_tid, td, (uintmax_t)td->td_sleeptimo));
 	thread_lock(td);
 	callout_when(sbt, pr, flags, &td->td_sleeptimo, &pr1);
 	thread_unlock(td);
 	callout_reset_sbt_on(&td->td_slpcallout, td->td_sleeptimo, pr1,
 	    sleepq_timeout, td, PCPU_GET(cpuid), flags | C_PRECALC |
 	    C_DIRECT_EXEC);
 }
 
 /*
  * Return the number of actual sleepers for the specified queue.
  */
 u_int
 sleepq_sleepcnt(void *wchan, int queue)
 {
 	struct sleepqueue *sq;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	return (sq->sq_blockedcnt[queue]);
 }
 
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
  * to sleep. Enters and exits with the thread lock held.  Thread lock
  * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	int sig, ret;
 
 	ret = 0;
 	td = curthread;
 	p = curproc;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(wchan != NULL);
 	if ((td->td_pflags & TDP_WAKEUP) != 0) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		ret = EINTR;
 		thread_lock(td);
 		goto out;
 	}
 
 	/*
 	 * See if there are any pending signals or suspension requests for this
 	 * thread.  If not, we can switch immediately.
 	 */
 	thread_lock(td);
 	if ((td->td_flags & (TDF_NEEDSIGCHK | TDF_NEEDSUSPCHK)) != 0) {
 		thread_unlock(td);
 		mtx_unlock_spin(&sc->sc_lock);
 		CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 			(void *)td, (long)p->p_pid, td->td_name);
 		PROC_LOCK(p);
 		/*
 		 * Check for suspension first. Checking for signals and then
 		 * suspending could result in a missed signal, since a signal
 		 * can be delivered while this thread is suspended.
 		 */
 		if ((td->td_flags & TDF_NEEDSUSPCHK) != 0) {
 			ret = thread_suspend_check(1);
 			MPASS(ret == 0 || ret == EINTR || ret == ERESTART);
 			if (ret != 0) {
 				PROC_UNLOCK(p);
 				mtx_lock_spin(&sc->sc_lock);
 				thread_lock(td);
 				goto out;
 			}
 		}
 		if ((td->td_flags & TDF_NEEDSIGCHK) != 0) {
 			ps = p->p_sigacts;
 			mtx_lock(&ps->ps_mtx);
 			sig = cursig(td);
 			if (sig == -1) {
 				mtx_unlock(&ps->ps_mtx);
 				KASSERT((td->td_flags & TDF_SBDRY) != 0,
 				    ("lost TDF_SBDRY"));
 				KASSERT(TD_SBDRY_INTR(td),
 				    ("lost TDF_SERESTART of TDF_SEINTR"));
 				KASSERT((td->td_flags &
 				    (TDF_SEINTR | TDF_SERESTART)) !=
 				    (TDF_SEINTR | TDF_SERESTART),
 				    ("both TDF_SEINTR and TDF_SERESTART"));
 				ret = TD_SBDRY_ERRNO(td);
 			} else if (sig != 0) {
 				ret = SIGISMEMBER(ps->ps_sigintr, sig) ?
 				    EINTR : ERESTART;
 				mtx_unlock(&ps->ps_mtx);
 			} else {
 				mtx_unlock(&ps->ps_mtx);
 			}
 		}
 		/*
 		 * Lock the per-process spinlock prior to dropping the PROC_LOCK
 		 * to avoid a signal delivery race.  PROC_LOCK, PROC_SLOCK, and
 		 * thread_lock() are currently held in tdsendsignal().
 		 */
 		PROC_SLOCK(p);
 		mtx_lock_spin(&sc->sc_lock);
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		PROC_SUNLOCK(p);
 	}
 	if (ret == 0) {
 		sleepq_switch(wchan, pri);
 		return (0);
 	}
 out:
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 	}
 	mtx_unlock_spin(&sc->sc_lock);
 	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
  * Switches to another thread if we are still asleep on a sleep queue.
  * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan, int pri)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
+	bool rtc_changed;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * If TDF_TIMEOUT is set, then our sleep has been timed out
 	 * already but we are still on the sleep queue, so dequeue the
-	 * thread and return.
+	 * thread and return.  Do the same if the real-time clock has
+	 * been adjusted since this thread calculated its timeout
+	 * based on that clock.
 	 */
-	if (td->td_flags & TDF_TIMEOUT) {
+	rtc_changed = td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation;
+	if ((td->td_flags & TDF_TIMEOUT) || rtc_changed) {
+		if (rtc_changed) {
+			td->td_rtcgen = 0;
+		}
 		MPASS(TD_ON_SLEEPQ(td));
 		sq = sleepq_lookup(wchan);
 		if (sleepq_resume_thread(sq, td, 0)) {
 #ifdef INVARIANTS
 			/*
 			 * This thread hasn't gone to sleep yet, so it
 			 * should not be swapped out.
 			 */
 			panic("not waking up swapper");
 #endif
 		}
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 #ifdef SLEEPQUEUE_PROFILING
 	if (prof_enabled)
 		sleepq_profile(td->td_wmesg);
 #endif
 	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td, pri);
 	thread_lock_set(td, &sc->sc_lock);
 	SDT_PROBE0(sched, , , sleep);
 	TD_SET_SLEEPING(td);
 	mi_switch(SW_VOL | SWT_SLEEPQ, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 }
 
 /*
  * Check to see if we timed out.
  */
 static int
 sleepq_check_timeout(void)
 {
 	struct thread *td;
 	int res;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.  But recheck
 	 * td_sleeptimo anyway.
 	 */
 	res = 0;
 	if (td->td_sleeptimo != 0) {
 		if (td->td_sleeptimo <= sbinuptime())
 			res = EWOULDBLOCK;
 		td->td_sleeptimo = 0;
 	}
 	if (td->td_flags & TDF_TIMEOUT)
 		td->td_flags &= ~TDF_TIMEOUT;
 	else
 		/*
 		 * We ignore the situation where timeout subsystem was
 		 * unable to stop our callout.  The struct thread is
 		 * type-stable, the callout will use the correct
 		 * memory when running.  The checks of the
 		 * td_sleeptimo value in this function and in
 		 * sleepq_timeout() ensure that the thread does not
 		 * get spurious wakeups, even if the callout was reset
 		 * or thread reused.
 		 */
 		callout_stop(&td->td_slpcallout);
 	return (res);
 }
 
 /*
  * Check to see if we were awoken by a signal.
  */
 static int
 sleepq_check_signals(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
 		td->td_flags &= ~TDF_SINTR;
 
 	if (td->td_flags & TDF_SLEEPABORT) {
 		td->td_flags &= ~TDF_SLEEPABORT;
 		return (td->td_intrval);
 	}
 
 	return (0);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue.
  */
 void
 sleepq_wait(void *wchan, int pri)
 {
 	struct thread *td;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	thread_unlock(td);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it is interrupted by a signal.
  */
 int
 sleepq_wait_sig(void *wchan, int pri)
 {
 	int rcatch;
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rval = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue
  * or it times out while waiting.
  */
 int
 sleepq_timedwait(void *wchan, int pri)
 {
 	struct thread *td;
 	int rval;
 
 	td = curthread;
 	MPASS(!(td->td_flags & TDF_SINTR));
 	thread_lock(td);
 	sleepq_switch(wchan, pri);
 	rval = sleepq_check_timeout();
 	thread_unlock(td);
 
 	return (rval);
 }
 
 /*
  * Block the current thread until it is awakened from its sleep queue,
  * it is interrupted by a signal, or it times out waiting to be awakened.
  */
 int
 sleepq_timedwait_sig(void *wchan, int pri)
 {
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan, pri);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
 	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
 		return (rvals);
 	return (rvalt);
 }
 
 /*
  * Returns the type of sleepqueue given a waitchannel.
  */
 int
 sleepq_type(void *wchan)
 {
 	struct sleepqueue *sq;
 	int type;
 
 	MPASS(wchan != NULL);
 
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		sleepq_release(wchan);
 		return (-1);
 	}
 	type = sq->sq_type;
 	sleepq_release(wchan);
 	return (type);
 }
 
 /*
  * Removes a thread from a sleep queue and makes it
  * runnable.
  */
 static int
 sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri)
 {
 	struct sleepqueue_chain *sc;
 
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
 	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 
 	SDT_PROBE2(sched, , , wakeup, td, td->td_proc);
 
 	/* Remove the thread from the queue. */
 	sq->sq_blockedcnt[td->td_sqqueue]--;
 	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
 	 * use the queue itself and take it out of the chain, otherwise,
 	 * remove a queue from the free list.
 	 */
 	if (LIST_EMPTY(&sq->sq_free)) {
 		td->td_sleepqueue = sq;
 #ifdef INVARIANTS
 		sq->sq_wchan = NULL;
 #endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth--;
 #endif
 	} else
 		td->td_sleepqueue = LIST_FIRST(&sq->sq_free);
 	LIST_REMOVE(td->td_sleepqueue, sq_hash);
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
 	td->td_flags &= ~TDF_SINTR;
 
 	CTR3(KTR_PROC, "sleepq_wakeup: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, td->td_name);
 
 	/* Adjust priority if requested. */
 	MPASS(pri == 0 || (pri >= PRI_MIN && pri <= PRI_MAX));
 	if (pri != 0 && td->td_priority > pri &&
 	    PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
 		sched_prio(td, pri);
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
 	 * sleepq_catch_signals() on another CPU or is blocked on its
 	 * proc lock to check signals.  There's no need to mark the
 	 * thread runnable in that case.
 	 */
 	if (TD_IS_SLEEPING(td)) {
 		TD_CLR_SLEEPING(td);
 		return (setrunnable(td));
 	}
 	return (0);
 }
 
 #ifdef INVARIANTS
 /*
  * UMA zone item deallocator.
  */
 static void
 sleepq_dtor(void *mem, int size, void *arg)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
 		MPASS(sq->sq_blockedcnt[i] == 0);
 	}
 }
 #endif
 
 /*
  * UMA zone item initializer.
  */
 static int
 sleepq_init(void *mem, int size, int flags)
 {
 	struct sleepqueue *sq;
 	int i;
 
 	bzero(mem, size);
 	sq = mem;
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		TAILQ_INIT(&sq->sq_blocked[i]);
 		sq->sq_blockedcnt[i] = 0;
 	}
 	LIST_INIT(&sq->sq_free);
 	return (0);
 }
 
 /*
  * Find the highest priority thread sleeping on a wait channel and resume it.
  */
 int
 sleepq_signal(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *besttd;
 	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/*
 	 * Find the highest priority thread on the queue.  If there is a
 	 * tie, use the thread that first appears in the queue as it has
 	 * been sleeping the longest since threads are always added to
 	 * the tail of sleep queues.
 	 */
 	besttd = TAILQ_FIRST(&sq->sq_blocked[queue]);
 	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
 		if (td->td_priority < besttd->td_priority)
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
 	thread_lock(besttd);
 	wakeup_swapper = sleepq_resume_thread(sq, besttd, pri);
 	thread_unlock(besttd);
 	return (wakeup_swapper);
 }
 
+static bool
+match_any(struct thread *td __unused)
+{
+	return (true);
+}
+
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 int
 sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
-	struct thread *td, *tdn;
-	int wakeup_swapper;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL)
 		return (0);
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
+	return (sleepq_remove_matching(sq, queue, match_any, pri));
+}
+
+/*
+ * Resume threads on the sleep queue that match the given predicate.
+ */
+int
+sleepq_remove_matching(struct sleepqueue *sq, int queue,
+    bool (*matches)(struct thread *), int pri)
+{
+	struct thread *td, *tdn;
+	int wakeup_swapper;
+
 	/*
-	 * Resume all blocked threads on the sleep queue.  The last thread will
-	 * be given ownership of sq and may re-enqueue itself before
-	 * sleepq_resume_thread() returns, so we must cache the "next" queue
-	 * item at the beginning of the final iteration.
+	 * The last thread will be given ownership of sq and may
+	 * re-enqueue itself before sleepq_resume_thread() returns,
+	 * so we must cache the "next" queue item at the beginning
+	 * of the final iteration.
 	 */
 	wakeup_swapper = 0;
 	TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq, tdn) {
 		thread_lock(td);
-		wakeup_swapper |= sleepq_resume_thread(sq, td, pri);
+		if (matches(td))
+			wakeup_swapper |= sleepq_resume_thread(sq, td, pri);
 		thread_unlock(td);
 	}
+
 	return (wakeup_swapper);
 }
 
 /*
  * Time sleeping threads out.  When the timeout expires, the thread is
  * removed from the sleep queue and made runnable if it is still asleep.
  */
 static void
 sleepq_timeout(void *arg)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
 	int wakeup_swapper;
 
 	td = arg;
 	wakeup_swapper = 0;
 	CTR3(KTR_PROC, "sleepq_timeout: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 
 	thread_lock(td);
 
 	if (td->td_sleeptimo > sbinuptime() || td->td_sleeptimo == 0) {
 		/*
 		 * The thread does not want a timeout (yet).
 		 */
 	} else if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		/*
 		 * See if the thread is asleep and get the wait
 		 * channel if it is.
 		 */
 		wchan = td->td_wchan;
 		sc = SC_LOOKUP(wchan);
 		THREAD_LOCKPTR_ASSERT(td, &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
 		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
 		wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	} else if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If the thread is on the SLEEPQ but isn't sleeping
 		 * yet, it can either be on another CPU in between
 		 * sleepq_add() and one of the sleepq_*wait*()
 		 * routines or it can be in sleepq_catch_signals().
 		 */
 		td->td_flags |= TDF_TIMEOUT;
 	}
 
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Resumes a specific thread from the sleep queue associated with a specific
  * wait channel if it is on that queue.
  */
 void
 sleepq_remove(struct thread *td, void *wchan)
 {
 	struct sleepqueue *sq;
 	int wakeup_swapper;
 
 	/*
 	 * Look up the sleep queue for this wait channel, then re-check
 	 * that the thread is asleep on that channel, if it is not, then
 	 * bail.
 	 */
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
 	/*
 	 * We can not lock the thread here as it may be sleeping on a
 	 * different sleepq.  However, holding the sleepq lock for this
 	 * wchan can guarantee that we do not miss a wakeup for this
 	 * channel.  The asserts below will catch any false positives.
 	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
 		sleepq_release(wchan);
 		return;
 	}
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	thread_lock(td);
 	MPASS(sq != NULL);
 	MPASS(td->td_wchan == wchan);
 	wakeup_swapper = sleepq_resume_thread(sq, td, 0);
 	thread_unlock(td);
 	sleepq_release(wchan);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
  */
 int
 sleepq_abort(struct thread *td, int intrval)
 {
 	struct sleepqueue *sq;
 	void *wchan;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
 
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if (td->td_flags & TDF_TIMEOUT)
 		return (0);
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_name);
 	td->td_intrval = intrval;
 	td->td_flags |= TDF_SLEEPABORT;
 	/*
 	 * If the thread has not slept yet it will find the signal in
 	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
 	 * we have to do it here.
 	 */
 	if (!TD_IS_SLEEPING(td))
 		return (0);
 	wchan = td->td_wchan;
 	MPASS(wchan != NULL);
 	sq = sleepq_lookup(wchan);
 	MPASS(sq != NULL);
 
 	/* Thread is asleep on sleep queue sq, so wake it up. */
 	return (sleepq_resume_thread(sq, td, 0));
+}
+
+void
+sleepq_chains_remove_matching(bool (*matches)(struct thread *))
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+	int i, wakeup_swapper;
+
+	wakeup_swapper = 0;
+	for (sc = &sleepq_chains[0]; sc < sleepq_chains + SC_TABLESIZE; ++sc) {
+		if (LIST_EMPTY(&sc->sc_queues)) {
+			continue;
+		}
+		mtx_lock_spin(&sc->sc_lock);
+		LIST_FOREACH(sq, &sc->sc_queues, sq_hash) {
+			for (i = 0; i < NR_SLEEPQS; ++i) {
+				wakeup_swapper |= sleepq_remove_matching(sq, i,
+				    matches, 0);
+			}
+		}
+		mtx_unlock_spin(&sc->sc_lock);
+	}
+	if (wakeup_swapper) {
+		kick_proc0();
+	}
 }
 
 /*
  * Prints the stacks of all threads presently sleeping on wchan/queue to
  * the sbuf sb.  Sets count_stacks_printed to the number of stacks actually
  * printed.  Typically, this will equal the number of threads sleeping on the
  * queue, but may be less if sb overflowed before all stacks were printed.
  */
 #ifdef STACK
 int
 sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
     int *count_stacks_printed)
 {
 	struct thread *td, *td_next;
 	struct sleepqueue *sq;
 	struct stack **st;
 	struct sbuf **td_infos;
 	int i, stack_idx, error, stacks_to_allocate;
 	bool finished, partial_print;
 
 	error = 0;
 	finished = false;
 	partial_print = false;
 
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
 	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	stacks_to_allocate = 10;
 	for (i = 0; i < 3 && !finished ; i++) {
 		/* We cannot malloc while holding the queue's spinlock, so
 		 * we do our mallocs now, and hope it is enough.  If it
 		 * isn't, we will free these, drop the lock, malloc more,
 		 * and try again, up to a point.  After that point we will
 		 * give up and report ENOMEM. We also cannot write to sb
 		 * during this time since the client may have set the
 		 * SBUF_AUTOEXTEND flag on their sbuf, which could cause a
 		 * malloc as we print to it.  So we defer actually printing
 		 * to sb until after we drop the spinlock.
 		 */
 
 		/* Where we will store the stacks. */
 		st = malloc(sizeof(struct stack *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			st[stack_idx] = stack_create();
 
 		/* Where we will store the td name, tid, etc. */
 		td_infos = malloc(sizeof(struct sbuf *) * stacks_to_allocate,
 		    M_TEMP, M_WAITOK);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			td_infos[stack_idx] = sbuf_new(NULL, NULL,
 			    MAXCOMLEN + sizeof(struct thread *) * 2 + 40,
 			    SBUF_FIXEDLEN);
 
 		sleepq_lock(wchan);
 		sq = sleepq_lookup(wchan);
 		if (sq == NULL) {
 			/* This sleepq does not exist; exit and return ENOENT. */
 			error = ENOENT;
 			finished = true;
 			sleepq_release(wchan);
 			goto loop_end;
 		}
 
 		stack_idx = 0;
 		/* Save thread info */
 		TAILQ_FOREACH_SAFE(td, &sq->sq_blocked[queue], td_slpq,
 		    td_next) {
 			if (stack_idx >= stacks_to_allocate)
 				goto loop_end;
 
 			/* Note the td_lock is equal to the sleepq_lock here. */
 			stack_save_td(st[stack_idx], td);
 
 			sbuf_printf(td_infos[stack_idx], "%d: %s %p",
 			    td->td_tid, td->td_name, td);
 
 			++stack_idx;
 		}
 
 		finished = true;
 		sleepq_release(wchan);
 
 		/* Print the stacks */
 		for (i = 0; i < stack_idx; i++) {
 			sbuf_finish(td_infos[i]);
 			sbuf_printf(sb, "--- thread %s: ---\n", sbuf_data(td_infos[i]));
 			stack_sbuf_print(sb, st[i]);
 			sbuf_printf(sb, "\n");
 
 			error = sbuf_error(sb);
 			if (error == 0)
 				*count_stacks_printed = stack_idx;
 		}
 
 loop_end:
 		if (!finished)
 			sleepq_release(wchan);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			stack_destroy(st[stack_idx]);
 		for (stack_idx = 0; stack_idx < stacks_to_allocate;
 		    stack_idx++)
 			sbuf_delete(td_infos[stack_idx]);
 		free(st, M_TEMP);
 		free(td_infos, M_TEMP);
 		stacks_to_allocate *= 10;
 	}
 
 	if (!finished && error == 0)
 		error = ENOMEM;
 
 	return (error);
 }
 #endif
 
 #ifdef SLEEPQUEUE_PROFILING
 #define	SLEEPQ_PROF_LOCATIONS	1024
 #define	SLEEPQ_SBUFSIZE		512
 struct sleepq_prof {
 	LIST_ENTRY(sleepq_prof) sp_link;
 	const char	*sp_wmesg;
 	long		sp_count;
 };
 
 LIST_HEAD(sqphead, sleepq_prof);
 
 struct sqphead sleepq_prof_free;
 struct sqphead sleepq_hash[SC_TABLESIZE];
 static struct sleepq_prof sleepq_profent[SLEEPQ_PROF_LOCATIONS];
 static struct mtx sleepq_prof_lock;
 MTX_SYSINIT(sleepq_prof_lock, &sleepq_prof_lock, "sleepq_prof", MTX_SPIN);
 
 static void
 sleepq_profile(const char *wmesg)
 {
 	struct sleepq_prof *sp;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	if (prof_enabled == 0)
 		goto unlock;
 	LIST_FOREACH(sp, &sleepq_hash[SC_HASH(wmesg)], sp_link)
 		if (sp->sp_wmesg == wmesg)
 			goto done;
 	sp = LIST_FIRST(&sleepq_prof_free);
 	if (sp == NULL)
 		goto unlock;
 	sp->sp_wmesg = wmesg;
 	LIST_REMOVE(sp, sp_link);
 	LIST_INSERT_HEAD(&sleepq_hash[SC_HASH(wmesg)], sp, sp_link);
 done:
 	sp->sp_count++;
 unlock:
 	mtx_unlock_spin(&sleepq_prof_lock);
 	return;
 }
 
 static void
 sleepq_prof_reset(void)
 {
 	struct sleepq_prof *sp;
 	int enabled;
 	int i;
 
 	mtx_lock_spin(&sleepq_prof_lock);
 	enabled = prof_enabled;
 	prof_enabled = 0;
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_INIT(&sleepq_hash[i]);
 	LIST_INIT(&sleepq_prof_free);
 	for (i = 0; i < SLEEPQ_PROF_LOCATIONS; i++) {
 		sp = &sleepq_profent[i];
 		sp->sp_wmesg = NULL;
 		sp->sp_count = 0;
 		LIST_INSERT_HEAD(&sleepq_prof_free, sp, sp_link);
 	}
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 }
 
 static int
 enable_sleepq_prof(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = prof_enabled;
 	error = sysctl_handle_int(oidp, &v, v, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == prof_enabled)
 		return (0);
 	if (v == 1)
 		sleepq_prof_reset();
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = !!v;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	return (0);
 }
 
 static int
 reset_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	int error, v;
 
 	v = 0;
 	error = sysctl_handle_int(oidp, &v, 0, req);
 	if (error)
 		return (error);
 	if (req->newptr == NULL)
 		return (error);
 	if (v == 0)
 		return (0);
 	sleepq_prof_reset();
 
 	return (0);
 }
 
 static int
 dump_sleepq_prof_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct sleepq_prof *sp;
 	struct sbuf *sb;
 	int enabled;
 	int error;
 	int i;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, SLEEPQ_SBUFSIZE, req);
 	sbuf_printf(sb, "\nwmesg\tcount\n");
 	enabled = prof_enabled;
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = 0;
 	mtx_unlock_spin(&sleepq_prof_lock);
 	for (i = 0; i < SC_TABLESIZE; i++) {
 		LIST_FOREACH(sp, &sleepq_hash[i], sp_link) {
 			sbuf_printf(sb, "%s\t%ld\n",
 			    sp->sp_wmesg, sp->sp_count);
 		}
 	}
 	mtx_lock_spin(&sleepq_prof_lock);
 	prof_enabled = enabled;
 	mtx_unlock_spin(&sleepq_prof_lock);
 
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 	return (error);
 }
 
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, dump_sleepq_prof_stats, "A", "Sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, reset_sleepq_prof_stats, "I",
     "Reset sleepqueue profiling statistics");
 SYSCTL_PROC(_debug_sleepq, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW,
     NULL, 0, enable_sleepq_prof, "I", "Enable sleepqueue profiling");
 #endif
 
 #ifdef DDB
 DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 #ifdef INVARIANTS
 	struct lock_object *lock;
 #endif
 	struct thread *td;
 	void *wchan;
 	int i;
 
 	if (!have_addr)
 		return;
 
 	/*
 	 * First, see if there is an active sleep queue for the wait channel
 	 * indicated by the address.
 	 */
 	wchan = (void *)addr;
 	sc = SC_LOOKUP(wchan);
 	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
 		if (sq->sq_wchan == wchan)
 			goto found;
 
 	/*
 	 * Second, see if there is an active sleep queue at the address
 	 * indicated.
 	 */
 	for (i = 0; i < SC_TABLESIZE; i++)
 		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
 			if (sq == (struct sleepqueue *)addr)
 				goto found;
 		}
 
 	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
 	return;
 found:
 	db_printf("Wait channel: %p\n", sq->sq_wchan);
 	db_printf("Queue type: %d\n", sq->sq_type);
 #ifdef INVARIANTS
 	if (sq->sq_lock) {
 		lock = sq->sq_lock;
 		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
 		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	}
 #endif
 	db_printf("Blocked threads:\n");
 	for (i = 0; i < NR_SLEEPQS; i++) {
 		db_printf("\nQueue[%d]:\n", i);
 		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
 			db_printf("\tempty\n");
 		else
 			TAILQ_FOREACH(td, &sq->sq_blocked[0],
 				      td_slpq) {
 				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
 					  td->td_tid, td->td_proc->p_pid,
 					  td->td_name);
 			}
 		db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]);
 	}
 }
 
 /* Alias 'show sleepqueue' to 'show sleepq'. */
 DB_SHOW_ALIAS(sleepqueue, db_show_sleepqueue);
 #endif
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 315279)
+++ head/sys/sys/proc.h	(revision 315280)
@@ -1,1128 +1,1130 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #include <sys/condvar.h>
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/lock_profile.h>
 #include <sys/_mutex.h>
 #include <sys/osd.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
 #include <sys/runq.h>
 #include <sys/resource.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <sys/_vm_domain.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	u_int		s_count;	/* Ref cnt; pgrps in session - atomic. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct cdev_priv *s_ttydp;	/* (m) Device of controlling tty.  */
 	struct tty	*s_ttyp;	/* (e) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members. */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Process group id. */
 	int		pg_jobc;	/* (m) Job control process count. */
 	struct mtx	pg_mtx;		/* Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      q - td_contested lock
  *      r - p_peers lock
+ *      s - by curthread, or by others when curthread is on sleepqueue
  *      t - thread lock
  *	u - process stat lock
  *	w - process timer lock
  *      x - created at fork, only changes during single threading in exec
  *      y - created at first aio, doesn't change until exit or exec at which
  *          point we are single-threaded and only curthread changes it
  *      z - zombie threads lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct cpuset;
 struct filecaps;
 struct filemon;
 struct kaioinfo;
 struct kaudit_record;
 struct kdtrace_proc;
 struct kdtrace_thread;
 struct mqueue_notifier;
 struct nlminfo;
 struct p_sched;
 struct proc;
 struct procdesc;
 struct racct;
 struct sbuf;
 struct sleepqueue;
 struct syscall_args;
 struct td_sched;
 struct thread;
 struct trapframe;
 struct turnstile;
 
 /*
  * XXX: Does this belong in resource.h or resourcevar.h instead?
  * Resource usage extension.  The times in rusage structs in the kernel are
  * never up to date.  The actual times are kept as runtimes and tick counts
  * (with control info in the "previous" times), and are converted when
  * userland asks for rusage info.  Backwards compatibility prevents putting
  * this directly in the user-visible rusage struct.
  *
  * Locking for p_rux: (cu) means (u) for p_rux and (c) for p_crux.
  * Locking for td_rux: (t) for all fields.
  */
 struct rusage_ext {
 	uint64_t	rux_runtime;    /* (cu) Real time. */
 	uint64_t	rux_uticks;     /* (cu) Statclock hits in user mode. */
 	uint64_t	rux_sticks;     /* (cu) Statclock hits in sys mode. */
 	uint64_t	rux_iticks;     /* (cu) Statclock hits in intr mode. */
 	uint64_t	rux_uu;         /* (c) Previous user time in usec. */
 	uint64_t	rux_su;         /* (c) Previous sys time in usec. */
 	uint64_t	rux_tu;         /* (c) Previous total time in usec. */
 };
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
 	struct mtx	*volatile td_lock; /* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 	TAILQ_ENTRY(thread) td_runq;	/* (t) Run queue. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 	LIST_ENTRY(thread) td_hash;	/* (d) Hash chain. */
 	struct cpuset	*td_cpuset;	/* (t) CPU affinity mask. */
 	struct seltd	*td_sel;	/* Select queue/channel. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
 	struct rl_q_entry *td_rlqe;	/* (k) Associated range lock entry. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	struct vm_domain_policy td_vm_dom_policy;	/* (c) current numa domain policy */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
 #define	td_siglist	td_sigqueue.sq_signals
 	u_char		td_lend_user_pri; /* (t) Lend user pri. */
 
 /* Cleared during fork1() */
 #define	td_startzero td_flags
 	int		td_flags;	/* (t) TDF_* flags. */
 	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
 	void		*td_wchan;	/* (t) Sleep address. */
 	const char	*td_wmesg;	/* (t) Reason for sleep. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
 	short		td_locks;	/* (k) Debug: count of non-spin locks */
 	short		td_rw_rlocks;	/* (k) Count of rwlock read locks. */
 	short		td_lk_slocks;	/* (k) Count of lockmgr shared locks. */
 	short		td_stopsched;	/* (k) Scheduler stopped. */
 	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
 	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	int		td_pinned;	/* (k) Temporary cpu pin count. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct plimit	*td_limit;	/* (k) Resource limits. */
 	int		td_slptick;	/* (t) Time at sleep. */
 	int		td_blktick;	/* (t) Time spent blocked. */
 	int		td_swvoltick;	/* (t) Time at last SW_VOL switch. */
 	int		td_swinvoltick;	/* (t) Time at last SW_INVOL switch. */
 	u_int		td_cow;		/* (*) Number of copy-on-write faults */
 	struct rusage	td_ru;		/* (t) rusage information. */
 	struct rusage_ext td_rux;	/* (t) Internal rusage information. */
 	uint64_t	td_incruntime;	/* (t) Cpu ticks to transfer to proc. */
 	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
 	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
 	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
 	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
 	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	int		td_intrval;	/* (t) Return value for sleepq. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
 	stack_t		td_sigstk;	/* (k) Stack ptr and on-stack flag. */
 	int		td_xsig;	/* (c) Signal for ptrace */
 	u_long		td_profil_addr;	/* (k) Temporary addr until AST. */
 	u_int		td_profil_ticks; /* (k) Temporary ticks until AST. */
 	char		td_name[MAXCOMLEN + 1];	/* (*) Thread name. */
 	struct file	*td_fpop;	/* (k) file referencing cdev under op */
 	int		td_dbgflags;	/* (c) Userland debugger flags */
 	struct ksiginfo td_dbgksi;	/* (c) ksi reflected to debugger. */
 	int		td_ng_outbound;	/* (k) Thread entered ng from above. */
 	struct osd	td_osd;		/* (k) Object specific data. */
 	struct vm_map_entry *td_map_def_user; /* (k) Deferred entries. */
 	pid_t		td_dbg_forked;	/* (c) Child pid for debugger. */
 	u_int		td_vp_reserv;	/* (k) Count of reserved vnodes. */
 	int		td_no_sleeping;	/* (k) Sleeping disabled count. */
 	int		td_dom_rr_idx;	/* (k) RR Numa domain selection. */
 	void		*td_su;		/* (k) FFS SU private */
 	sbintime_t	td_sleeptimo;	/* (t) Sleep timeout. */
+	int		td_rtcgen;	/* (s) rtc_generation of abs. sleep */
 #define	td_endzero td_sigmask
 
 /* Copied during fork1() or create_thread(). */
 #define	td_startcopy td_endzero
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	u_char		td_rqindex;	/* (t) Run queue index. */
 	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
 	u_char		td_priority;	/* (t) Thread active priority. */
 	u_char		td_pri_class;	/* (t) Scheduling class. */
 	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
 	u_char		td_base_user_pri; /* (t) Base user pri */
 	u_int		td_dbg_sc_code;	/* (c) Syscall code to debugger. */
 	u_int		td_dbg_sc_narg;	/* (c) Syscall arg count to debugger.*/
 	uintptr_t	td_rb_list;	/* (k) Robust list head. */
 	uintptr_t	td_rbp_list;	/* (k) Robust priv list head. */
 	uintptr_t	td_rb_inact;	/* (k) Current in-action mutex loc. */
 #define	td_endcopy td_pcb
 
 /*
  * Fields that must be manually set in fork1() or create_thread()
  * or already have been set in the allocator, constructor, etc.
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;			/* (t) thread state */
 	union {
 		register_t	tdu_retval[2];
 		off_t		tdu_off;
 	} td_uretoff;			/* (k) Syscall aux returns. */
 #define td_retval	td_uretoff.tdu_retval
 	u_int		td_cowgen;	/* (k) Generation of COW pointers. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack. */
 	volatile u_int	td_critnest;	/* (k*) Critical section nest level. */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct kaudit_record	*td_ar;	/* (k) Active audit record, if any. */
 	struct lpohead	td_lprof[2];	/* (a) lock profiling objects. */
 	struct kdtrace_thread	*td_dtrace; /* (*) DTrace-specific data. */
 	int		td_errno;	/* Error returned by last syscall. */
 	struct vnet	*td_vnet;	/* (k) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (k) Debugging vnet push / pop. */
 	struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
 	struct proc	*td_rfppwait_p;	/* (k) The vforked child */
 	struct vm_page	**td_ma;	/* (k) uio pages held */
 	int		td_ma_cnt;	/* (k) size of *td_ma */
 	void		*td_emuldata;	/* Emulator state data */
 	int		td_lastcpu;	/* (t) Last cpu we were on. */
 	int		td_oncpu;	/* (t) Which cpu we are on. */
 	void		*td_lkpi_task;	/* LinuxKPI task struct pointer */
 };
 
 struct thread0_storage {
 	struct thread t0st_thread;
 	uint64_t t0st_sched[10];
 };
 
 struct mtx *thread_lock_block(struct thread *);
 void thread_lock_unblock(struct thread *, struct mtx *);
 void thread_lock_set(struct thread *, struct mtx *);
 #define	THREAD_LOCK_ASSERT(td, type)					\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	if (__m != &blocked_lock)					\
 		mtx_assert(__m, (type));				\
 } while (0)
 
 #ifdef INVARIANTS
 #define	THREAD_LOCKPTR_ASSERT(td, lock)					\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	KASSERT((__m == &blocked_lock || __m == (lock)),		\
 	    ("Thread %p lock %p does not match %p", td, __m, (lock)));	\
 } while (0)
 
 #define	TD_LOCKS_INC(td)	((td)->td_locks++)
 #define	TD_LOCKS_DEC(td)	((td)->td_locks--)
 #else
 #define	THREAD_LOCKPTR_ASSERT(td, lock)
 
 #define	TD_LOCKS_INC(td)
 #define	TD_LOCKS_DEC(td)
 #endif
 
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
  */
 #define	TDF_BORROWING	0x00000001 /* Thread is borrowing pri from another. */
 #define	TDF_INPANIC	0x00000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_INMEM	0x00000004 /* Thread's stack is in memory. */
 #define	TDF_SINTR	0x00000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x00000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_CANSWAP	0x00000040 /* Thread can be swapped. */
 #define	TDF_SLEEPABORT	0x00000080 /* sleepq_abort was called. */
 #define	TDF_KTH_SUSP	0x00000100 /* kthread is suspended */
 #define	TDF_ALLPROCSUSP	0x00000200 /* suspended by SINGLE_ALLPROC */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_UNUSED12	0x00001000 /* --available-- */
 #define	TDF_SBDRY	0x00002000 /* Stop only on usermode boundary. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_NEEDSUSPCHK	0x00008000 /* Thread may need to suspend. */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_NOLOAD	0x00040000 /* Ignore during load avg calculations. */
 #define	TDF_SERESTART	0x00080000 /* ERESTART on stop attempts. */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_SEINTR	0x00200000 /* EINTR on stop attempts. */
 #define	TDF_SWAPINREQ	0x00400000 /* Swapin request due to wakeup. */
 #define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED3	0x08000000 /* Reserved for scheduler private use */
 #define	TDF_ALRMPEND	0x10000000 /* Pending SIGVTALRM needs to be posted. */
 #define	TDF_PROFPEND	0x20000000 /* Pending SIGPROF needs to be posted. */
 #define	TDF_MACPEND	0x40000000 /* AST-based MAC event pending. */
 
 /* Userland debug flags */
 #define	TDB_SUSPEND	0x00000001 /* Thread is suspended by debugger */
 #define	TDB_XSIG	0x00000002 /* Thread is exchanging signal under trace */
 #define	TDB_USERWR	0x00000004 /* Debugger modified memory or registers */
 #define	TDB_SCE		0x00000008 /* Thread performs syscall enter */
 #define	TDB_SCX		0x00000010 /* Thread performs syscall exit */
 #define	TDB_EXEC	0x00000020 /* TDB_SCX from exec(2) family */
 #define	TDB_FORK	0x00000040 /* TDB_SCX from fork(2) that created new
 				      process */
 #define	TDB_STOPATFORK	0x00000080 /* Stop at the return from fork (child
 				      only) */
 #define	TDB_CHILD	0x00000100 /* New child indicator for ptrace() */
 #define	TDB_BORN	0x00000200 /* New LWP indicator for ptrace() */
 #define	TDB_EXIT	0x00000400 /* Exiting LWP indicator for ptrace() */
 #define	TDB_VFORK	0x00000800 /* vfork indicator for ptrace() */
 #define	TDB_FSTP	0x00001000 /* The thread is PT_ATTACH leader */
 
 /*
  * "Private" flags kept in td_pflags:
  * These are only written by curthread and thus need no locking.
  */
 #define	TDP_OLDMASK	0x00000001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x00000002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x00000004 /* Thread is currently in KTRACE code. */
 #define	TDP_BUFNEED	0x00000008 /* Do not recurse into the buf flush */
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock acquisition - deadlock treatment. */
 #define	TDP_NOFAULTING	0x00000080 /* Do not handle page faults. */
 #define	TDP_UNUSED9	0x00000100 /* --available-- */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
 #define	TDP_SYNCIO	0x00000800 /* Local override, disable async i/o. */
 #define	TDP_SCHED1	0x00001000 /* Reserved for scheduler private use */
 #define	TDP_SCHED2	0x00002000 /* Reserved for scheduler private use */
 #define	TDP_SCHED3	0x00004000 /* Reserved for scheduler private use */
 #define	TDP_SCHED4	0x00008000 /* Reserved for scheduler private use */
 #define	TDP_GEOM	0x00010000 /* Settle GEOM before finishing syscall */
 #define	TDP_SOFTDEP	0x00020000 /* Stuck processing softdep worklist */
 #define	TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
 #define	TDP_WAKEUP	0x00080000 /* Don't sleep in umtx cond_wait */
 #define	TDP_INBDFLUSH	0x00100000 /* Already in BO_BDFLUSH, do not recurse */
 #define	TDP_KTHREAD	0x00200000 /* This is an official kernel thread */
 #define	TDP_CALLCHAIN	0x00400000 /* Capture thread's callchain */
 #define	TDP_IGNSUSP	0x00800000 /* Permission to ignore the MNTK_SUSPEND* */
 #define	TDP_AUDITREC	0x01000000 /* Audit record pending on thread */
 #define	TDP_RFPPWAIT	0x02000000 /* Handle RFPPWAIT on syscall exit */
 #define	TDP_RESETSPUR	0x04000000 /* Reset spurious page fault history. */
 #define	TDP_NERRNO	0x08000000 /* Last errno is already in td_errno */
 #define	TDP_UIOHELD	0x10000000 /* Current uio has pages held in td_ma */
 #define	TDP_FORKING	0x20000000 /* Thread is being created through fork() */
 #define	TDP_EXECVMSPC	0x40000000 /* Execve destroyed old vmspace */
 
 /*
  * Reasons that the current thread can not be run yet.
  * More than one may apply.
  */
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.  Bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 #define	TD_ON_UPILOCK(td)	((td)->td_flags & TDF_UPIBLOCKED)
 #define TD_IS_IDLETHREAD(td)	((td)->td_flags & TDF_IDLETD)
 
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	(td)->td_state = TDS_RUNNING
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN
 
 #define	TD_SBDRY_INTR(td) \
     (((td)->td_flags & (TDF_SEINTR | TDF_SERESTART)) != 0)
 #define	TD_SBDRY_ERRNO(td) \
     (((td)->td_flags & TDF_SEINTR) != 0 ? EINTR : ERESTART)
 
 /*
  * Process structure.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, thread) p_threads;	/* (c) all threads. */
 	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Resource limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_flag2;	/* (c) P2_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) Process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct proc	*p_reaper;	/* (e) My reaper. */
 	LIST_HEAD(, proc) p_reaplist;	/* (e) List of my descendants
 					       (if I am reaper). */
 	LIST_ENTRY(proc) p_reapsibling;	/* (e) List of siblings - descendants of
 					       the same reaper. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 	struct mtx	p_statmtx;	/* Lock for the stats */
 	struct mtx	p_itimmtx;	/* Lock for the virt/prof timers */
 	struct mtx	p_profmtx;	/* Lock for the profiling */
 	struct ksiginfo *p_ksi;	/* Locked by parent proc lock */
 	sigqueue_t	p_sigqueue;	/* (c) Sigs not delivered to a td. */
 #define p_siglist	p_sigqueue.sq_signals
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid;	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtick;	/* (c) Tick when swapped in or out. */
 	u_int		p_cowgen;	/* (c) Generation of COW pointers. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct rusage	p_ru;		/* (a) Exit information. */
 	struct rusage_ext p_rux;	/* (cu) Internal resource usage. */
 	struct rusage_ext p_crux;	/* (c) Internal child resource usage. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task. */
 	volatile int	p_exitthreads;	/* (j) Number of threads exiting */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	u_int		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	u_int		p_ptevents;	/* (c) ptrace() event mask. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (y) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (j) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 	struct procdesc	*p_procdesc;	/* (e) Process descriptor, if any. */
 	u_int		p_treeflag;	/* (e) P_TREE flags */
 	int		p_pendingexits; /* (c) Count of pending thread exits. */
 	struct filemon	*p_filemon;	/* (c) filemon-specific data. */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	u_int		p_magic;	/* (b) Magic number. */
 	int		p_osrel;	/* (x) osreldate for the
 					       binary (from ELF note, if any) */
 	char		p_comm[MAXCOMLEN + 1];	/* (x) Process name. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (c) Current CPU limit in seconds. */
 	signed char	p_nice;		/* (c) Process "nice" value. */
 	int		p_fibnum;	/* in this routing domain XXX MRT */
 	pid_t		p_reapsubtree;	/* (e) Pid of the direct child of the
 					       reaper which spawned
 					       our subtree. */
 	u_int		p_xexit;	/* (c) Exit code. */
 	u_int		p_xsig;		/* (c) Stop/kill sig. */
 	uint16_t	p_elf_machine;	/* (x) ELF machine type */
 	uint64_t	p_elf_flags;	/* (x) ELF flags */
 
 /* End area that is copied on creation. */
 #define	p_endcopy	p_elf_flags
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct knlist	*p_klist;	/* (c) Knotes attached to this proc. */
 	int		p_numthreads;	/* (c) Number of threads. */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	*p_label;	/* (*) Proc (not subject) MAC label. */
 	STAILQ_HEAD(, ktr_request)	p_ktr;	/* (o) KTR event queue. */
 	LIST_HEAD(, mqueue_notifier)	p_mqnotifier; /* (c) mqueue notifiers.*/
 	struct kdtrace_proc	*p_dtrace; /* (*) DTrace-specific data. */
 	struct cv	p_pwait;	/* (*) wait cv for exit/exec. */
 	struct cv	p_dbgwait;	/* (*) wait cv for debugger attach
 					   after fork. */
 	uint64_t	p_prev_runtime;	/* (c) Resource usage accounting. */
 	struct racct	*p_racct;	/* (b) Resource accounting. */
 	int		p_throttled;	/* (c) Flag for racct pcpu throttling */
 	struct vm_domain_policy p_vm_dom_policy;	/* (c) process default VM domain, or -1 */
 	/*
 	 * An orphan is the child that has beed re-parented to the
 	 * debugger as a result of attaching to it.  Need to keep
 	 * track of them for parent to be able to collect the exit
 	 * status of what used to be children.
 	 */
 	LIST_ENTRY(proc) p_orphan;	/* (e) List of orphan processes. */
 	LIST_HEAD(, proc) p_orphans;	/* (e) Pointer to list of orphans. */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU		(-1)	/* For when we aren't on a CPU. */
 #define	NOCPU_OLD	(255)
 #define	MAXCPU_OLD	(254)
 
 #define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
 #define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
 #define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 #define	PROC_STATLOCK(p)	mtx_lock_spin(&(p)->p_statmtx)
 #define	PROC_STATUNLOCK(p)	mtx_unlock_spin(&(p)->p_statmtx)
 #define	PROC_STATLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_statmtx, (type))
 
 #define	PROC_ITIMLOCK(p)	mtx_lock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMUNLOCK(p)	mtx_unlock_spin(&(p)->p_itimmtx)
 #define	PROC_ITIMLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_itimmtx, (type))
 
 #define	PROC_PROFLOCK(p)	mtx_lock_spin(&(p)->p_profmtx)
 #define	PROC_PROFUNLOCK(p)	mtx_unlock_spin(&(p)->p_profmtx)
 #define	PROC_PROFLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_profmtx, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KPROC		0x00004	/* Kernel process. */
 #define	P_UNUSED3	0x00008	/* --available-- */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread requesting to stop profiling. */
 #define	P_HADTHREADS	0x00080	/* Has had threads (no cleanup shortcuts) */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_WKILLED	0x08000	/* Killed, go to kernel/user boundary ASAP. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
 #define	P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 #define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
 #define	P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */
 #define	P_HWPMC		0x800000 /* Process is using HWPMCs */
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_TOTAL_STOP	0x2000000 /* Stopped in stop_all_proc. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 #define	P_STATCHILD	0x8000000 /* Child process stopped or exited. */
 #define	P_INMEM		0x10000000 /* Loaded into memory. */
 #define	P_SWAPPINGOUT	0x20000000 /* Process is being swapped out. */
 #define	P_SWAPPINGIN	0x40000000 /* Process is being swapped in. */
 #define	P_PPTRACE	0x80000000 /* PT_TRACEME by vforked child. */
 
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 #define	P_KILLED(p)	((p)->p_flag & P_WKILLED)
 
 /* These flags are kept in p_flag2. */
 #define	P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */
 #define	P2_NOTRACE	0x00000002	/* No ptrace(2) attach or coredumps. */
 #define	P2_NOTRACE_EXEC 0x00000004	/* Keep P2_NOPTRACE on exec(2). */
 #define	P2_AST_SU	0x00000008	/* Handles SU ast for kthreads. */
 #define	P2_PTRACE_FSTP	0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */
 #define	P2_TRAPCAP	0x00000020	/* SIGTRAP on ENOTCAPABLE */
 
 /* Flags protected by proctree_lock, kept in p_treeflags. */
 #define	P_TREE_ORPHANED		0x00000001	/* Reparented, on orphan list */
 #define	P_TREE_FIRST_ORPHAN	0x00000002	/* First element of orphan
 						   list */
 #define	P_TREE_REAPER		0x00000004	/* Reaper of subtree */
 
 /*
  * These were process status values (p_stat), now they are only used in
  * legacy conversion code.
  */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 /* Types and flags for mi_switch(). */
 #define	SW_TYPE_MASK		0xff	/* First 8 bits are switch type */
 #define	SWT_NONE		0	/* Unspecified switch. */
 #define	SWT_PREEMPT		1	/* Switching due to preemption. */
 #define	SWT_OWEPREEMPT		2	/* Switching due to owepreempt. */
 #define	SWT_TURNSTILE		3	/* Turnstile contention. */
 #define	SWT_SLEEPQ		4	/* Sleepq wait. */
 #define	SWT_SLEEPQTIMO		5	/* Sleepq timeout wait. */
 #define	SWT_RELINQUISH		6	/* yield call. */
 #define	SWT_NEEDRESCHED		7	/* NEEDRESCHED was set. */
 #define	SWT_IDLE		8	/* Switching from the idle thread. */
 #define	SWT_IWAIT		9	/* Waiting for interrupts. */
 #define	SWT_SUSPEND		10	/* Thread suspended. */
 #define	SWT_REMOTEPREEMPT	11	/* Remote processor preempted. */
 #define	SWT_REMOTEWAKEIDLE	12	/* Remote processor preempted idle. */
 #define	SWT_COUNT		13	/* Number of switch types. */
 /* Flags */
 #define	SW_VOL		0x0100		/* Voluntary switch. */
 #define	SW_INVOL	0x0200		/* Involuntary switch. */
 #define SW_PREEMPT	0x0400		/* The invol switch is a preemption */
 
 /* How values for thread_single(). */
 #define	SINGLE_NO_EXIT	0
 #define	SINGLE_EXIT	1
 #define	SINGLE_BOUNDARY	2
 #define	SINGLE_ALLPROC	3
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 
 /*
  * We use process IDs <= pid_max <= PID_MAX; PID_MAX + 1 must also fit
  * in a pid_t, as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 extern pid_t pid_max;
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 
 
 #define	STOPEVENT(p, e, v) do {						\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,			\
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))	{					\
 		PROC_LOCK(p);						\
 		stopevent((p), (e), (v));				\
 		PROC_UNLOCK(p);						\
 	}								\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))						\
 		stopevent((p), (e), (v));				\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_LOCK(pg);						\
 } while (0)
 #define	PGRP_UNLOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_UNLOCK(pg);					\
 } while (0)
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /*
  * Non-zero p_lock ensures that:
  * - exit1() is not performed until p_lock reaches zero;
  * - the process' threads stack are not swapped out if they are currently
  *   not (P_INMEM).
  *
  * PHOLD() asserts that the process (except the current process) is
  * not exiting, increments p_lock and swaps threads stacks into memory,
  * if needed.
  * _PHOLD() is same as PHOLD(), it takes the process locked.
  * _PHOLD_LITE() also takes the process locked, but comparing with
  * _PHOLD(), it only guarantees that exit1() is not executed,
  * faultin() is not called.
  */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process %p", p));			\
 	(p)->p_lock++;							\
 	if (((p)->p_flag & P_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 #define	_PHOLD_LITE(p) do {						\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process %p", p));			\
 	(p)->p_lock++;							\
 } while (0)
 #define	PROC_ASSERT_HELD(p) do {					\
 	KASSERT((p)->p_lock > 0, ("process %p not held", p));		\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	PROC_ASSERT_HELD(p);						\
 	(--(p)->p_lock);						\
 	if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0)		\
 		wakeup(&(p)->p_lock);					\
 } while (0)
 #define	PROC_ASSERT_NOT_HELD(p) do {					\
 	KASSERT((p)->p_lock == 0, ("process %p held", p));		\
 } while (0)
 
 #define	PROC_UPDATE_COW(p) do {						\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(p)->p_cowgen++;						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td)	((td)->td_flags & TDF_CANSWAP)
 
 /* Control whether or not it is safe for curthread to sleep. */
 #define	THREAD_NO_SLEEPING()		((curthread)->td_no_sleeping++)
 
 #define	THREAD_SLEEPING_OK()		((curthread)->td_no_sleeping--)
 
 #define	THREAD_CAN_SLEEP()		((curthread)->td_no_sleeping == 0)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 #define	TIDHASH(tid)	(&tidhashtbl[(tid) & tidhash])
 extern LIST_HEAD(tidhashhead, thread) *tidhashtbl;
 extern u_long tidhash;
 extern struct rwlock tidhash_lock;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern int allproc_gen;
 extern struct sx proctree_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread0_storage thread0_st;	/* Primary thread in proc0. */
 #define	thread0 (thread0_st.t0st_thread)
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int lastpid;
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 
 extern struct uma_zone *proc_zone;
 
 struct	proc *pfind(pid_t);		/* Find process by id. */
 struct	proc *pfind_locked(pid_t pid);
 struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
 struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
 
 struct	fork_req {
 	int		fr_flags;
 	int		fr_pages;
 	int 		*fr_pidp;
 	struct proc 	**fr_procp;
 	int 		*fr_pd_fd;
 	int 		fr_pd_flags;
 	struct filecaps	*fr_pd_fcaps;
 };
 
 /*
  * pget() flags.
  */
 #define	PGET_HOLD	0x00001	/* Hold the process. */
 #define	PGET_CANSEE	0x00002	/* Check against p_cansee(). */
 #define	PGET_CANDEBUG	0x00004	/* Check against p_candebug(). */
 #define	PGET_ISCURRENT	0x00008	/* Check that the found process is current. */
 #define	PGET_NOTWEXIT	0x00010	/* Check that the process is not in P_WEXIT. */
 #define	PGET_NOTINEXEC	0x00020	/* Check that the process is not in P_INEXEC. */
 #define	PGET_NOTID	0x00040	/* Do not assume tid if pid > PID_MAX. */
 
 #define	PGET_WANTREAD	(PGET_HOLD | PGET_CANDEBUG | PGET_NOTWEXIT)
 
 int	pget(pid_t pid, int flags, struct proc **pp);
 
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansee(struct ucred *u1, struct ucred *u2);
 int	cr_canseesocket(struct ucred *cred, struct socket *so);
 int	cr_canseeothergids(struct ucred *u1, struct ucred *u2);
 int	cr_canseeotheruids(struct ucred *u1, struct ucred *u2);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp,
 	    struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, struct fork_req *);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 void	kern_yield(int);
 void 	kick_proc0(void);
 void	killjobc(void);
 int	leavepgrp(struct proc *p);
 int	maybe_preempt(struct thread *td);
 void	maybe_yield(void);
 void	mi_switch(int flags, struct thread *newtd);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 int	p_canwait(struct thread *td, struct proc *p);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 int	proc_getargv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getauxv(struct thread *td, struct proc *p, struct sbuf *sb);
 int	proc_getenvv(struct thread *td, struct proc *p, struct sbuf *sb);
 void	procinit(void);
 void	proc_linkup0(struct proc *p, struct thread *td);
 void	proc_linkup(struct proc *p, struct thread *td);
 struct proc *proc_realparent(struct proc *child);
 void	proc_reap(struct thread *td, struct proc *p, int *status, int options);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 void	proc_set_traced(struct proc *p, bool stop);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
 void	reaper_abandon_children(struct proc *p, bool exiting);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	sess_hold(struct session *);
 void	sess_release(struct session *);
 int	setrunnable(struct thread *);
 void	setsugid(struct proc *p);
 int	should_yield(void);
 int	sigonstack(size_t sp);
 void	stopevent(struct proc *, u_int, u_int);
 struct	thread *tdfind(lwpid_t, pid_t);
 void	threadinit(void);
 void	tidhash_add(struct thread *);
 void	tidhash_remove(struct thread *);
 void	cpu_idle(int);
 int	cpu_idle_wakeup(int);
 extern	void (*cpu_idle_hook)(sbintime_t);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *, struct thread *, struct mtx *);
 void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int, int) __dead2;
 void	cpu_copy_thread(struct thread *td, struct thread *td0);
 int	cpu_fetch_syscall_args(struct thread *td, struct syscall_args *sa);
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_fork_kthread_handler(struct thread *, void (*)(void *), void *);
 void	cpu_set_syscall_retval(struct thread *, int);
 void	cpu_set_upcall(struct thread *, void (*)(void *), void *,
 	    stack_t *);
 int	cpu_set_user_tls(struct thread *, void *tls_base);
 void	cpu_thread_alloc(struct thread *);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_free(struct thread *);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
 struct	thread *thread_alloc(int pages);
 int	thread_alloc_stack(struct thread *, int pages);
 void	thread_cow_get_proc(struct thread *newtd, struct proc *p);
 void	thread_cow_get(struct thread *newtd, struct thread *td);
 void	thread_cow_free(struct thread *td);
 void	thread_cow_update(struct thread *td);
 int	thread_create(struct thread *td, struct rtprio *rtp,
 	    int (*initialize_thread)(struct thread *, void *), void *thunk);
 void	thread_exit(void) __dead2;
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct proc *p);
 void	thread_reap(void);
 int	thread_single(struct proc *p, int how);
 void	thread_single_end(struct proc *p, int how);
 void	thread_stash(struct thread *td);
 void	thread_stopped(struct proc *p);
 void	childproc_stopped(struct proc *child, int reason);
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
 bool	thread_suspend_check_needed(void);
 void	thread_suspend_switch(struct thread *, struct proc *p);
 void	thread_suspend_one(struct thread *td);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_wait(struct proc *p);
 struct thread	*thread_find(struct proc *p, lwpid_t tid);
 
 void	stop_all_proc(void);
 void	resume_all_proc(void);
 
 static __inline int
 curthread_pflags_set(int flags)
 {
 	struct thread *td;
 	int save;
 
 	td = curthread;
 	save = ~flags | (td->td_pflags & flags);
 	td->td_pflags |= flags;
 	return (save);
 }
 
 static __inline void
 curthread_pflags_restore(int save)
 {
 
 	curthread->td_pflags &= save;
 }
 
 static __inline __pure2 struct td_sched *
 td_get_sched(struct thread *td)
 {
 
 	return ((struct td_sched *)&td[1]);
 }
 
 extern void (*softdep_ast_cleanup)(struct thread *);
 static __inline void
 td_softdep_cleanup(struct thread *td)
 {
 
 	if (td->td_su != NULL && softdep_ast_cleanup != NULL)
 		softdep_ast_cleanup(td);
 }
 
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
Index: head/sys/sys/sleepqueue.h
===================================================================
--- head/sys/sys/sleepqueue.h	(revision 315279)
+++ head/sys/sys/sleepqueue.h	(revision 315280)
@@ -1,117 +1,120 @@
 /*-
  * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SLEEPQUEUE_H_
 #define _SYS_SLEEPQUEUE_H_
 
 /*
  * Sleep queue interface.  Sleep/wakeup, condition variables, and sx
  * locks use a sleep queue for the queue of threads blocked on a sleep
  * channel.
  *
  * A thread calls sleepq_lock() to lock the sleep queue chain associated
  * with a given wait channel.  A thread can then call call sleepq_add() to
  * add themself onto a sleep queue and call one of the sleepq_wait()
  * functions to actually go to sleep.  If a thread needs to abort a sleep
  * operation it should call sleepq_release() to unlock the associated sleep
  * queue chain lock.  If the thread also needs to remove itself from a queue
  * it just enqueued itself on, it can use sleepq_remove() instead.
  *
  * If the thread only wishes to sleep for a limited amount of time, it can
  * call sleepq_set_timeout() after sleepq_add() to setup a timeout.  It
  * should then use one of the sleepq_timedwait() functions to block.
  *
  * A thread is normally resumed from a sleep queue by either the
  * sleepq_signal() or sleepq_broadcast() functions.  Sleepq_signal() wakes
  * the thread with the highest priority that is sleeping on the specified
  * wait channel.  Sleepq_broadcast() wakes all threads that are sleeping
  * on the specified wait channel.  A thread sleeping in an interruptible
  * sleep can be interrupted by calling sleepq_abort().  A thread can also
  * be removed from a specified sleep queue using the sleepq_remove()
  * function.  Note that the sleep queue chain must first be locked via
  * sleepq_lock() before calling sleepq_abort(), sleepq_broadcast(), or
  * sleepq_signal().  These routines each return a boolean that will be true
  * if at least one swapped-out thread was resumed.  In that case, the caller
  * is responsible for waking up the swapper by calling kick_proc0() after
  * releasing the sleep queue chain lock.
  *
  * Each thread allocates a sleep queue at thread creation via sleepq_alloc()
  * and releases it at thread destruction via sleepq_free().  Note that
  * a sleep queue is not tied to a specific thread and that the sleep queue
  * released at thread destruction may not be the same sleep queue that the
  * thread allocated when it was created.
  *
  * XXX: Some other parts of the kernel such as ithread sleeping may end up
  * using this interface as well (death to TDI_IWAIT!)
  */
 
 struct lock_object;
 struct sleepqueue;
 struct thread;
 
 #ifdef _KERNEL
 
 #define	SLEEPQ_TYPE		0x0ff		/* Mask of sleep queue types. */
 #define	SLEEPQ_SLEEP		0x00		/* Used by sleep/wakeup. */
 #define	SLEEPQ_CONDVAR		0x01		/* Used for a cv. */
 #define	SLEEPQ_PAUSE		0x02		/* Used by pause. */
 #define	SLEEPQ_SX		0x03		/* Used by an sx lock. */
 #define	SLEEPQ_LK		0x04		/* Used by a lockmgr. */
 #define	SLEEPQ_INTERRUPTIBLE	0x100		/* Sleep is interruptible. */
 
 void	init_sleepqueues(void);
 int	sleepq_abort(struct thread *td, int intrval);
 void	sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg,
 	    int flags, int queue);
 struct sleepqueue *sleepq_alloc(void);
 int	sleepq_broadcast(void *wchan, int flags, int pri, int queue);
+void	sleepq_chains_remove_matching(bool (*matches)(struct thread *));
 void	sleepq_free(struct sleepqueue *sq);
 void	sleepq_lock(void *wchan);
 struct sleepqueue *sleepq_lookup(void *wchan);
 void	sleepq_release(void *wchan);
 void	sleepq_remove(struct thread *td, void *wchan);
+int	sleepq_remove_matching(struct sleepqueue *sq, int queue,
+	    bool (*matches)(struct thread *), int pri);
 int	sleepq_signal(void *wchan, int flags, int pri, int queue);
 void	sleepq_set_timeout_sbt(void *wchan, sbintime_t sbt,
 	    sbintime_t pr, int flags);
 #define	sleepq_set_timeout(wchan, timo)					\
     sleepq_set_timeout_sbt((wchan), tick_sbt * (timo), 0, C_HARDCLOCK)
 u_int	sleepq_sleepcnt(void *wchan, int queue);
 int	sleepq_timedwait(void *wchan, int pri);
 int	sleepq_timedwait_sig(void *wchan, int pri);
 int	sleepq_type(void *wchan);
 void	sleepq_wait(void *wchan, int pri);
 int	sleepq_wait_sig(void *wchan, int pri);
 
 #ifdef STACK
 struct sbuf;
 int sleepq_sbuf_print_stacks(struct sbuf *sb, void *wchan, int queue,
     int *count_stacks_printed);
 #endif
 
 #endif	/* _KERNEL */
 #endif	/* !_SYS_SLEEPQUEUE_H_ */
Index: head/sys/sys/time.h
===================================================================
--- head/sys/sys/time.h	(revision 315279)
+++ head/sys/sys/time.h	(revision 315280)
@@ -1,499 +1,501 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)time.h	8.5 (Berkeley) 5/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_TIME_H_
 #define	_SYS_TIME_H_
 
 #include <sys/_timeval.h>
 #include <sys/types.h>
 #include <sys/timespec.h>
 
 struct timezone {
 	int	tz_minuteswest;	/* minutes west of Greenwich */
 	int	tz_dsttime;	/* type of dst correction */
 };
 #define	DST_NONE	0	/* not on dst */
 #define	DST_USA		1	/* USA style dst */
 #define	DST_AUST	2	/* Australian style dst */
 #define	DST_WET		3	/* Western European dst */
 #define	DST_MET		4	/* Middle European dst */
 #define	DST_EET		5	/* Eastern European dst */
 #define	DST_CAN		6	/* Canada */
 
 #if __BSD_VISIBLE
 struct bintime {
 	time_t	sec;
 	uint64_t frac;
 };
 
 static __inline void
 bintime_addx(struct bintime *_bt, uint64_t _x)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac += _x;
 	if (_u > _bt->frac)
 		_bt->sec++;
 }
 
 static __inline void
 bintime_add(struct bintime *_bt, const struct bintime *_bt2)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac += _bt2->frac;
 	if (_u > _bt->frac)
 		_bt->sec++;
 	_bt->sec += _bt2->sec;
 }
 
 static __inline void
 bintime_sub(struct bintime *_bt, const struct bintime *_bt2)
 {
 	uint64_t _u;
 
 	_u = _bt->frac;
 	_bt->frac -= _bt2->frac;
 	if (_u < _bt->frac)
 		_bt->sec--;
 	_bt->sec -= _bt2->sec;
 }
 
 static __inline void
 bintime_mul(struct bintime *_bt, u_int _x)
 {
 	uint64_t _p1, _p2;
 
 	_p1 = (_bt->frac & 0xffffffffull) * _x;
 	_p2 = (_bt->frac >> 32) * _x + (_p1 >> 32);
 	_bt->sec *= _x;
 	_bt->sec += (_p2 >> 32);
 	_bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull);
 }
 
 static __inline void
 bintime_shift(struct bintime *_bt, int _exp)
 {
 
 	if (_exp > 0) {
 		_bt->sec <<= _exp;
 		_bt->sec |= _bt->frac >> (64 - _exp);
 		_bt->frac <<= _exp;
 	} else if (_exp < 0) {
 		_bt->frac >>= -_exp;
 		_bt->frac |= (uint64_t)_bt->sec << (64 + _exp);
 		_bt->sec >>= -_exp;
 	}
 }
 
 #define	bintime_clear(a)	((a)->sec = (a)->frac = 0)
 #define	bintime_isset(a)	((a)->sec || (a)->frac)
 #define	bintime_cmp(a, b, cmp)						\
 	(((a)->sec == (b)->sec) ?					\
 	    ((a)->frac cmp (b)->frac) :					\
 	    ((a)->sec cmp (b)->sec))
 
 #define	SBT_1S	((sbintime_t)1 << 32)
 #define	SBT_1M	(SBT_1S * 60)
 #define	SBT_1MS	(SBT_1S / 1000)
 #define	SBT_1US	(SBT_1S / 1000000)
 #define	SBT_1NS	(SBT_1S / 1000000000)
 #define	SBT_MAX	0x7fffffffffffffffLL
 
 static __inline int
 sbintime_getsec(sbintime_t _sbt)
 {
 
 	return (_sbt >> 32);
 }
 
 static __inline sbintime_t
 bttosbt(const struct bintime _bt)
 {
 
 	return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32));
 }
 
 static __inline struct bintime
 sbttobt(sbintime_t _sbt)
 {
 	struct bintime _bt;
 
 	_bt.sec = _sbt >> 32;
 	_bt.frac = _sbt << 32;
 	return (_bt);
 }
 
 /*-
  * Background information:
  *
  * When converting between timestamps on parallel timescales of differing
  * resolutions it is historical and scientific practice to round down rather
  * than doing 4/5 rounding.
  *
  *   The date changes at midnight, not at noon.
  *
  *   Even at 15:59:59.999999999 it's not four'o'clock.
  *
  *   time_second ticks after N.999999999 not after N.4999999999
  */
 
 static __inline void
 bintime2timespec(const struct bintime *_bt, struct timespec *_ts)
 {
 
 	_ts->tv_sec = _bt->sec;
 	_ts->tv_nsec = ((uint64_t)1000000000 *
 	    (uint32_t)(_bt->frac >> 32)) >> 32;
 }
 
 static __inline void
 timespec2bintime(const struct timespec *_ts, struct bintime *_bt)
 {
 
 	_bt->sec = _ts->tv_sec;
 	/* 18446744073 = int(2^64 / 1000000000) */
 	_bt->frac = _ts->tv_nsec * (uint64_t)18446744073LL;
 }
 
 static __inline void
 bintime2timeval(const struct bintime *_bt, struct timeval *_tv)
 {
 
 	_tv->tv_sec = _bt->sec;
 	_tv->tv_usec = ((uint64_t)1000000 * (uint32_t)(_bt->frac >> 32)) >> 32;
 }
 
 static __inline void
 timeval2bintime(const struct timeval *_tv, struct bintime *_bt)
 {
 
 	_bt->sec = _tv->tv_sec;
 	/* 18446744073709 = int(2^64 / 1000000) */
 	_bt->frac = _tv->tv_usec * (uint64_t)18446744073709LL;
 }
 
 static __inline struct timespec
 sbttots(sbintime_t _sbt)
 {
 	struct timespec _ts;
 
 	_ts.tv_sec = _sbt >> 32;
 	_ts.tv_nsec = ((uint64_t)1000000000 * (uint32_t)_sbt) >> 32;
 	return (_ts);
 }
 
 static __inline sbintime_t
 tstosbt(struct timespec _ts)
 {
 
 	return (((sbintime_t)_ts.tv_sec << 32) +
 	    (_ts.tv_nsec * (((uint64_t)1 << 63) / 500000000) >> 32));
 }
 
 static __inline struct timeval
 sbttotv(sbintime_t _sbt)
 {
 	struct timeval _tv;
 
 	_tv.tv_sec = _sbt >> 32;
 	_tv.tv_usec = ((uint64_t)1000000 * (uint32_t)_sbt) >> 32;
 	return (_tv);
 }
 
 static __inline sbintime_t
 tvtosbt(struct timeval _tv)
 {
 
 	return (((sbintime_t)_tv.tv_sec << 32) +
 	    (_tv.tv_usec * (((uint64_t)1 << 63) / 500000) >> 32));
 }
 #endif /* __BSD_VISIBLE */
 
 #ifdef _KERNEL
 
 /* Operations on timespecs */
 #define	timespecclear(tvp)	((tvp)->tv_sec = (tvp)->tv_nsec = 0)
 #define	timespecisset(tvp)	((tvp)->tv_sec || (tvp)->tv_nsec)
 #define	timespeccmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_nsec cmp (uvp)->tv_nsec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
 #define	timespecadd(vvp, uvp)						\
 	do {								\
 		(vvp)->tv_sec += (uvp)->tv_sec;				\
 		(vvp)->tv_nsec += (uvp)->tv_nsec;			\
 		if ((vvp)->tv_nsec >= 1000000000) {			\
 			(vvp)->tv_sec++;				\
 			(vvp)->tv_nsec -= 1000000000;			\
 		}							\
 	} while (0)
 #define	timespecsub(vvp, uvp)						\
 	do {								\
 		(vvp)->tv_sec -= (uvp)->tv_sec;				\
 		(vvp)->tv_nsec -= (uvp)->tv_nsec;			\
 		if ((vvp)->tv_nsec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_nsec += 1000000000;			\
 		}							\
 	} while (0)
 
 /* Operations on timevals. */
 
 #define	timevalclear(tvp)		((tvp)->tv_sec = (tvp)->tv_usec = 0)
 #define	timevalisset(tvp)		((tvp)->tv_sec || (tvp)->tv_usec)
 #define	timevalcmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_usec cmp (uvp)->tv_usec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
 
 /* timevaladd and timevalsub are not inlined */
 
 #endif /* _KERNEL */
 
 #ifndef _KERNEL			/* NetBSD/OpenBSD compatible interfaces */
 
 #define	timerclear(tvp)		((tvp)->tv_sec = (tvp)->tv_usec = 0)
 #define	timerisset(tvp)		((tvp)->tv_sec || (tvp)->tv_usec)
 #define	timercmp(tvp, uvp, cmp)					\
 	(((tvp)->tv_sec == (uvp)->tv_sec) ?				\
 	    ((tvp)->tv_usec cmp (uvp)->tv_usec) :			\
 	    ((tvp)->tv_sec cmp (uvp)->tv_sec))
 #define	timeradd(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec >= 1000000) {			\
 			(vvp)->tv_sec++;				\
 			(vvp)->tv_usec -= 1000000;			\
 		}							\
 	} while (0)
 #define	timersub(tvp, uvp, vvp)						\
 	do {								\
 		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
 		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
 		if ((vvp)->tv_usec < 0) {				\
 			(vvp)->tv_sec--;				\
 			(vvp)->tv_usec += 1000000;			\
 		}							\
 	} while (0)
 #endif
 
 /*
  * Names of the interval timers, and structure
  * defining a timer setting.
  */
 #define	ITIMER_REAL	0
 #define	ITIMER_VIRTUAL	1
 #define	ITIMER_PROF	2
 
 struct itimerval {
 	struct	timeval it_interval;	/* timer interval */
 	struct	timeval it_value;	/* current value */
 };
 
 /*
  * Getkerninfo clock information structure
  */
 struct clockinfo {
 	int	hz;		/* clock frequency */
 	int	tick;		/* micro-seconds per hz tick */
 	int	spare;
 	int	stathz;		/* statistics clock frequency */
 	int	profhz;		/* profiling clock frequency */
 };
 
 /* These macros are also in time.h. */
 #ifndef CLOCK_REALTIME
 #define	CLOCK_REALTIME	0
 #define	CLOCK_VIRTUAL	1
 #define	CLOCK_PROF	2
 #define	CLOCK_MONOTONIC	4
 #define	CLOCK_UPTIME	5		/* FreeBSD-specific. */
 #define	CLOCK_UPTIME_PRECISE	7	/* FreeBSD-specific. */
 #define	CLOCK_UPTIME_FAST	8	/* FreeBSD-specific. */
 #define	CLOCK_REALTIME_PRECISE	9	/* FreeBSD-specific. */
 #define	CLOCK_REALTIME_FAST	10	/* FreeBSD-specific. */
 #define	CLOCK_MONOTONIC_PRECISE	11	/* FreeBSD-specific. */
 #define	CLOCK_MONOTONIC_FAST	12	/* FreeBSD-specific. */
 #define	CLOCK_SECOND	13		/* FreeBSD-specific. */
 #define	CLOCK_THREAD_CPUTIME_ID	14
 #define	CLOCK_PROCESS_CPUTIME_ID	15
 #endif
 
 #ifndef TIMER_ABSTIME
 #define	TIMER_RELTIME	0x0	/* relative timer */
 #define	TIMER_ABSTIME	0x1	/* absolute timer */
 #endif
 
 #if __BSD_VISIBLE
 #define	CPUCLOCK_WHICH_PID	0
 #define	CPUCLOCK_WHICH_TID	1
 #endif
 
 #ifdef _KERNEL
 
 /*
  * Kernel to clock driver interface.
  */
 void	inittodr(time_t base);
 void	resettodr(void);
 
 extern volatile time_t	time_second;
 extern volatile time_t	time_uptime;
 extern struct bintime tc_tick_bt;
 extern sbintime_t tc_tick_sbt;
 extern struct bintime tick_bt;
 extern sbintime_t tick_sbt;
 extern int tc_precexp;
 extern int tc_timepercentage;
 extern struct bintime bt_timethreshold;
 extern struct bintime bt_tickthreshold;
 extern sbintime_t sbt_timethreshold;
 extern sbintime_t sbt_tickthreshold;
 
+extern volatile int rtc_generation;
+
 /*
  * Functions for looking at our clock: [get]{bin,nano,micro}[up]time()
  *
  * Functions without the "get" prefix returns the best timestamp
  * we can produce in the given format.
  *
  * "bin"   == struct bintime  == seconds + 64 bit fraction of seconds.
  * "nano"  == struct timespec == seconds + nanoseconds.
  * "micro" == struct timeval  == seconds + microseconds.
  *
  * Functions containing "up" returns time relative to boot and
  * should be used for calculating time intervals.
  *
  * Functions without "up" returns UTC time.
  *
  * Functions with the "get" prefix returns a less precise result
  * much faster than the functions without "get" prefix and should
  * be used where a precision of 1/hz seconds is acceptable or where
  * performance is priority. (NB: "precision", _not_ "resolution" !)
  */
 
 void	binuptime(struct bintime *bt);
 void	nanouptime(struct timespec *tsp);
 void	microuptime(struct timeval *tvp);
 
 static __inline sbintime_t
 sbinuptime(void)
 {
 	struct bintime _bt;
 
 	binuptime(&_bt);
 	return (bttosbt(_bt));
 }
 
 void	bintime(struct bintime *bt);
 void	nanotime(struct timespec *tsp);
 void	microtime(struct timeval *tvp);
 
 void	getbinuptime(struct bintime *bt);
 void	getnanouptime(struct timespec *tsp);
 void	getmicrouptime(struct timeval *tvp);
 
 static __inline sbintime_t
 getsbinuptime(void)
 {
 	struct bintime _bt;
 
 	getbinuptime(&_bt);
 	return (bttosbt(_bt));
 }
 
 void	getbintime(struct bintime *bt);
 void	getnanotime(struct timespec *tsp);
 void	getmicrotime(struct timeval *tvp);
 
 void	getboottime(struct timeval *boottime);
 void	getboottimebin(struct bintime *boottimebin);
 
 /* Other functions */
 int	itimerdecr(struct itimerval *itp, int usec);
 int	itimerfix(struct timeval *tv);
 int	ppsratecheck(struct timeval *, int *, int);
 int	ratecheck(struct timeval *, const struct timeval *);
 void	timevaladd(struct timeval *t1, const struct timeval *t2);
 void	timevalsub(struct timeval *t1, const struct timeval *t2);
 int	tvtohz(struct timeval *tv);
 
 #define	TC_DEFAULTPERC		5
 
 #define	BT2FREQ(bt)                                                     \
 	(((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) /           \
 	    ((bt)->frac >> 1))
 
 #define	SBT2FREQ(sbt)	((SBT_1S + ((sbt) >> 1)) / (sbt))
 
 #define	FREQ2BT(freq, bt)                                               \
 {									\
 	(bt)->sec = 0;                                                  \
 	(bt)->frac = ((uint64_t)0x8000000000000000  / (freq)) << 1;     \
 }
 
 #define	TIMESEL(sbt, sbt2)						\
 	(((sbt2) >= sbt_timethreshold) ?				\
 	    ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0))
 
 #else /* !_KERNEL */
 #include <time.h>
 
 #include <sys/cdefs.h>
 #include <sys/select.h>
 
 __BEGIN_DECLS
 int	setitimer(int, const struct itimerval *, struct itimerval *);
 int	utimes(const char *, const struct timeval *);
 
 #if __BSD_VISIBLE
 int	adjtime(const struct timeval *, struct timeval *);
 int	clock_getcpuclockid2(id_t, int, clockid_t *);
 int	futimes(int, const struct timeval *);
 int	futimesat(int, const char *, const struct timeval [2]);
 int	lutimes(const char *, const struct timeval *);
 int	settimeofday(const struct timeval *, const struct timezone *);
 #endif
 
 #if __XSI_VISIBLE
 int	getitimer(int, struct itimerval *);
 int	gettimeofday(struct timeval *, struct timezone *);
 #endif
 
 __END_DECLS
 
 #endif /* !_KERNEL */
 
 #endif /* !_SYS_TIME_H_ */