Index: stable/10/sys/kern/kern_lock.c
===================================================================
--- stable/10/sys/kern/kern_lock.c	(revision 278649)
+++ stable/10/sys/kern/kern_lock.c	(revision 278650)
@@ -1,1532 +1,1552 @@
 /*-
  * Copyright (c) 2008 Attilio Rao <attilio@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include "opt_adaptive_lockmgrs.h"
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/lock_profile.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sleepqueue.h>
 #ifdef DEBUG_LOCKS
 #include <sys/stack.h>
 #endif
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <machine/cpu.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) ==
     (LK_ADAPTIVE | LK_NOSHARE));
 CTASSERT(LK_UNLOCKED == (LK_UNLOCKED &
     ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS)));
 
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 #ifndef INVARIANTS
 #define	_lockmgr_assert(lk, what, file, line)
 #define	TD_LOCKS_INC(td)
 #define	TD_LOCKS_DEC(td)
 #else
 #define	TD_LOCKS_INC(td)	((td)->td_locks++)
 #define	TD_LOCKS_DEC(td)	((td)->td_locks--)
 #endif
 #define	TD_SLOCKS_INC(td)	((td)->td_lk_slocks++)
 #define	TD_SLOCKS_DEC(td)	((td)->td_lk_slocks--)
 
 #ifndef DEBUG_LOCKS
 #define	STACK_PRINT(lk)
 #define	STACK_SAVE(lk)
 #define	STACK_ZERO(lk)
 #else
 #define	STACK_PRINT(lk)	stack_print_ddb(&(lk)->lk_stack)
 #define	STACK_SAVE(lk)	stack_save(&(lk)->lk_stack)
 #define	STACK_ZERO(lk)	stack_zero(&(lk)->lk_stack)
 #endif
 
 #define	LOCK_LOG2(lk, string, arg1, arg2)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR2(KTR_LOCK, (string), (arg1), (arg2))
 #define	LOCK_LOG3(lk, string, arg1, arg2, arg3)				\
 	if (LOCK_LOG_TEST(&(lk)->lock_object, 0))			\
 		CTR3(KTR_LOCK, (string), (arg1), (arg2), (arg3))
 
 #define	GIANT_DECLARE							\
 	int _i = 0;							\
 	WITNESS_SAVE_DECL(Giant)
 #define	GIANT_RESTORE() do {						\
 	if (_i > 0) {							\
 		while (_i--)						\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 #define	GIANT_SAVE() do {						\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_i++;						\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define	LK_CAN_SHARE(x, flags)						\
 	(((x) & LK_SHARE) &&						\
 	(((x) & (LK_EXCLUSIVE_WAITERS | LK_EXCLUSIVE_SPINNERS)) == 0 ||	\
 	(curthread->td_lk_slocks != 0 && !(flags & LK_NODDLKTREAT)) ||	\
 	(curthread->td_pflags & TDP_DEADLKTREAT)))
 #define	LK_TRYOP(x)							\
 	((x) & LK_NOWAIT)
 
 #define	LK_CAN_WITNESS(x)						\
 	(((x) & LK_NOWITNESS) == 0 && !LK_TRYOP(x))
 #define	LK_TRYWIT(x)							\
 	(LK_TRYOP(x) ? LOP_TRYLOCK : 0)
 
 #define	LK_CAN_ADAPT(lk, f)						\
 	(((lk)->lock_object.lo_flags & LK_ADAPTIVE) != 0 &&		\
 	((f) & LK_SLEEPFAIL) == 0)
 
 #define	lockmgr_disowned(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == LK_KERNPROC)
 
 #define	lockmgr_xlocked(lk)						\
 	(((lk)->lk_lock & ~(LK_FLAGMASK & ~LK_SHARE)) == (uintptr_t)curthread)
 
 static void	assert_lockmgr(const struct lock_object *lock, int how);
 #ifdef DDB
 static void	db_show_lockmgr(const struct lock_object *lock);
 #endif
 static void	lock_lockmgr(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_lockmgr(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_lockmgr(struct lock_object *lock);
 
 struct lock_class lock_class_lockmgr = {
 	.lc_name = "lockmgr",
 	.lc_flags = LC_RECURSABLE | LC_SLEEPABLE | LC_SLEEPLOCK | LC_UPGRADABLE,
 	.lc_assert = assert_lockmgr,
 #ifdef DDB
 	.lc_ddb_show = db_show_lockmgr,
 #endif
 	.lc_lock = lock_lockmgr,
 	.lc_unlock = unlock_lockmgr,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_lockmgr,
 #endif
 };
 
 #ifdef ADAPTIVE_LOCKMGRS
 static u_int alk_retries = 10;
 static u_int alk_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, lockmgr, CTLFLAG_RD, NULL,
     "lockmgr debugging");
 SYSCTL_UINT(_debug_lockmgr, OID_AUTO, retries, CTLFLAG_RW, &alk_retries, 0, "");
 SYSCTL_UINT(_debug_lockmgr, OID_AUTO, loops, CTLFLAG_RW, &alk_loops, 0, "");
 #endif
 
 static __inline struct thread *
 lockmgr_xholder(const struct lock *lk)
 {
 	uintptr_t x;
 
 	x = lk->lk_lock;
 	return ((x & LK_SHARE) ? NULL : (struct thread *)LK_HOLDER(x));
 }
 
 /*
  * It assumes sleepq_lock held and returns with this one unheld.
  * It also assumes the generic interlock is sane and previously checked.
  * If LK_INTERLOCK is specified the interlock is not reacquired after the
  * sleep.
  */
 static __inline int
 sleeplk(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, int queue)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	int catch, error;
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	catch = pri & PCATCH;
 	pri &= PRIMASK;
 	error = 0;
 
 	LOCK_LOG3(lk, "%s: %p blocking on the %s sleepqueue", __func__, lk,
 	    (queue == SQ_EXCLUSIVE_QUEUE) ? "exclusive" : "shared");
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0)
 		lk->lk_exslpfail++;
 	GIANT_SAVE();
 	sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ?
 	    SLEEPQ_INTERRUPTIBLE : 0), queue);
 	if ((flags & LK_TIMELOCK) && timo)
 		sleepq_set_timeout(&lk->lock_object, timo);
 
 	/*
 	 * Decisional switch for real sleeping.
 	 */
 	if ((flags & LK_TIMELOCK) && timo && catch)
 		error = sleepq_timedwait_sig(&lk->lock_object, pri);
 	else if ((flags & LK_TIMELOCK) && timo)
 		error = sleepq_timedwait(&lk->lock_object, pri);
 	else if (catch)
 		error = sleepq_wait_sig(&lk->lock_object, pri);
 	else
 		sleepq_wait(&lk->lock_object, pri);
 	GIANT_RESTORE();
 	if ((flags & LK_SLEEPFAIL) && error == 0)
 		error = ENOLCK;
 
 	return (error);
 }
 
 static __inline int
 wakeupshlk(struct lock *lk, const char *file, int line)
 {
 	uintptr_t v, x;
 	u_int realexslp;
 	int queue, wakeup_swapper;
 
 	WITNESS_UNLOCK(&lk->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, file, line);
 
 	wakeup_swapper = 0;
 	for (;;) {
 		x = lk->lk_lock;
 
 		/*
 		 * If there is more than one shared lock held, just drop one
 		 * and return.
 		 */
 		if (LK_SHARERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x,
 			    x - LK_ONE_SHARER))
 				break;
 			continue;
 		}
 
 		/*
 		 * If there are not waiters on the exclusive queue, drop the
 		 * lock quickly.
 		 */
 		if ((x & LK_ALL_WAITERS) == 0) {
 			MPASS((x & ~LK_EXCLUSIVE_SPINNERS) ==
 			    LK_SHARERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, x, LK_UNLOCKED))
 				break;
 			continue;
 		}
 
 		/*
 		 * We should have a sharer with waiters, so enter the hard
 		 * path in order to handle wakeups correctly.
 		 */
 		sleepq_lock(&lk->lock_object);
 		x = lk->lk_lock & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 		v = LK_UNLOCKED;
 
 		/*
 		 * If the lock has exclusive waiters, give them preference in
 		 * order to avoid deadlock with shared runners up.
 		 * If interruptible sleeps left the exclusive queue empty
 		 * avoid a starvation for the threads sleeping on the shared
 		 * queue by giving them precedence and cleaning up the
 		 * exclusive waiters bit anyway.
 		 * Please note that lk_exslpfail count may be lying about
 		 * the real number of waiters with the LK_SLEEPFAIL flag on
 		 * because they may be used in conjuction with interruptible
 		 * sleeps so lk_exslpfail might be considered an 'upper limit'
 		 * bound, including the edge cases.
 		 */
 		realexslp = sleepq_sleepcnt(&lk->lock_object,
 		    SQ_EXCLUSIVE_QUEUE);
 		if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 			if (lk->lk_exslpfail < realexslp) {
 				lk->lk_exslpfail = 0;
 				queue = SQ_EXCLUSIVE_QUEUE;
 				v |= (x & LK_SHARED_WAITERS);
 			} else {
 				lk->lk_exslpfail = 0;
 				LOCK_LOG2(lk,
 				    "%s: %p has only LK_SLEEPFAIL sleepers",
 				    __func__, lk);
 				LOCK_LOG2(lk,
 			    "%s: %p waking up threads on the exclusive queue",
 				    __func__, lk);
 				wakeup_swapper =
 				    sleepq_broadcast(&lk->lock_object,
 				    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 				queue = SQ_SHARED_QUEUE;
 			}
 				
 		} else {
 
 			/*
 			 * Exclusive waiters sleeping with LK_SLEEPFAIL on
 			 * and using interruptible sleeps/timeout may have
 			 * left spourious lk_exslpfail counts on, so clean
 			 * it up anyway.
 			 */
 			lk->lk_exslpfail = 0;
 			queue = SQ_SHARED_QUEUE;
 		}
 
 		if (!atomic_cmpset_rel_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x,
 		    v)) {
 			sleepq_release(&lk->lock_object);
 			continue;
 		}
 		LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue",
 		    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 		wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK,
 		    0, queue);
 		sleepq_release(&lk->lock_object);
 		break;
 	}
 
 	lock_profile_release_lock(&lk->lock_object);
 	TD_LOCKS_DEC(curthread);
 	TD_SLOCKS_DEC(curthread);
 	return (wakeup_swapper);
 }
 
 static void
 assert_lockmgr(const struct lock_object *lock, int what)
 {
 
 	panic("lockmgr locks do not support assertions");
 }
 
 static void
 lock_lockmgr(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 static uintptr_t
 unlock_lockmgr(struct lock_object *lock)
 {
 
 	panic("lockmgr locks do not support sleep interlocking");
 }
 
 #ifdef KDTRACE_HOOKS
 static int
 owner_lockmgr(const struct lock_object *lock, struct thread **owner)
 {
 
 	panic("lockmgr locks do not support owner inquiring");
 }
 #endif
 
 void
 lockinit(struct lock *lk, int pri, const char *wmesg, int timo, int flags)
 {
 	int iflags;
 
 	MPASS((flags & ~LK_INIT_MASK) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(lk->lk_lock,
             ("%s: lockmgr not aligned for %s: %p", __func__, wmesg,
             &lk->lk_lock));
 
 	iflags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (flags & LK_CANRECURSE)
 		iflags |= LO_RECURSABLE;
 	if ((flags & LK_NODUP) == 0)
 		iflags |= LO_DUPOK;
 	if (flags & LK_NOPROFILE)
 		iflags |= LO_NOPROFILE;
 	if ((flags & LK_NOWITNESS) == 0)
 		iflags |= LO_WITNESS;
 	if (flags & LK_QUIET)
 		iflags |= LO_QUIET;
 	if (flags & LK_IS_VNODE)
 		iflags |= LO_IS_VNODE;
 	iflags |= flags & (LK_ADAPTIVE | LK_NOSHARE);
 
 	lock_init(&lk->lock_object, &lock_class_lockmgr, wmesg, NULL, iflags);
 	lk->lk_lock = LK_UNLOCKED;
 	lk->lk_recurse = 0;
 	lk->lk_exslpfail = 0;
 	lk->lk_timo = timo;
 	lk->lk_pri = pri;
 	STACK_ZERO(lk);
 }
 
 /*
  * XXX: Gross hacks to manipulate external lock flags after
  * initialization.  Used for certain vnode and buf locks.
  */
 void
 lockallowshare(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LK_NOSHARE;
 }
 
 void
 lockdisableshare(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags |= LK_NOSHARE;
 }
 
 void
 lockallowrecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags |= LO_RECURSABLE;
 }
 
 void
 lockdisablerecurse(struct lock *lk)
 {
 
 	lockmgr_assert(lk, KA_XLOCKED);
 	lk->lock_object.lo_flags &= ~LO_RECURSABLE;
 }
 
 void
 lockdestroy(struct lock *lk)
 {
 
 	KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held"));
 	KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed"));
 	KASSERT(lk->lk_exslpfail == 0, ("lockmgr still exclusive waiters"));
 	lock_destroy(&lk->lock_object);
 }
 
 int
 __lockmgr_args(struct lock *lk, u_int flags, struct lock_object *ilk,
     const char *wmesg, int pri, int timo, const char *file, int line)
 {
 	GIANT_DECLARE;
 	struct lock_class *class;
 	const char *iwmesg;
 	uintptr_t tid, v, x;
 	u_int op, realexslp;
 	int error, ipri, itimo, queue, wakeup_swapper;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #ifdef ADAPTIVE_LOCKMGRS
 	volatile struct thread *owner;
 	u_int i, spintries = 0;
 #endif
 
 	error = 0;
 	tid = (uintptr_t)curthread;
 	op = (flags & LK_TYPE_MASK);
 	iwmesg = (wmesg == LK_WMESG_DEFAULT) ? lk->lock_object.lo_name : wmesg;
 	ipri = (pri == LK_PRIO_DEFAULT) ? lk->lk_pri : pri;
 	itimo = (timo == LK_TIMO_DEFAULT) ? lk->lk_timo : timo;
 
 	MPASS((flags & ~LK_TOTAL_MASK) == 0);
 	KASSERT((op & (op - 1)) == 0,
 	    ("%s: Invalid requested operation @ %s:%d", __func__, file, line));
 	KASSERT((flags & (LK_NOWAIT | LK_SLEEPFAIL)) == 0 ||
 	    (op != LK_DOWNGRADE && op != LK_RELEASE),
 	    ("%s: Invalid flags in regard of the operation desired @ %s:%d",
 	    __func__, file, line));
 	KASSERT((flags & LK_INTERLOCK) == 0 || ilk != NULL,
 	    ("%s: LK_INTERLOCK passed without valid interlock @ %s:%d",
 	    __func__, file, line));
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("%s: idle thread %p on lockmgr %s @ %s:%d", __func__, curthread,
 	    lk->lock_object.lo_name, file, line));
 
 	class = (flags & LK_INTERLOCK) ? LOCK_CLASS(ilk) : NULL;
 	if (panicstr != NULL) {
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		return (0);
 	}
 
 	if (lk->lock_object.lo_flags & LK_NOSHARE) {
 		switch (op) {
 		case LK_SHARED:
 			op = LK_EXCLUSIVE;
 			break;
 		case LK_UPGRADE:
 		case LK_TRYUPGRADE:
 		case LK_DOWNGRADE:
 			_lockmgr_assert(lk, KA_XLOCKED | KA_NOTRECURSED,
 			    file, line);
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			return (0);
 		}
 	}
 
 	wakeup_swapper = 0;
 	switch (op) {
 	case LK_SHARED:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER,
 			    file, line, flags & LK_INTERLOCK ? ilk : NULL);
 		for (;;) {
 			x = lk->lk_lock;
 
 			/*
 			 * If no other thread has an exclusive lock, or
 			 * no exclusive waiter is present, bump the count of
 			 * sharers.  Since we have to preserve the state of
 			 * waiters, if we fail to acquire the shared lock
 			 * loop back and retry.
 			 */
 			if (LK_CAN_SHARE(x, flags)) {
 				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    x + LK_ONE_SHARER))
 					break;
 				continue;
 			}
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is already held by curthread in
 			 * exclusive way avoid a deadlock.
 			 */
 			if (LK_HOLDER(x) == tid) {
 				LOCK_LOG2(lk,
 				    "%s: %p already held in exclusive mode",
 				    __func__, lk);
 				error = EDEADLK;
 				break;
 			}
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * If the owner is running on another CPU, spin until
 			 * the owner stops running or the state of the lock
 			 * changes.  We need a double-state handle here
 			 * because for a failed acquisition the lock can be
 			 * either held in exclusive mode or shared mode
 			 * (for the writer starvation avoidance technique).
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (LOCK_LOG_TEST(&lk->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, lk, owner);
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(td), "spinning",
+				    "lockname:\"%s\"", lk->lock_object.lo_name);
 
 				/*
 				 * If we are holding also an interlock drop it
 				 * in order to avoid a deadlock if the lockmgr
 				 * owner is adaptively spinning on the
 				 * interlock itself.
 				 */
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				while (LK_HOLDER(lk->lk_lock) ==
 				    (uintptr_t)owner && TD_IS_RUNNING(owner))
 					cpu_spinwait();
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(td), "running");
 				GIANT_RESTORE();
 				continue;
 			} else if (LK_CAN_ADAPT(lk, flags) &&
 			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
 			    spintries < alk_retries) {
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(td), "spinning",
+				    "lockname:\"%s\"", lk->lock_object.lo_name);
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < alk_loops; i++) {
 					if (LOCK_LOG_TEST(&lk->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, lk, spintries, i);
 					x = lk->lk_lock;
 					if ((x & LK_SHARE) == 0 ||
 					    LK_CAN_SHARE(x, flags) != 0)
 						break;
 					cpu_spinwait();
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(td), "running");
 				GIANT_RESTORE();
 				if (i != alk_loops)
 					continue;
 			}
 #endif
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock can be acquired in shared mode, try
 			 * again.
 			 */
 			if (LK_CAN_SHARE(x, flags)) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * The current lock owner might have started executing
 			 * on another CPU (or the lock could have changed
 			 * owner) while we were waiting on the turnstile
 			 * chain lock.  If so, drop the turnstile lock and try
 			 * again.
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (TD_IS_RUNNING(owner)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 			}
 #endif
 
 			/*
 			 * Try to set the LK_SHARED_WAITERS flag.  If we fail,
 			 * loop back and retry.
 			 */
 			if ((x & LK_SHARED_WAITERS) == 0) {
 				if (!atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    x | LK_SHARED_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set shared waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * shared lock and the shared waiters flag is set,
 			 * we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_SHARED_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, file,
 			    line);
 			WITNESS_LOCK(&lk->lock_object, LK_TRYWIT(flags), file,
 			    line);
 			TD_LOCKS_INC(curthread);
 			TD_SLOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	case LK_UPGRADE:
 	case LK_TRYUPGRADE:
 		_lockmgr_assert(lk, KA_SLOCKED, file, line);
 		v = lk->lk_lock;
 		x = v & LK_ALL_WAITERS;
 		v &= LK_EXCLUSIVE_SPINNERS;
 
 		/*
 		 * Try to switch from one shared lock to an exclusive one.
 		 * We need to preserve waiters flags during the operation.
 		 */
 		if (atomic_cmpset_ptr(&lk->lk_lock, LK_SHARERS_LOCK(1) | x | v,
 		    tid | x)) {
 			LOCK_LOG_LOCK("XUPGRADE", &lk->lock_object, 0, 0, file,
 			    line);
 			WITNESS_UPGRADE(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_SLOCKS_DEC(curthread);
 			break;
 		}
 
 		/*
 		 * In LK_TRYUPGRADE mode, do not drop the lock,
 		 * returning EBUSY instead.
 		 */
 		if (op == LK_TRYUPGRADE) {
 			LOCK_LOG2(lk, "%s: %p failed the nowait upgrade",
 			    __func__, lk);
 			error = EBUSY;
 			break;
 		}
 
 		/*
 		 * We have been unable to succeed in upgrading, so just
 		 * give up the shared lock.
 		 */
 		wakeup_swapper |= wakeupshlk(lk, file, line);
 
 		/* FALLTHROUGH */
 	case LK_EXCLUSIVE:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * If curthread already holds the lock and this one is
 		 * allowed to recurse, simply recurse on it.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if ((flags & LK_CANRECURSE) == 0 &&
 			    (lk->lock_object.lo_flags & LO_RECURSABLE) == 0) {
 
 				/*
 				 * If the lock is expected to not panic just
 				 * give up and return.
 				 */
 				if (LK_TRYOP(flags)) {
 					LOCK_LOG2(lk,
 					    "%s: %p fails the try operation",
 					    __func__, lk);
 					error = EBUSY;
 					break;
 				}
 				if (flags & LK_INTERLOCK)
 					class->lc_unlock(ilk);
 		panic("%s: recursing on non recursive lockmgr %s @ %s:%d\n",
 				    __func__, iwmesg, file, line);
 			}
 			lk->lk_recurse++;
 			LOCK_LOG2(lk, "%s: %p recursing", __func__, lk);
 			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			break;
 		}
 
 		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED,
 		    tid)) {
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * If the owner is running on another CPU, spin until
 			 * the owner stops running or the state of the lock
 			 * changes.
 			 */
 			x = lk->lk_lock;
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (LOCK_LOG_TEST(&lk->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, lk, owner);
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(td), "spinning",
+				    "lockname:\"%s\"", lk->lock_object.lo_name);
 
 				/*
 				 * If we are holding also an interlock drop it
 				 * in order to avoid a deadlock if the lockmgr
 				 * owner is adaptively spinning on the
 				 * interlock itself.
 				 */
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				while (LK_HOLDER(lk->lk_lock) ==
 				    (uintptr_t)owner && TD_IS_RUNNING(owner))
 					cpu_spinwait();
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(td), "running");
 				GIANT_RESTORE();
 				continue;
 			} else if (LK_CAN_ADAPT(lk, flags) &&
 			    (x & LK_SHARE) != 0 && LK_SHARERS(x) &&
 			    spintries < alk_retries) {
 				if ((x & LK_EXCLUSIVE_SPINNERS) == 0 &&
 				    !atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_SPINNERS))
 					continue;
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(td), "spinning",
+				    "lockname:\"%s\"", lk->lock_object.lo_name);
 				if (flags & LK_INTERLOCK) {
 					class->lc_unlock(ilk);
 					flags &= ~LK_INTERLOCK;
 				}
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < alk_loops; i++) {
 					if (LOCK_LOG_TEST(&lk->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, lk, spintries, i);
 					if ((lk->lk_lock &
 					    LK_EXCLUSIVE_SPINNERS) == 0)
 						break;
 					cpu_spinwait();
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(td), "running");
 				GIANT_RESTORE();
 				if (i != alk_loops)
 					continue;
 			}
 #endif
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 #ifdef ADAPTIVE_LOCKMGRS
 			/*
 			 * The current lock owner might have started executing
 			 * on another CPU (or the lock could have changed
 			 * owner) while we were waiting on the turnstile
 			 * chain lock.  If so, drop the turnstile lock and try
 			 * again.
 			 */
 			if (LK_CAN_ADAPT(lk, flags) && (x & LK_SHARE) == 0 &&
 			    LK_HOLDER(x) != LK_KERNPROC) {
 				owner = (struct thread *)LK_HOLDER(x);
 				if (TD_IS_RUNNING(owner)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 			}
 #endif
 
 			/*
 			 * The lock can be in the state where there is a
 			 * pending queue of waiters, but still no owner.
 			 * This happens when the lock is contested and an
 			 * owner is going to claim the lock.
 			 * If curthread is the one successfully acquiring it
 			 * claim lock ownership and return, preserving waiters
 			 * flags.
 			 */
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v &= ~LK_EXCLUSIVE_SPINNERS;
 				if (atomic_cmpset_acq_ptr(&lk->lk_lock, x,
 				    tid | v)) {
 					sleepq_release(&lk->lock_object);
 					LOCK_LOG2(lk,
 					    "%s: %p claimed by a new writer",
 					    __func__, lk);
 					break;
 				}
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set excl waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			error = sleeplk(lk, flags, ilk, iwmesg, ipri, itimo,
 			    SQ_EXCLUSIVE_QUEUE);
 			flags &= ~LK_INTERLOCK;
 			if (error) {
 				LOCK_LOG3(lk,
 				    "%s: interrupted sleep for %p with %d",
 				    __func__, lk, error);
 				break;
 			}
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("XLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	case LK_DOWNGRADE:
 		_lockmgr_assert(lk, KA_XLOCKED, file, line);
 		LOCK_LOG_LOCK("XDOWNGRADE", &lk->lock_object, 0, 0, file, line);
 		WITNESS_DOWNGRADE(&lk->lock_object, 0, file, line);
 
 		/*
 		 * Panic if the lock is recursed.
 		 */
 		if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: downgrade a recursed lockmgr %s @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 		TD_SLOCKS_INC(curthread);
 
 		/*
 		 * In order to preserve waiters flags, just spin.
 		 */
 		for (;;) {
 			x = lk->lk_lock;
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			x &= LK_ALL_WAITERS;
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 			    LK_SHARERS_LOCK(1) | x))
 				break;
 			cpu_spinwait();
 		}
 		break;
 	case LK_RELEASE:
 		_lockmgr_assert(lk, KA_LOCKED, file, line);
 		x = lk->lk_lock;
 
 		if ((x & LK_SHARE) == 0) {
 
 			/*
 			 * As first option, treact the lock as if it has not
 			 * any waiter.
 			 * Fix-up the tid var if the lock has been disowned.
 			 */
 			if (LK_HOLDER(x) == LK_KERNPROC)
 				tid = LK_KERNPROC;
 			else {
 				WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE,
 				    file, line);
 				TD_LOCKS_DEC(curthread);
 			}
 			LOCK_LOG_LOCK("XUNLOCK", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 
 			/*
 			 * The lock is held in exclusive mode.
 			 * If the lock is recursed also, then unrecurse it.
 			 */
 			if (lockmgr_xlocked(lk) && lockmgr_recursed(lk)) {
 				LOCK_LOG2(lk, "%s: %p unrecursing", __func__,
 				    lk);
 				lk->lk_recurse--;
 				break;
 			}
 			if (tid != LK_KERNPROC)
 				lock_profile_release_lock(&lk->lock_object);
 
 			if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid,
 			    LK_UNLOCKED))
 				break;
 
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 			v = LK_UNLOCKED;
 
 			/*
 		 	 * If the lock has exclusive waiters, give them
 			 * preference in order to avoid deadlock with
 			 * shared runners up.
 			 * If interruptible sleeps left the exclusive queue
 			 * empty avoid a starvation for the threads sleeping
 			 * on the shared queue by giving them precedence
 			 * and cleaning up the exclusive waiters bit anyway.
 			 * Please note that lk_exslpfail count may be lying
 			 * about the real number of waiters with the
 			 * LK_SLEEPFAIL flag on because they may be used in
 			 * conjuction with interruptible sleeps so
 			 * lk_exslpfail might be considered an 'upper limit'
 			 * bound, including the edge cases.
 			 */
 			MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 			realexslp = sleepq_sleepcnt(&lk->lock_object,
 			    SQ_EXCLUSIVE_QUEUE);
 			if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) {
 				if (lk->lk_exslpfail < realexslp) {
 					lk->lk_exslpfail = 0;
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v |= (x & LK_SHARED_WAITERS);
 				} else {
 					lk->lk_exslpfail = 0;
 					LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 					    __func__, lk);
 					LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 					    __func__, lk);
 					wakeup_swapper =
 					    sleepq_broadcast(&lk->lock_object,
 					    SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE);
 					queue = SQ_SHARED_QUEUE;
 				}
 			} else {
 
 				/*
 				 * Exclusive waiters sleeping with LK_SLEEPFAIL
 				 * on and using interruptible sleeps/timeout
 				 * may have left spourious lk_exslpfail counts
 				 * on, so clean it up anyway. 
 				 */
 				lk->lk_exslpfail = 0;
 				queue = SQ_SHARED_QUEUE;
 			}
 
 			LOCK_LOG3(lk,
 			    "%s: %p waking up threads on the %s queue",
 			    __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" :
 			    "exclusive");
 			atomic_store_rel_ptr(&lk->lk_lock, v);
 			wakeup_swapper |= sleepq_broadcast(&lk->lock_object,
 			    SLEEPQ_LK, 0, queue);
 			sleepq_release(&lk->lock_object);
 			break;
 		} else
 			wakeup_swapper = wakeupshlk(lk, file, line);
 		break;
 	case LK_DRAIN:
 		if (LK_CAN_WITNESS(flags))
 			WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER |
 			    LOP_EXCLUSIVE, file, line, flags & LK_INTERLOCK ?
 			    ilk : NULL);
 
 		/*
 		 * Trying to drain a lock we already own will result in a
 		 * deadlock.
 		 */
 		if (lockmgr_xlocked(lk)) {
 			if (flags & LK_INTERLOCK)
 				class->lc_unlock(ilk);
 			panic("%s: draining %s with the lock held @ %s:%d\n",
 			    __func__, iwmesg, file, line);
 		}
 
 		while (!atomic_cmpset_acq_ptr(&lk->lk_lock, LK_UNLOCKED, tid)) {
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&lk->lock_object,
 			    &contested, &waittime);
 
 			/*
 			 * If the lock is expected to not sleep just give up
 			 * and return.
 			 */
 			if (LK_TRYOP(flags)) {
 				LOCK_LOG2(lk, "%s: %p fails the try operation",
 				    __func__, lk);
 				error = EBUSY;
 				break;
 			}
 
 			/*
 			 * Acquire the sleepqueue chain lock because we
 			 * probabilly will need to manipulate waiters flags.
 			 */
 			sleepq_lock(&lk->lock_object);
 			x = lk->lk_lock;
 
 			/*
 			 * if the lock has been released while we spun on
 			 * the sleepqueue chain lock just try again.
 			 */
 			if (x == LK_UNLOCKED) {
 				sleepq_release(&lk->lock_object);
 				continue;
 			}
 
 			v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS);
 			if ((x & ~v) == LK_UNLOCKED) {
 				v = (x & ~LK_EXCLUSIVE_SPINNERS);
 
 				/*
 				 * If interruptible sleeps left the exclusive
 				 * queue empty avoid a starvation for the
 				 * threads sleeping on the shared queue by
 				 * giving them precedence and cleaning up the
 				 * exclusive waiters bit anyway.
 				 * Please note that lk_exslpfail count may be
 				 * lying about the real number of waiters with
 				 * the LK_SLEEPFAIL flag on because they may
 				 * be used in conjuction with interruptible
 				 * sleeps so lk_exslpfail might be considered
 				 * an 'upper limit' bound, including the edge
 				 * cases.
 				 */
 				if (v & LK_EXCLUSIVE_WAITERS) {
 					queue = SQ_EXCLUSIVE_QUEUE;
 					v &= ~LK_EXCLUSIVE_WAITERS;
 				} else {
 
 					/*
 					 * Exclusive waiters sleeping with
 					 * LK_SLEEPFAIL on and using
 					 * interruptible sleeps/timeout may
 					 * have left spourious lk_exslpfail
 					 * counts on, so clean it up anyway.
 					 */
 					MPASS(v & LK_SHARED_WAITERS);
 					lk->lk_exslpfail = 0;
 					queue = SQ_SHARED_QUEUE;
 					v &= ~LK_SHARED_WAITERS;
 				}
 				if (queue == SQ_EXCLUSIVE_QUEUE) {
 					realexslp =
 					    sleepq_sleepcnt(&lk->lock_object,
 					    SQ_EXCLUSIVE_QUEUE);
 					if (lk->lk_exslpfail >= realexslp) {
 						lk->lk_exslpfail = 0;
 						queue = SQ_SHARED_QUEUE;
 						v &= ~LK_SHARED_WAITERS;
 						if (realexslp != 0) {
 							LOCK_LOG2(lk,
 					"%s: %p has only LK_SLEEPFAIL sleepers",
 							    __func__, lk);
 							LOCK_LOG2(lk,
 			"%s: %p waking up threads on the exclusive queue",
 							    __func__, lk);
 							wakeup_swapper =
 							    sleepq_broadcast(
 							    &lk->lock_object,
 							    SLEEPQ_LK, 0,
 							    SQ_EXCLUSIVE_QUEUE);
 						}
 					} else
 						lk->lk_exslpfail = 0;
 				}
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG3(lk,
 				"%s: %p waking up all threads on the %s queue",
 				    __func__, lk, queue == SQ_SHARED_QUEUE ?
 				    "shared" : "exclusive");
 				wakeup_swapper |= sleepq_broadcast(
 				    &lk->lock_object, SLEEPQ_LK, 0, queue);
 
 				/*
 				 * If shared waiters have been woken up we need
 				 * to wait for one of them to acquire the lock
 				 * before to set the exclusive waiters in
 				 * order to avoid a deadlock.
 				 */
 				if (queue == SQ_SHARED_QUEUE) {
 					for (v = lk->lk_lock;
 					    (v & LK_SHARE) && !LK_SHARERS(v);
 					    v = lk->lk_lock)
 						cpu_spinwait();
 				}
 			}
 
 			/*
 			 * Try to set the LK_EXCLUSIVE_WAITERS flag.  If we
 			 * fail, loop back and retry.
 			 */
 			if ((x & LK_EXCLUSIVE_WAITERS) == 0) {
 				if (!atomic_cmpset_ptr(&lk->lk_lock, x,
 				    x | LK_EXCLUSIVE_WAITERS)) {
 					sleepq_release(&lk->lock_object);
 					continue;
 				}
 				LOCK_LOG2(lk, "%s: %p set drain waiters flag",
 				    __func__, lk);
 			}
 
 			/*
 			 * As far as we have been unable to acquire the
 			 * exclusive lock and the exclusive waiters flag
 			 * is set, we will sleep.
 			 */
 			if (flags & LK_INTERLOCK) {
 				class->lc_unlock(ilk);
 				flags &= ~LK_INTERLOCK;
 			}
 			GIANT_SAVE();
 			sleepq_add(&lk->lock_object, NULL, iwmesg, SLEEPQ_LK,
 			    SQ_EXCLUSIVE_QUEUE);
 			sleepq_wait(&lk->lock_object, ipri & PRIMASK);
 			GIANT_RESTORE();
 			LOCK_LOG2(lk, "%s: %p resuming from the sleep queue",
 			    __func__, lk);
 		}
 
 		if (error == 0) {
 			lock_profile_obtain_lock_success(&lk->lock_object,
 			    contested, waittime, file, line);
 			LOCK_LOG_LOCK("DRAIN", &lk->lock_object, 0,
 			    lk->lk_recurse, file, line);
 			WITNESS_LOCK(&lk->lock_object, LOP_EXCLUSIVE |
 			    LK_TRYWIT(flags), file, line);
 			TD_LOCKS_INC(curthread);
 			STACK_SAVE(lk);
 		}
 		break;
 	default:
 		if (flags & LK_INTERLOCK)
 			class->lc_unlock(ilk);
 		panic("%s: unknown lockmgr request 0x%x\n", __func__, op);
 	}
 
 	if (flags & LK_INTERLOCK)
 		class->lc_unlock(ilk);
 	if (wakeup_swapper)
 		kick_proc0();
 
 	return (error);
 }
 
 void
 _lockmgr_disown(struct lock *lk, const char *file, int line)
 {
 	uintptr_t tid, x;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	tid = (uintptr_t)curthread;
 	_lockmgr_assert(lk, KA_XLOCKED, file, line);
 
 	/*
 	 * Panic if the lock is recursed.
 	 */
 	if (lockmgr_xlocked(lk) && lockmgr_recursed(lk))
 		panic("%s: disown a recursed lockmgr @ %s:%d\n",
 		    __func__,  file, line);
 
 	/*
 	 * If the owner is already LK_KERNPROC just skip the whole operation.
 	 */
 	if (LK_HOLDER(lk->lk_lock) != tid)
 		return;
 	lock_profile_release_lock(&lk->lock_object);
 	LOCK_LOG_LOCK("XDISOWN", &lk->lock_object, 0, 0, file, line);
 	WITNESS_UNLOCK(&lk->lock_object, LOP_EXCLUSIVE, file, line);
 	TD_LOCKS_DEC(curthread);
 	STACK_SAVE(lk);
 
 	/*
 	 * In order to preserve waiters flags, just spin.
 	 */
 	for (;;) {
 		x = lk->lk_lock;
 		MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0);
 		x &= LK_ALL_WAITERS;
 		if (atomic_cmpset_rel_ptr(&lk->lk_lock, tid | x,
 		    LK_KERNPROC | x))
 			return;
 		cpu_spinwait();
 	}
 }
 
 void
 lockmgr_printinfo(const struct lock *lk)
 {
 	struct thread *td;
 	uintptr_t x;
 
 	if (lk->lk_lock == LK_UNLOCKED)
 		printf("lock type %s: UNLOCKED\n", lk->lock_object.lo_name);
 	else if (lk->lk_lock & LK_SHARE)
 		printf("lock type %s: SHARED (count %ju)\n",
 		    lk->lock_object.lo_name,
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		if (td == (struct thread *)LK_KERNPROC)
 			printf("lock type %s: EXCL by KERNPROC\n",
 			    lk->lock_object.lo_name);
 		else
 			printf("lock type %s: EXCL by thread %p "
 			    "(pid %d, %s, tid %d)\n", lk->lock_object.lo_name,
 			    td, td->td_proc->p_pid, td->td_proc->p_comm,
 			    td->td_tid);
 	}
 
 	x = lk->lk_lock;
 	if (x & LK_EXCLUSIVE_WAITERS)
 		printf(" with exclusive waiters pending\n");
 	if (x & LK_SHARED_WAITERS)
 		printf(" with shared waiters pending\n");
 	if (x & LK_EXCLUSIVE_SPINNERS)
 		printf(" with exclusive spinners pending\n");
 
 	STACK_PRINT(lk);
 }
 
 int
 lockstatus(const struct lock *lk)
 {
 	uintptr_t v, x;
 	int ret;
 
 	ret = LK_SHARED;
 	x = lk->lk_lock;
 	v = LK_HOLDER(x);
 
 	if ((x & LK_SHARE) == 0) {
 		if (v == (uintptr_t)curthread || v == LK_KERNPROC)
 			ret = LK_EXCLUSIVE;
 		else
 			ret = LK_EXCLOTHER;
 	} else if (x == LK_UNLOCKED)
 		ret = 0;
 
 	return (ret);
 }
 
 #ifdef INVARIANT_SUPPORT
 
 FEATURE(invariant_support,
     "Support for modules compiled with INVARIANTS option");
 
 #ifndef INVARIANTS
 #undef	_lockmgr_assert
 #endif
 
 void
 _lockmgr_assert(const struct lock *lk, int what, const char *file, int line)
 {
 	int slocked = 0;
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case KA_SLOCKED:
 	case KA_SLOCKED | KA_NOTRECURSED:
 	case KA_SLOCKED | KA_RECURSED:
 		slocked = 1;
 	case KA_LOCKED:
 	case KA_LOCKED | KA_NOTRECURSED:
 	case KA_LOCKED | KA_RECURSED:
 #ifdef WITNESS
 
 		/*
 		 * We cannot trust WITNESS if the lock is held in exclusive
 		 * mode and a call to lockmgr_disown() happened.
 		 * Workaround this skipping the check if the lock is held in
 		 * exclusive mode even for the KA_LOCKED case.
 		 */
 		if (slocked || (lk->lk_lock & LK_SHARE)) {
 			witness_assert(&lk->lock_object, what, file, line);
 			break;
 		}
 #endif
 		if (lk->lk_lock == LK_UNLOCKED ||
 		    ((lk->lk_lock & LK_SHARE) == 0 && (slocked ||
 		    (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk)))))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    lk->lock_object.lo_name, slocked ? "share" : "",
 			    file, line);
 
 		if ((lk->lk_lock & LK_SHARE) == 0) {
 			if (lockmgr_recursed(lk)) {
 				if (what & KA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    lk->lock_object.lo_name, file,
 					    line);
 			} else if (what & KA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		}
 		break;
 	case KA_XLOCKED:
 	case KA_XLOCKED | KA_NOTRECURSED:
 	case KA_XLOCKED | KA_RECURSED:
 		if (!lockmgr_xlocked(lk) && !lockmgr_disowned(lk))
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		if (lockmgr_recursed(lk)) {
 			if (what & KA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    lk->lock_object.lo_name, file, line);
 		} else if (what & KA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	case KA_UNLOCKED:
 		if (lockmgr_xlocked(lk) || lockmgr_disowned(lk))
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    lk->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("Unknown lockmgr assertion: %d @ %s:%d\n", what, file,
 		    line);
 	}
 }
 #endif
 
 #ifdef DDB
 int
 lockmgr_chain(struct thread *td, struct thread **ownerp)
 {
 	struct lock *lk;
 
 	lk = td->td_wchan;
 
 	if (LOCK_CLASS(&lk->lock_object) != &lock_class_lockmgr)
 		return (0);
 	db_printf("blocked on lockmgr %s", lk->lock_object.lo_name);
 	if (lk->lk_lock & LK_SHARE)
 		db_printf("SHARED (count %ju)\n",
 		    (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else
 		db_printf("EXCL\n");
 	*ownerp = lockmgr_xholder(lk);
 
 	return (1);
 }
 
 static void
 db_show_lockmgr(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct lock *lk;
 
 	lk = (const struct lock *)lock;
 
 	db_printf(" state: ");
 	if (lk->lk_lock == LK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (lk->lk_lock & LK_SHARE)
 		db_printf("SLOCK: %ju\n", (uintmax_t)LK_SHARERS(lk->lk_lock));
 	else {
 		td = lockmgr_xholder(lk);
 		if (td == (struct thread *)LK_KERNPROC)
 			db_printf("XLOCK: LK_KERNPROC\n");
 		else
 			db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 			    td->td_tid, td->td_proc->p_pid,
 			    td->td_proc->p_comm);
 		if (lockmgr_recursed(lk))
 			db_printf(" recursed: %d\n", lk->lk_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (lk->lk_lock & LK_ALL_WAITERS) {
 	case LK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case LK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case LK_ALL_WAITERS:
 		db_printf("shared and exclusive\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 	db_printf(" spinners: ");
 	if (lk->lk_lock & LK_EXCLUSIVE_SPINNERS)
 		db_printf("exclusive\n");
 	else
 		db_printf("none\n");
 }
 #endif
Index: stable/10/sys/kern/kern_mutex.c
===================================================================
--- stable/10/sys/kern/kern_mutex.c	(revision 278649)
+++ stable/10/sys/kern/kern_mutex.c	(revision 278650)
@@ -1,1009 +1,1020 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Machine independent bits of mutex implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_adaptive_mutexes.h"
 #include "opt_ddb.h"
 #include "opt_global.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <sys/vmmeter.h>
 #include <sys/lock_profile.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <fs/devfs/devfs_int.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
 #define	ADAPTIVE_MUTEXES
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DEFINE( , , lock, failed);
 #endif
 
 /*
  * Return the mutex address when the lock cookie address is provided.
  * This functionality assumes that struct mtx* have a member named mtx_lock.
  */
 #define	mtxlock2mtx(c)	(__containerof(c, struct mtx, mtx_lock))
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define	mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
 
 #define	mtx_owner(m)	((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
 
 static void	assert_mtx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_mtx(const struct lock_object *lock);
 #endif
 static void	lock_mtx(struct lock_object *lock, uintptr_t how);
 static void	lock_spin(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_mtx(const struct lock_object *lock,
 		    struct thread **owner);
 #endif
 static uintptr_t unlock_mtx(struct lock_object *lock);
 static uintptr_t unlock_spin(struct lock_object *lock);
 
 /*
  * Lock classes for sleep and spin mutexes.
  */
 struct lock_class lock_class_mtx_sleep = {
 	.lc_name = "sleep mutex",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_mtx,
 	.lc_unlock = unlock_mtx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 struct lock_class lock_class_mtx_spin = {
 	.lc_name = "spin mutex",
 	.lc_flags = LC_SPINLOCK | LC_RECURSABLE,
 	.lc_assert = assert_mtx,
 #ifdef DDB
 	.lc_ddb_show = db_show_mtx,
 #endif
 	.lc_lock = lock_spin,
 	.lc_unlock = unlock_spin,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_mtx,
 #endif
 };
 
 /*
  * System-wide mutexes
  */
 struct mtx blocked_lock;
 struct mtx Giant;
 
 void
 assert_mtx(const struct lock_object *lock, int what)
 {
 
 	mtx_assert((const struct mtx *)lock, what);
 }
 
 void
 lock_mtx(struct lock_object *lock, uintptr_t how)
 {
 
 	mtx_lock((struct mtx *)lock);
 }
 
 void
 lock_spin(struct lock_object *lock, uintptr_t how)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 uintptr_t
 unlock_mtx(struct lock_object *lock)
 {
 	struct mtx *m;
 
 	m = (struct mtx *)lock;
 	mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(m);
 	return (0);
 }
 
 uintptr_t
 unlock_spin(struct lock_object *lock)
 {
 
 	panic("spin locks can only use msleep_spin");
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_mtx(const struct lock_object *lock, struct thread **owner)
 {
 	const struct mtx *m = (const struct mtx *)lock;
 
 	*owner = mtx_owner(m);
 	return (mtx_unowned(m) == 0);
 }
 #endif
 
 /*
  * Function versions of the inlined __mtx_* macros.  These are used by
  * modules and can also be called from assembly language if needed.
  */
 void
 __mtx_lock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("mtx_lock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_CHECKORDER(&m->lock_object, (opts & ~MTX_RECURSE) |
 	    LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 
 	__mtx_lock(m, curthread, opts, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, (opts & ~MTX_RECURSE) | LOP_EXCLUSIVE,
 	    file, line);
 	curthread->td_locks++;
 }
 
 void
 __mtx_unlock_flags(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
 	if (m->mtx_recurse == 0)
 		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_MTX_UNLOCK_RELEASE, m);
 	__mtx_unlock(m, curthread, opts, file, line);
 	curthread->td_locks--;
 }
 
 void
 __mtx_lock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	if (mtx_owned(m))
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("mtx_lock_spin: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 	opts &= ~MTX_RECURSE;
 	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line, NULL);
 	__mtx_lock_spin(m, curthread, opts, file, line);
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 }
 
 void
 __mtx_unlock_spin_flags(volatile uintptr_t *c, int opts, const char *file,
     int line)
 {
 	struct mtx *m;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
 	    m->lock_object.lo_name, file, line));
 	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
 
 	__mtx_unlock_spin(m);
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.'  If this function is called on a mutex that
  * is already owned, it will recursively acquire the lock.
  */
 int
 _mtx_trylock_flags_(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	m = mtxlock2mtx(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("mtx_trylock() by idle thread %p on sleep mutex %s @ %s:%d",
 	    curthread, m->lock_object.lo_name, file, line));
 	KASSERT(m->mtx_lock != MTX_DESTROYED,
 	    ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
 	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
 	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 
 	if (mtx_owned(m) && ((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 	    (opts & MTX_RECURSE) != 0)) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		rval = 1;
 	} else
 		rval = _mtx_obtain_lock(m, (uintptr_t)curthread);
 	opts &= ~MTX_RECURSE;
 
 	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 		if (m->mtx_recurse == 0)
 			LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE,
 			    m, contested, waittime, file, line);
 
 	}
 
 	return (rval);
 }
 
 /*
  * __mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
 __mtx_lock_sleep(volatile uintptr_t *c, uintptr_t tid, int opts,
     const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
 	uintptr_t v;
 #ifdef ADAPTIVE_MUTEXES
 	volatile struct thread *owner;
 #endif
 #ifdef KTR
 	int cont_logged = 0;
 #endif
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (mtx_owned(m)) {
 		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0 ||
 		    (opts & MTX_RECURSE) != 0,
 	    ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
 		    m->lock_object.lo_name, file, line));
 		opts &= ~MTX_RECURSE;
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 	opts &= ~MTX_RECURSE;
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object,
 		    &contested, &waittime);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR4(KTR_LOCK,
 		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
 		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
 
 	while (!_mtx_obtain_lock(m, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * If the owner is running on another CPU, spin until the
 		 * owner stops running or the state of the lock changes.
 		 */
 		v = m->mtx_lock;
 		if (v != MTX_UNOWNED) {
 			owner = (struct thread *)(v & ~MTX_FLAGMASK);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&m->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, m, owner);
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname((struct thread *)tid),
+				    "spinning", "lockname:\"%s\"",
+				    m->lock_object.lo_name);
 				while (mtx_owner(m) == owner &&
 				    TD_IS_RUNNING(owner)) {
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname((struct thread *)tid),
+				    "running");
 				continue;
 			}
 		}
 #endif
 
 		ts = turnstile_trywait(&m->lock_object);
 		v = m->mtx_lock;
 
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the turnstile chain lock.
 		 */
 		if (v == MTX_UNOWNED) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 #ifdef ADAPTIVE_MUTEXES
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		owner = (struct thread *)(v & ~MTX_FLAGMASK);
 		if (TD_IS_RUNNING(owner)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 #endif
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 		/*
 		 * We definitely must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef KTR
 		if (!cont_logged) {
 			CTR6(KTR_CONTENTION,
 			    "contention: %p at %s:%d wants %s, taken by %s:%d",
 			    (void *)tid, file, line, m->lock_object.lo_name,
 			    WITNESS_FILE(&m->lock_object),
 			    WITNESS_LINE(&m->lock_object));
 			cont_logged = 1;
 		}
 #endif
 
 		/*
 		 * Block on the turnstile.
 		 */
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 	}
 #ifdef KTR
 	if (cont_logged) {
 		CTR4(KTR_CONTENTION,
 		    "contention end: %s acquired by %p at %s:%d",
 		    m->lock_object.lo_name, (void *)tid, file, line);
 	}
 #endif
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_LOCK_ACQUIRE, m, contested,
 	    waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_MTX_LOCK_BLOCK, m, sleep_time);
 
 	/*
 	 * Only record the loops spinning and not sleeping. 
 	 */
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_MTX_LOCK_SPIN, m, (spin_cnt - sleep_cnt));
 #endif
 }
 
 static void
 _mtx_lock_spin_failed(struct mtx *m)
 {
 	struct thread *td;
 
 	td = mtx_owner(m);
 
 	/* If the mutex is unlocked, try again. */
 	if (td == NULL)
 		return;
 
 	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
 	    m, m->lock_object.lo_name, td, td->td_tid);
 #ifdef WITNESS
 	witness_display_spinlock(&m->lock_object, td, printf);
 #endif
 	panic("spin lock held too long");
 }
 
 #ifdef SMP
 /*
  * _mtx_lock_spin_cookie: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 void
 _mtx_lock_spin_cookie(volatile uintptr_t *c, uintptr_t tid, int opts,
     const char *file, int line)
 {
 	struct mtx *m;
 	int i = 0;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname((struct thread *)tid),
+	    "spinning", "lockname:\"%s\"", m->lock_object.lo_name);
 
 #ifdef HWPMC_HOOKS
 	PMC_SOFT_CALL( , , lock, failed);
 #endif
 	lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
 	while (!_mtx_obtain_lock(m, tid)) {
 
 		/* Give interrupts a chance while we spin. */
 		spinlock_exit();
 		while (m->mtx_lock != MTX_UNOWNED) {
 			if (i++ < 10000000) {
 				cpu_spinwait();
 				continue;
 			}
 			if (i < 60000000 || kdb_active || panicstr != NULL)
 				DELAY(1);
 			else
 				_mtx_lock_spin_failed(m);
 			cpu_spinwait();
 		}
 		spinlock_enter();
 	}
 
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
+	KTR_STATE0(KTR_SCHED, "thread", sched_tdname((struct thread *)tid),
+	    "running");
 
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE, m,
 	    contested, waittime, (file), (line));
 	LOCKSTAT_RECORD1(LS_MTX_SPIN_LOCK_SPIN, m, i);
 }
 #endif /* SMP */
 
 void
 thread_lock_flags_(struct thread *td, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	uintptr_t tid;
 	int i;
 #ifdef LOCK_PROFILING
 	int contested = 0;
 	uint64_t waittime = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 #endif
 
 	i = 0;
 	tid = (uintptr_t)curthread;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	for (;;) {
 retry:
 		spinlock_enter();
 		m = td->td_lock;
 		KASSERT(m->mtx_lock != MTX_DESTROYED,
 		    ("thread_lock() of destroyed mutex @ %s:%d", file, line));
 		KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 		    ("thread_lock() of sleep mutex %s @ %s:%d",
 		    m->lock_object.lo_name, file, line));
 		if (mtx_owned(m))
 			KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("thread_lock: recursed on non-recursive mutex %s @ %s:%d\n",
 			    m->lock_object.lo_name, file, line));
 		WITNESS_CHECKORDER(&m->lock_object,
 		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line, NULL);
 		while (!_mtx_obtain_lock(m, tid)) {
 #ifdef KDTRACE_HOOKS
 			spin_cnt++;
 #endif
 			if (m->mtx_lock == tid) {
 				m->mtx_recurse++;
 				break;
 			}
 #ifdef HWPMC_HOOKS
 			PMC_SOFT_CALL( , , lock, failed);
 #endif
 			lock_profile_obtain_lock_failed(&m->lock_object,
 			    &contested, &waittime);
 			/* Give interrupts a chance while we spin. */
 			spinlock_exit();
 			while (m->mtx_lock != MTX_UNOWNED) {
 				if (i++ < 10000000)
 					cpu_spinwait();
 				else if (i < 60000000 ||
 				    kdb_active || panicstr != NULL)
 					DELAY(1);
 				else
 					_mtx_lock_spin_failed(m);
 				cpu_spinwait();
 				if (m != td->td_lock)
 					goto retry;
 			}
 			spinlock_enter();
 		}
 		if (m == td->td_lock)
 			break;
 		__mtx_unlock_spin(m);	/* does spinlock_exit() */
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 	}
 	if (m->mtx_recurse == 0)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_MTX_SPIN_LOCK_ACQUIRE,
 		    m, contested, waittime, (file), (line));
 	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 	LOCKSTAT_RECORD1(LS_THREAD_LOCK_SPIN, m, spin_cnt);
 }
 
 struct mtx *
 thread_lock_block(struct thread *td)
 {
 	struct mtx *lock;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = &blocked_lock;
 	mtx_unlock_spin(lock);
 
 	return (lock);
 }
 
 void
 thread_lock_unblock(struct thread *td, struct mtx *new)
 {
 	mtx_assert(new, MA_OWNED);
 	MPASS(td->td_lock == &blocked_lock);
 	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
 }
 
 void
 thread_lock_set(struct thread *td, struct mtx *new)
 {
 	struct mtx *lock;
 
 	mtx_assert(new, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = new;
 	mtx_unlock_spin(lock);
 }
 
 /*
  * __mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed or contested (i.e. we
  * need to wake up a blocked thread).
  */
 void
 __mtx_unlock_sleep(volatile uintptr_t *c, int opts, const char *file, int line)
 {
 	struct mtx *m;
 	struct turnstile *ts;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	m = mtxlock2mtx(c);
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	/*
 	 * We have to lock the chain before the turnstile so this turnstile
 	 * can be removed from the hash list if it is empty.
 	 */
 	turnstile_chain_lock(&m->lock_object);
 	ts = turnstile_lookup(&m->lock_object);
 	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 	MPASS(ts != NULL);
 	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
 	_mtx_release_lock_quick(m);
 
 	/*
 	 * This turnstile is now no longer associated with the mutex.  We can
 	 * unlock the chain lock so a new turnstile may take it's place.
 	 */
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&m->lock_object);
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the __mtx_unlock_spin() macro for the details.
  */
 
 /*
  * The backing function for the INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANT_SUPPORT
 void
 __mtx_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct mtx *m;
 
 	if (panicstr != NULL || dumping)
 		return;
 
 	m = mtxlock2mtx(c);
 
 	switch (what) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned(m))
 			panic("mutex %s not owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		if (mtx_recursed(m)) {
 			if ((what & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    m->lock_object.lo_name, file, line);
 		} else if ((what & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned(m))
 			panic("mutex %s owned at %s:%d",
 			    m->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * The MUTEX_DEBUG-enabled mtx_validate()
  *
  * Most of these checks have been moved off into the LO_INITIALIZED flag
  * maintained by the witness code.
  */
 #ifdef MUTEX_DEBUG
 
 void	mtx_validate(struct mtx *);
 
 void
 mtx_validate(struct mtx *m)
 {
 
 /*
  * XXX: When kernacc() does not require Giant we can reenable this check
  */
 #ifdef notyet
 	/*
 	 * Can't call kernacc() from early init386(), especially when
 	 * initializing Giant mutex, because some stuff in kernacc()
 	 * requires Giant itself.
 	 */
 	if (!cold)
 		if (!kernacc((caddr_t)m, sizeof(m),
 		    VM_PROT_READ | VM_PROT_WRITE))
 			panic("Can't read and write to mutex %p", m);
 #endif
 }
 #endif
 
 /*
  * General init routine used by the MTX_SYSINIT() macro.
  */
 void
 mtx_sysinit(void *arg)
 {
 	struct mtx_args *margs = arg;
 
 	mtx_init((struct mtx *)margs->ma_mtx, margs->ma_desc, NULL,
 	    margs->ma_opts);
 }
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and name `name.'  The optional
  * lock type `type' is used as a general lock category name for use with
  * witness.
  */
 void
 _mtx_init(volatile uintptr_t *c, const char *name, const char *type, int opts)
 {
 	struct mtx *m;
 	struct lock_class *class;
 	int flags;
 
 	m = mtxlock2mtx(c);
 
 	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
 		MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(m->mtx_lock,
 	    ("%s: mtx_lock not aligned for %s: %p", __func__, name,
 	    &m->mtx_lock));
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	mtx_validate(m);
 #endif
 
 	/* Determine lock class and lock flags. */
 	if (opts & MTX_SPIN)
 		class = &lock_class_mtx_spin;
 	else
 		class = &lock_class_mtx_sleep;
 	flags = 0;
 	if (opts & MTX_QUIET)
 		flags |= LO_QUIET;
 	if (opts & MTX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if ((opts & MTX_NOWITNESS) == 0)
 		flags |= LO_WITNESS;
 	if (opts & MTX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & MTX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 
 	/* Initialize mutex. */
 	lock_init(&m->lock_object, class, name, type, flags);
 
 	m->mtx_lock = MTX_UNOWNED;
 	m->mtx_recurse = 0;
 }
 
 /*
  * Remove lock `m' from all_mtx queue.  We don't allow MTX_QUIET to be
  * passed in as a flag here because if the corresponding mtx_init() was
  * called with MTX_QUIET set, then it will already be set in the mutex's
  * flags.
  */
 void
 _mtx_destroy(volatile uintptr_t *c)
 {
 	struct mtx *m;
 
 	m = mtxlock2mtx(c);
 
 	if (!mtx_owned(m))
 		MPASS(mtx_unowned(m));
 	else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 
 		/* Perform the non-mtx related part of mtx_unlock_spin(). */
 		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
 			spinlock_exit();
 		else
 			curthread->td_locks--;
 
 		lock_profile_release_lock(&m->lock_object);
 		/* Tell witness this isn't locked to make it happy. */
 		WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
 		    __LINE__);
 	}
 
 	m->mtx_lock = MTX_DESTROYED;
 	lock_destroy(&m->lock_object);
 }
 
 /*
  * Intialize the mutex code and system mutexes.  This is called from the MD
  * startup code prior to mi_startup().  The per-CPU data space needs to be
  * setup before this is called.
  */
 void
 mutex_init(void)
 {
 
 	/* Setup turnstiles so that sleep mutexes work. */
 	init_turnstiles();
 
 	/*
 	 * Initialize mutexes.
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
 	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
 	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
 	mtx_lock(&Giant);
 }
 
 #ifdef DDB
 void
 db_show_mtx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct mtx *m;
 
 	m = (const struct mtx *)lock;
 
 	db_printf(" flags: {");
 	if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
 		db_printf("SPIN");
 	else
 		db_printf("DEF");
 	if (m->lock_object.lo_flags & LO_RECURSABLE)
 		db_printf(", RECURSE");
 	if (m->lock_object.lo_flags & LO_DUPOK)
 		db_printf(", DUPOK");
 	db_printf("}\n");
 	db_printf(" state: {");
 	if (mtx_unowned(m))
 		db_printf("UNOWNED");
 	else if (mtx_destroyed(m))
 		db_printf("DESTROYED");
 	else {
 		db_printf("OWNED");
 		if (m->mtx_lock & MTX_CONTESTED)
 			db_printf(", CONTESTED");
 		if (m->mtx_lock & MTX_RECURSED)
 			db_printf(", RECURSED");
 	}
 	db_printf("}\n");
 	if (!mtx_unowned(m) && !mtx_destroyed(m)) {
 		td = mtx_owner(m);
 		db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (mtx_recursed(m))
 			db_printf(" recursed: %d\n", m->mtx_recurse);
 	}
 }
 #endif
Index: stable/10/sys/kern/kern_rwlock.c
===================================================================
--- stable/10/sys/kern/kern_rwlock.c	(revision 278649)
+++ stable/10/sys/kern/kern_rwlock.c	(revision 278650)
@@ -1,1229 +1,1250 @@
 /*-
  * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Machine independent bits of reader/writer lock implementation.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
 #include "opt_no_adaptive_rwlocks.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
+#include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/turnstile.h>
 
 #include <machine/cpu.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
 #define	ADAPTIVE_RWLOCKS
 #endif
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /*
  * Return the rwlock address when the lock cookie address is provided.
  * This functionality assumes that struct rwlock* have a member named rw_lock.
  */
 #define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
 
 #ifdef ADAPTIVE_RWLOCKS
 static int rowner_retries = 10;
 static int rowner_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, rwlock, CTLFLAG_RD, NULL,
     "rwlock debugging");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
 SYSCTL_INT(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 
 static void	db_show_rwlock(const struct lock_object *lock);
 #endif
 static void	assert_rw(const struct lock_object *lock, int what);
 static void	lock_rw(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_rw(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_rw(struct lock_object *lock);
 
 struct lock_class lock_class_rw = {
 	.lc_name = "rw",
 	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_rw,
 #ifdef DDB
 	.lc_ddb_show = db_show_rwlock,
 #endif
 	.lc_lock = lock_rw,
 	.lc_unlock = unlock_rw,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_rw,
 #endif
 };
 
 /*
  * Return a pointer to the owning thread if the lock is write-locked or
  * NULL if the lock is unlocked or read-locked.
  */
 #define	rw_wowner(rw)							\
 	((rw)->rw_lock & RW_LOCK_READ ? NULL :				\
 	    (struct thread *)RW_OWNER((rw)->rw_lock))
 
 /*
  * Returns if a write owner is recursed.  Write ownership is not assured
  * here and should be previously checked.
  */
 #define	rw_recursed(rw)		((rw)->rw_recurse != 0)
 
 /*
  * Return true if curthread helds the lock.
  */
 #define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
 
 /*
  * Return a pointer to the owning thread for this lock who should receive
  * any priority lent by threads that block on this lock.  Currently this
  * is identical to rw_wowner().
  */
 #define	rw_owner(rw)		rw_wowner(rw)
 
 #ifndef INVARIANTS
 #define	__rw_assert(c, what, file, line)
 #endif
 
 void
 assert_rw(const struct lock_object *lock, int what)
 {
 
 	rw_assert((const struct rwlock *)lock, what);
 }
 
 void
 lock_rw(struct lock_object *lock, uintptr_t how)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	if (how)
 		rw_rlock(rw);
 	else
 		rw_wlock(rw);
 }
 
 uintptr_t
 unlock_rw(struct lock_object *lock)
 {
 	struct rwlock *rw;
 
 	rw = (struct rwlock *)lock;
 	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
 	if (rw->rw_lock & RW_LOCK_READ) {
 		rw_runlock(rw);
 		return (1);
 	} else {
 		rw_wunlock(rw);
 		return (0);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_rw(const struct lock_object *lock, struct thread **owner)
 {
 	const struct rwlock *rw = (const struct rwlock *)lock;
 	uintptr_t x = rw->rw_lock;
 
 	*owner = rw_wowner(rw);
 	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
 	    (*owner != NULL));
 }
 #endif
 
 void
 _rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
 {
 	struct rwlock *rw;
 	int flags;
 
 	rw = rwlock2rw(c);
 
 	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
 	    RW_RECURSE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
 	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
 	    &rw->rw_lock));
 
 	flags = LO_UPGRADABLE;
 	if (opts & RW_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & RW_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & RW_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & RW_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & RW_QUIET)
 		flags |= LO_QUIET;
 
 	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
 	rw->rw_lock = RW_UNLOCKED;
 	rw->rw_recurse = 0;
 }
 
 void
 _rw_destroy(volatile uintptr_t *c)
 {
 	struct rwlock *rw;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
 	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
 	rw->rw_lock = RW_DESTROYED;
 	lock_destroy(&rw->lock_object);
 }
 
 void
 rw_sysinit(void *arg)
 {
 	struct rw_args *args = arg;
 
 	rw_init((struct rwlock *)args->ra_rw, args->ra_desc);
 }
 
 void
 rw_sysinit_flags(void *arg)
 {
 	struct rw_args_flags *args = arg;
 
 	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
 	    args->ra_flags);
 }
 
 int
 _rw_wowned(const volatile uintptr_t *c)
 {
 
 	return (rw_wowner(rwlock2rw(c)) == curthread);
 }
 
 void
 _rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	__rw_wlock(rw, curthread, file, line);
 	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
 	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	curthread->td_locks++;
 }
 
 int
 __rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
 
 	if (rw_wlocked(rw) &&
 	    (rw->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		rw->rw_recurse++;
 		rval = 1;
 	} else
 		rval = atomic_cmpset_acq_ptr(&rw->rw_lock, RW_UNLOCKED,
 		    (uintptr_t)curthread);
 
 	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 	}
 	return (rval);
 }
 
 void
 _rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_WLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
 	    line);
 	if (!rw_recursed(rw))
 		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_WUNLOCK_RELEASE, rw);
 	__rw_wunlock(rw, curthread, file, line);
 	curthread->td_locks--;
 }
 /*
  * Determines whether a new reader can acquire a lock.  Succeeds if the
  * reader already owns a read lock and the lock is locked for read to
  * prevent deadlock from reader recursion.  Also succeeds if the lock
  * is unlocked and has no writer waiters or spinners.  Failing otherwise
  * prioritizes writers before readers.
  */
 #define	RW_CAN_READ(_rw)						\
     ((curthread->td_rw_rlocks && (_rw) & RW_LOCK_READ) || ((_rw) &	\
     (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER)) ==	\
     RW_LOCK_READ)
 
 void
 __rw_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 	int spintries = 0;
 	int i;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	uintptr_t v;
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
 	KASSERT(rw_wowner(rw) != curthread,
 	    ("rw_rlock: wlock already held for %s @ %s:%d",
 	    rw->lock_object.lo_name, file, line));
 	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
 
 	for (;;) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 		/*
 		 * Handle the easy case.  If no other thread has a write
 		 * lock, then try to bump up the count of read locks.  Note
 		 * that we have to preserve the current state of the
 		 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
 		 * read lock, then rw_lock must have changed, so restart
 		 * the loop.  Note that this handles the case of a
 		 * completely unlocked rwlock since such a lock is encoded
 		 * as a read lock with no waiters.
 		 */
 		v = rw->rw_lock;
 		if (RW_CAN_READ(v)) {
 			/*
 			 * The RW_LOCK_READ_WAITERS flag should only be set
 			 * if the lock has been unlocked and write waiters
 			 * were present.
 			 */
 			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v,
 			    v + RW_ONE_READER)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeed %p -> %p", __func__,
 					    rw, (void *)v,
 					    (void *)(v + RW_ONE_READER));
 				break;
 			}
 			continue;
 		}
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&rw->lock_object,
 		    &contested, &waittime);
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, rw, owner);
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "spinning",
+				    "lockname:\"%s\"", rw->lock_object.lo_name);
 				while ((struct thread*)RW_OWNER(rw->rw_lock) ==
 				    owner && TD_IS_RUNNING(owner)) {
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "running");
 				continue;
 			}
 		} else if (spintries < rowner_retries) {
 			spintries++;
+			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "spinning", "lockname:\"%s\"",
+			    rw->lock_object.lo_name);
 			for (i = 0; i < rowner_loops; i++) {
 				v = rw->rw_lock;
 				if ((v & RW_LOCK_READ) == 0 || RW_CAN_READ(v))
 					break;
 				cpu_spinwait();
 			}
+			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "running");
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 
 		/*
 		 * Okay, now it's the hard case.  Some other thread already
 		 * has a write lock or there are write waiters present,
 		 * acquire the turnstile lock so we can begin the process
 		 * of blocking.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 
 		/*
 		 * The lock might have been released while we spun, so
 		 * recheck its state and restart the loop if needed.
 		 */
 		v = rw->rw_lock;
 		if (RW_CAN_READ(v)) {
 			turnstile_cancel(ts);
 			continue;
 		}
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if ((v & RW_LOCK_READ) == 0) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * The lock is held in write mode or it already has waiters.
 		 */
 		MPASS(!RW_CAN_READ(v));
 
 		/*
 		 * If the RW_LOCK_READ_WAITERS flag is already set, then
 		 * we can go ahead and block.  If it is not set then try
 		 * to set it.  If we fail to set it drop the turnstile
 		 * lock and restart the loop.
 		 */
 		if (!(v & RW_LOCK_READ_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_READ_WAITERS)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
 				    __func__, rw);
 		}
 
 		/*
 		 * We were unable to acquire the lock and the read waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 	}
 
 	/*
 	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
 	 * however.  turnstiles don't like owners changing between calls to
 	 * turnstile_wait() currently.
 	 */
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_RLOCK_ACQUIRE, rw, contested,
 	    waittime, file, line);
 	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
 	WITNESS_LOCK(&rw->lock_object, 0, file, line);
 	curthread->td_locks++;
 	curthread->td_rw_rlocks++;
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_RW_RLOCK_BLOCK, rw, sleep_time);
 
 	/*
 	 * Record only the loops spinning and not sleeping. 
 	 */
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_RW_RLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
 #endif
 }
 
 int
 __rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
 	    curthread, rw->lock_object.lo_name, file, line));
 
 	for (;;) {
 		x = rw->rw_lock;
 		KASSERT(rw->rw_lock != RW_DESTROYED,
 		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
 		if (!(x & RW_LOCK_READ))
 			break;
 		if (atomic_cmpset_acq_ptr(&rw->rw_lock, x, x + RW_ONE_READER)) {
 			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
 			    line);
 			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
 			curthread->td_locks++;
 			curthread->td_rw_rlocks++;
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 void
 _rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t x, v, queue;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_RLOCKED, file, line);
 	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
 
 	/* TODO: drop "owner of record" here. */
 
 	for (;;) {
 		/*
 		 * See if there is more than one read lock held.  If so,
 		 * just drop one and return.
 		 */
 		x = rw->rw_lock;
 		if (RW_READERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
 			    x - RW_ONE_READER)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, rw, (void *)x,
 					    (void *)(x - RW_ONE_READER));
 				break;
 			}
 			continue;
 		}
 		/*
 		 * If there aren't any waiters for a write lock, then try
 		 * to drop it quickly.
 		 */
 		if (!(x & RW_LOCK_WAITERS)) {
 			MPASS((x & ~RW_LOCK_WRITE_SPINNER) ==
 			    RW_READERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&rw->rw_lock, x,
 			    RW_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&rw->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, rw);
 				break;
 			}
 			continue;
 		}
 		/*
 		 * Ok, we know we have waiters and we think we are the
 		 * last reader, so grab the turnstile lock.
 		 */
 		turnstile_chain_lock(&rw->lock_object);
 		v = rw->rw_lock & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		MPASS(v & RW_LOCK_WAITERS);
 
 		/*
 		 * Try to drop our lock leaving the lock in a unlocked
 		 * state.
 		 *
 		 * If you wanted to do explicit lock handoff you'd have to
 		 * do it here.  You'd also want to use turnstile_signal()
 		 * and you'd have to handle the race where a higher
 		 * priority thread blocks on the write lock before the
 		 * thread you wakeup actually runs and have the new thread
 		 * "steal" the lock.  For now it's a lot simpler to just
 		 * wakeup all of the waiters.
 		 *
 		 * As above, if we fail, then another thread might have
 		 * acquired a read lock, so drop the turnstile lock and
 		 * restart.
 		 */
 		x = RW_UNLOCKED;
 		if (v & RW_LOCK_WRITE_WAITERS) {
 			queue = TS_EXCLUSIVE_QUEUE;
 			x |= (v & RW_LOCK_READ_WAITERS);
 		} else
 			queue = TS_SHARED_QUEUE;
 		if (!atomic_cmpset_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
 		    x)) {
 			turnstile_chain_unlock(&rw->lock_object);
 			continue;
 		}
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
 			    __func__, rw);
 
 		/*
 		 * Ok.  The lock is released and all that's left is to
 		 * wake up the waiters.  Note that the lock might not be
 		 * free anymore, but in that case the writers will just
 		 * block again if they run before the new lock holder(s)
 		 * release the lock.
 		 */
 		ts = turnstile_lookup(&rw->lock_object);
 		MPASS(ts != NULL);
 		turnstile_broadcast(ts, queue);
 		turnstile_unpend(ts, TS_SHARED_LOCK);
 		turnstile_chain_unlock(&rw->lock_object);
 		break;
 	}
 	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_RW_RUNLOCK_RELEASE, rw);
 	curthread->td_locks--;
 	curthread->td_rw_rlocks--;
 }
 
 /*
  * This function is called when we are unable to obtain a write lock on the
  * first try.  This means that at least one other thread holds either a
  * read or write lock.
  */
 void
 __rw_wlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
     int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 #ifdef ADAPTIVE_RWLOCKS
 	volatile struct thread *owner;
 	int spintries = 0;
 	int i;
 #endif
 	uintptr_t v, x;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	if (rw_wlocked(rw)) {
 		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
 		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
 		    __func__, rw->lock_object.lo_name, file, line));
 		rw->rw_recurse++;
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
 		return;
 	}
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
 
 	while (!_rw_write_lock(rw, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&rw->lock_object,
 		    &contested, &waittime);
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		v = rw->rw_lock;
 		owner = (struct thread *)RW_OWNER(v);
 		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
 				    __func__, rw, owner);
+			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "spinning", "lockname:\"%s\"",
+			    rw->lock_object.lo_name);
 			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
 			    TD_IS_RUNNING(owner)) {
 				cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 				spin_cnt++;
 #endif
 			}
+			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "running");
 			continue;
 		}
 		if ((v & RW_LOCK_READ) && RW_READERS(v) &&
 		    spintries < rowner_retries) {
 			if (!(v & RW_LOCK_WRITE_SPINNER)) {
 				if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 				    v | RW_LOCK_WRITE_SPINNER)) {
 					continue;
 				}
 			}
 			spintries++;
+			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "spinning", "lockname:\"%s\"",
+			    rw->lock_object.lo_name);
 			for (i = 0; i < rowner_loops; i++) {
 				if ((rw->rw_lock & RW_LOCK_WRITE_SPINNER) == 0)
 					break;
 				cpu_spinwait();
 			}
+			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
+			    "running");
 #ifdef KDTRACE_HOOKS
 			spin_cnt += rowner_loops - i;
 #endif
 			if (i != rowner_loops)
 				continue;
 		}
 #endif
 		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 
 #ifdef ADAPTIVE_RWLOCKS
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the turnstile
 		 * chain lock.  If so, drop the turnstile lock and try
 		 * again.
 		 */
 		if (!(v & RW_LOCK_READ)) {
 			owner = (struct thread *)RW_OWNER(v);
 			if (TD_IS_RUNNING(owner)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 		}
 #endif
 		/*
 		 * Check for the waiters flags about this rwlock.
 		 * If the lock was released, without maintain any pending
 		 * waiters queue, simply try to acquire it.
 		 * If a pending waiters queue is present, claim the lock
 		 * ownership and maintain the pending queue.
 		 */
 		x = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
 		if ((v & ~x) == RW_UNLOCKED) {
 			x &= ~RW_LOCK_WRITE_SPINNER;
 			if (atomic_cmpset_acq_ptr(&rw->rw_lock, v, tid | x)) {
 				if (x)
 					turnstile_claim(ts);
 				else
 					turnstile_cancel(ts);
 				break;
 			}
 			turnstile_cancel(ts);
 			continue;
 		}
 		/*
 		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
 		 * set it.  If we fail to set it, then loop back and try
 		 * again.
 		 */
 		if (!(v & RW_LOCK_WRITE_WAITERS)) {
 			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
 			    v | RW_LOCK_WRITE_WAITERS)) {
 				turnstile_cancel(ts);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&rw->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
 				    __func__, rw);
 		}
 		/*
 		 * We were unable to acquire the lock and the write waiters
 		 * flag is set, so we must block on the turnstile.
 		 */
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
 			    rw);
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
 			    __func__, rw);
 #ifdef ADAPTIVE_RWLOCKS
 		spintries = 0;
 #endif
 	}
 	LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_RW_WLOCK_ACQUIRE, rw, contested,
 	    waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_RW_WLOCK_BLOCK, rw, sleep_time);
 
 	/*
 	 * Record only the loops spinning and not sleeping.
 	 */ 
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_RW_WLOCK_SPIN, rw, (spin_cnt - sleep_cnt));
 #endif
 }
 
 /*
  * This function is called if the first try at releasing a write lock failed.
  * This means that one of the 2 waiter bits must be set indicating that at
  * least one thread is waiting on this lock.
  */
 void
 __rw_wunlock_hard(volatile uintptr_t *c, uintptr_t tid, const char *file,
     int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t v;
 	int queue;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	if (rw_wlocked(rw) && rw_recursed(rw)) {
 		rw->rw_recurse--;
 		if (LOCK_LOG_TEST(&rw->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
 		return;
 	}
 
 	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
 	    ("%s: neither of the waiter flags are set", __func__));
 
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
 
 	turnstile_chain_lock(&rw->lock_object);
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 
 	/*
 	 * Use the same algo as sx locks for now.  Prefer waking up shared
 	 * waiters if we have any over writers.  This is probably not ideal.
 	 *
 	 * 'v' is the value we are going to write back to rw_lock.  If we
 	 * have waiters on both queues, we need to preserve the state of
 	 * the waiter flag for the queue we don't wake up.  For now this is
 	 * hardcoded for the algorithm mentioned above.
 	 *
 	 * In the case of both readers and writers waiting we wakeup the
 	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
 	 * new writer comes in before a reader it will claim the lock up
 	 * above.  There is probably a potential priority inversion in
 	 * there that could be worked around either by waking both queues
 	 * of waiters or doing some complicated lock handoff gymnastics.
 	 */
 	v = RW_UNLOCKED;
 	if (rw->rw_lock & RW_LOCK_WRITE_WAITERS) {
 		queue = TS_EXCLUSIVE_QUEUE;
 		v |= (rw->rw_lock & RW_LOCK_READ_WAITERS);
 	} else
 		queue = TS_SHARED_QUEUE;
 
 	/* Wake up all waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&rw->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
 		    queue == TS_SHARED_QUEUE ? "read" : "write");
 	turnstile_broadcast(ts, queue);
 	atomic_store_rel_ptr(&rw->rw_lock, v);
 	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	turnstile_chain_unlock(&rw->lock_object);
 }
 
 /*
  * Attempt to do a non-blocking upgrade from a read lock to a write
  * lock.  This will only succeed if this thread holds a single read
  * lock.  Returns true if the upgrade succeeded and false otherwise.
  */
 int
 __rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	uintptr_t v, x, tid;
 	struct turnstile *ts;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_RLOCKED, file, line);
 
 	/*
 	 * Attempt to switch from one reader to a writer.  If there
 	 * are any write waiters, then we will have to lock the
 	 * turnstile first to prevent races with another writer
 	 * calling turnstile_wait() before we have claimed this
 	 * turnstile.  So, do the simple case of no waiters first.
 	 */
 	tid = (uintptr_t)curthread;
 	success = 0;
 	for (;;) {
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1)
 			break;
 		if (!(v & RW_LOCK_WAITERS)) {
 			success = atomic_cmpset_ptr(&rw->rw_lock, v, tid);
 			if (!success)
 				continue;
 			break;
 		}
 
 		/*
 		 * Ok, we think we have waiters, so lock the turnstile.
 		 */
 		ts = turnstile_trywait(&rw->lock_object);
 		v = rw->rw_lock;
 		if (RW_READERS(v) > 1) {
 			turnstile_cancel(ts);
 			break;
 		}
 		/*
 		 * Try to switch from one reader to a writer again.  This time
 		 * we honor the current state of the waiters flags.
 		 * If we obtain the lock with the flags set, then claim
 		 * ownership of the turnstile.
 		 */
 		x = rw->rw_lock & RW_LOCK_WAITERS;
 		success = atomic_cmpset_ptr(&rw->rw_lock, v, tid | x);
 		if (success) {
 			if (x)
 				turnstile_claim(ts);
 			else
 				turnstile_cancel(ts);
 			break;
 		}
 		turnstile_cancel(ts);
 	}
 	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
 	if (success) {
 		curthread->td_rw_rlocks--;
 		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(LS_RW_TRYUPGRADE_UPGRADE, rw);
 	}
 	return (success);
 }
 
 /*
  * Downgrade a write lock into a single read lock.
  */
 void
 __rw_downgrade(volatile uintptr_t *c, const char *file, int line)
 {
 	struct rwlock *rw;
 	struct turnstile *ts;
 	uintptr_t tid, v;
 	int rwait, wwait;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	rw = rwlock2rw(c);
 
 	KASSERT(rw->rw_lock != RW_DESTROYED,
 	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
 	__rw_assert(c, RA_WLOCKED | RA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (rw_recursed(rw))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
 
 	/*
 	 * Convert from a writer to a single reader.  First we handle
 	 * the easy case with no waiters.  If there are any waiters, we
 	 * lock the turnstile and "disown" the lock.
 	 */
 	tid = (uintptr_t)curthread;
 	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
 		goto out;
 
 	/*
 	 * Ok, we think we have waiters, so lock the turnstile so we can
 	 * read the waiter flags without any races.
 	 */
 	turnstile_chain_lock(&rw->lock_object);
 	v = rw->rw_lock & RW_LOCK_WAITERS;
 	rwait = v & RW_LOCK_READ_WAITERS;
 	wwait = v & RW_LOCK_WRITE_WAITERS;
 	MPASS(rwait | wwait);
 
 	/*
 	 * Downgrade from a write lock while preserving waiters flag
 	 * and give up ownership of the turnstile.
 	 */
 	ts = turnstile_lookup(&rw->lock_object);
 	MPASS(ts != NULL);
 	if (!wwait)
 		v &= ~RW_LOCK_READ_WAITERS;
 	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
 	/*
 	 * Wake other readers if there are no writers pending.  Otherwise they
 	 * won't be able to acquire the lock anyway.
 	 */
 	if (rwait && !wwait) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
 	} else
 		turnstile_disown(ts);
 	turnstile_chain_unlock(&rw->lock_object);
 out:
 	curthread->td_rw_rlocks++;
 	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(LS_RW_DOWNGRADE_DOWNGRADE, rw);
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef __rw_assert
 #endif
 
 /*
  * In the non-WITNESS case, rw_assert() can only detect that at least
  * *some* thread owns an rlock, but it cannot guarantee that *this*
  * thread owns an rlock.
  */
 void
 __rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
 {
 	const struct rwlock *rw;
 
 	if (panicstr != NULL)
 		return;
 
 	rw = rwlock2rw(c);
 
 	switch (what) {
 	case RA_LOCKED:
 	case RA_LOCKED | RA_RECURSED:
 	case RA_LOCKED | RA_NOTRECURSED:
 	case RA_RLOCKED:
 	case RA_RLOCKED | RA_RECURSED:
 	case RA_RLOCKED | RA_NOTRECURSED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has a write lock or we have one
 		 * and are asserting a read lock, fail.  Also, if no one
 		 * has a lock at all, fail.
 		 */
 		if (rw->rw_lock == RW_UNLOCKED ||
 		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
 		    rw_wowner(rw) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
 			    "read " : "", file, line);
 
 		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
 			if (rw_recursed(rw)) {
 				if (what & RA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    rw->lock_object.lo_name, file,
 					    line);
 			} else if (what & RA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case RA_WLOCKED:
 	case RA_WLOCKED | RA_RECURSED:
 	case RA_WLOCKED | RA_NOTRECURSED:
 		if (rw_wowner(rw) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		if (rw_recursed(rw)) {
 			if (what & RA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    rw->lock_object.lo_name, file, line);
 		} else if (what & RA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 		break;
 	case RA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&rw->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold a write lock fail.  We can't reliably check
 		 * to see if we hold a read lock or not.
 		 */
 		if (rw_wowner(rw) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    rw->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif /* INVARIANT_SUPPORT */
 
 #ifdef DDB
 void
 db_show_rwlock(const struct lock_object *lock)
 {
 	const struct rwlock *rw;
 	struct thread *td;
 
 	rw = (const struct rwlock *)lock;
 
 	db_printf(" state: ");
 	if (rw->rw_lock == RW_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (rw->rw_lock == RW_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (rw->rw_lock & RW_LOCK_READ)
 		db_printf("RLOCK: %ju locks\n",
 		    (uintmax_t)(RW_READERS(rw->rw_lock)));
 	else {
 		td = rw_wowner(rw);
 		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (rw_recursed(rw))
 			db_printf(" recursed: %u\n", rw->rw_recurse);
 	}
 	db_printf(" waiters: ");
 	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
 	case RW_LOCK_READ_WAITERS:
 		db_printf("readers\n");
 		break;
 	case RW_LOCK_WRITE_WAITERS:
 		db_printf("writers\n");
 		break;
 	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
 		db_printf("readers and writers\n");
 		break;
 	default:
 		db_printf("none\n");
 		break;
 	}
 }
 
 #endif
Index: stable/10/sys/kern/kern_sx.c
===================================================================
--- stable/10/sys/kern/kern_sx.c	(revision 278649)
+++ stable/10/sys/kern/kern_sx.c	(revision 278650)
@@ -1,1214 +1,1231 @@
 /*-
  * Copyright (c) 2007 Attilio Rao <attilio@freebsd.org>
  * Copyright (c) 2001 Jason Evans <jasone@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 /*
  * Shared/exclusive locks.  This implementation attempts to ensure
  * deterministic lock granting behavior, so that slocks and xlocks are
  * interleaved.
  *
  * Priority propagation will not generally raise the priority of lock holders,
  * so should not be relied upon in combination with sx locks.
  */
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
 #include "opt_no_adaptive_sx.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #include <machine/cpu.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #if defined(SMP) && !defined(NO_ADAPTIVE_SX)
 #define	ADAPTIVE_SX
 #endif
 
 CTASSERT((SX_NOADAPTIVE & LO_CLASSFLAGS) == SX_NOADAPTIVE);
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 PMC_SOFT_DECLARE( , , lock, failed);
 #endif
 
 /* Handy macros for sleep queues. */
 #define	SQ_EXCLUSIVE_QUEUE	0
 #define	SQ_SHARED_QUEUE		1
 
 /*
  * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
  * drop Giant anytime we have to sleep or if we adaptively spin.
  */
 #define	GIANT_DECLARE							\
 	int _giantcnt = 0;						\
 	WITNESS_SAVE_DECL(Giant)					\
 
 #define	GIANT_SAVE() do {						\
 	if (mtx_owned(&Giant)) {					\
 		WITNESS_SAVE(&Giant.lock_object, Giant);		\
 		while (mtx_owned(&Giant)) {				\
 			_giantcnt++;					\
 			mtx_unlock(&Giant);				\
 		}							\
 	}								\
 } while (0)
 
 #define GIANT_RESTORE() do {						\
 	if (_giantcnt > 0) {						\
 		mtx_assert(&Giant, MA_NOTOWNED);			\
 		while (_giantcnt--)					\
 			mtx_lock(&Giant);				\
 		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
 	}								\
 } while (0)
 
 /*
  * Returns true if an exclusive lock is recursed.  It assumes
  * curthread currently has an exclusive lock.
  */
 #define	sx_recurse		lock_object.lo_data
 #define	sx_recursed(sx)		((sx)->sx_recurse != 0)
 
 static void	assert_sx(const struct lock_object *lock, int what);
 #ifdef DDB
 static void	db_show_sx(const struct lock_object *lock);
 #endif
 static void	lock_sx(struct lock_object *lock, uintptr_t how);
 #ifdef KDTRACE_HOOKS
 static int	owner_sx(const struct lock_object *lock, struct thread **owner);
 #endif
 static uintptr_t unlock_sx(struct lock_object *lock);
 
 struct lock_class lock_class_sx = {
 	.lc_name = "sx",
 	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
 	.lc_assert = assert_sx,
 #ifdef DDB
 	.lc_ddb_show = db_show_sx,
 #endif
 	.lc_lock = lock_sx,
 	.lc_unlock = unlock_sx,
 #ifdef KDTRACE_HOOKS
 	.lc_owner = owner_sx,
 #endif
 };
 
 #ifndef INVARIANTS
 #define	_sx_assert(sx, what, file, line)
 #endif
 
 #ifdef ADAPTIVE_SX
 static u_int asx_retries = 10;
 static u_int asx_loops = 10000;
 static SYSCTL_NODE(_debug, OID_AUTO, sx, CTLFLAG_RD, NULL, "sxlock debugging");
 SYSCTL_UINT(_debug_sx, OID_AUTO, retries, CTLFLAG_RW, &asx_retries, 0, "");
 SYSCTL_UINT(_debug_sx, OID_AUTO, loops, CTLFLAG_RW, &asx_loops, 0, "");
 #endif
 
 void
 assert_sx(const struct lock_object *lock, int what)
 {
 
 	sx_assert((const struct sx *)lock, what);
 }
 
 void
 lock_sx(struct lock_object *lock, uintptr_t how)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	if (how)
 		sx_slock(sx);
 	else
 		sx_xlock(sx);
 }
 
 uintptr_t
 unlock_sx(struct lock_object *lock)
 {
 	struct sx *sx;
 
 	sx = (struct sx *)lock;
 	sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
 	if (sx_xlocked(sx)) {
 		sx_xunlock(sx);
 		return (0);
 	} else {
 		sx_sunlock(sx);
 		return (1);
 	}
 }
 
 #ifdef KDTRACE_HOOKS
 int
 owner_sx(const struct lock_object *lock, struct thread **owner)
 {
         const struct sx *sx = (const struct sx *)lock;
 	uintptr_t x = sx->sx_lock;
 
         *owner = (struct thread *)SX_OWNER(x);
         return ((x & SX_LOCK_SHARED) != 0 ? (SX_SHARERS(x) != 0) :
 	    (*owner != NULL));
 }
 #endif
 
 void
 sx_sysinit(void *arg)
 {
 	struct sx_args *sargs = arg;
 
 	sx_init_flags(sargs->sa_sx, sargs->sa_desc, sargs->sa_flags);
 }
 
 void
 sx_init_flags(struct sx *sx, const char *description, int opts)
 {
 	int flags;
 
 	MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
 	    SX_NOPROFILE | SX_NOADAPTIVE)) == 0);
 	ASSERT_ATOMIC_LOAD_PTR(sx->sx_lock,
 	    ("%s: sx_lock not aligned for %s: %p", __func__, description,
 	    &sx->sx_lock));
 
 	flags = LO_SLEEPABLE | LO_UPGRADABLE;
 	if (opts & SX_DUPOK)
 		flags |= LO_DUPOK;
 	if (opts & SX_NOPROFILE)
 		flags |= LO_NOPROFILE;
 	if (!(opts & SX_NOWITNESS))
 		flags |= LO_WITNESS;
 	if (opts & SX_RECURSE)
 		flags |= LO_RECURSABLE;
 	if (opts & SX_QUIET)
 		flags |= LO_QUIET;
 
 	flags |= opts & SX_NOADAPTIVE;
 	lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
 	sx->sx_lock = SX_LOCK_UNLOCKED;
 	sx->sx_recurse = 0;
 }
 
 void
 sx_destroy(struct sx *sx)
 {
 
 	KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
 	KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
 	sx->sx_lock = SX_LOCK_DESTROYED;
 	lock_destroy(&sx->lock_object);
 }
 
 int
 _sx_slock(struct sx *sx, int opts, const char *file, int line)
 {
 	int error = 0;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_slock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line, NULL);
 	error = __sx_slock(sx, opts, file, line);
 	if (!error) {
 		LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
 		WITNESS_LOCK(&sx->lock_object, 0, file, line);
 		curthread->td_locks++;
 	}
 
 	return (error);
 }
 
 int
 sx_try_slock_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_try_slock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 
 	for (;;) {
 		x = sx->sx_lock;
 		KASSERT(x != SX_LOCK_DESTROYED,
 		    ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
 		if (!(x & SX_LOCK_SHARED))
 			break;
 		if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
 			LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
 			WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
 			curthread->td_locks++;
 			return (1);
 		}
 	}
 
 	LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
 	return (0);
 }
 
 int
 _sx_xlock(struct sx *sx, int opts, const char *file, int line)
 {
 	int error = 0;
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xlock() of destroyed sx @ %s:%d", file, line));
 	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
 	    line, NULL);
 	error = __sx_xlock(sx, curthread, opts, file, line);
 	if (!error) {
 		LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
 		    file, line);
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 		curthread->td_locks++;
 	}
 
 	return (error);
 }
 
 int
 sx_try_xlock_(struct sx *sx, const char *file, int line)
 {
 	int rval;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
 	    ("sx_try_xlock() by idle thread %p on sx %s @ %s:%d",
 	    curthread, sx->lock_object.lo_name, file, line));
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
 
 	if (sx_xlocked(sx) &&
 	    (sx->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		sx->sx_recurse++;
 		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		rval = 1;
 	} else
 		rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
 		    (uintptr_t)curthread);
 	LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
 	if (rval) {
 		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		curthread->td_locks++;
 	}
 
 	return (rval);
 }
 
 void
 _sx_sunlock(struct sx *sx, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
 	LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
 	__sx_sunlock(sx, file, line);
 	LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_SUNLOCK_RELEASE, sx);
 	curthread->td_locks--;
 }
 
 void
 _sx_xunlock(struct sx *sx, const char *file, int line)
 {
 
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED, file, line);
 	WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
 	LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
 	    line);
 	if (!sx_recursed(sx))
 		LOCKSTAT_PROFILE_RELEASE_LOCK(LS_SX_XUNLOCK_RELEASE, sx);
 	__sx_xunlock(sx, curthread, file, line);
 	curthread->td_locks--;
 }
 
 /*
  * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
  * This will only succeed if this thread holds a single shared lock.
  * Return 1 if if the upgrade succeed, 0 otherwise.
  */
 int
 sx_try_upgrade_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int success;
 
 	if (SCHEDULER_STOPPED())
 		return (1);
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_SLOCKED, file, line);
 
 	/*
 	 * Try to switch from one shared lock to an exclusive lock.  We need
 	 * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
 	 * we will wake up the exclusive waiters when we drop the lock.
 	 */
 	x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
 	success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
 	    (uintptr_t)curthread | x);
 	LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
 	if (success) {
 		WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
 		LOCKSTAT_RECORD0(LS_SX_TRYUPGRADE_UPGRADE, sx);
 	}
 	return (success);
 }
 
 /*
  * Downgrade an unrecursed exclusive lock into a single shared lock.
  */
 void
 sx_downgrade_(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
 	    ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
 	_sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
 #ifndef INVARIANTS
 	if (sx_recursed(sx))
 		panic("downgrade of a recursed lock");
 #endif
 
 	WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
 
 	/*
 	 * Try to switch from an exclusive lock with no shared waiters
 	 * to one sharer with no shared waiters.  If there are
 	 * exclusive waiters, we don't need to lock the sleep queue so
 	 * long as we preserve the flag.  We do one quick try and if
 	 * that fails we grab the sleepq lock to keep the flags from
 	 * changing and do it the slow way.
 	 *
 	 * We have to lock the sleep queue if there are shared waiters
 	 * so we can wake them up.
 	 */
 	x = sx->sx_lock;
 	if (!(x & SX_LOCK_SHARED_WAITERS) &&
 	    atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
 		LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 		return;
 	}
 
 	/*
 	 * Lock the sleep queue so we can read the waiters bits
 	 * without any races and wakeup any shared waiters.
 	 */
 	sleepq_lock(&sx->lock_object);
 
 	/*
 	 * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
 	 * shared lock.  If there are any shared waiters, wake them up.
 	 */
 	wakeup_swapper = 0;
 	x = sx->sx_lock;
 	atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
 	    (x & SX_LOCK_EXCLUSIVE_WAITERS));
 	if (x & SX_LOCK_SHARED_WAITERS)
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_SHARED_QUEUE);
 	sleepq_release(&sx->lock_object);
 
 	LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 	LOCKSTAT_RECORD0(LS_SX_DOWNGRADE_DOWNGRADE, sx);
 
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 int
 _sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
     int line)
 {
 	GIANT_DECLARE;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 	u_int i, spintries = 0;
 #endif
 	uintptr_t x;
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	int error = 0;
 #ifdef	KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	/* If we already hold an exclusive lock, then recurse. */
 	if (sx_xlocked(sx)) {
 		KASSERT((sx->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
 		    sx->lock_object.lo_name, file, line));
 		sx->sx_recurse++;
 		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
 		return (0);
 	}
 
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
 		    sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
 
 	while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 		    &waittime);
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the lock is write locked and the owner is
 		 * running on another CPU, spin until the owner stops
 		 * running or the state of the lock changes.
 		 */
 		x = sx->sx_lock;
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			if ((x & SX_LOCK_SHARED) == 0) {
 				x = SX_OWNER(x);
 				owner = (struct thread *)x;
 				if (TD_IS_RUNNING(owner)) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 						    __func__, sx, owner);
+					KTR_STATE1(KTR_SCHED, "thread",
+					    sched_tdname(curthread), "spinning",
+					    "lockname:\"%s\"",
+					    sx->lock_object.lo_name);
 					GIANT_SAVE();
 					while (SX_OWNER(sx->sx_lock) == x &&
 					    TD_IS_RUNNING(owner)) {
 						cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 						spin_cnt++;
 #endif
 					}
+					KTR_STATE0(KTR_SCHED, "thread",
+					    sched_tdname(curthread), "running");
 					continue;
 				}
 			} else if (SX_SHARERS(x) && spintries < asx_retries) {
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "spinning",
+				    "lockname:\"%s\"", sx->lock_object.lo_name);
 				GIANT_SAVE();
 				spintries++;
 				for (i = 0; i < asx_loops; i++) {
 					if (LOCK_LOG_TEST(&sx->lock_object, 0))
 						CTR4(KTR_LOCK,
 				    "%s: shared spinning on %p with %u and %u",
 						    __func__, sx, spintries, i);
 					x = sx->sx_lock;
 					if ((x & SX_LOCK_SHARED) == 0 ||
 					    SX_SHARERS(x) == 0)
 						break;
 					cpu_spinwait();
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "running");
 				if (i != asx_loops)
 					continue;
 			}
 		}
 #endif
 
 		sleepq_lock(&sx->lock_object);
 		x = sx->sx_lock;
 
 		/*
 		 * If the lock was released while spinning on the
 		 * sleep queue chain lock, try again.
 		 */
 		if (x == SX_LOCK_UNLOCKED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * The current lock owner might have started executing
 		 * on another CPU (or the lock could have changed
 		 * owners) while we were waiting on the sleep queue
 		 * chain lock.  If so, drop the sleep queue lock and try
 		 * again.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * If an exclusive lock was released with both shared
 		 * and exclusive waiters and a shared waiter hasn't
 		 * woken up and acquired the lock yet, sx_lock will be
 		 * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
 		 * If we see that value, try to acquire it once.  Note
 		 * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
 		 * as there are other exclusive waiters still.  If we
 		 * fail, restart the loop.
 		 */
 		if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (atomic_cmpset_acq_ptr(&sx->sx_lock,
 			    SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
 			    tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
 				    __func__, sx);
 				break;
 			}
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 		/*
 		 * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
 		 * than loop back and retry.
 		 */
 		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
 			    x | SX_LOCK_EXCLUSIVE_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the exclusive
 		 * lock and the exclusive waiters flag is set, we have
 		 * to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		GIANT_SAVE();
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 	}
 
 	GIANT_RESTORE();
 	if (!error)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_XLOCK_ACQUIRE, sx,
 		    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
 #endif
 	return (error);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_xunlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 void
 _sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
 {
 	uintptr_t x;
 	int queue, wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
 
 	/* If the lock is recursed, then unrecurse one level. */
 	if (sx_xlocked(sx) && sx_recursed(sx)) {
 		if ((--sx->sx_recurse) == 0)
 			atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
 		return;
 	}
 	MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
 	    SX_LOCK_EXCLUSIVE_WAITERS));
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
 
 	sleepq_lock(&sx->lock_object);
 	x = SX_LOCK_UNLOCKED;
 
 	/*
 	 * The wake up algorithm here is quite simple and probably not
 	 * ideal.  It gives precedence to shared waiters if they are
 	 * present.  For this condition, we have to preserve the
 	 * state of the exclusive waiters flag.
 	 * If interruptible sleeps left the shared queue empty avoid a
 	 * starvation for the threads sleeping on the exclusive queue by giving
 	 * them precedence and cleaning up the shared waiters bit anyway.
 	 */
 	if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 &&
 	    sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) {
 		queue = SQ_SHARED_QUEUE;
 		x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
 	} else
 		queue = SQ_EXCLUSIVE_QUEUE;
 
 	/* Wake up all the waiters for the specific queue. */
 	if (LOCK_LOG_TEST(&sx->lock_object, 0))
 		CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
 		    __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
 		    "exclusive");
 	atomic_store_rel_ptr(&sx->sx_lock, x);
 	wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, 0,
 	    queue);
 	sleepq_release(&sx->lock_object);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_slock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 int
 _sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 {
 	GIANT_DECLARE;
 #ifdef ADAPTIVE_SX
 	volatile struct thread *owner;
 #endif
 #ifdef LOCK_PROFILING
 	uint64_t waittime = 0;
 	int contested = 0;
 #endif
 	uintptr_t x;
 	int error = 0;
 #ifdef KDTRACE_HOOKS
 	uint64_t spin_cnt = 0;
 	uint64_t sleep_cnt = 0;
 	int64_t sleep_time = 0;
 #endif
 
 	if (SCHEDULER_STOPPED())
 		return (0);
 
 	/*
 	 * As with rwlocks, we don't make any attempt to try to block
 	 * shared locks once there is an exclusive waiter.
 	 */
 	for (;;) {
 #ifdef KDTRACE_HOOKS
 		spin_cnt++;
 #endif
 		x = sx->sx_lock;
 
 		/*
 		 * If no other thread has an exclusive lock then try to bump up
 		 * the count of sharers.  Since we have to preserve the state
 		 * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
 		 * shared lock loop back and retry.
 		 */
 		if (x & SX_LOCK_SHARED) {
 			MPASS(!(x & SX_LOCK_SHARED_WAITERS));
 			if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
 			    x + SX_ONE_SHARER)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeed %p -> %p", __func__,
 					    sx, (void *)x,
 					    (void *)(x + SX_ONE_SHARER));
 				break;
 			}
 			continue;
 		}
 #ifdef HWPMC_HOOKS
 		PMC_SOFT_CALL( , , lock, failed);
 #endif
 		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
 		    &waittime);
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if ((sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			x = SX_OWNER(x);
 			owner = (struct thread *)x;
 			if (TD_IS_RUNNING(owner)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR3(KTR_LOCK,
 					    "%s: spinning on %p held by %p",
 					    __func__, sx, owner);
+				KTR_STATE1(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "spinning",
+				    "lockname:\"%s\"", sx->lock_object.lo_name);
 				GIANT_SAVE();
 				while (SX_OWNER(sx->sx_lock) == x &&
 				    TD_IS_RUNNING(owner)) {
 #ifdef KDTRACE_HOOKS
 					spin_cnt++;
 #endif
 					cpu_spinwait();
 				}
+				KTR_STATE0(KTR_SCHED, "thread",
+				    sched_tdname(curthread), "running");
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Some other thread already has an exclusive lock, so
 		 * start the process of blocking.
 		 */
 		sleepq_lock(&sx->lock_object);
 		x = sx->sx_lock;
 
 		/*
 		 * The lock could have been released while we spun.
 		 * In this case loop back and retry.
 		 */
 		if (x & SX_LOCK_SHARED) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 
 #ifdef ADAPTIVE_SX
 		/*
 		 * If the owner is running on another CPU, spin until
 		 * the owner stops running or the state of the lock
 		 * changes.
 		 */
 		if (!(x & SX_LOCK_SHARED) &&
 		    (sx->lock_object.lo_flags & SX_NOADAPTIVE) == 0) {
 			owner = (struct thread *)SX_OWNER(x);
 			if (TD_IS_RUNNING(owner)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 		}
 #endif
 
 		/*
 		 * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
 		 * fail to set it drop the sleep queue lock and loop
 		 * back.
 		 */
 		if (!(x & SX_LOCK_SHARED_WAITERS)) {
 			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
 			    x | SX_LOCK_SHARED_WAITERS)) {
 				sleepq_release(&sx->lock_object);
 				continue;
 			}
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
 				    __func__, sx);
 		}
 
 		/*
 		 * Since we have been unable to acquire the shared lock,
 		 * we have to sleep.
 		 */
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
 			    __func__, sx);
 
 #ifdef KDTRACE_HOOKS
 		sleep_time -= lockstat_nsecs();
 #endif
 		GIANT_SAVE();
 		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
 		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
 		    SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
 		if (!(opts & SX_INTERRUPTIBLE))
 			sleepq_wait(&sx->lock_object, 0);
 		else
 			error = sleepq_wait_sig(&sx->lock_object, 0);
 #ifdef KDTRACE_HOOKS
 		sleep_time += lockstat_nsecs();
 		sleep_cnt++;
 #endif
 		if (error) {
 			if (LOCK_LOG_TEST(&sx->lock_object, 0))
 				CTR2(KTR_LOCK,
 			"%s: interruptible sleep by %p suspended by signal",
 				    __func__, sx);
 			break;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
 			    __func__, sx);
 	}
 	if (error == 0)
 		LOCKSTAT_PROFILE_OBTAIN_LOCK_SUCCESS(LS_SX_SLOCK_ACQUIRE, sx,
 		    contested, waittime, file, line);
 #ifdef KDTRACE_HOOKS
 	if (sleep_time)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_BLOCK, sx, sleep_time);
 	if (spin_cnt > sleep_cnt)
 		LOCKSTAT_RECORD1(LS_SX_XLOCK_SPIN, sx, (spin_cnt - sleep_cnt));
 #endif
 	GIANT_RESTORE();
 	return (error);
 }
 
 /*
  * This function represents the so-called 'hard case' for sx_sunlock
  * operation.  All 'easy case' failures are redirected to this.  Note
  * that ideally this would be a static function, but it needs to be
  * accessible from at least sx.h.
  */
 void
 _sx_sunlock_hard(struct sx *sx, const char *file, int line)
 {
 	uintptr_t x;
 	int wakeup_swapper;
 
 	if (SCHEDULER_STOPPED())
 		return;
 
 	for (;;) {
 		x = sx->sx_lock;
 
 		/*
 		 * We should never have sharers while at least one thread
 		 * holds a shared lock.
 		 */
 		KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
 		    ("%s: waiting sharers", __func__));
 
 		/*
 		 * See if there is more than one shared lock held.  If
 		 * so, just drop one and return.
 		 */
 		if (SX_SHARERS(x) > 1) {
 			if (atomic_cmpset_rel_ptr(&sx->sx_lock, x,
 			    x - SX_ONE_SHARER)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR4(KTR_LOCK,
 					    "%s: %p succeeded %p -> %p",
 					    __func__, sx, (void *)x,
 					    (void *)(x - SX_ONE_SHARER));
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * If there aren't any waiters for an exclusive lock,
 		 * then try to drop it quickly.
 		 */
 		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
 			MPASS(x == SX_SHARERS_LOCK(1));
 			if (atomic_cmpset_rel_ptr(&sx->sx_lock,
 			    SX_SHARERS_LOCK(1), SX_LOCK_UNLOCKED)) {
 				if (LOCK_LOG_TEST(&sx->lock_object, 0))
 					CTR2(KTR_LOCK, "%s: %p last succeeded",
 					    __func__, sx);
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * At this point, there should just be one sharer with
 		 * exclusive waiters.
 		 */
 		MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
 
 		sleepq_lock(&sx->lock_object);
 
 		/*
 		 * Wake up semantic here is quite simple:
 		 * Just wake up all the exclusive waiters.
 		 * Note that the state of the lock could have changed,
 		 * so if it fails loop back and retry.
 		 */
 		if (!atomic_cmpset_rel_ptr(&sx->sx_lock,
 		    SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
 		    SX_LOCK_UNLOCKED)) {
 			sleepq_release(&sx->lock_object);
 			continue;
 		}
 		if (LOCK_LOG_TEST(&sx->lock_object, 0))
 			CTR2(KTR_LOCK, "%s: %p waking up all thread on"
 			    "exclusive queue", __func__, sx);
 		wakeup_swapper = sleepq_broadcast(&sx->lock_object, SLEEPQ_SX,
 		    0, SQ_EXCLUSIVE_QUEUE);
 		sleepq_release(&sx->lock_object);
 		if (wakeup_swapper)
 			kick_proc0();
 		break;
 	}
 }
 
 #ifdef INVARIANT_SUPPORT
 #ifndef INVARIANTS
 #undef	_sx_assert
 #endif
 
 /*
  * In the non-WITNESS case, sx_assert() can only detect that at least
  * *some* thread owns an slock, but it cannot guarantee that *this*
  * thread owns an slock.
  */
 void
 _sx_assert(const struct sx *sx, int what, const char *file, int line)
 {
 #ifndef WITNESS
 	int slocked = 0;
 #endif
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
 	case SA_SLOCKED:
 	case SA_SLOCKED | SA_NOTRECURSED:
 	case SA_SLOCKED | SA_RECURSED:
 #ifndef WITNESS
 		slocked = 1;
 		/* FALLTHROUGH */
 #endif
 	case SA_LOCKED:
 	case SA_LOCKED | SA_NOTRECURSED:
 	case SA_LOCKED | SA_RECURSED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If some other thread has an exclusive lock or we
 		 * have one and are asserting a shared lock, fail.
 		 * Also, if no one has a lock at all, fail.
 		 */
 		if (sx->sx_lock == SX_LOCK_UNLOCKED ||
 		    (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
 		    sx_xholder(sx) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
 			    sx->lock_object.lo_name, slocked ? "share " : "",
 			    file, line);
 
 		if (!(sx->sx_lock & SX_LOCK_SHARED)) {
 			if (sx_recursed(sx)) {
 				if (what & SA_NOTRECURSED)
 					panic("Lock %s recursed @ %s:%d\n",
 					    sx->lock_object.lo_name, file,
 					    line);
 			} else if (what & SA_RECURSED)
 				panic("Lock %s not recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		}
 #endif
 		break;
 	case SA_XLOCKED:
 	case SA_XLOCKED | SA_NOTRECURSED:
 	case SA_XLOCKED | SA_RECURSED:
 		if (sx_xholder(sx) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		if (sx_recursed(sx)) {
 			if (what & SA_NOTRECURSED)
 				panic("Lock %s recursed @ %s:%d\n",
 				    sx->lock_object.lo_name, file, line);
 		} else if (what & SA_RECURSED)
 			panic("Lock %s not recursed @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 		break;
 	case SA_UNLOCKED:
 #ifdef WITNESS
 		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
 		 * If we hold an exclusve lock fail.  We can't
 		 * reliably check to see if we hold a shared lock or
 		 * not.
 		 */
 		if (sx_xholder(sx) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
 			    sx->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
 		panic("Unknown sx lock assertion: %d @ %s:%d", what, file,
 		    line);
 	}
 }
 #endif	/* INVARIANT_SUPPORT */
 
 #ifdef DDB
 static void
 db_show_sx(const struct lock_object *lock)
 {
 	struct thread *td;
 	const struct sx *sx;
 
 	sx = (const struct sx *)lock;
 
 	db_printf(" state: ");
 	if (sx->sx_lock == SX_LOCK_UNLOCKED)
 		db_printf("UNLOCKED\n");
 	else if (sx->sx_lock == SX_LOCK_DESTROYED) {
 		db_printf("DESTROYED\n");
 		return;
 	} else if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else {
 		td = sx_xholder(sx);
 		db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_name);
 		if (sx_recursed(sx))
 			db_printf(" recursed: %d\n", sx->sx_recurse);
 	}
 
 	db_printf(" waiters: ");
 	switch(sx->sx_lock &
 	    (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
 	case SX_LOCK_SHARED_WAITERS:
 		db_printf("shared\n");
 		break;
 	case SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive\n");
 		break;
 	case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
 		db_printf("exclusive and shared\n");
 		break;
 	default:
 		db_printf("none\n");
 	}
 }
 
 /*
  * Check to see if a thread that is blocked on a sleep queue is actually
  * blocked on an sx lock.  If so, output some details and return true.
  * If the lock has an exclusive owner, return that in *ownerp.
  */
 int
 sx_chain(struct thread *td, struct thread **ownerp)
 {
 	struct sx *sx;
 
 	/*
 	 * Check to see if this thread is blocked on an sx lock.
 	 * First, we check the lock class.  If that is ok, then we
 	 * compare the lock name against the wait message.
 	 */
 	sx = td->td_wchan;
 	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
 	    sx->lock_object.lo_name != td->td_wmesg)
 		return (0);
 
 	/* We think we have an sx lock, so output some details. */
 	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
 	*ownerp = sx_xholder(sx);
 	if (sx->sx_lock & SX_LOCK_SHARED)
 		db_printf("SLOCK (count %ju)\n",
 		    (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else
 		db_printf("XLOCK\n");
 	return (1);
 }
 #endif
Index: stable/10/sys/kern/kern_timeout.c
===================================================================
--- stable/10/sys/kern/kern_timeout.c	(revision 278649)
+++ stable/10/sys/kern/kern_timeout.c	(revision 278650)
@@ -1,1446 +1,1452 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_callout_profiling.h"
 #include "opt_kdtrace.h"
 #if defined(__arm__)
 #include "opt_timer.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
 #ifndef NO_EVENTTIMERS
 DPCPU_DECLARE(sbintime_t, hardclocktime);
 #endif
 
 SDT_PROVIDER_DEFINE(callout_execute);
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__start,
     "struct callout *");
 SDT_PROBE_DEFINE1(callout_execute, kernel, , callout__end,
     "struct callout *");
 
 #ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
 static int avg_gcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_gcalls, CTLFLAG_RD, &avg_gcalls, 0,
     "Average number of Giant callouts made per softclock call. Units = 1/1000");
 static int avg_lockcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls, CTLFLAG_RD, &avg_lockcalls, 0,
     "Average number of lock callouts made per softclock call. Units = 1/1000");
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
 static int avg_depth_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
     "Average number of direct callouts examined per callout_process call. "
     "Units = 1/1000");
 static int avg_lockcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
     &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
     "callout_process call. Units = 1/1000");
 static int avg_mpcalls_dir;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
     0, "Average number of MP direct callouts made per callout_process call. "
     "Units = 1/1000");
 #endif
 
 static int ncallout;
 SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
     "Number of entries in callwheel and size of timeout() preallocation");
 
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
 u_int callwheelsize, callwheelmask;
 
 /*
  * The callout cpu exec entities represent informations necessary for
  * describing the state of callouts currently running on the CPU and the ones
  * necessary for migrating callouts to the new callout cpu. In particular,
  * the first entry of the array cc_exec_entity holds informations for callout
  * running in SWI thread context, while the second one holds informations
  * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
 struct cc_exec {
 	struct callout		*cc_next;
 	struct callout		*cc_curr;
 #ifdef SMP
 	void			(*ce_migration_func)(void *);
 	void			*ce_migration_arg;
 	int			ce_migration_cpu;
 	sbintime_t		ce_migration_time;
 	sbintime_t		ce_migration_prec;
 #endif
 	bool			cc_cancel;
 	bool			cc_waiting;
 };
 
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
  */
 struct callout_cpu {
 	struct mtx_padalign	cc_lock;
 	struct cc_exec 		cc_exec_entity[2];
 	struct callout		*cc_callout;
 	struct callout_list	*cc_callwheel;
 	struct callout_tailq	cc_expireq;
 	struct callout_slist	cc_callfree;
 	sbintime_t		cc_firstevent;
 	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
 	u_int			cc_bucket;
+	char			cc_ktr_event_name[20];
 };
 
 #define	cc_exec_curr		cc_exec_entity[0].cc_curr
 #define	cc_exec_next		cc_exec_entity[0].cc_next
 #define	cc_exec_cancel		cc_exec_entity[0].cc_cancel
 #define	cc_exec_waiting		cc_exec_entity[0].cc_waiting
 #define	cc_exec_curr_dir	cc_exec_entity[1].cc_curr
 #define	cc_exec_next_dir	cc_exec_entity[1].cc_next
 #define	cc_exec_cancel_dir	cc_exec_entity[1].cc_cancel
 #define	cc_exec_waiting_dir	cc_exec_entity[1].cc_waiting
 
 #ifdef SMP
 #define	cc_migration_func	cc_exec_entity[0].ce_migration_func
 #define	cc_migration_arg	cc_exec_entity[0].ce_migration_arg
 #define	cc_migration_cpu	cc_exec_entity[0].ce_migration_cpu
 #define	cc_migration_time	cc_exec_entity[0].ce_migration_time
 #define	cc_migration_prec	cc_exec_entity[0].ce_migration_prec
 #define	cc_migration_func_dir	cc_exec_entity[1].ce_migration_func
 #define	cc_migration_arg_dir	cc_exec_entity[1].ce_migration_arg
 #define	cc_migration_cpu_dir	cc_exec_entity[1].ce_migration_cpu
 #define	cc_migration_time_dir	cc_exec_entity[1].ce_migration_time
 #define	cc_migration_prec_dir	cc_exec_entity[1].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
 #define	CC_CPU(cpu)	(&cc_cpu[(cpu)])
 #define	CC_SELF()	CC_CPU(PCPU_GET(cpuid))
 #else
 struct callout_cpu cc_cpu;
 #define	CC_CPU(cpu)	&cc_cpu
 #define	CC_SELF()	&cc_cpu
 #endif
 #define	CC_LOCK(cc)	mtx_lock_spin(&(cc)->cc_lock)
 #define	CC_UNLOCK(cc)	mtx_unlock_spin(&(cc)->cc_lock)
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
 
-static void	callout_cpu_init(struct callout_cpu *cc);
+static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
 static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
 		    int *mpcalls, int *lockcalls, int *gcalls,
 #endif
 		    int direct);
 
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
  *   cc_curr         - If a callout is in progress, it is cc_curr.
  *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
  *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
  *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
  *                     cc_curr is non-NULL.
  */
 
 /*
  * Resets the execution entity tied to a specific callout cpu.
  */
 static void
 cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	cc->cc_exec_entity[direct].cc_next = NULL;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	cc->cc_exec_entity[direct].cc_waiting = false;
 #ifdef SMP
 	cc->cc_exec_entity[direct].ce_migration_cpu = CPUBLOCK;
 	cc->cc_exec_entity[direct].ce_migration_time = 0;
 	cc->cc_exec_entity[direct].ce_migration_prec = 0;
 	cc->cc_exec_entity[direct].ce_migration_func = NULL;
 	cc->cc_exec_entity[direct].ce_migration_arg = NULL;
 #endif
 }
 
 /*
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
 cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
 	return (cc->cc_exec_entity[direct].ce_migration_cpu != CPUBLOCK);
 #else
 	return (0);
 #endif
 }
 
 /*
  * Kernel low level callwheel initialization
  * called on cpu0 during kernel startup.
  */
 static void
 callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Calculate the size of the callout wheel and the preallocated
 	 * timeout() structures.
 	 * XXX: Clip callout to result of previous function of maxusers
 	 * maximum 384.  This is still huge, but acceptable.
 	 */
 	ncallout = imin(16 + maxproc + maxfiles, 18508);
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
 	 * Calculate callout wheel size, should be next power of two higher
 	 * than 'ncallout'.
 	 */
 	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Only cpu0 handles timeout(9) and receives a preallocation.
 	 *
 	 * XXX: Once all timeout(9) consumers are converted this can
 	 * be removed.
 	 */
 	timeout_cpu = PCPU_GET(cpuid);
 	cc = CC_CPU(timeout_cpu);
 	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
 	    M_CALLOUT, M_WAITOK);
-	callout_cpu_init(cc);
+	callout_cpu_init(cc, timeout_cpu);
 }
 SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
 /*
  * Initialize the per-cpu callout structures.
  */
 static void
-callout_cpu_init(struct callout_cpu *cc)
+callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
 	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
 	    M_CALLOUT, M_WAITOK);
 	for (i = 0; i < callwheelsize; i++)
 		LIST_INIT(&cc->cc_callwheel[i]);
 	TAILQ_INIT(&cc->cc_expireq);
 	cc->cc_firstevent = INT64_MAX;
 	for (i = 0; i < 2; i++)
 		cc_cce_cleanup(cc, i);
+	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
+	    "callwheel cpu %d", cpu);
 	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
 
 #ifdef SMP
 /*
  * Switches the cpu tied to a specific callout.
  * The function expects a locked incoming callout cpu and returns with
  * locked outcoming callout cpu.
  */
 static struct callout_cpu *
 callout_cpu_switch(struct callout *c, struct callout_cpu *cc, int new_cpu)
 {
 	struct callout_cpu *new_cc;
 
 	MPASS(c != NULL && cc != NULL);
 	CC_LOCK_ASSERT(cc);
 
 	/*
 	 * Avoid interrupts and preemption firing after the callout cpu
 	 * is blocked in order to avoid deadlocks as the new thread
 	 * may be willing to acquire the callout cpu lock.
 	 */
 	c->c_cpu = CPUBLOCK;
 	spinlock_enter();
 	CC_UNLOCK(cc);
 	new_cc = CC_CPU(new_cpu);
 	CC_LOCK(new_cc);
 	spinlock_exit();
 	c->c_cpu = new_cpu;
 	return (new_cc);
 }
 #endif
 
 /*
  * Start standard softclock thread.
  */
 static void
 start_softclock(void *dummy)
 {
 	struct callout_cpu *cc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	cc = CC_CPU(timeout_cpu);
 	if (swi_add(&clk_intr_event, "clock", softclock, cc, SWI_CLOCK,
 	    INTR_MPSAFE, &cc->cc_cookie))
 		panic("died while creating standard software ithreads");
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
 		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
-		callout_cpu_init(cc);
+		callout_cpu_init(cc, cpu);
 		if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
 	}
 #endif
 }
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
 #define	CC_HASH_SHIFT	8
 
 static inline u_int
 callout_hash(sbintime_t sbt)
 {
 
 	return (sbt >> (32 - CC_HASH_SHIFT));
 }
 
 static inline u_int
 callout_get_bucket(sbintime_t sbt)
 {
 
 	return (callout_hash(sbt) & callwheelmask);
 }
 
 void
 callout_process(sbintime_t now)
 {
 	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t first, last, max, tmp_max;
 	uint32_t lookahead;
 	u_int firstb, lastb, nowb;
 #ifdef CALLOUT_PROFILING
 	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
 #endif
 
 	cc = CC_SELF();
 	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
 
 	/* Compute the buckets of the last scan and present times. */
 	firstb = callout_hash(cc->cc_lastscan);
 	cc->cc_lastscan = now;
 	nowb = callout_hash(now);
 
 	/* Compute the last bucket and minimum time of the bucket after it. */
 	if (nowb == firstb)
 		lookahead = (SBT_1S / 16);
 	else if (nowb - firstb == 1)
 		lookahead = (SBT_1S / 8);
 	else
 		lookahead = (SBT_1S / 2);
 	first = last = now;
 	first += (lookahead / 2);
 	last += lookahead;
 	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
 	lastb = callout_hash(last) - 1;
 	max = last;
 
 	/*
 	 * Check if we wrapped around the entire wheel from the last scan.
 	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
 	if (lastb - firstb >= callwheelsize) {
 		lastb = firstb + callwheelsize - 1;
 		if (nowb - firstb >= callwheelsize)
 			nowb = lastb;
 	}
 
 	/* Iterate callwheel from firstb to nowb and then up to lastb. */
 	do {
 		sc = &cc->cc_callwheel[firstb & callwheelmask];
 		tmp = LIST_FIRST(sc);
 		while (tmp != NULL) {
 			/* Run the callout if present time within allowed. */
 			if (tmp->c_time <= now) {
 				/*
 				 * Consumer told us the callout may be run
 				 * directly from hardware interrupt context.
 				 */
 				if (tmp->c_flags & CALLOUT_DIRECT) {
 #ifdef CALLOUT_PROFILING
 					++depth_dir;
 #endif
 					cc->cc_exec_next_dir =
 					    LIST_NEXT(tmp, c_links.le);
 					cc->cc_bucket = firstb & callwheelmask;
 					LIST_REMOVE(tmp, c_links.le);
 					softclock_call_cc(tmp, cc,
 #ifdef CALLOUT_PROFILING
 					    &mpcalls_dir, &lockcalls_dir, NULL,
 #endif
 					    1);
 					tmp = cc->cc_exec_next_dir;
 				} else {
 					tmpn = LIST_NEXT(tmp, c_links.le);
 					LIST_REMOVE(tmp, c_links.le);
 					TAILQ_INSERT_TAIL(&cc->cc_expireq,
 					    tmp, c_links.tqe);
 					tmp->c_flags |= CALLOUT_PROCESSED;
 					tmp = tmpn;
 				}
 				continue;
 			}
 			/* Skip events from distant future. */
 			if (tmp->c_time >= max)
 				goto next;
 			/*
 			 * Event minimal time is bigger than present maximal
 			 * time, so it cannot be aggregated.
 			 */
 			if (tmp->c_time > last) {
 				lastb = nowb;
 				goto next;
 			}
 			/* Update first and last time, respecting this event. */
 			if (tmp->c_time < first)
 				first = tmp->c_time;
 			tmp_max = tmp->c_time + tmp->c_precision;
 			if (tmp_max < last)
 				last = tmp_max;
 next:
 			tmp = LIST_NEXT(tmp, c_links.le);
 		}
 		/* Proceed with the next bucket. */
 		firstb++;
 		/*
 		 * Stop if we looked after present time and found
 		 * some event we can't execute at now.
 		 * Stop if we looked far enough into the future.
 		 */
 	} while (((int)(firstb - lastb)) <= 0);
 	cc->cc_firstevent = last;
 #ifndef NO_EVENTTIMERS
 	cpu_new_callout(curcpu, last, first);
 #endif
 #ifdef CALLOUT_PROFILING
 	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
 	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
 	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
 #endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
 	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
 	struct callout_cpu *cc;
 	int cpu;
 
 	for (;;) {
 		cpu = c->c_cpu;
 #ifdef SMP
 		if (cpu == CPUBLOCK) {
 			while (c->c_cpu == CPUBLOCK)
 				cpu_spinwait();
 			continue;
 		}
 #endif
 		cc = CC_CPU(cpu);
 		CC_LOCK(cc);
 		if (cpu == c->c_cpu)
 			break;
 		CC_UNLOCK(cc);
 	}
 	return (cc);
 }
 
 static void
 callout_cc_add(struct callout *c, struct callout_cpu *cc,
     sbintime_t sbt, sbintime_t precision, void (*func)(void *),
     void *arg, int cpu, int flags)
 {
 	int bucket;
 
 	CC_LOCK_ASSERT(cc);
 	if (sbt < cc->cc_lastscan)
 		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
 	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
 	if (flags & C_DIRECT_EXEC)
 		c->c_flags |= CALLOUT_DIRECT;
 	c->c_flags &= ~CALLOUT_PROCESSED;
 	c->c_func = func;
 	c->c_time = sbt;
 	c->c_precision = precision;
 	bucket = callout_get_bucket(c->c_time);
 	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
 	    c, (int)(c->c_precision >> 32),
 	    (u_int)(c->c_precision & 0xffffffff));
 	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
 	if (cc->cc_bucket == bucket)
 		cc->cc_exec_next_dir = c;
 #ifndef NO_EVENTTIMERS
 	/*
 	 * Inform the eventtimers(4) subsystem there's a new callout
 	 * that has been inserted, but only if really required.
 	 */
 	if (INT64_MAX - c->c_time < c->c_precision)
 		c->c_precision = INT64_MAX - c->c_time;
 	sbt = c->c_time + c->c_precision;
 	if (sbt < cc->cc_firstevent) {
 		cc->cc_firstevent = sbt;
 		cpu_new_callout(cpu, sbt, c->c_time);
 	}
 #endif
 }
 
 static void
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
 	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 }
 
 static void
 softclock_call_cc(struct callout *c, struct callout_cpu *cc,
 #ifdef CALLOUT_PROFILING
     int *mpcalls, int *lockcalls, int *gcalls,
 #endif
     int direct)
 {
 	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
 	uintptr_t lock_status;
 	int c_flags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
 	int flags, new_cpu;
 	sbintime_t new_prec, new_time;
 #endif
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
 	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
 	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
 	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
 	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
 	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
 	lock_status = 0;
 	if (c->c_flags & CALLOUT_SHAREDLOCK) {
 		if (class == &lock_class_rm)
 			lock_status = (uintptr_t)&tracker;
 		else
 			lock_status = 1;
 	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
 	c_flags = c->c_flags;
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		c->c_flags = CALLOUT_LOCAL_ALLOC;
 	else
 		c->c_flags &= ~CALLOUT_PENDING;
 	cc->cc_exec_entity[direct].cc_curr = c;
 	cc->cc_exec_entity[direct].cc_cancel = false;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
 		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
 		if (cc->cc_exec_entity[direct].cc_cancel) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
 		cc->cc_exec_entity[direct].cc_cancel = true;
 		if (c_lock == &Giant.lock_object) {
 #ifdef CALLOUT_PROFILING
 			(*gcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
 #ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
 #endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
 #ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
 #endif
 		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
+	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
+	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
 	SDT_PROBE(callout_execute, kernel, , callout__start, c, 0, 0, 0, 0);
 	c_func(c_arg);
 	SDT_PROBE(callout_execute, kernel, , callout__end, c, 0, 0, 0, 0);
 	THREAD_SLEEPING_OK();
 #if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
 	sbt2 = sbinuptime();
 	sbt2 -= sbt1;
 	if (sbt2 > maxdt) {
 		if (lastfunc != c_func || sbt2 > maxdt * 2) {
 			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
 		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
+	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
 	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
 	KASSERT(cc->cc_exec_entity[direct].cc_curr == c, ("mishandled cc_curr"));
 	cc->cc_exec_entity[direct].cc_curr = NULL;
 	if (cc->cc_exec_entity[direct].cc_waiting) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
 		if (cc_cce_migrating(cc, direct)) {
 			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 		}
 		cc->cc_exec_entity[direct].cc_waiting = false;
 		CC_UNLOCK(cc);
 		wakeup(&cc->cc_exec_entity[direct].cc_waiting);
 		CC_LOCK(cc);
 	} else if (cc_cce_migrating(cc, direct)) {
 		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
 		new_cpu = cc->cc_exec_entity[direct].ce_migration_cpu;
 		new_time = cc->cc_exec_entity[direct].ce_migration_time;
 		new_prec = cc->cc_exec_entity[direct].ce_migration_prec;
 		new_func = cc->cc_exec_entity[direct].ce_migration_func;
 		new_arg = cc->cc_exec_entity[direct].ce_migration_arg;
 		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
 		 * but that is not easy.
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
 		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
 			callout_cc_del(c, cc);
 			return;
 		}
 		c->c_flags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
 		flags = (direct) ? C_DIRECT_EXEC : 0;
 		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
 		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
 		panic("migration should not happen");
 #endif
 	}
 	/*
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
 	 * Note: we need to check the cached copy of c_flags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
 	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
 	    c->c_flags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
 	if (c_flags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
  * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
  * used in this implementation was published by G. Varghese and T. Lauck in
  * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for
  * the Efficient Implementation of a Timer Facility" in the Proceedings of
  * the 11th ACM Annual Symposium on Operating Systems Principles,
  * Austin, Texas Nov 1987.
  */
 
 /*
  * Software (low priority) clock interrupt.
  * Run periodic events from timeout queue.
  */
 void
 softclock(void *arg)
 {
 	struct callout_cpu *cc;
 	struct callout *c;
 #ifdef CALLOUT_PROFILING
 	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
 #endif
 
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
 	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		softclock_call_cc(c, cc,
 #ifdef CALLOUT_PROFILING
 		    &mpcalls, &lockcalls, &gcalls,
 #endif
 		    0);
 #ifdef CALLOUT_PROFILING
 		++depth;
 #endif
 	}
 #ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
 #endif
 	CC_UNLOCK(cc);
 }
 
 /*
  * timeout --
  *	Execute a function after a specified length of time.
  *
  * untimeout --
  *	Cancel previous timeout function call.
  *
  * callout_handle_init --
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
  *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
  */
 struct callout_handle
 timeout(ftn, arg, to_ticks)
 	timeout_t *ftn;
 	void *arg;
 	int to_ticks;
 {
 	struct callout_cpu *cc;
 	struct callout *new;
 	struct callout_handle handle;
 
 	cc = CC_CPU(timeout_cpu);
 	CC_LOCK(cc);
 	/* Fill in the next free callout structure. */
 	new = SLIST_FIRST(&cc->cc_callfree);
 	if (new == NULL)
 		/* XXX Attempt to malloc first */
 		panic("timeout table full");
 	SLIST_REMOVE_HEAD(&cc->cc_callfree, c_links.sle);
 	callout_reset(new, to_ticks, ftn, arg);
 	handle.callout = new;
 	CC_UNLOCK(cc);
 
 	return (handle);
 }
 
 void
 untimeout(ftn, arg, handle)
 	timeout_t *ftn;
 	void *arg;
 	struct callout_handle handle;
 {
 	struct callout_cpu *cc;
 
 	/*
 	 * Check for a handle that was initialized
 	 * by callout_handle_init, but never used
 	 * for a real timeout.
 	 */
 	if (handle.callout == NULL)
 		return;
 
 	cc = callout_lock(handle.callout);
 	if (handle.callout->c_func == ftn && handle.callout->c_arg == arg)
 		callout_stop(handle.callout);
 	CC_UNLOCK(cc);
 }
 
 void
 callout_handle_init(struct callout_handle *handle)
 {
 	handle->callout = NULL;
 }
 
 /*
  * New interface; clients allocate their own callout structures.
  *
  * callout_reset() - establish or change a timeout
  * callout_stop() - disestablish a timeout
  * callout_init() - initialize a callout structure so that it can
  *	safely be passed to callout_reset() and callout_stop()
  *
  * <sys/callout.h> defines three convenience macros:
  *
  * callout_active() - returns truth if callout has not been stopped,
  *	drained, or deactivated since the last time the callout was
  *	reset.
  * callout_pending() - returns truth if callout is still waiting for timeout
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
 callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t precision,
     void (*ftn)(void *), void *arg, int cpu, int flags)
 {
 	sbintime_t to_sbt, pr;
 	struct callout_cpu *cc;
 	int cancelled, direct;
 
 	cancelled = 0;
 	if (flags & C_ABSOLUTE) {
 		to_sbt = sbt;
 	} else {
 		if ((flags & C_HARDCLOCK) && (sbt < tick_sbt))
 			sbt = tick_sbt;
 		if ((flags & C_HARDCLOCK) ||
 #ifdef NO_EVENTTIMERS
 		    sbt >= sbt_timethreshold) {
 			to_sbt = getsbinuptime();
 
 			/* Add safety belt for the case of hz > 1000. */
 			to_sbt += tc_tick_sbt - tick_sbt;
 #else
 		    sbt >= sbt_tickthreshold) {
 			/*
 			 * Obtain the time of the last hardclock() call on
 			 * this CPU directly from the kern_clocksource.c.
 			 * This value is per-CPU, but it is equal for all
 			 * active ones.
 			 */
 #ifdef __LP64__
 			to_sbt = DPCPU_GET(hardclocktime);
 #else
 			spinlock_enter();
 			to_sbt = DPCPU_GET(hardclocktime);
 			spinlock_exit();
 #endif
 #endif
 			if ((flags & C_HARDCLOCK) == 0)
 				to_sbt += tick_sbt;
 		} else
 			to_sbt = sbinuptime();
 		if (INT64_MAX - to_sbt < sbt)
 			to_sbt = INT64_MAX;
 		else
 			to_sbt += sbt;
 		pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
 		    sbt >> C_PRELGET(flags));
 		if (pr > precision)
 			precision = pr;
 	}
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
 	 * become unbalanced.
 	 */
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
 		cpu = c->c_cpu;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	KASSERT(!direct || c->c_lock == NULL,
 	    ("%s: direct callout %p has lock", __func__, c));
 	cc = callout_lock(c);
 	if (cc->cc_exec_entity[direct].cc_curr == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
 		if (c->c_lock != NULL && !cc->cc_exec_entity[direct].cc_cancel)
 			cancelled = cc->cc_exec_entity[direct].cc_cancel = true;
 		if (cc->cc_exec_entity[direct].cc_waiting) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
 			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
 			    cancelled ? "cancelled" : "failed to cancel",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 	}
 	if (c->c_flags & CALLOUT_PENDING) {
 		if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 			if (cc->cc_exec_next_dir == c)
 				cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 			LIST_REMOVE(c, c_links.le);
 		} else
 			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		cancelled = 1;
 		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 	}
 
 #ifdef SMP
 	/*
 	 * If the callout must migrate try to perform it immediately.
 	 * If the callout is currently running, just defer the migration
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
 		if (cc->cc_exec_entity[direct].cc_curr == c) {
 			cc->cc_exec_entity[direct].ce_migration_cpu = cpu;
 			cc->cc_exec_entity[direct].ce_migration_time
 			    = to_sbt;
 			cc->cc_exec_entity[direct].ce_migration_prec 
 			    = precision;
 			cc->cc_exec_entity[direct].ce_migration_func = ftn;
 			cc->cc_exec_entity[direct].ce_migration_arg = arg;
 			c->c_flags |= CALLOUT_DFRMIGRATION;
 			CTR6(KTR_CALLOUT,
 		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
 			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
 		cc = callout_cpu_switch(c, cc, cpu);
 	}
 #endif
 
 	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
 	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
 	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
 	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
 }
 
 /*
  * Common idioms that can be optimized in the future.
  */
 int
 callout_schedule_on(struct callout *c, int to_ticks, int cpu)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, cpu);
 }
 
 int
 callout_schedule(struct callout *c, int to_ticks)
 {
 	return callout_reset_on(c, to_ticks, c->c_func, c->c_arg, c->c_cpu);
 }
 
 int
 _callout_stop_safe(c, safe)
 	struct	callout *c;
 	int	safe;
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
 	int direct, sq_locked, use_lock;
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
 	if (!safe && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
 			use_lock = 1;
 			class = LOCK_CLASS(c->c_lock);
 			class->lc_assert(c->c_lock, LA_XLOCKED);
 		}
 	} else
 		use_lock = 0;
 	direct = (c->c_flags & CALLOUT_DIRECT) != 0;
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
 	 * again.
 	 */
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
 		sleepq_release(&old_cc->cc_exec_entity[direct].cc_waiting);
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
 #else
 		panic("migration should not happen");
 #endif
 	}
 
 	/*
 	 * If the callout isn't pending, it's not on the queue, so
 	 * don't attempt to remove it from the queue.  We can try to
 	 * stop it by other means however.
 	 */
 	if (!(c->c_flags & CALLOUT_PENDING)) {
 		c->c_flags &= ~CALLOUT_ACTIVE;
 
 		/*
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 */
 		if (cc->cc_exec_entity[direct].cc_curr != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
 				sleepq_release(
 				    &cc->cc_exec_entity[direct].cc_waiting);
 			return (0);
 		}
 
 		if (safe) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
 			 * just wait for the current invocation to
 			 * finish.
 			 */
 			while (cc->cc_exec_entity[direct].cc_curr == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
 				 * a LOR between cc_lock and sleepqueue
 				 * chain spinlocks.  This piece of code
 				 * emulates a msleep_spin() call actually.
 				 *
 				 * If we already have the sleepqueue chain
 				 * locked, then we can safely block.  If we
 				 * don't already have it locked, however,
 				 * we have to drop the cc_lock to lock
 				 * it.  This opens several races, so we
 				 * restart at the beginning once we have
 				 * both locks.  If nothing has changed, then
 				 * we will end up back here with sq_locked
 				 * set.
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
 					sleepq_lock(
 					&cc->cc_exec_entity[direct].cc_waiting);
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
 				}
 
 				/*
 				 * Migration could be cancelled here, but
 				 * as long as it is still not sure when it
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
 				cc->cc_exec_entity[direct].cc_waiting = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
 				sleepq_add(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
 				sleepq_wait(
 				    &cc->cc_exec_entity[direct].cc_waiting,
 					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
 				/* Reacquire locks previously released. */
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
 		} else if (use_lock &&
 			    !cc->cc_exec_entity[direct].cc_cancel) {
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
 			 * and return.  After our caller drops the
 			 * lock, the callout will be skipped in
 			 * softclock().
 			 */
 			cc->cc_exec_entity[direct].cc_cancel = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
 		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
 			c->c_flags &= ~CALLOUT_DFRMIGRATION;
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			return (1);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
 		CC_UNLOCK(cc);
 		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		return (0);
 	}
 	if (sq_locked)
 		sleepq_release(&cc->cc_exec_entity[direct].cc_waiting);
 
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
 	if ((c->c_flags & CALLOUT_PROCESSED) == 0) {
 		if (cc->cc_exec_next_dir == c)
 			cc->cc_exec_next_dir = LIST_NEXT(c, c_links.le);
 		LIST_REMOVE(c, c_links.le);
 	} else
 		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 	callout_cc_del(c, cc);
 
 	CC_UNLOCK(cc);
 	return (1);
 }
 
 void
 callout_init(c, mpsafe)
 	struct	callout *c;
 	int mpsafe;
 {
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
 		c->c_flags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
 		c->c_flags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
 
 void
 _callout_init_lock(c, lock, flags)
 	struct	callout *c;
 	struct	lock_object *lock;
 	int flags;
 {
 	bzero(c, sizeof *c);
 	c->c_lock = lock;
 	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK)) == 0,
 	    ("callout_init_lock: bad flags %d", flags));
 	KASSERT(lock != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_lock: CALLOUT_RETURNUNLOCKED with no lock"));
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
 	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
 #ifdef APM_FIXUP_CALLTODO
 /* 
  * Adjust the kernel calltodo timeout list.  This routine is used after 
  * an APM resume to recalculate the calltodo timer list values with the 
  * number of hz's we have been sleeping.  The next hardclock() will detect 
  * that there are fired timers and run softclock() to execute them.
  *
  * Please note, I have not done an exhaustive analysis of what code this
  * might break.  I am motivated to have my select()'s and alarm()'s that
  * have expired during suspend firing upon resume so that the applications
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
  * firing, which seemed independant on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key@cs.utk.edu>
  */
 void
 adjust_timeout_calltodo(time_change)
     struct timeval *time_change;
 {
 	register struct callout *p;
 	unsigned long delta_ticks;
 
 	/* 
 	 * How many ticks were we asleep?
 	 * (stolen from tvtohz()).
 	 */
 
 	/* Don't do anything */
 	if (time_change->tv_sec < 0)
 		return;
 	else if (time_change->tv_sec <= LONG_MAX / 1000000)
 		delta_ticks = (time_change->tv_sec * 1000000 +
 			       time_change->tv_usec + (tick - 1)) / tick + 1;
 	else if (time_change->tv_sec <= LONG_MAX / hz)
 		delta_ticks = time_change->tv_sec * hz +
 			      (time_change->tv_usec + (tick - 1)) / tick + 1;
 	else
 		delta_ticks = LONG_MAX;
 
 	if (delta_ticks > INT_MAX)
 		delta_ticks = INT_MAX;
 
 	/* 
 	 * Now rip through the timer calltodo list looking for timers
 	 * to expire.
 	 */
 
 	/* don't collide with softclock() */
 	CC_LOCK(cc);
 	for (p = calltodo.c_next; p != NULL; p = p->c_next) {
 		p->c_time -= delta_ticks;
 
 		/* Break if the timer had more time on it than delta_ticks */
 		if (p->c_time > 0)
 			break;
 
 		/* take back the ticks the timer didn't use (p->c_time <= 0) */
 		delta_ticks = -p->c_time;
 	}
 	CC_UNLOCK(cc);
 
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
 
 static int
 flssbt(sbintime_t sbt)
 {
 
 	sbt += (uint64_t)sbt >> 1;
 	if (sizeof(long) >= sizeof(sbintime_t))
 		return (flsl(sbt));
 	if (sbt >= SBT_1S)
 		return (flsl(((uint64_t)sbt) >> 32) + 32);
 	return (flsl(sbt));
 }
 
 /*
  * Dump immediate statistic snapshot of the scheduled callouts.
  */
 static int
 sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
 {
 	struct callout *tmp;
 	struct callout_cpu *cc;
 	struct callout_list *sc;
 	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
 	int ct[64], cpr[64], ccpbk[32];
 	int error, val, i, count, tcum, pcum, maxc, c, medc;
 #ifdef SMP
 	int cpu;
 #endif
 
 	val = 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	count = maxc = 0;
 	st = spr = maxt = maxpr = 0;
 	bzero(ccpbk, sizeof(ccpbk));
 	bzero(ct, sizeof(ct));
 	bzero(cpr, sizeof(cpr));
 	now = sbinuptime();
 #ifdef SMP
 	CPU_FOREACH(cpu) {
 		cc = CC_CPU(cpu);
 #else
 		cc = CC_CPU(timeout_cpu);
 #endif
 		CC_LOCK(cc);
 		for (i = 0; i < callwheelsize; i++) {
 			sc = &cc->cc_callwheel[i];
 			c = 0;
 			LIST_FOREACH(tmp, sc, c_links.le) {
 				c++;
 				t = tmp->c_time - now;
 				if (t < 0)
 					t = 0;
 				st += t / SBT_1US;
 				spr += tmp->c_precision / SBT_1US;
 				if (t > maxt)
 					maxt = t;
 				if (tmp->c_precision > maxpr)
 					maxpr = tmp->c_precision;
 				ct[flssbt(t)]++;
 				cpr[flssbt(tmp->c_precision)]++;
 			}
 			if (c > maxc)
 				maxc = c;
 			ccpbk[fls(c + c / 2)]++;
 			count += c;
 		}
 		CC_UNLOCK(cc);
 #ifdef SMP
 	}
 #endif
 
 	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
 		tcum += ct[i];
 	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
 		pcum += cpr[i];
 	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
 	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
 		c += ccpbk[i];
 	medc = (i >= 2) ? (1 << (i - 2)) : 0;
 
 	printf("Scheduled callouts statistic snapshot:\n");
 	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
 	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
 	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
 	    medc,
 	    count / callwheelsize / mp_ncpus,
 	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
 	    maxc);
 	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
 	    (st / count) / 1000000, (st / count) % 1000000,
 	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
 	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
 	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
 	    (spr / count) / 1000000, (spr / count) % 1000000,
 	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
 	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
 	    "   prec\t   pcum\n");
 	for (i = 0, tcum = pcum = 0; i < 64; i++) {
 		if (ct[i] == 0 && cpr[i] == 0)
 			continue;
 		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
 		tcum += ct[i];
 		pcum += cpr[i];
 		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
 		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
 		    i - 1 - (32 - CC_HASH_SHIFT),
 		    ct[i], tcum, cpr[i], pcum);
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_callout_stat, "I",
     "Dump immediate statistic snapshot of the scheduled callouts");
Index: stable/10/tools/sched/schedgraph.py
===================================================================
--- stable/10/tools/sched/schedgraph.py	(revision 278649)
+++ stable/10/tools/sched/schedgraph.py	(revision 278650)
@@ -1,1625 +1,1623 @@
 #!/usr/local/bin/python
 
 # Copyright (c) 2002-2003, 2009, Jeffrey Roberson <jeff@freebsd.org>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice unmodified, this list of conditions, and the following
 #    disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 # IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 # $FreeBSD$
 
 import sys
 import re
 import random
 from Tkinter import *
 
 # To use:
 # - Install the ports/x11-toolkits/py-tkinter package; e.g.
 #	portinstall x11-toolkits/py-tkinter package
 # - Add KTR_SCHED to KTR_COMPILE and KTR_MASK in your KERNCONF; e.g.
 #	options 	KTR
 #	options 	KTR_ENTRIES=32768
 #	options 	KTR_COMPILE=(KTR_SCHED)
 #	options 	KTR_MASK=(KTR_SCHED)
 # - It is encouraged to increase KTR_ENTRIES size to gather enough
 #    information for analysis; e.g.
 #	options 	KTR_ENTRIES=262144
 #   as 32768 entries may only correspond to a second or two of profiling
 #   data depending on your workload.
 # - Rebuild kernel with proper changes to KERNCONF and boot new kernel.
 # - Run your workload to be profiled.
 # - While the workload is continuing (i.e. before it finishes), disable
 #   KTR tracing by setting 'sysctl debug.ktr.mask=0'.  This is necessary
 #   to avoid a race condition while running ktrdump, i.e. the KTR ring buffer
 #   will cycle a bit while ktrdump runs, and this confuses schedgraph because
 #   the timestamps appear to go backwards at some point.  Stopping KTR logging
 #   while the workload is still running is to avoid wasting log entries on
 #   "idle" time at the end.
 # - Dump the trace to a file: 'ktrdump -ct > ktr.out'
 # - Run the python script: 'python schedgraph.py ktr.out' optionally provide
 #   your cpu frequency in ghz: 'python schedgraph.py ktr.out 2.4'
 #
 # To do:
 # Add a per-source summary display
 # "Vertical rule" to help relate data in different rows
 # Mouse-over popup of full thread/event/row label (currently truncated)
 # More visible anchors for popup event windows
 #
 # BUGS: 1) Only 8 CPUs are supported, more CPUs require more choices of
 #          colours to represent them ;-)
 
 eventcolors = [
 	("count",	"red"),
 	("running",	"green"),
 	("idle",	"grey"),
+	("spinning",	"red"),
 	("yielding",	"yellow"),
 	("swapped",	"violet"),
 	("suspended",	"purple"),
 	("iwait",	"grey"),
 	("sleep",	"blue"),
 	("blocked",	"dark red"),
 	("runq add",	"yellow"),
 	("runq rem",	"yellow"),
 	("thread exit",	"grey"),
 	("proc exit",	"grey"),
-	("callwheel idle", "grey"),
-	("callout running", "green"),
 	("lock acquire", "blue"),
 	("lock contest", "purple"),
 	("failed lock try", "red"),
 	("lock release", "grey"),
 	("statclock",	"black"),
 	("prio",	"black"),
 	("lend prio",	"black"),
 	("wokeup",	"black")
 ]
 
 cpucolors = [
 	("CPU 0",	"light grey"),
 	("CPU 1",	"dark grey"),
 	("CPU 2",	"light blue"),
 	("CPU 3",	"light pink"),
 	("CPU 4",	"blanched almond"),
 	("CPU 5",	"slate grey"),
 	("CPU 6",	"tan"),
 	("CPU 7",	"thistle"),
 	("CPU 8",	"white")
 ]
 
 colors = [
 	"white", "thistle", "blanched almond", "tan", "chartreuse",
 	"dark red", "red", "pale violet red", "pink", "light pink",
 	"dark orange", "orange", "coral", "light coral",
 	"goldenrod", "gold", "yellow", "light yellow",
 	"dark green", "green", "light green", "light sea green",
 	"dark blue", "blue", "light blue", "steel blue", "light slate blue",
 	"dark violet", "violet", "purple", "blue violet",
 	"dark grey", "slate grey", "light grey",
 	"black",
 ]
 colors.sort()
 
 ticksps = None
 status = None
 colormap = None
 ktrfile = None
 clockfreq = None
 sources = []
 lineno = -1
 
 Y_BORDER = 10
 X_BORDER = 10
 Y_COUNTER = 80
 Y_EVENTSOURCE = 10
 XY_POINT = 4
 
 class Colormap:
 	def __init__(self, table):
 		self.table = table
 		self.map = {}
 		for entry in table:
 			self.map[entry[0]] = entry[1]
 
 	def lookup(self, name):
 		try:
 			color = self.map[name]
 		except:
 			color = colors[random.randrange(0, len(colors))]
 			print "Picking random color", color, "for", name
 			self.map[name] = color
 			self.table.append((name, color))
 		return (color)
 
 def ticks2sec(ticks):
 	ticks = float(ticks)
 	ns = float(ticksps) / 1000000000
 	ticks /= ns
 	if (ticks < 1000):
 		return ("%.2fns" % ticks)
 	ticks /= 1000
 	if (ticks < 1000):
 		return ("%.2fus" % ticks)
 	ticks /= 1000
 	if (ticks < 1000):
 		return ("%.2fms" % ticks)
 	ticks /= 1000
 	return ("%.2fs" % ticks)
 
 class Scaler(Frame):
 	def __init__(self, master, target):
 		Frame.__init__(self, master)
 		self.scale = None
 		self.target = target
 		self.label = Label(self, text="Ticks per pixel")
 		self.label.pack(side=LEFT)
 		self.resolution = 100
 		self.setmax(10000)
 
 	def scaleset(self, value):
 		self.target.scaleset(int(value))
 
 	def set(self, value):
 		self.scale.set(value)
 
 	def setmax(self, value):
 		#
 		# We can't reconfigure the to_ value so we delete the old
 		# window and make a new one when we resize.
 		#
 		if (self.scale != None):
 			self.scale.pack_forget()
 			self.scale.destroy()
 		self.scale = Scale(self, command=self.scaleset,
 		    from_=100, to_=value, orient=HORIZONTAL,
 		    resolution=self.resolution)
 		self.scale.pack(fill="both", expand=1)
 		self.scale.set(self.target.scaleget())
 
 class Status(Frame):
 	def __init__(self, master):
 		Frame.__init__(self, master)
 		self.label = Label(self, bd=1, relief=SUNKEN, anchor=W)
 		self.label.pack(fill="both", expand=1)
 		self.clear()
 
 	def set(self, str):
 		self.label.config(text=str)
 
 	def clear(self):
 		self.label.config(text="")
 
 	def startup(self, str):
 		self.set(str)
 		root.update()
 
 class ColorConf(Frame):
 	def __init__(self, master, name, color):
 		Frame.__init__(self, master)
 		if (graph.getstate(name) == "hidden"):
 			enabled = 0
 		else:
 			enabled = 1
 		self.name = name
 		self.color = StringVar()
 		self.color_default = color
 		self.color_current = color
 		self.color.set(color)
 		self.enabled = IntVar()
 		self.enabled_default = enabled
 		self.enabled_current = enabled
 		self.enabled.set(enabled)
 		self.draw()
 
 	def draw(self):
 		self.label = Label(self, text=self.name, anchor=W)
 		self.sample = Canvas(self, width=24, height=24,
 		    bg='grey')
 		self.rect = self.sample.create_rectangle(0, 0, 24, 24,
 		    fill=self.color.get())
 		self.list = OptionMenu(self, self.color, command=self.setcolor,
 		    *colors)
 		self.checkbox = Checkbutton(self, text="enabled",
 		    variable=self.enabled)
 		self.label.grid(row=0, column=0, sticky=E+W)
 		self.sample.grid(row=0, column=1)
 		self.list.grid(row=0, column=2, sticky=E+W)
 		self.checkbox.grid(row=0, column=3)
 		self.columnconfigure(0, weight=1)
 		self.columnconfigure(2, minsize=150)
 
 	def setcolor(self, color):
 		self.color.set(color)
 		self.sample.itemconfigure(self.rect, fill=color)
 
 	def apply(self):
 		cchange = 0
 		echange = 0
 		if (self.color_current != self.color.get()):
 			cchange = 1
 		if (self.enabled_current != self.enabled.get()):
 			echange = 1
 		self.color_current = self.color.get()
 		self.enabled_current = self.enabled.get()
 		if (echange != 0):
 			if (self.enabled_current):
 				graph.setcolor(self.name, self.color_current)
 			else:
 				graph.hide(self.name)
 			return
 		if (cchange != 0):
 			graph.setcolor(self.name, self.color_current)
 
 	def revert(self):
 		self.setcolor(self.color_default)
 		self.enabled.set(self.enabled_default)
 
 class ColorConfigure(Toplevel):
 	def __init__(self, table, name):
 		Toplevel.__init__(self)
 		self.resizable(0, 0)
 		self.title(name)
 		self.items = LabelFrame(self, text="Item Type")
 		self.buttons = Frame(self)
 		self.drawbuttons()
 		self.items.grid(row=0, column=0, sticky=E+W)
 		self.columnconfigure(0, weight=1)
 		self.buttons.grid(row=1, column=0, sticky=E+W)
 		self.types = []
 		self.irow = 0
 		for type in table:
 			color = graph.getcolor(type[0])
 			if (color != ""):
 				self.additem(type[0], color)
 
 	def additem(self, name, color):
 		item = ColorConf(self.items, name, color)
 		self.types.append(item)
 		item.grid(row=self.irow, column=0, sticky=E+W)
 		self.irow += 1
 
 	def drawbuttons(self):
 		self.apply = Button(self.buttons, text="Apply",
 		    command=self.apress)
 		self.default = Button(self.buttons, text="Revert",
 		    command=self.rpress)
 		self.apply.grid(row=0, column=0, sticky=E+W)
 		self.default.grid(row=0, column=1, sticky=E+W)
 		self.buttons.columnconfigure(0, weight=1)
 		self.buttons.columnconfigure(1, weight=1)
 
 	def apress(self):
 		for item in self.types:
 			item.apply()
 
 	def rpress(self):
 		for item in self.types:
 			item.revert()
 
 class SourceConf(Frame):
 	def __init__(self, master, source):
 		Frame.__init__(self, master)
 		if (source.hidden == 1):
 			enabled = 0
 		else:
 			enabled = 1
 		self.source = source
 		self.name = source.name
 		self.enabled = IntVar()
 		self.enabled_default = enabled
 		self.enabled_current = enabled
 		self.enabled.set(enabled)
 		self.draw()
 
 	def draw(self):
 		self.label = Label(self, text=self.name, anchor=W)
 		self.checkbox = Checkbutton(self, text="enabled",
 		    variable=self.enabled)
 		self.label.grid(row=0, column=0, sticky=E+W)
 		self.checkbox.grid(row=0, column=1)
 		self.columnconfigure(0, weight=1)
 
 	def changed(self):
 		if (self.enabled_current != self.enabled.get()):
 			return 1
 		return 0
 
 	def apply(self):
 		self.enabled_current = self.enabled.get()
 
 	def revert(self):
 		self.enabled.set(self.enabled_default)
 
 	def check(self):
 		self.enabled.set(1)
 
 	def uncheck(self):
 		self.enabled.set(0)
 
 class SourceConfigure(Toplevel):
 	def __init__(self):
 		Toplevel.__init__(self)
 		self.resizable(0, 0)
 		self.title("Source Configuration")
 		self.items = []
 		self.iframe = Frame(self)
 		self.iframe.grid(row=0, column=0, sticky=E+W)
 		f = LabelFrame(self.iframe, bd=4, text="Sources")
 		self.items.append(f)
 		self.buttons = Frame(self)
 		self.items[0].grid(row=0, column=0, sticky=E+W)
 		self.columnconfigure(0, weight=1)
 		self.sconfig = []
 		self.irow = 0
 		self.icol = 0
 		for source in sources:
 			self.addsource(source)
 		self.drawbuttons()
 		self.buttons.grid(row=1, column=0, sticky=W)
 
 	def addsource(self, source):
 		if (self.irow > 30):
 			self.icol += 1
 			self.irow = 0
 			c = self.icol
 			f = LabelFrame(self.iframe, bd=4, text="Sources")
 			f.grid(row=0, column=c, sticky=N+E+W)
 			self.items.append(f)
 		item = SourceConf(self.items[self.icol], source)
 		self.sconfig.append(item)
 		item.grid(row=self.irow, column=0, sticky=E+W)
 		self.irow += 1
 
 	def drawbuttons(self):
 		self.apply = Button(self.buttons, text="Apply",
 		    command=self.apress)
 		self.default = Button(self.buttons, text="Revert",
 		    command=self.rpress)
 		self.checkall = Button(self.buttons, text="Check All",
 		    command=self.cpress)
 		self.uncheckall = Button(self.buttons, text="Uncheck All",
 		    command=self.upress)
 		self.checkall.grid(row=0, column=0, sticky=W)
 		self.uncheckall.grid(row=0, column=1, sticky=W)
 		self.apply.grid(row=0, column=2, sticky=W)
 		self.default.grid(row=0, column=3, sticky=W)
 		self.buttons.columnconfigure(0, weight=1)
 		self.buttons.columnconfigure(1, weight=1)
 		self.buttons.columnconfigure(2, weight=1)
 		self.buttons.columnconfigure(3, weight=1)
 
 	def apress(self):
 		disable_sources = []
 		enable_sources = []
 		for item in self.sconfig:
 			if (item.changed() == 0):
 				continue
 			if (item.enabled.get() == 1):
 				enable_sources.append(item.source)
 			else:
 				disable_sources.append(item.source)
 
 		if (len(disable_sources)):
 			graph.sourcehidelist(disable_sources)
 		if (len(enable_sources)):
 			graph.sourceshowlist(enable_sources)
 
 		for item in self.sconfig:
 			item.apply()
 
 	def rpress(self):
 		for item in self.sconfig:
 			item.revert()
 
 	def cpress(self):
 		for item in self.sconfig:
 			item.check()
 
 	def upress(self):
 		for item in self.sconfig:
 			item.uncheck()
 
 # Reverse compare of second member of the tuple
 def cmp_counts(x, y):
 	return y[1] - x[1]
 
 class SourceStats(Toplevel):
 	def __init__(self, source):
 		self.source = source
 		Toplevel.__init__(self)
 		self.resizable(0, 0)
 		self.title(source.name + " statistics")
 		self.evframe = LabelFrame(self,
 		    text="Event Count, Duration, Avg Duration")
 		self.evframe.grid(row=0, column=0, sticky=E+W)
 		eventtypes={}
 		for event in self.source.events:
 			if (event.type == "pad"):
 				continue
 			duration = event.duration
 			if (eventtypes.has_key(event.name)):
 				(c, d) = eventtypes[event.name]
 				c += 1
 				d += duration
 				eventtypes[event.name] = (c, d)
 			else:
 				eventtypes[event.name] = (1, duration)
 		events = []
 		for k, v in eventtypes.iteritems():
 			(c, d) = v
 			events.append((k, c, d))
 		events.sort(cmp=cmp_counts)
 
 		ypos = 0
 		for event in events:
 			(name, c, d) = event
 			Label(self.evframe, text=name, bd=1, 
 			    relief=SUNKEN, anchor=W, width=30).grid(
 			    row=ypos, column=0, sticky=W+E)
 			Label(self.evframe, text=str(c), bd=1,
 			    relief=SUNKEN, anchor=W, width=10).grid(
 			    row=ypos, column=1, sticky=W+E)
 			Label(self.evframe, text=ticks2sec(d),
 			    bd=1, relief=SUNKEN, width=10).grid(
 			    row=ypos, column=2, sticky=W+E)
 			if (d and c):
 				d /= c
 			else:
 				d = 0
 			Label(self.evframe, text=ticks2sec(d),
 			    bd=1, relief=SUNKEN, width=10).grid(
 			    row=ypos, column=3, sticky=W+E)
 			ypos += 1
 
 
 class SourceContext(Menu):
 	def __init__(self, event, source):
 		self.source = source
 		Menu.__init__(self, tearoff=0, takefocus=0)
 		self.add_command(label="hide", command=self.hide)
 		self.add_command(label="hide group", command=self.hidegroup)
 		self.add_command(label="stats", command=self.stats)
 		self.tk_popup(event.x_root-3, event.y_root+3)
 
 	def hide(self):
 		graph.sourcehide(self.source)
 
 	def hidegroup(self):
 		grouplist = []
 		for source in sources:
 			if (source.group == self.source.group):
 				grouplist.append(source)
 		graph.sourcehidelist(grouplist)
 
 	def show(self):
 		graph.sourceshow(self.source)
 
 	def stats(self):
 		SourceStats(self.source)
 
 class EventView(Toplevel):
 	def __init__(self, event, canvas):
 		Toplevel.__init__(self)
 		self.resizable(0, 0)
 		self.title("Event")
 		self.event = event
 		self.buttons = Frame(self)
 		self.buttons.grid(row=0, column=0, sticky=E+W)
 		self.frame = Frame(self)
 		self.frame.grid(row=1, column=0, sticky=N+S+E+W)
 		self.canvas = canvas
 		self.drawlabels()
 		self.drawbuttons()
 		event.displayref(canvas)
 		self.bind("<Destroy>", self.destroycb)
 
 	def destroycb(self, event):
 		self.unbind("<Destroy>")
 		if (self.event != None):
 			self.event.displayunref(self.canvas)
 			self.event = None
 		self.destroy()
 
 	def clearlabels(self):
 		for label in self.frame.grid_slaves():
 			label.grid_remove()
 
 	def drawlabels(self):
 		ypos = 0
 		labels = self.event.labels()
 		while (len(labels) < 7):
 			labels.append(("", ""))
 		for label in labels:
 			name, value = label
 			linked = 0
 			if (name == "linkedto"):
 				linked = 1
 			l = Label(self.frame, text=name, bd=1, width=15,
 			    relief=SUNKEN, anchor=W)
 			if (linked):
 				fgcolor = "blue"
 			else:
 				fgcolor = "black"
 			r = Label(self.frame, text=value, bd=1,
 			    relief=SUNKEN, anchor=W, fg=fgcolor)
 			l.grid(row=ypos, column=0, sticky=E+W)
 			r.grid(row=ypos, column=1, sticky=E+W)
 			if (linked):
 				r.bind("<Button-1>", self.linkpress)
 			ypos += 1
 		self.frame.columnconfigure(1, minsize=80)
 
 	def drawbuttons(self):
 		self.back = Button(self.buttons, text="<", command=self.bpress)
 		self.forw = Button(self.buttons, text=">", command=self.fpress)
 		self.new = Button(self.buttons, text="new", command=self.npress)
 		self.back.grid(row=0, column=0, sticky=E+W)
 		self.forw.grid(row=0, column=1, sticky=E+W)
 		self.new.grid(row=0, column=2, sticky=E+W)
 		self.buttons.columnconfigure(2, weight=1)
 
 	def newevent(self, event):
 		self.event.displayunref(self.canvas)
 		self.clearlabels()
 		self.event = event
 		self.event.displayref(self.canvas)
 		self.drawlabels()
 
 	def npress(self):
 		EventView(self.event, self.canvas)
 
 	def bpress(self):
 		prev = self.event.prev()
 		if (prev == None):
 			return
 		while (prev.type == "pad"):
 			prev = prev.prev()
 			if (prev == None):
 				return
 		self.newevent(prev)
 
 	def fpress(self):
 		next = self.event.next()
 		if (next == None):
 			return
 		while (next.type == "pad"):
 			next = next.next()
 			if (next == None):
 				return
 		self.newevent(next)
 
 	def linkpress(self, wevent):
 		event = self.event.getlinked()
 		if (event != None):
 			self.newevent(event)
 
 class Event:
 	def __init__(self, source, name, cpu, timestamp, attrs):
 		self.source = source
 		self.name = name
 		self.cpu = cpu
 		self.timestamp = int(timestamp)
 		self.attrs = attrs
 		self.idx = None
 		self.item = None
 		self.dispcnt = 0
 		self.duration = 0
 		self.recno = lineno
 
 	def status(self):
 		statstr = self.name + " " + self.source.name
 		statstr += " on: cpu" + str(self.cpu)
 		statstr += " at: " + str(self.timestamp)
 		statstr += " attributes: "
 		for i in range(0, len(self.attrs)):
 			attr = self.attrs[i]
 			statstr += attr[0] + ": " + str(attr[1])
 			if (i != len(self.attrs) - 1):
 				statstr += ", "
 		status.set(statstr)
 
 	def labels(self):
 		return [("Source", self.source.name),
 			("Event", self.name),
 			("CPU", self.cpu),
 			("Timestamp", self.timestamp),
 			("KTR Line ", self.recno)
 		] + self.attrs
 
 	def mouseenter(self, canvas):
 		self.displayref(canvas)
 		self.status()
 
 	def mouseexit(self, canvas):
 		self.displayunref(canvas)
 		status.clear()
 
 	def mousepress(self, canvas):
 		EventView(self, canvas)
 
 	def draw(self, canvas, xpos, ypos, item):
 		self.item = item
 		if (item != None):
 			canvas.items[item] = self
 
 	def move(self, canvas, x, y):
 		if (self.item == None):
 			return;
 		canvas.move(self.item, x, y);
 
 	def next(self):
 		return self.source.eventat(self.idx + 1)
 
 	def nexttype(self, type):
 		next = self.next()
 		while (next != None and next.type != type):
 			next = next.next()
 		return (next)
 
 	def prev(self):
 		return self.source.eventat(self.idx - 1)
 
 	def displayref(self, canvas):
 		if (self.dispcnt == 0):
 			canvas.itemconfigure(self.item, width=2)
 		self.dispcnt += 1
 
 	def displayunref(self, canvas):
 		self.dispcnt -= 1
 		if (self.dispcnt == 0):
 			canvas.itemconfigure(self.item, width=0)
 			canvas.tag_raise("point", "state")
 
 	def getlinked(self):
 		for attr in self.attrs:
 			if (attr[0] != "linkedto"):
 				continue
 			source = ktrfile.findid(attr[1])
 			return source.findevent(self.timestamp)
 		return None
 
 class PointEvent(Event):
 	type = "point"
 	def __init__(self, source, name, cpu, timestamp, attrs):
 		Event.__init__(self, source, name, cpu, timestamp, attrs)
 
 	def draw(self, canvas, xpos, ypos):
 		color = colormap.lookup(self.name)
 		l = canvas.create_oval(xpos - XY_POINT, ypos,
 		    xpos + XY_POINT, ypos - (XY_POINT * 2),
 		    fill=color, width=0,
 		    tags=("event", self.type, self.name, self.source.tag))
 		Event.draw(self, canvas, xpos, ypos, l)
 
 		return xpos
 
 class StateEvent(Event):
 	type = "state"
 	def __init__(self, source, name, cpu, timestamp, attrs):
 		Event.__init__(self, source, name, cpu, timestamp, attrs)
 
 	def draw(self, canvas, xpos, ypos):
 		next = self.nexttype("state")
 		if (next == None):
 			return (xpos)
 		self.duration = duration = next.timestamp - self.timestamp
 		self.attrs.insert(0, ("duration", ticks2sec(duration)))
 		color = colormap.lookup(self.name)
 		if (duration < 0):
 			duration = 0
 			print "Unsynchronized timestamp"
 			print self.cpu, self.timestamp
 			print next.cpu, next.timestamp
 		delta = duration / canvas.ratio
 		l = canvas.create_rectangle(xpos, ypos,
 		    xpos + delta, ypos - 10, fill=color, width=0,
 		    tags=("event", self.type, self.name, self.source.tag))
 		Event.draw(self, canvas, xpos, ypos, l)
 
 		return (xpos + delta)
 
 class CountEvent(Event):
 	type = "count"
 	def __init__(self, source, count, cpu, timestamp, attrs):
 		count = int(count)
 		self.count = count
 		Event.__init__(self, source, "count", cpu, timestamp, attrs)
 
 	def draw(self, canvas, xpos, ypos):
 		next = self.nexttype("count")
 		if (next == None):
 			return (xpos)
 		color = colormap.lookup("count")
 		self.duration = duration = next.timestamp - self.timestamp
 		if (duration < 0):
 			duration = 0
 			print "Unsynchronized timestamp"
 			print self.cpu, self.timestamp
 			print next.cpu, next.timestamp
 		self.attrs.insert(0, ("count", self.count))
 		self.attrs.insert(1, ("duration", ticks2sec(duration)))
 		delta = duration / canvas.ratio
 		yhight = self.source.yscale() * self.count
 		l = canvas.create_rectangle(xpos, ypos - yhight,
 		    xpos + delta, ypos, fill=color, width=0,
 		    tags=("event", self.type, self.name, self.source.tag))
 		Event.draw(self, canvas, xpos, ypos, l)
 		return (xpos + delta)
 
 class PadEvent(StateEvent):
 	type = "pad"
 	def __init__(self, source, cpu, timestamp, last=0):
 		if (last):
 			cpu = source.events[len(source.events) -1].cpu
 		else:
 			cpu = source.events[0].cpu
 		StateEvent.__init__(self, source, "pad", cpu, timestamp, [])
 	def draw(self, canvas, xpos, ypos):
 		next = self.next()
 		if (next == None):
 			return (xpos)
 		duration = next.timestamp - self.timestamp
 		delta = duration / canvas.ratio
 		Event.draw(self, canvas, xpos, ypos, None)
 		return (xpos + delta)
 
 # Sort function for start y address
 def source_cmp_start(x, y):
 	return x.y - y.y
 
 class EventSource:
 	def __init__(self, group, id):
 		self.name = id
 		self.events = []
 		self.cpuitems = []
 		self.group = group
 		self.y = 0
 		self.item = None
 		self.hidden = 0
 		self.tag = group + id
 
 	def __cmp__(self, other):
 		if (other == None):
 			return -1
 		if (self.group == other.group):
 			return cmp(self.name, other.name)
 		return cmp(self.group, other.group)
 
 	# It is much faster to append items to a list then to insert them
 	# at the beginning.  As a result, we add events in reverse order
 	# and then swap the list during fixup.
 	def fixup(self):
 		self.events.reverse()
 
 	def addevent(self, event):
 		self.events.append(event)
 
 	def addlastevent(self, event):
 		self.events.insert(0, event)
 
 	def draw(self, canvas, ypos):
 		xpos = 10
 		cpux = 10
 		cpu = self.events[1].cpu
 		for i in range(0, len(self.events)):
 			self.events[i].idx = i
 		for event in self.events:
 			if (event.cpu != cpu and event.cpu != -1):
 				self.drawcpu(canvas, cpu, cpux, xpos, ypos)
 				cpux = xpos
 				cpu = event.cpu
 			xpos = event.draw(canvas, xpos, ypos)
 		self.drawcpu(canvas, cpu, cpux, xpos, ypos)
 
 	def drawname(self, canvas, ypos):
 		self.y = ypos
 		ypos = ypos - (self.ysize() / 2)
 		self.item = canvas.create_text(X_BORDER, ypos, anchor="w",
 		    text=self.name)
 		return (self.item)
 
 	def drawcpu(self, canvas, cpu, fromx, tox, ypos):
 		cpu = "CPU " + str(cpu)
 		color = cpucolormap.lookup(cpu)
 		# Create the cpu background colors default to hidden
 		l = canvas.create_rectangle(fromx,
 		    ypos - self.ysize() - canvas.bdheight,
 		    tox, ypos + canvas.bdheight, fill=color, width=0,
 		    tags=("cpubg", cpu, self.tag), state="hidden")
 		self.cpuitems.append(l)
 
 	def move(self, canvas, xpos, ypos):
 		canvas.move(self.tag, xpos, ypos)
 
 	def movename(self, canvas, xpos, ypos):
 		self.y += ypos
 		canvas.move(self.item, xpos, ypos)
 
 	def ysize(self):
 		return (Y_EVENTSOURCE)
 
 	def eventat(self, i):
-		if (i >= len(self.events)):
+		if (i >= len(self.events) or i < 0):
 			return (None)
 		event = self.events[i]
 		return (event)
 
 	def findevent(self, timestamp):
 		for event in self.events:
 			if (event.timestamp >= timestamp and event.type != "pad"):
 				return (event)
 		return (None)
 
 class Counter(EventSource):
 	#
 	# Store a hash of counter groups that keeps the max value
 	# for a counter in this group for scaling purposes.
 	#
 	groups = {}
 	def __init__(self, group, id):
 		try:
 			Counter.cnt = Counter.groups[group]
 		except:
 			Counter.groups[group] = 0
 		EventSource.__init__(self, group, id)
 
 	def fixup(self):
 		for event in self.events:
 			if (event.type != "count"):
 				continue;
 			count = int(event.count)
 			if (count > Counter.groups[self.group]):
 				Counter.groups[self.group] = count
 		EventSource.fixup(self)
 
 	def ymax(self):
 		return (Counter.groups[self.group])
 
 	def ysize(self):
 		return (Y_COUNTER)
 
 	def yscale(self):
 		return (self.ysize() / self.ymax())
 
 class KTRFile:
 	def __init__(self, file):
 		self.timestamp_f = None
 		self.timestamp_l = None
 		self.locks = {}
-		self.callwheels = {}
 		self.ticks = {}
 		self.load = {}
 		self.crit = {}
 		self.stathz = 0
 		self.eventcnt = 0
 		self.taghash = {}
 
 		self.parse(file)
 		self.fixup()
 		global ticksps
 		ticksps = self.ticksps()
 		span = self.timespan()
 		ghz = float(ticksps) / 1000000000.0
 		#
 		# Update the title with some stats from the file
 		#
 		titlestr = "SchedGraph: "
 		titlestr += ticks2sec(span) + " at %.3f ghz, " % ghz
 		titlestr += str(len(sources)) + " event sources, "
 		titlestr += str(self.eventcnt) + " events"
 		root.title(titlestr)
 
 	def parse(self, file):
 		try:
 			ifp = open(file)
 		except:
 			print "Can't open", file
 			sys.exit(1)
 
 		# quoteexp matches a quoted string, no escaping
 		quoteexp = "\"([^\"]*)\""
 
 		#
 		# commaexp matches a quoted string OR the string up
 		# to the first ','
 		#
 		commaexp = "(?:" + quoteexp + "|([^,]+))"
 
 		#
 		# colonstr matches a quoted string OR the string up
 		# to the first ':'
 		#
 		colonexp = "(?:" + quoteexp + "|([^:]+))"
 
 		#
 		# Match various manditory parts of the KTR string this is
 		# fairly inflexible until you get to attributes to make
 		# parsing faster.
 		#
 		hdrexp = "\s*(\d+)\s+(\d+)\s+(\d+)\s+"
 		groupexp = "KTRGRAPH group:" + quoteexp + ", "
 		idexp = "id:" + quoteexp + ", "
 		typeexp = "([^:]+):" + commaexp + ", "
 		attribexp = "attributes: (.*)"
 
 		#
 		# Matches optional attributes in the KTR string.  This
 		# tolerates more variance as the users supply these values.
 		#
 		attrexp = colonexp + "\s*:\s*(?:" + commaexp + ", (.*)|"
 		attrexp += quoteexp +"|(.*))"
 
 		# Precompile regexp
 		ktrre = re.compile(hdrexp + groupexp + idexp + typeexp + attribexp)
 		attrre = re.compile(attrexp)
 
 		global lineno
 		lineno = 0
 		for line in ifp.readlines():
 			lineno += 1
 			if ((lineno % 2048) == 0):
 				status.startup("Parsing line " + str(lineno))
 			m = ktrre.match(line);
 			if (m == None):
 				print "Can't parse", lineno, line,
 				continue;
 			(index, cpu, timestamp, group, id, type, dat, dat1, attrstring) = m.groups();
 			if (dat == None):
 				dat = dat1
 			if (self.checkstamp(timestamp) == 0):
 				print "Bad timestamp at", lineno, ":",
 				print cpu, timestamp 
 				continue
 			#
 			# Build the table of optional attributes
 			#
 			attrs = []
 			while (attrstring != None):
 				m = attrre.match(attrstring.strip())
 				if (m == None):
 					break;
 				#
 				# Name may or may not be quoted.
 				#
 				# For val we have four cases:
 				# 1) quotes followed by comma and more
 				#    attributes.
 				# 2) no quotes followed by comma and more
 				#    attributes.
 				# 3) no more attributes or comma with quotes.
 				# 4) no more attributes or comma without quotes.
 				#
 				(name, name1, val, val1, attrstring, end, end1) = m.groups();
 				if (name == None):
 					name = name1
 				if (end == None):
 					end = end1
 				if (val == None):
 					val = val1
 				if (val == None):
 					val = end
 				if (name == "stathz"):
 					self.setstathz(val, cpu)
 				attrs.append((name, val))
 			args = (dat, cpu, timestamp, attrs)
 			e = self.makeevent(group, id, type, args)
 			if (e == None):
 				print "Unknown type", type, lineno, line,
 
 	def makeevent(self, group, id, type, args):
 		e = None
 		source = self.makeid(group, id, type)
 		if (type == "state"):
 			e = StateEvent(source, *args)
 		elif (type == "counter"):
 			e = CountEvent(source, *args)
 		elif (type == "point"):
 			e = PointEvent(source, *args)
 		if (e != None):
 			self.eventcnt += 1
 			source.addevent(e);
 		return e
 
 	def setstathz(self, val, cpu):
 		self.stathz = int(val)
 		cpu = int(cpu)
 		try:
 			ticks = self.ticks[cpu]
 		except:
 			self.ticks[cpu] = 0
 		self.ticks[cpu] += 1
 
 	def checkstamp(self, timestamp):
 		timestamp = int(timestamp)
 		if (self.timestamp_f == None):
 			self.timestamp_f = timestamp;
 		if (self.timestamp_l != None and
 		    timestamp -2048> self.timestamp_l):
 			return (0)
 		self.timestamp_l = timestamp;
 		return (1)
 
 	def makeid(self, group, id, type):
 		tag = group + id
 		if (self.taghash.has_key(tag)):
 			return self.taghash[tag]
 		if (type == "counter"):
 			source = Counter(group, id)
 		else:
 			source = EventSource(group, id)
 		sources.append(source)
 		self.taghash[tag] = source
 		return (source)
 
 	def findid(self, id):
 		for source in sources:
 			if (source.name == id):
 				return source
 		return (None)
 
 	def timespan(self):
 		return (self.timestamp_f - self.timestamp_l);
 
 	def ticksps(self):
 		oneghz = 1000000000
 		# Use user supplied clock first
 		if (clockfreq != None):
 			return int(clockfreq * oneghz)
 
 		# Check for a discovered clock
 		if (self.stathz != 0):
 			return (self.timespan() / self.ticks[0]) * int(self.stathz)
 		# Pretend we have a 1ns clock
 		print "WARNING: No clock discovered and no frequency ",
 		print "specified via the command line."
 		print "Using fake 1ghz clock"
 		return (oneghz);
 
 	def fixup(self):
 		for source in sources:
 			e = PadEvent(source, -1, self.timestamp_l)
 			source.addevent(e)
 			e = PadEvent(source, -1, self.timestamp_f, last=1)
 			source.addlastevent(e)
 			source.fixup()
 		sources.sort()
 
 class SchedNames(Canvas):
 	def __init__(self, master, display):
 		self.display = display
 		self.parent = master
 		self.bdheight = master.bdheight
 		self.items = {}
 		self.ysize = 0
 		self.lines = []
 		Canvas.__init__(self, master, width=120,
 		    height=display["height"], bg='grey',
 		    scrollregion=(0, 0, 50, 100))
 
 	def moveline(self, cur_y, y):
 		for line in self.lines:
 			(x0, y0, x1, y1) = self.coords(line)
 			if (cur_y != y0):
 				continue
 			self.move(line, 0, y)
 			return
 
 	def draw(self):
 		status.startup("Drawing names")
 		ypos = 0
 		self.configure(scrollregion=(0, 0,
 		    self["width"], self.display.ysize()))
 		for source in sources:
 			l = self.create_line(0, ypos, self["width"], ypos,
 			    width=1, fill="black", tags=("all","sources"))
 			self.lines.append(l)
 			ypos += self.bdheight
 			ypos += source.ysize()
 			t = source.drawname(self, ypos)
 			self.items[t] = source
 			ypos += self.bdheight
 		self.ysize = ypos
 		self.create_line(0, ypos, self["width"], ypos,
 		    width=1, fill="black", tags=("all",))
 		self.bind("<Button-1>", self.master.mousepress);
 		self.bind("<Button-3>", self.master.mousepressright);
 		self.bind("<ButtonRelease-1>", self.master.mouserelease);
 		self.bind("<B1-Motion>", self.master.mousemotion);
 
 	def updatescroll(self):
 		self.configure(scrollregion=(0, 0,
 		    self["width"], self.display.ysize()))
 
 
 class SchedDisplay(Canvas):
 	def __init__(self, master):
 		self.ratio = 1
 		self.parent = master
 		self.bdheight = master.bdheight
 		self.items = {}
 		self.lines = []
 		Canvas.__init__(self, master, width=800, height=500, bg='grey',
 		     scrollregion=(0, 0, 800, 500))
 
 	def prepare(self):
 		#
 		# Compute a ratio to ensure that the file's timespan fits into
 		# 2^31.  Although python may handle larger values for X
 		# values, the Tk internals do not.
 		#
 		self.ratio = (ktrfile.timespan() - 1) / 2**31 + 1
 
 	def draw(self):
 		ypos = 0
 		xsize = self.xsize()
 		for source in sources:
 			status.startup("Drawing " + source.name)
 			l = self.create_line(0, ypos, xsize, ypos,
 			    width=1, fill="black", tags=("all",))
 			self.lines.append(l)
 			ypos += self.bdheight
 			ypos += source.ysize()
 			source.draw(self, ypos)
 			ypos += self.bdheight
 		self.tag_raise("point", "state")
 		self.tag_lower("cpubg", ALL)
 		self.create_line(0, ypos, xsize, ypos,
 		    width=1, fill="black", tags=("lines",))
 		self.tag_bind("event", "<Enter>", self.mouseenter)
 		self.tag_bind("event", "<Leave>", self.mouseexit)
 		self.bind("<Button-1>", self.mousepress)
 		self.bind("<Button-3>", self.master.mousepressright);
 		self.bind("<Button-4>", self.wheelup)
 		self.bind("<Button-5>", self.wheeldown)
 		self.bind("<ButtonRelease-1>", self.master.mouserelease);
 		self.bind("<B1-Motion>", self.master.mousemotion);
 
 	def moveline(self, cur_y, y):
 		for line in self.lines:
 			(x0, y0, x1, y1) = self.coords(line)
 			if (cur_y != y0):
 				continue
 			self.move(line, 0, y)
 			return
 
 	def mouseenter(self, event):
 		item, = self.find_withtag(CURRENT)
 		self.items[item].mouseenter(self)
 
 	def mouseexit(self, event):
 		item, = self.find_withtag(CURRENT)
 		self.items[item].mouseexit(self)
 
 	def mousepress(self, event):
 		# Find out what's beneath us
 		items = self.find_withtag(CURRENT)
 		if (len(items) == 0):
 			self.master.mousepress(event)
 			return
 		# Only grab mouse presses for things with event tags.
 		item = items[0]
 		tags = self.gettags(item)
 		for tag in tags:
 			if (tag == "event"):
 				self.items[item].mousepress(self)
 				return
 		# Leave the rest to the master window
 		self.master.mousepress(event)
 
 	def wheeldown(self, event):
 		self.parent.display_yview("scroll", 1, "units")
 
 	def wheelup(self, event):
 		self.parent.display_yview("scroll", -1, "units")
 
 	def xsize(self):
 		return ((ktrfile.timespan() / self.ratio) + (X_BORDER * 2))
 
 	def ysize(self):
 		ysize = 0
 		for source in sources:
 			if (source.hidden == 1):
 				continue
 			ysize += self.parent.sourcesize(source)
 		return ysize
 
 	def scaleset(self, ratio):
 		if (ktrfile == None):
 			return
 		oldratio = self.ratio
 		xstart, xend = self.xview()
 		midpoint = xstart + ((xend - xstart) / 2)
 
 		self.ratio = ratio
 		self.updatescroll()
 		self.scale(ALL, 0, 0, float(oldratio) / ratio, 1)
 
 		xstart, xend = self.xview()
 		xsize = (xend - xstart) / 2
 		self.xview_moveto(midpoint - xsize)
 
 	def updatescroll(self):
 		self.configure(scrollregion=(0, 0, self.xsize(), self.ysize()))
 
 	def scaleget(self):
 		return self.ratio
 
 	def getcolor(self, tag):
 		return self.itemcget(tag, "fill")
 
 	def getstate(self, tag):
 		return self.itemcget(tag, "state")
 
 	def setcolor(self, tag, color):
 		self.itemconfigure(tag, state="normal", fill=color)
 
 	def hide(self, tag):
 		self.itemconfigure(tag, state="hidden")
 
 class GraphMenu(Frame):
 	def __init__(self, master):
 		Frame.__init__(self, master, bd=2, relief=RAISED)
 		self.conf = Menubutton(self, text="Configure")
 		self.confmenu = Menu(self.conf, tearoff=0)
 		self.confmenu.add_command(label="Event Colors",
 		    command=self.econf)
 		self.confmenu.add_command(label="CPU Colors",
 		    command=self.cconf)
 		self.confmenu.add_command(label="Source Configure",
 		    command=self.sconf)
 		self.conf["menu"] = self.confmenu
 		self.conf.pack(side=LEFT)
 
 	def econf(self):
 		ColorConfigure(eventcolors, "Event Display Configuration")
 
 	def cconf(self):
 		ColorConfigure(cpucolors, "CPU Background Colors")
 
 	def sconf(self):
 		SourceConfigure()
 
 class SchedGraph(Frame):
 	def __init__(self, master):
 		Frame.__init__(self, master)
 		self.menu = None
 		self.names = None
 		self.display = None
 		self.scale = None
 		self.status = None
 		self.bdheight = Y_BORDER
 		self.clicksource = None
 		self.lastsource = None
 		self.pack(expand=1, fill="both")
 		self.buildwidgets()
 		self.layout()
 
 	def buildwidgets(self):
 		global status
 		self.menu = GraphMenu(self)
 		self.display = SchedDisplay(self)
 		self.names = SchedNames(self, self.display)
 		self.scale = Scaler(self, self.display)
 		status = self.status = Status(self)
 		self.scrollY = Scrollbar(self, orient="vertical",
 		    command=self.display_yview)
 		self.display.scrollX = Scrollbar(self, orient="horizontal",
 		    command=self.display.xview)
 		self.display["xscrollcommand"] = self.display.scrollX.set
 		self.display["yscrollcommand"] = self.scrollY.set
 		self.names["yscrollcommand"] = self.scrollY.set
 
 	def layout(self):
 		self.columnconfigure(1, weight=1)
 		self.rowconfigure(1, weight=1)
 		self.menu.grid(row=0, column=0, columnspan=3, sticky=E+W)
 		self.names.grid(row=1, column=0, sticky=N+S)
 		self.display.grid(row=1, column=1, sticky=W+E+N+S)
 		self.scrollY.grid(row=1, column=2, sticky=N+S)
 		self.display.scrollX.grid(row=2, column=0, columnspan=2,
 		    sticky=E+W)
 		self.scale.grid(row=3, column=0, columnspan=3, sticky=E+W)
 		self.status.grid(row=4, column=0, columnspan=3, sticky=E+W)
 
 	def draw(self):
 		self.master.update()
 		self.display.prepare()
 		self.names.draw()
 		self.display.draw()
 		self.status.startup("")
 		#
 		# Configure scale related values
 		#
 		scalemax = ktrfile.timespan() / int(self.display["width"])
 		width = int(root.geometry().split('x')[0])
 		self.constwidth = width - int(self.display["width"])
 		self.scale.setmax(scalemax)
 		self.scale.set(scalemax)
 		self.display.xview_moveto(0)
 		self.bind("<Configure>", self.resize)
 
 	def mousepress(self, event):
 		self.clicksource = self.sourceat(event.y)
 
 	def mousepressright(self, event):
 		source = self.sourceat(event.y)
 		if (source == None):
 			return
 		SourceContext(event, source)
 
 	def mouserelease(self, event):
 		if (self.clicksource == None):
 			return
 		newsource = self.sourceat(event.y)
 		if (self.clicksource != newsource):
 			self.sourceswap(self.clicksource, newsource)
 		self.clicksource = None
 		self.lastsource = None
 
 	def mousemotion(self, event):
 		if (self.clicksource == None):
 			return
 		newsource = self.sourceat(event.y)
 		#
 		# If we get a None source they moved off the page.
 		# swapsource() can't handle moving multiple items so just
 		# pretend we never clicked on anything to begin with so the
 		# user can't mouseover a non-contiguous area.
 		#
 		if (newsource == None):
 			self.clicksource = None
 			self.lastsource = None
 			return
 		if (newsource == self.lastsource):
 			return;
 		self.lastsource = newsource
 		if (newsource != self.clicksource):
 			self.sourceswap(self.clicksource, newsource)
 
 	# These are here because this object controls layout
 	def sourcestart(self, source):
 		return source.y - self.bdheight - source.ysize()
 
 	def sourceend(self, source):
 		return source.y + self.bdheight
 
 	def sourcesize(self, source):
 		return (self.bdheight * 2) + source.ysize()
 
 	def sourceswap(self, source1, source2):
 		# Sort so we always know which one is on top.
 		if (source2.y < source1.y):
 			swap = source1
 			source1 = source2
 			source2 = swap
 		# Only swap adjacent sources
 		if (self.sourceend(source1) != self.sourcestart(source2)):
 			return
 		# Compute start coordinates and target coordinates
 		y1 = self.sourcestart(source1)
 		y2 = self.sourcestart(source2)
 		y1targ = y1 + self.sourcesize(source2)
 		y2targ = y1
 		#
 		# If the sizes are not equal, adjust the start of the lower
 		# source to account for the lost/gained space.
 		#
 		if (source1.ysize() != source2.ysize()):
 			diff = source2.ysize() - source1.ysize()
 			self.names.moveline(y2, diff);
 			self.display.moveline(y2, diff)
 		source1.move(self.display, 0, y1targ - y1)
 		source2.move(self.display, 0, y2targ - y2)
 		source1.movename(self.names, 0, y1targ - y1)
 		source2.movename(self.names, 0, y2targ - y2)
 
 	def sourcepicky(self, source):
 		if (source.hidden == 0):
 			return self.sourcestart(source)
 		# Revert to group based sort
 		sources.sort()
 		prev = None
 		for s in sources:
 			if (s == source):
 				break
 			if (s.hidden == 0):
 				prev = s
 		if (prev == None):
 			newy = 0
 		else:
 			newy = self.sourcestart(prev) + self.sourcesize(prev)
 		return newy
 
 	def sourceshow(self, source):
 		if (source.hidden == 0):
 			return;
 		newy = self.sourcepicky(source)
 		off = newy - self.sourcestart(source)
 		self.sourceshiftall(newy-1, self.sourcesize(source))
 		self.sourceshift(source, off)
 		source.hidden = 0
 
 	#
 	# Optimized source show of multiple entries that only moves each
 	# existing entry once.  Doing sourceshow() iteratively is too
 	# expensive due to python's canvas.move().
 	#
 	def sourceshowlist(self, srclist):
 		srclist.sort(cmp=source_cmp_start)
 		startsize = []
 		for source in srclist:
 			if (source.hidden == 0):
 				srclist.remove(source)
 			startsize.append((self.sourcepicky(source),
 			    self.sourcesize(source)))
 
 		sources.sort(cmp=source_cmp_start, reverse=True)
 		self.status.startup("Updating display...");
 		for source in sources:
 			if (source.hidden == 1):
 				continue
 			nstart = self.sourcestart(source)
 			size = 0
 			for hidden in startsize:
 				(start, sz) = hidden
 				if (start <= nstart or start+sz <= nstart):
 					size += sz
 			self.sourceshift(source, size)
 		idx = 0
 		size = 0
 		for source in srclist:
 			(newy, sz) = startsize[idx]
 			off = (newy + size) - self.sourcestart(source)
 			self.sourceshift(source, off)
 			source.hidden = 0
 			size += sz
 			idx += 1
 		self.updatescroll()
 		self.status.set("")
 
 	#
 	# Optimized source hide of multiple entries that only moves each
 	# remaining entry once.  Doing sourcehide() iteratively is too
 	# expensive due to python's canvas.move().
 	#
 	def sourcehidelist(self, srclist):
 		srclist.sort(cmp=source_cmp_start)
 		sources.sort(cmp=source_cmp_start)
 		startsize = []
 		off = len(sources) * 100
 		self.status.startup("Updating display...");
 		for source in srclist:
 			if (source.hidden == 1):
 				srclist.remove(source)
 			#
 			# Remember our old position so we can sort things
 			# below us when we're done.
 			#
 			startsize.append((self.sourcestart(source),
 			    self.sourcesize(source)))
 			self.sourceshift(source, off)
 			source.hidden = 1
 
 		idx = 0
 		size = 0
 		for hidden in startsize:
 			(start, sz) = hidden
 			size += sz
 			if (idx + 1 < len(startsize)):
 				(stop, sz) = startsize[idx+1]
 			else:
 				stop = self.display.ysize()
 			idx += 1
 			for source in sources:
 				nstart = self.sourcestart(source)
 				if (nstart < start or source.hidden == 1):
 					continue
 				if (nstart >= stop):
 					break;
 				self.sourceshift(source, -size)
 		self.updatescroll()
 		self.status.set("")
 
 	def sourcehide(self, source):
 		if (source.hidden == 1):
 			return;
 		# Move it out of the visible area
 		off = len(sources) * 100
 		start = self.sourcestart(source)
 		self.sourceshift(source, off)
 		self.sourceshiftall(start, -self.sourcesize(source))
 		source.hidden = 1
 
 	def sourceshift(self, source, off):
 		start = self.sourcestart(source)
 		source.move(self.display, 0, off)
 		source.movename(self.names, 0, off)
 		self.names.moveline(start, off);
 		self.display.moveline(start, off)
 		#
 		# We update the idle tasks to shrink the dirtied area so
 		# it does not always include the entire screen.
 		#
 		self.names.update_idletasks()
 		self.display.update_idletasks()
 
 	def sourceshiftall(self, start, off):
 		self.status.startup("Updating display...");
 		for source in sources:
 			nstart = self.sourcestart(source)
 			if (nstart < start):
 				continue;
 			self.sourceshift(source, off)
 		self.updatescroll()
 		self.status.set("")
 
 	def sourceat(self, ypos):
 		(start, end) = self.names.yview()
 		starty = start * float(self.names.ysize)
 		ypos += starty
 		for source in sources:
 			if (source.hidden == 1):
 				continue;
 			yend = self.sourceend(source)
 			ystart = self.sourcestart(source)
 			if (ypos >= ystart and ypos <= yend):
 				return source
 		return None
 
 	def display_yview(self, *args):
 		self.names.yview(*args)
 		self.display.yview(*args)
 
 	def resize(self, *args):
 		width = int(root.geometry().split('x')[0])
 		scalemax = ktrfile.timespan() / (width - self.constwidth)
 		self.scale.setmax(scalemax)
 
 	def updatescroll(self):
 		self.names.updatescroll()
 		self.display.updatescroll()
 
 	def setcolor(self, tag, color):
 		self.display.setcolor(tag, color)
 
 	def hide(self, tag):
 		self.display.hide(tag)
 
 	def getcolor(self, tag):
 		return self.display.getcolor(tag)
 
 	def getstate(self, tag):
 		return self.display.getstate(tag)
 
 if (len(sys.argv) != 2 and len(sys.argv) != 3):
 	print "usage:", sys.argv[0], "<ktr file> [clock freq in ghz]"
 	sys.exit(1)
 
 if (len(sys.argv) > 2):
 	clockfreq = float(sys.argv[2])
 
 root = Tk()
 root.title("SchedGraph")
 colormap = Colormap(eventcolors)
 cpucolormap = Colormap(cpucolors)
 graph = SchedGraph(root)
 ktrfile = KTRFile(sys.argv[1])
 graph.draw()
 root.mainloop()
Index: stable/10/usr.bin/man/man.sh
===================================================================
--- stable/10/usr.bin/man/man.sh	(revision 278649)
+++ stable/10/usr.bin/man/man.sh	(revision 278650)
@@ -1,965 +1,1005 @@
 #! /bin/sh
 #
 #  Copyright (c) 2010 Gordon Tetlow
 #  All rights reserved.
 #
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions
 #  are met:
 #  1. Redistributions of source code must retain the above copyright
 #     notice, this list of conditions and the following disclaimer.
 #  2. Redistributions in binary form must reproduce the above copyright
 #     notice, this list of conditions and the following disclaimer in the
 #     documentation and/or other materials provided with the distribution.
 #
 #  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 #  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 #  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 #  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 #  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 #  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 #  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 #  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 #  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 #  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 #  SUCH DAMAGE.
 #
 # $FreeBSD$
 
 # Usage: add_to_manpath path
 # Adds a variable to manpath while ensuring we don't have duplicates.
 # Returns true if we were able to add something. False otherwise.
 add_to_manpath() {
 	case "$manpath" in
 	*:$1)	decho "  Skipping duplicate manpath entry $1" 2 ;;
 	$1:*)	decho "  Skipping duplicate manpath entry $1" 2 ;;
 	*:$1:*)	decho "  Skipping duplicate manpath entry $1" 2 ;;
 	*)	if [ -d "$1" ]; then
 			decho "  Adding $1 to manpath"
 			manpath="$manpath:$1"
 			return 0
 		fi
 		;;
 	esac
 
 	return 1
 }
 
 # Usage: build_manlocales
 # Builds a correct MANLOCALES variable.
 build_manlocales() {
 	# If the user has set manlocales, who are we to argue.
 	if [ -n "$MANLOCALES" ]; then
 		return
 	fi
 
 	parse_configs
 
 	# Trim leading colon
 	MANLOCALES=${manlocales#:}
 
 	decho "Available manual locales: $MANLOCALES"
 }
 
 # Usage: build_manpath
 # Builds a correct MANPATH variable.
 build_manpath() {
 	local IFS
 
 	# If the user has set a manpath, who are we to argue.
 	if [ -n "$MANPATH" ]; then
 		return
 	fi
 
 	search_path
 
 	decho "Adding default manpath entries"
 	IFS=:
 	for path in $man_default_path; do
 		add_to_manpath "$path"
 	done
 	unset IFS
 
 	parse_configs
 
 	# Trim leading colon
 	MANPATH=${manpath#:}
 
 	decho "Using manual path: $MANPATH"
 }
 
 # Usage: check_cat catglob
 # Checks to see if a cat glob is available.
 check_cat() {
 	if exists "$1"; then
 		use_cat=yes
 		catpage=$found
 		setup_cattool $catpage
 		decho "    Found catpage $catpage"
 		return 0
 	else
 		return 1
 	fi
 }
 
 # Usage: check_man manglob catglob
 # Given 2 globs, figures out if the manglob is available, if so, check to
 # see if the catglob is also available and up to date.
 check_man() {
 	if exists "$1"; then
 		# We have a match, check for a cat page
 		manpage=$found
 		setup_cattool $manpage
 		decho "    Found manpage $manpage"
 
 		if [ -n "${use_width}" ]; then
 			# non-standard width
 			unset use_cat
 			decho "    Skipping catpage: non-standard page width"
 		elif exists "$2" && is_newer $found $manpage; then
 			# cat page found and is newer, use that
 			use_cat=yes
 			catpage=$found
 			setup_cattool $catpage
 			decho "    Using catpage $catpage"
 		else
 			# no cat page or is older
 			unset use_cat
 			decho "    Skipping catpage: not found or old"
 		fi
 		return 0
 	fi
 
 	return 1
 }
 
 # Usage: decho "string" [debuglevel]
 # Echoes to stderr string prefaced with -- if high enough debuglevel.
 decho() {
 	if [ $debug -ge ${2:-1} ]; then
 		echo "-- $1" >&2
 	fi
 }
 
 # Usage: exists glob
 # Returns true if glob resolves to a real file.
 exists() {
 	local IFS
 
 	# Don't accidentally inherit callers IFS (breaks perl manpages)
 	unset IFS
 
 	# Use some globbing tricks in the shell to determine if a file
 	# exists or not.
 	set +f
 	set -- "$1" $1
 	set -f
 
 	if [ "$1" != "$2" -a -r "$2" ]; then
 		found="$2"
 		return 0
 	fi
 
 	return 1
 }
 
 # Usage: find_file path section subdir pagename
 # Returns: true if something is matched and found.
 # Search the given path/section combo for a given page.
 find_file() {
 	local manroot catroot mann man0 catn cat0
 
 	manroot="$1/man$2"
 	catroot="$1/cat$2"
 	if [ -n "$3" ]; then
 		manroot="$manroot/$3"
 		catroot="$catroot/$3"
 	fi
 
 	if [ ! -d "$manroot" ]; then
 		return 1
 	fi
 	decho "  Searching directory $manroot" 2
 
 	mann="$manroot/$4.$2*"
 	man0="$manroot/$4.0*"
 	catn="$catroot/$4.$2*"
 	cat0="$catroot/$4.0*"
 
 	# This is the behavior as seen by the original man utility.
 	# Let's not change that which doesn't seem broken.
 	if check_man "$mann" "$catn"; then
 		return 0
 	elif check_man "$man0" "$cat0"; then
 		return 0
 	elif check_cat "$catn"; then
 		return 0
 	elif check_cat "$cat0"; then
 		return 0
 	fi
 
 	return 1
 }
 
 # Usage: is_newer file1 file2
 # Returns true if file1 is newer than file2 as calculated by mtime.
 is_newer() {
 	if ! [ "$1" -ot "$2" ]; then
 		decho "    mtime: $1 not older than $2" 3
 		return 0
 	else
 		decho "    mtime: $1 older than $2" 3
 		return 1
 	fi
 }
 
 # Usage: manpath_parse_args "$@"
 # Parses commandline options for manpath.
 manpath_parse_args() {
 	local cmd_arg
 
 	while getopts 'Ldq' cmd_arg; do
 		case "${cmd_arg}" in
 		L)	Lflag=Lflag ;;
 		d)	debug=$(( $debug + 1 )) ;;
 		q)	qflag=qflag ;;
 		*)	manpath_usage ;;
 		esac
 	done >&2
 }
 
 # Usage: manpath_usage
 # Display usage for the manpath(1) utility.
 manpath_usage() {
 	echo 'usage: manpath [-Ldq]' >&2
 	exit 1
 }
 
 # Usage: manpath_warnings
 # Display some warnings to stderr.
 manpath_warnings() {
 	if [ -z "$Lflag" -a -n "$MANPATH" ]; then
 		echo "(Warning: MANPATH environment variable set)" >&2
 	fi
 
 	if [ -n "$Lflag" -a -n "$MANLOCALES" ]; then
 		echo "(Warning: MANLOCALES environment variable set)" >&2
 	fi
 }
 
 # Usage: man_check_for_so page path
 # Returns: True if able to resolve the file, false if it ended in tears.
 # Detects the presence of the .so directive and causes the file to be
 # redirected to another source file.
 man_check_for_so() {
 	local IFS line tstr
 
 	unset IFS
 
 	# We need to loop to accommodate multiple .so directives.
 	while true
 	do
 		line=$($cattool $manpage | head -1)
 		case "$line" in
 		.so*)	trim "${line#.so}"
 			decho "$manpage includes $tstr"
 			# Glob and check for the file.
 			if ! check_man "$path/$tstr*" ""; then
 				decho "  Unable to find $tstr"
 				return 1
 			fi
 			;;
 		*)	break ;;
 		esac
 	done
 
 	return 0
 }
 
-# Usage: man_display_page
-# Display either the manpage or catpage depending on the use_cat variable
 man_display_page() {
-	local EQN NROFF PIC TBL TROFF REFER VGRIND
-	local IFS l nroff_dev pipeline preproc_arg tool
+	local IFS pipeline preconv_enc testline
 
 	# We are called with IFS set to colon. This causes really weird
 	# things to happen for the variables that have spaces in them.
 	unset IFS
 
 	# If we are supposed to use a catpage and we aren't using troff(1)
 	# just zcat the catpage and we are done.
 	if [ -z "$tflag" -a -n "$use_cat" ]; then
 		if [ -n "$wflag" ]; then
 			echo "$catpage (source: $manpage)"
 			ret=0
 		else
 			if [ $debug -gt 0 ]; then
 				decho "Command: $cattool $catpage | $MANPAGER"
 				ret=0
 			else
 				eval "$cattool $catpage | $MANPAGER"
 				ret=$?
 			fi
 		fi
 		return
 	fi
 
 	# Okay, we are using the manpage, do we just need to output the
 	# name of the manpage?
 	if [ -n "$wflag" ]; then
 		echo "$manpage"
 		ret=0
 		return
 	fi
+
+	case "${manpage}" in
+	*.${man_charset}/*)
+		case "$man_charset" in
+		ISO8859-1) preconv_enc="latin-1" ;;
+		ISO8859-15) preconv_enc="latin-1" ;;
+		UTF-8) preconv_enc="utf-8" ;;
+		esac
+		;;
+	esac
+
+	if [ -n "$preconv_enc" ]; then
+		pipeline="preconv -e $preconv_enc |"
+	fi
+	testline="$pipeline mandoc -Tlint -Werror 2>/dev/null"
+	pipeline="$pipeline mandoc -Tlocale | $MANPAGER"
+
+	if ! eval "$cattool $manpage | $testline" ;then
+		if which -s groff2; then
+			man_display_page_groff
+		else
+			echo "This manpage needs groff(1) to be rendered" >&2
+			echo "First install groff(1): " >&2
+			echo "pkg install groff " >&2
+			ret=1
+		fi
+		return
+	fi
+
+	if [ $debug -gt 0 ]; then
+		decho "Command: $cattool $manpage | $pipeline"
+		ret=0
+	else
+		eval "$cattool $manpage | $pipeline"
+		ret=$?
+	fi
+}
+
+# Usage: man_display_page
+# Display either the manpage or catpage depending on the use_cat variable
+man_display_page_groff() {
+	local EQN NROFF PIC TBL TROFF REFER VGRIND
+	local IFS l nroff_dev pipeline preproc_arg tool
 
 	# So, we really do need to parse the manpage. First, figure out the
 	# device flag (-T) we have to pass to eqn(1) and groff(1). Then,
 	# setup the pipeline of commands based on the user's request.
 
 	# If the manpage is from a particular charset, we need to setup nroff
 	# to properly output for the correct device.
 	case "${manpage}" in
 	*.${man_charset}/*)
 		# I don't pretend to know this; I'm just copying from the
 		# previous version of man(1).
 		case "$man_charset" in
 		KOI8-R)		nroff_dev="koi8-r" ;;
 		ISO8859-1)	nroff_dev="latin1" ;;
 		ISO8859-15)	nroff_dev="latin1" ;;
 		UTF-8)		nroff_dev="utf8" ;;
 		*)		nroff_dev="ascii" ;;
 		esac
 
 		NROFF="$NROFF -T$nroff_dev"
 		EQN="$EQN -T$nroff_dev"
 
 		# Iff the manpage is from the locale and not just the charset,
 		# then we need to define the locale string.
 		case "${manpage}" in
 		*/${man_lang}_${man_country}.${man_charset}/*)
 			NROFF="$NROFF -dlocale=$man_lang.$man_charset"
 			;;
 		*/${man_lang}.${man_charset}/*)
 			NROFF="$NROFF -dlocale=$man_lang.$man_charset"
 			;;
 		esac
 
 		# Allow language specific calls to override the default
 		# set of utilities.
 		l=$(echo $man_lang | tr [:lower:] [:upper:])
 		for tool in EQN NROFF PIC TBL TROFF REFER VGRIND; do
 			eval "$tool=\${${tool}_$l:-\$$tool}"
 		done
 		;;
 	*)	NROFF="$NROFF -Tascii"
 		EQN="$EQN -Tascii"
 		;;
 	esac
 
 	if [ -z "$MANCOLOR" ]; then
 		NROFF="$NROFF -P-c"
 	fi
 
 	if [ -n "${use_width}" ]; then
 		NROFF="$NROFF -rLL=${use_width}n -rLT=${use_width}n"
 	fi
 
 	if [ -n "$MANROFFSEQ" ]; then
 		set -- -$MANROFFSEQ
 		while getopts 'egprtv' preproc_arg; do
 			case "${preproc_arg}" in
 			e)	pipeline="$pipeline | $EQN" ;;
 			g)	;; # Ignore for compatibility.
 			p)	pipeline="$pipeline | $PIC" ;;
 			r)	pipeline="$pipeline | $REFER" ;;
 			t)	pipeline="$pipeline | $TBL" ;;
 			v)	pipeline="$pipeline | $VGRIND" ;;
 			*)	usage ;;
 			esac
 		done
 		# Strip the leading " | " from the resulting pipeline.
 		pipeline="${pipeline#" | "}"
 	else
 		pipeline="$TBL"
 	fi
 
 	if [ -n "$tflag" ]; then
 		pipeline="$pipeline | $TROFF"
 	else
 		pipeline="$pipeline | $NROFF | $MANPAGER"
 	fi
 
 	if [ $debug -gt 0 ]; then
 		decho "Command: $cattool $manpage | $pipeline"
 		ret=0
 	else
 		eval "$cattool $manpage | $pipeline"
 		ret=$?
 	fi
 }
 
 # Usage: man_find_and_display page
 # Search through the manpaths looking for the given page.
 man_find_and_display() {
 	local found_page locpath p path sect
 
 	# Check to see if it's a file. But only if it has a '/' in
 	# the filename.
 	case "$1" in
 	*/*)	if [ -f "$1" -a -r "$1" ]; then
 			decho "Found a usable page, displaying that"
 			unset use_cat
 			manpage="$1"
 			setup_cattool $manpage
 			if man_check_for_so $manpage $(dirname $manpage); then
 				found_page=yes
 				man_display_page
 			fi
 			return
 		fi
 		;;
 	esac
 
 	IFS=:
 	for sect in $MANSECT; do
 		decho "Searching section $sect" 2
 		for path in $MANPATH; do
 			for locpath in $locpaths; do
 				p=$path/$locpath
 				p=${p%/.} # Rid ourselves of the trailing /.
 
 				# Check if there is a MACHINE specific manpath.
 				if find_file $p $sect $MACHINE "$1"; then
 					if man_check_for_so $manpage $p; then
 						found_page=yes
 						man_display_page
 						if [ -n "$aflag" ]; then
 							continue 2
 						else
 							return
 						fi
 					fi
 				fi
 
 				# Check if there is a MACHINE_ARCH
 				# specific manpath.
 				if find_file $p $sect $MACHINE_ARCH "$1"; then
 					if man_check_for_so $manpage $p; then
 						found_page=yes
 						man_display_page
 						if [ -n "$aflag" ]; then
 							continue 2
 						else
 							return
 						fi
 					fi
 				fi
 
 				# Check plain old manpath.
 				if find_file $p $sect '' "$1"; then
 					if man_check_for_so $manpage $p; then
 						found_page=yes
 						man_display_page
 						if [ -n "$aflag" ]; then
 							continue 2
 						else
 							return
 						fi
 					fi
 				fi
 			done
 		done
 	done
 	unset IFS
 
 	# Nothing? Well, we are done then.
 	if [ -z "$found_page" ]; then
 		echo "No manual entry for $1" >&2
 		ret=1
 		return
 	fi
 }
 
 # Usage: man_parse_args "$@"
 # Parses commandline options for man.
 man_parse_args() {
 	local IFS cmd_arg
 
 	while getopts 'M:P:S:adfhkm:op:tw' cmd_arg; do
 		case "${cmd_arg}" in
 		M)	MANPATH=$OPTARG ;;
 		P)	MANPAGER=$OPTARG ;;
 		S)	MANSECT=$OPTARG ;;
 		a)	aflag=aflag ;;
 		d)	debug=$(( $debug + 1 )) ;;
 		f)	fflag=fflag ;;
 		h)	man_usage 0 ;;
 		k)	kflag=kflag ;;
 		m)	mflag=$OPTARG ;;
 		o)	oflag=oflag ;;
 		p)	MANROFFSEQ=$OPTARG ;;
 		t)	tflag=tflag ;;
 		w)	wflag=wflag ;;
 		*)	man_usage ;;
 		esac
 	done >&2
 
 	shift $(( $OPTIND - 1 ))
 
 	# Check the args for incompatible options.
 	case "${fflag}${kflag}${tflag}${wflag}" in
 	fflagkflag*)	echo "Incompatible options: -f and -k"; man_usage ;;
 	fflag*tflag*)	echo "Incompatible options: -f and -t"; man_usage ;;
 	fflag*wflag)	echo "Incompatible options: -f and -w"; man_usage ;;
 	*kflagtflag*)	echo "Incompatible options: -k and -t"; man_usage ;;
 	*kflag*wflag)	echo "Incompatible options: -k and -w"; man_usage ;;
 	*tflagwflag)	echo "Incompatible options: -t and -w"; man_usage ;;
 	esac
 
 	# Short circuit for whatis(1) and apropos(1)
 	if [ -n "$fflag" ]; then
 		do_whatis "$@"
 		exit
 	fi
 
 	if [ -n "$kflag" ]; then
 		do_apropos "$@"
 		exit
 	fi
 
 	IFS=:
 	for sect in $man_default_sections; do
 		if [ "$sect" = "$1" ]; then
 			decho "Detected manual section as first arg: $1"
 			MANSECT="$1"
 			shift
 			break
 		fi
 	done
 	unset IFS
 
 	pages="$*"
 }
 
 # Usage: man_setup
 # Setup various trivial but essential variables.
 man_setup() {
 	# Setup machine and architecture variables.
 	if [ -n "$mflag" ]; then
 		MACHINE_ARCH=${mflag%%:*}
 		MACHINE=${mflag##*:}
 	fi
 	if [ -z "$MACHINE_ARCH" ]; then
 		MACHINE_ARCH=$($SYSCTL -n hw.machine_arch)
 	fi
 	if [ -z "$MACHINE" ]; then
 		MACHINE=$($SYSCTL -n hw.machine)
 	fi
 	decho "Using architecture: $MACHINE_ARCH:$MACHINE"
 
 	setup_pager
 
 	# Setup manual sections to search.
 	if [ -z "$MANSECT" ]; then
 		MANSECT=$man_default_sections
 	fi
 	decho "Using manual sections: $MANSECT"
 
 	build_manpath
 	man_setup_locale
 	man_setup_width
 }
 
 # Usage: man_setup_width
 # Set up page width.
 man_setup_width() {
 	local sizes
 
 	unset use_width
 	case "$MANWIDTH" in
 	[0-9]*)
 		if [ "$MANWIDTH" -gt 0 2>/dev/null ]; then
 			use_width=$MANWIDTH
 		fi
 		;;
 	[Tt][Tt][Yy])
 		if { sizes=$($STTY size 0>&3 2>/dev/null); } 3>&1; then
 			set -- $sizes
 			if [ $2 -gt 80 ]; then
 				use_width=$(($2-2))
 			fi
 		fi
 		;;
 	esac
 	if [ -n "$use_width" ]; then
 		decho "Using non-standard page width: ${use_width}"
 	else
 		decho 'Using standard page width'
 	fi
 }
 
 # Usage: man_setup_locale
 # Setup necessary locale variables.
 man_setup_locale() {
 	local lang_cc
 
 	locpaths='.'
 	man_charset='US-ASCII'
 
 	# Setup locale information.
 	if [ -n "$oflag" ]; then
 		decho 'Using non-localized manpages'
 	else
 		# Use the locale tool to give us the proper LC_CTYPE
 		eval $( $LOCALE )
 
 		case "$LC_CTYPE" in
 		C)		;;
 		POSIX)		;;
 		[a-z][a-z]_[A-Z][A-Z]\.*)
 				lang_cc="${LC_CTYPE%.*}"
 				man_lang="${LC_CTYPE%_*}"
 				man_country="${lang_cc#*_}"
 				man_charset="${LC_CTYPE#*.}"
 				locpaths="$LC_CTYPE"
 				locpaths="$locpaths:$man_lang.$man_charset"
 				if [ "$man_lang" != "en" ]; then
 					locpaths="$locpaths:en.$man_charset"
 				fi
 				locpaths="$locpaths:."
 				;;
 		*)		echo 'Unknown locale, assuming C' >&2
 				;;
 		esac
 	fi
 
 	decho "Using locale paths: $locpaths"
 }
 
 # Usage: man_usage [exitcode]
 # Display usage for the man utility.
 man_usage() {
 	echo 'Usage:'
 	echo ' man [-adho] [-t | -w] [-M manpath] [-P pager] [-S mansect]'
 	echo '     [-m arch[:machine]] [-p [eprtv]] [mansect] page [...]'
 	echo ' man -f page [...] -- Emulates whatis(1)'
 	echo ' man -k page [...] -- Emulates apropos(1)'
 
 	# When exit'ing with -h, it's not an error.
 	exit ${1:-1}
 }
 
 # Usage: parse_configs
 # Reads the end-user adjustable config files.
 parse_configs() {
 	local IFS file files
 
 	if [ -n "$parsed_configs" ]; then
 		return
 	fi
 
 	unset IFS
 
 	# Read the global config first in case the user wants
 	# to override config_local.
 	if [ -r "$config_global" ]; then
 		parse_file "$config_global"
 	fi
 
 	# Glob the list of files to parse.
 	set +f
 	files=$(echo $config_local)
 	set -f
 
 	for file in $files; do
 		if [ -r "$file" ]; then
 			parse_file "$file"
 		fi
 	done
 
 	parsed_configs='yes'
 }
 
 # Usage: parse_file file
 # Reads the specified config files.
 parse_file() {
 	local file line tstr var
 
 	file="$1"
 	decho "Parsing config file: $file"
 	while read line; do
 		decho "  $line" 2
 		case "$line" in
 		\#*)		decho "    Comment" 3
 				;;
 		MANPATH*)	decho "    MANPATH" 3
 				trim "${line#MANPATH}"
 				add_to_manpath "$tstr"
 				;;
 		MANLOCALE*)	decho "    MANLOCALE" 3
 				trim "${line#MANLOCALE}"
 				manlocales="$manlocales:$tstr"
 				;;
 		MANCONFIG*)	decho "    MANCONFIG" 3
 				trim "${line#MANCONFIG}"
 				config_local="$tstr"
 				;;
 		# Set variables in the form of FOO_BAR
 		*_*[\ \	]*)	var="${line%%[\ \	]*}"
 				trim "${line#$var}"
 				eval "$var=\"$tstr\""
 				decho "    Parsed $var" 3
 				;;
 		esac
 	done < "$file"
 }
 
 # Usage: search_path
 # Traverse $PATH looking for manpaths.
 search_path() {
 	local IFS p path
 
 	decho "Searching PATH for man directories"
 
 	IFS=:
 	for path in $PATH; do
 		# Do a little special casing since the base manpages
 		# are in /usr/share/man instead of /usr/man or /man.
 		case "$path" in
 		/bin|/usr/bin)	add_to_manpath "/usr/share/man" ;;
 		*)	if add_to_manpath "$path/man"; then
 				:
 			elif add_to_manpath "$path/MAN"; then
 				:
 			else
 				case "$path" in
 				*/bin)	p="${path%/bin}/man"
 					add_to_manpath "$p"
 					;;
 				*)	;;
 				esac
 			fi
 			;;
 		esac
 	done
 	unset IFS
 
 	if [ -z "$manpath" ]; then
 		decho '  Unable to find any manpaths, using default'
 		manpath=$man_default_path
 	fi
 }
 
 # Usage: search_whatis cmd [arglist]
 # Do the heavy lifting for apropos/whatis
 search_whatis() {
 	local IFS bad cmd f good key keywords loc opt out path rval wlist
 
 	cmd="$1"
 	shift
 
 	whatis_parse_args "$@"
 
 	build_manpath
 	build_manlocales
 	setup_pager
 
 	if [ "$cmd" = "whatis" ]; then
 		opt="-w"
 	fi
 
 	f='whatis'
 
 	IFS=:
 	for path in $MANPATH; do
 		if [ \! -d "$path" ]; then
 			decho "Skipping non-existent path: $path" 2
 			continue
 		fi
 
 		if [ -f "$path/$f" -a -r "$path/$f" ]; then
 			decho "Found whatis: $path/$f"
 			wlist="$wlist $path/$f"
 		fi
 
 		for loc in $MANLOCALES; do
 			if [ -f "$path/$loc/$f" -a -r "$path/$loc/$f" ]; then
 				decho "Found whatis: $path/$loc/$f"
 				wlist="$wlist $path/$loc/$f"
 			fi
 		done
 	done
 	unset IFS
 
 	if [ -z "$wlist" ]; then
 		echo "$cmd: no whatis databases in $MANPATH" >&2
 		exit 1
 	fi
 
 	rval=0
 	for key in $keywords; do
 		out=$(grep -Ehi $opt -- "$key" $wlist)
 		if [ -n "$out" ]; then
 			good="$good\\n$out"
 		else
 			bad="$bad\\n$key: nothing appropriate"
 			rval=1
 		fi
 	done
 
 	# Strip leading carriage return.
 	good=${good#\\n}
 	bad=${bad#\\n}
 
 	if [ -n "$good" ]; then
 		echo -e "$good" | $MANPAGER
 	fi
 
 	if [ -n "$bad" ]; then
 		echo -e "$bad" >&2
 	fi
 
 	exit $rval
 }
 
 # Usage: setup_cattool page
 # Finds an appropriate decompressor based on extension
 setup_cattool() {
 	case "$1" in
 	*.bz)	cattool='/usr/bin/bzcat' ;;
 	*.bz2)	cattool='/usr/bin/bzcat' ;;
 	*.gz)	cattool='/usr/bin/zcat' ;;
 	*.lzma)	cattool='/usr/bin/lzcat' ;;
 	*.xz)	cattool='/usr/bin/xzcat' ;;
 	*)	cattool='/usr/bin/zcat -f' ;;
 	esac
 }
 
 # Usage: setup_pager
 # Correctly sets $MANPAGER
 setup_pager() {
 	# Setup pager.
 	if [ -z "$MANPAGER" ]; then
 		if [ -n "$MANCOLOR" ]; then
 			MANPAGER="less -sR"
 		else
 			if [ -n "$PAGER" ]; then
 				MANPAGER="$PAGER"
 			else
 				MANPAGER="more -s"
 			fi
 		fi
 	fi
 	decho "Using pager: $MANPAGER"
 }
 
 # Usage: trim string
 # Trims whitespace from beginning and end of a variable
 trim() {
 	tstr=$1
 	while true; do
 		case "$tstr" in
 		[\ \	]*)	tstr="${tstr##[\ \	]}" ;;
 		*[\ \	])	tstr="${tstr%%[\ \	]}" ;;
 		*)		break ;;
 		esac
 	done
 }
 
 # Usage: whatis_parse_args "$@"
 # Parse commandline args for whatis and apropos.
 whatis_parse_args() {
 	local cmd_arg
 	while getopts 'd' cmd_arg; do
 		case "${cmd_arg}" in
 		d)	debug=$(( $debug + 1 )) ;;
 		*)	whatis_usage ;;
 		esac
 	done >&2
 
 	shift $(( $OPTIND - 1 ))
 
 	keywords="$*"
 }
 
 # Usage: whatis_usage
 # Display usage for the whatis/apropos utility.
 whatis_usage() {
 	echo "usage: $cmd [-d] keyword [...]"
 	exit 1
 }
 
 
 
 # Supported commands
 do_apropos() {
 	search_whatis apropos "$@"
 }
 
 do_man() {
 	man_parse_args "$@"
 	if [ -z "$pages" ]; then
 		echo 'What manual page do you want?' >&2
 		exit 1
 	fi
 	man_setup
 
 	for page in $pages; do
 		decho "Searching for $page"
 		man_find_and_display "$page"
 	done
 
 	exit ${ret:-0}
 }
 
 do_manpath() {
 	manpath_parse_args "$@"
 	if [ -z "$qflag" ]; then
 		manpath_warnings
 	fi
 	if [ -n "$Lflag" ]; then
 		build_manlocales
 		echo $MANLOCALES
 	else
 		build_manpath
 		echo $MANPATH
 	fi
 	exit 0
 }
 
 do_whatis() {
 	search_whatis whatis "$@"
 }
 
 # User's PATH setting decides on the groff-suite to pick up.
 EQN=eqn
 NROFF='groff -S -P-h -Wall -mtty-char -man'
 PIC=pic
 REFER=refer
 TBL=tbl
 TROFF='groff -S -man'
 VGRIND=vgrind
 
 LOCALE=/usr/bin/locale
 STTY=/bin/stty
 SYSCTL=/sbin/sysctl
 
 debug=0
 man_default_sections='1:8:2:3:n:4:5:6:7:9:l'
 man_default_path='/usr/share/man:/usr/share/openssl/man:/usr/local/man'
 cattool='/usr/bin/zcat -f'
 
 config_global='/etc/man.conf'
 
 # This can be overridden via a setting in /etc/man.conf.
 config_local='/usr/local/etc/man.d/*.conf'
 
 # Set noglobbing for now. I don't want spurious globbing.
 set -f
 
 case "$0" in
 *apropos)	do_apropos "$@" ;;
 *manpath)	do_manpath "$@" ;;
 *whatis)	do_whatis "$@" ;;
 *)		do_man "$@" ;;
 esac
Index: stable/10
===================================================================
--- stable/10	(revision 278649)
+++ stable/10	(revision 278650)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r272315,272757,274092,274901