diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 22dd49b65ede..1b05640df93f 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -1,673 +1,662 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 int	hogticks;
 int	lbolt;
 
 static struct callout loadav_callout;
 static struct callout lbolt_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int      fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 static void	endtsleep(void *);
 static void	loadav(void *arg);
 static void	lboltcb(void *arg);
 
 /*
  * We're only looking at 7 bits of the address; everything is
  * aligned to 4, lots of things are aligned to greater powers
  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
  */
 #define TABLESIZE	128
 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 
 void
 sleepinit(void)
 {
 	int i;
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	for (i = 0; i < TABLESIZE; i++)
 		TAILQ_INIT(&slpque[i]);
 }
 
 /*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The mutex argument is exited before the caller is suspended, and
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
 
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int sig, catch = priority & PCATCH;
 	int rval = 0;
 	WITNESS_SAVE_DECL(mtx);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
 	/* XXX: mtx == NULL ?? */
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mtx->mtx_object,
 	    "Sleeping on \"%s\"", wmesg);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
 	    ("sleeping without a mutex"));
 	/*
 	 * If we are capable of async syscalls and there isn't already
 	 * another one ready to return, start a new thread
 	 * and queue it as ready to run. Note that there is danger here
 	 * because we need to make sure that we don't sleep allocating
 	 * the thread (recursion here might be bad).
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (p->p_flag & P_SA || p->p_numthreads > 1) {
 		/*
 		 * Just don't bother if we are exiting
 		 * and not the exiting thread or thread was marked as
 		 * interrupted.
 		 */
 		if (catch) {
 			if ((p->p_flag & P_WEXIT) && p->p_singlethread != td) {
 				mtx_unlock_spin(&sched_lock);
 				return (EINTR);
 			}
 			if (td->td_flags & TDF_INTERRUPT) {
 				mtx_unlock_spin(&sched_lock);
 				return (td->td_intrval);
 			}
 		}
 	}
 	if (cold ) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other procs or panic below,
 		 * in case this is the idle process and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		if (mtx != NULL && priority & PDROP)
 			mtx_unlock(mtx);
 		mtx_unlock_spin(&sched_lock);
 		return (0);
 	}
 	DROP_GIANT();
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(&mtx->mtx_object, mtx);
 		mtx_unlock(mtx);
 		if (priority & PDROP)
 			mtx = NULL;
 	}
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
 	    td, p->p_pid, p->p_comm, wmesg, ident);
 
 	td->td_wchan = ident;
 	td->td_wmesg = wmesg;
 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
 	TD_SET_ON_SLEEPQ(td);
 	if (timo)
 		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there, and
 	 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
 	 * without resuming us, thus we must be ready for sleep
 	 * when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
 	if (catch) {
 		CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
 		    p->p_pid, p->p_comm);
 		td->td_flags |= TDF_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		if (sig == 0 && thread_suspend_check(1))
 			sig = SIGSTOP;
 		mtx_lock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		if (sig != 0) {
 			if (TD_ON_SLEEPQ(td))
 				unsleep(td);
 		} else if (!TD_ON_SLEEPQ(td))
 			catch = 0;
 	} else
 		sig = 0;
 
 	/*
 	 * Let the scheduler know we're about to voluntarily go to sleep.
 	 */
 	sched_sleep(td, priority & PRIMASK);
 
 	if (TD_ON_SLEEPQ(td)) {
 		p->p_stats->p_ru.ru_nvcsw++;
 		TD_SET_SLEEPING(td);
 		mi_switch();
 	}
 	/*
 	 * We're awake from voluntary sleep.
 	 */
 	CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	td->td_flags &= ~TDF_SINTR;
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
 		if (sig == 0)
 			rval = EWOULDBLOCK;
 	} else if (td->td_flags & TDF_TIMOFAIL) {
 		td->td_flags &= ~TDF_TIMOFAIL;
 	} else if (timo && callout_stop(&td->td_slpcallout) == 0) {
 		/*
 		 * This isn't supposed to be pretty.  If we are here, then
 		 * the endtsleep() callout is currently executing on another
 		 * CPU and is either spinning on the sched_lock or will be
 		 * soon.  If we don't synchronize here, there is a chance
 		 * that this process may msleep() again before the callout
 		 * has a chance to run and the callout may end up waking up
 		 * the wrong msleep().  Yuck.
 		 */
 		TD_SET_SLEEPING(td);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		td->td_flags &= ~TDF_TIMOFAIL;
 	} 
 	if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) &&
 	    (rval == 0)) {
 		rval = td->td_intrval;
 	}
 	mtx_unlock_spin(&sched_lock);
 	if (rval == 0 && catch) {
 		PROC_LOCK(p);
 		/* XXX: shouldn't we always be calling cursig()? */
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		if (sig != 0 || (sig = cursig(td))) {
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
 				rval = ERESTART;
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	if (mtx != NULL) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
 	}
 	return (rval);
 }
 
 /*
  * Implement timeout for msleep().
  *
  * If process hasn't been awakened (wchan non-zero),
  * set timeout flag and undo the sleep.  If proc
  * is stopped, just unsleep so it will remain stopped.
  * MP-safe, called without the Giant mutex.
  */
 static void
 endtsleep(arg)
 	void *arg;
 {
 	register struct thread *td;
 
 	td = (struct thread *)arg;
 	CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)",
 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
 	mtx_lock_spin(&sched_lock);
 	/*
 	 * This is the other half of the synchronization with msleep()
 	 * described above.  If the TDS_TIMEOUT flag is set, we lost the
 	 * race and just need to put the process back on the runqueue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 		td->td_flags |= TDF_TIMEOUT;
 		td->td_wmesg = NULL;
 	} else
 		td->td_flags |= TDF_TIMOFAIL;
 	TD_CLR_SLEEPING(td);
 	setrunnable(td);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Abort a thread, as if an interrupt had occured.  Only abort
  * interruptable waits (unfortunatly it isn't only safe to abort others).
  * This is about identical to cv_abort().
  * Think about merging them?
  * Also, whatever the signal code does...
  */
 void
 abortsleep(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
 		if (TD_ON_SLEEPQ(td)) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 		}
 	}
 }
 
 /*
  * Remove a process from its wait queue
  */
 void
 unsleep(struct thread *td)
 {
 
 	mtx_lock_spin(&sched_lock);
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 		td->td_wmesg = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make all processes sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct thread *td;
 	struct thread *ntd;
 	struct proc *p;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 restart:
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			goto restart;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make a process sleeping on the specified identifier runnable.
  * May wake more than one process if a target process is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 	register struct proc *p;
 	register struct slpquehead *qp;
 	register struct thread *td;
 	struct thread *ntd;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			break;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * The machine independent parts of mi_switch().
  */
 void
 mi_switch(void)
 {
 	struct bintime new_switchtime;
 	struct thread *td;
-	struct thread *newtd;
 	struct proc *p;
-	u_int sched_nest;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1,
 	    ("mi_switch: switch in a critical section"));
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	binuptime(&new_switchtime);
 	bintime_add(&p->p_runtime, &new_switchtime);
 	bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
 
 	td->td_generation++;	/* bump preempt-detect counter */
 
 #ifdef DDB
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (db_active) {
 		mtx_unlock_spin(&sched_lock);
 		db_print_backtrace();
 		db_error("Context switches not allowed in the debugger");
 	}
 #endif
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * over max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime.sec > p->p_cpulimit) {
 		p->p_sflag |= PS_XCPU;
 		td->td_flags |= TDF_ASTPENDING;
 	}
 
 	/*
 	 * Finish up stats for outgoing thread.
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
 	CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
-	sched_nest = sched_lock.mtx_recurse;
 	if (td->td_proc->p_flag & P_SA)
 		thread_switchout(td);
-	sched_switchout(td);
-
-	newtd = choosethread();
-	if (td != newtd)
-		cpu_switch(td, newtd);	/* SHAZAM!! */
-
-	sched_lock.mtx_recurse = sched_nest;
-	sched_lock.mtx_lock = (uintptr_t)td;
-	sched_switchin(td);
+	sched_switch(td);
 
 	/* 
 	 * Start setting up stats etc. for the incoming thread.
 	 * Similar code in fork_exit() is returned to by cpu_switch()
 	 * in the case of a new thread/process.
 	 */
 	CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
 	default:
 		break;
 	}
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return;
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return;
 		/* XXX: intentional fall-through ? */
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			wakeup(&proc0);
 		}
 	} else
 		sched_wakeup(td);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  * XXXKSE   Needs complete rewrite when correct info is available.
  * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 	struct proc *p;
 	struct thread *td;
 
 	avg = &averunnable;
 	sx_slock(&allproc_lock);
 	nrun = 0;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			switch (td->td_state) {
 			case TDS_RUNQ:
 			case TDS_RUNNING:
 				if ((p->p_flag & P_NOLOAD) != 0)
 					goto nextproc;
 				nrun++; /* XXXKSE */
 			default:
 				break;
 			}
 nextproc:
 			continue;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 	    loadav, NULL);
 }
 
 static void
 lboltcb(void *arg)
 {
 	wakeup(&lbolt);
 	callout_reset(&lbolt_callout, hz, lboltcb, NULL);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(dummy)
 	void *dummy;
 {
 	callout_init(&loadav_callout, 0);
 	callout_init(&lbolt_callout, CALLOUT_MPSAFE);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 	lboltcb(NULL);
 }
 
 /*
  * General purpose yield system call
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
 	struct ksegrp *kg;
 
 	kg = td->td_ksegrp;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
 	sched_prio(td, PRI_MAX_TIMESHARE);
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
 	td->td_retval[0] = 0;
 	return (0);
 }
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 7f11c4bca066..542dce4dcd14 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -1,726 +1,727 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 struct ke_sched {
 	int	ske_cpticks;	/* (j) Ticks of cpu time. */
 };
 
 static struct ke_sched ke_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = NULL;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = NULL;
 
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
 
 static struct callout schedcpu_callout;
 static struct callout roundrobin_callout;
 
 static void	roundrobin(void *arg);
 static void	schedcpu(void *arg);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct ksegrp *kg);
 static void	resetpriority(struct ksegrp *kg);
 
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 
 	new_val = sched_quantum * tick;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < tick)
 		return (EINVAL);
 	sched_quantum = new_val / tick;
 	hogticks = 2 * sched_quantum;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
 	"Roundrobin scheduling quantum in microseconds");
 
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (td->td_priority < curthread->td_priority && curthread->td_kse)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Force switch among equal priority processes every 100ms.
  * We don't actually need to force a context switch of the current process.
  * The act of firing the event triggers a context switch to softclock() and
  * then switching back out again which is equivalent to a preemption, thus
  * no further work is needed on the local CPU.
  */
 /* ARGSUSED */
 static void
 roundrobin(void *arg)
 {
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	forward_roundrobin();
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (kg_estcpu) usage in 5 * loadav time
  *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		kg_estcpu *= decay;
  * will compute
  * 	kg_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void *arg)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 	int awake, realstathz;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		/*
 		 * Prevent state changes and protect run queue.
 		 */
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Increment time in/out of memory.  We ignore overflow; with
 		 * 16-bit int's (remember them?) overflow takes 45 days.
 		 */
 		p->p_swtime++;
 		FOREACH_KSEGRP_IN_PROC(p, kg) { 
 			awake = 0;
 			FOREACH_KSE_IN_GROUP(kg, ke) {
 				/*
 				 * Increment sleep time (if sleeping).  We
 				 * ignore overflow, as above.
 				 */
 				/*
 				 * The kse slptimes are not touched in wakeup
 				 * because the thread may not HAVE a KSE.
 				 */
 				if (ke->ke_state == KES_ONRUNQ) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				} else if ((ke->ke_state == KES_THREAD) &&
 				    (TD_IS_RUNNING(ke->ke_thread))) {
 					awake = 1;
 					/* Do not clear KEF_DIDRUN */
 				} else if (ke->ke_flags & KEF_DIDRUN) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				}
 
 				/*
 				 * ke_pctcpu is only for ps and ttyinfo().
 				 * Do it per kse, and add them up at the end?
 				 * XXXKSE
 				 */
 				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
 				    FSHIFT;
 				/*
 				 * If the kse has been idle the entire second,
 				 * stop recalculating its priority until
 				 * it wakes up.
 				 */
 				if (ke->ke_sched->ske_cpticks == 0)
 					continue;
 #if	(FSHIFT >= CCPU_SHIFT)
 				ke->ke_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ke->ke_sched->ske_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ke->ke_sched->ske_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ke->ke_pctcpu += ((FSCALE - ccpu) *
 				    (ke->ke_sched->ske_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ke->ke_sched->ske_cpticks = 0;
 			} /* end of kse loop */
 			/* 
 			 * If there are ANY running threads in this KSEGRP,
 			 * then don't count it as sleeping.
 			 */
 			if (awake) {
 				if (kg->kg_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(kg);
 				}
 				kg->kg_slptime = 0;
 			} else
 				kg->kg_slptime++;
 			if (kg->kg_slptime > 1)
 				continue;
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
 			FOREACH_THREAD_IN_GROUP(kg, td) {
 				if (td->td_priority >= PUSER) {
 					sched_prio(td, kg->kg_user_pri);
 				}
 			}
 		} /* end of ksegrp loop */
 		mtx_unlock_spin(&sched_lock);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay kg_estcpu to zero.
  */
 static void
 updatepri(struct ksegrp *kg)
 {
 	register fixpt_t loadfac;
 	register unsigned int newcpu;
 
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (kg->kg_slptime > 5 * loadfac)
 		kg->kg_estcpu = 0;
 	else {
 		newcpu = kg->kg_estcpu;
 		kg->kg_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --kg->kg_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		kg->kg_estcpu = newcpu;
 	}
 	resetpriority(kg);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct ksegrp *kg)
 {
 	register unsigned int newpriority;
 	struct thread *td;
 
 	if (kg->kg_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
 		maybe_resched(td);			/* XXXKSE silly */
 	}
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	hogticks = 2 * sched_quantum;
 
 	callout_init(&schedcpu_callout, CALLOUT_MPSAFE);
 	callout_init(&roundrobin_callout, 0);
 
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
 }
 
 /* External interfaces start here */
 int
 sched_runnable(void)
 {
         return runq_check(&runq);
 }
 
 int 
 sched_rr_interval(void)
 {
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	return (sched_quantum);
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (kg_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time kg_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	ke = td->td_kse;
 
 	ke->ke_sched->ske_cpticks++;
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(kg);
 		if (td->td_priority >= PUSER)
 			td->td_priority = kg->kg_user_pri;
 	}
 }
 
 /*
  * charge childs scheduling cpu usage to parent.
  *
  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
  * Charge it to the ksegrp that did the wait since process estcpu is sum of
  * all ksegrps, this is strictly as expected.  Assume that the child process
  * aggregated all the estcpu into the 'built-in' ksegrp.
  */
 void
 sched_exit(struct proc *p, struct proc *p1)
 {
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 	child->ke_sched->ske_cpticks = 0;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	child->kg_estcpu = kg->kg_estcpu;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_nice = nice;
 	resetpriority(kg);
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  * This may include moving the thread within the KSEGRP,
  * changing the assignment of a kse to the thread,
  * and moving a KSE in the system run queue.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		adjustrunqueue(td, prio);
 	} else {
 		td->td_priority = prio;
 	}
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_ksegrp->kg_slptime = 0;
 	td->td_priority = prio;
 }
 
 void
-sched_switchin(struct thread *td)
-{
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	td->td_oncpu = PCPU_GET(cpuid);
-}
-
-void
-sched_switchout(struct thread *td)
+sched_switch(struct thread *td)
 {
+	struct thread *newtd;
+	u_long sched_nest;
 	struct kse *ke;
 	struct proc *p;
 
 	ke = td->td_kse;
 	p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?"));
 
 	td->td_lastcpu = td->td_oncpu;
 	td->td_last_kse = ke;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		/* Put us back on the run queue (kse and all). */
 		setrunqueue(td);
 	} else if (p->p_flag & P_SA) {
 		/*
 		 * We will not be on the run queue. So we must be
 		 * sleeping or similar. As it's available,
 		 * someone else can use the KSE if they need it.
 		 */
 		kse_reassign(ke);
 	}
+	sched_nest = sched_lock.mtx_recurse;
+	newtd = choosethread();
+	if (td != newtd)
+		cpu_switch(td, newtd);
+	sched_lock.mtx_recurse = sched_nest;
+	sched_lock.mtx_lock = (uintptr_t)td;
+	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	if (kg->kg_slptime > 1)
 		updatepri(kg);
 	kg->kg_slptime = 0;
 	setrunqueue(td);
 	maybe_resched(td);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("runq_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("runq_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_add: process swapped out"));
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(&runq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_remove: process swapped out"));
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	runq_remove(&runq, ke);
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kse *ke;
 
 	ke = runq_choose(&runq);
 
 	if (ke != NULL) {
 		runq_remove(&runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		KASSERT((ke->ke_thread != NULL),
 		    ("runq_choose: No thread on KSE"));
 		KASSERT((ke->ke_thread->td_kse != NULL),
 		    ("runq_choose: No KSE on thread"));
 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		    ("runq_choose: process swapped out"));
 	}
 	return (ke);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	kg = td->td_ksegrp;
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp));
 }
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	return (td->td_kse->ke_pctcpu);
 }
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 71646b6a22de..1045122bcd67 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -1,1358 +1,1359 @@
 /*-
  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 #define KTR_ULE         KTR_NFS
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 /* XXX This is bogus compatability crap for ps */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
 
 static int sched_strict;
 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
 
 static int slice_min = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
 
 static int slice_max = 10;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
 
 int realstathz;
 int tickincr = 1;
 
 #ifdef SMP
 /* Callout to handle load balancing SMP systems. */
 static struct callout kseq_lb_callout;
 #endif
 
 /*
  * These datastructures are allocated within their parent datastructure but
  * are scheduler specific.
  */
 
 struct ke_sched {
 	int		ske_slice;
 	struct runq	*ske_runq;
 	/* The following variables are only used for pctcpu calculation */
 	int		ske_ltick;	/* Last tick that we were running on */
 	int		ske_ftick;	/* First tick that we were running on */
 	int		ske_ticks;	/* Tick count */
 	/* CPU that we have affinity for. */
 	u_char		ske_cpu;
 };
 #define	ke_slice	ke_sched->ske_slice
 #define	ke_runq		ke_sched->ske_runq
 #define	ke_ltick	ke_sched->ske_ltick
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
 };
 #define	kg_slptime	kg_sched->skg_slptime
 #define	kg_runtime	kg_sched->skg_runtime
 
 struct td_sched {
 	int	std_slptime;
 };
 #define	td_slptime	td_sched->std_slptime
 
 struct td_sched td_sched;
 struct ke_sched ke_sched;
 struct kg_sched kg_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = &kg_sched;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = &td_sched;
 
 /*
  * The priority is primarily determined by the interactivity score.  Thus, we
  * give lower(better) priorities to kse groups that use less CPU.  The nice
  * value is then directly added to this to allow nice to have some effect
  * on latency.
  *
  * PRI_RANGE:	Total priority range for timeshare threads.
  * PRI_NRESV:	Number of nice values.
  * PRI_BASE:	The start of the dynamic range.
  */
 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	SCHED_PRI_NRESV		PRIO_TOTAL
 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
 #define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
 #define	SCHED_PRI_INTERACT(score)					\
     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
 
 /*
  * These determine the interactivity of a process.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshhold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
 #define	SCHED_SLP_RUN_THROTTLE	(100)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters and macros determine the size of the time slice that is
  * granted to each thread.
  *
  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
  * SLICE_MAX:	Maximum time slice granted.
  * SLICE_RANGE:	Range of available time slices scaled by hz.
  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  */
 #define	SCHED_SLICE_MIN			(slice_min)
 #define	SCHED_SLICE_MAX			(slice_max)
 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
 #define	SCHED_SLICE_NICE(nice)						\
     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
 
 /*
  * This macro determines whether or not the kse belongs on the current or
  * next run queue.
  * 
  * XXX nice value should effect how interactive a kg is.
  */
 #define	SCHED_INTERACTIVE(kg)						\
     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
 #define	SCHED_CURR(kg, ke)						\
     (ke->ke_thread->td_priority != kg->kg_user_pri ||			\
     SCHED_INTERACTIVE(kg))
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
  */
 
 #define	SCHED_CPU_TIME	10
 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
 
 /*
  * kseq - per processor runqs and statistics.
  */
 
 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
 
 struct kseq {
 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
 	struct runq	*ksq_next;		/* Next timeshare queue. */
 	struct runq	*ksq_curr;		/* Current queue. */
 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
 	int		ksq_load;		/* Aggregate load. */
 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
 	int		ksq_cpus;	/* Count of CPUs in this kseq. */
 	unsigned int	ksq_rslices;	/* Slices on run queue */
 #endif
 };
 
 /*
  * One kse queue per processor.
  */
 #ifdef SMP
 struct kseq	kseq_cpu[MAXCPU];
 struct kseq	*kseq_idmap[MAXCPU];
 #define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(kseq_idmap[(x)])
 #else
 struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
 
 static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 static void sched_interact_update(struct ksegrp *kg);
 void sched_pctcpu_update(struct kse *ke);
 int sched_pickcpu(void);
 
 /* Operations on per processor queues */
 static struct kse * kseq_choose(struct kseq *kseq, int steal);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_add(struct kseq *kseq, struct kse *ke);
 static void kseq_rem(struct kseq *kseq, struct kse *ke);
 static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
 struct kseq * kseq_load_highest(void);
 void kseq_balance(void *arg);
 void kseq_move(struct kseq *from, int cpu);
 #endif
 
 void
 kseq_print(int cpu)
 {
 	struct kseq *kseq;
 	int i;
 
 	kseq = KSEQ_CPU(cpu);
 
 	printf("kseq:\n");
 	printf("\tload:           %d\n", kseq->ksq_load);
 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 	printf("\tnice counts:\n");
 	for (i = 0; i < PRIO_TOTAL + 1; i++)
 		if (kseq->ksq_nice[i])
 			printf("\t\t%d = %d\n",
 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
 }
 
 static void
 kseq_add(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
 	kseq->ksq_load++;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices += ke->ke_slice;
 #endif
 }
 
 static void
 kseq_rem(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
 	kseq->ksq_load--;
 	ke->ke_runq = NULL;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices -= ke->ke_slice;
 #endif
 }
 
 static void
 kseq_nice_add(struct kseq *kseq, int nice)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
 		kseq->ksq_nicemin = nice;
 }
 
 static void
 kseq_nice_rem(struct kseq *kseq, int nice) 
 {
 	int n;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	n = nice + SCHED_PRI_NHALF;
 	kseq->ksq_nice[n]--;
 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
 
 	/*
 	 * If this wasn't the smallest nice value or there are more in
 	 * this bucket we can just return.  Otherwise we have to recalculate
 	 * the smallest nice.
 	 */
 	if (nice != kseq->ksq_nicemin ||
 	    kseq->ksq_nice[n] != 0 ||
 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
 		return;
 
 	for (; n < SCHED_PRI_NRESV + 1; n++)
 		if (kseq->ksq_nice[n]) {
 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
 			return;
 		}
 }
 
 #ifdef SMP
 /*
  * kseq_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
  * by migrating some processes.
  *
  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
  * algorithm simplicity and more gradual effects on load in larger systems.
  *
  * It could be improved by considering the priorities and slices assigned to
  * each task prior to balancing them.  There are many pathological cases with
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
 void
 kseq_balance(void *arg)
 {
 	struct kseq *kseq;
 	int high_load;
 	int low_load;
 	int high_cpu;
 	int low_cpu;
 	int move;
 	int diff;
 	int i;
 
 	high_cpu = 0;
 	low_cpu = 0;
 	high_load = 0;
 	low_load = -1;
 
 	mtx_lock_spin(&sched_lock);
 	if (smp_started == 0)
 		goto out;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > high_load) {
 			high_load = kseq->ksq_load;
 			high_cpu = i;
 		}
 		if (low_load == -1 || kseq->ksq_load < low_load) {
 			low_load = kseq->ksq_load;
 			low_cpu = i;
 		}
 	}
 
 	kseq = KSEQ_CPU(high_cpu);
 
 	/*
 	 * Nothing to do.
 	 */
 	if (high_load < kseq->ksq_cpus + 1)
 		goto out;
 
 	high_load -= kseq->ksq_cpus;
 
 	if (low_load >= high_load)
 		goto out;
 
 	diff = high_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
 
 	for (i = 0; i < move; i++)
 		kseq_move(kseq, low_cpu);
 
 out:
 	mtx_unlock_spin(&sched_lock);
 	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
 
 	return;
 }
 
 struct kseq *
 kseq_load_highest(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	cpu = 0;
 	load = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > load) {
 			load = kseq->ksq_load;
 			cpu = i;
 		}
 	}
 	kseq = KSEQ_CPU(cpu);
 
 	if (load > kseq->ksq_cpus)
 		return (kseq);
 
 	return (NULL);
 }
 
 void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kse *ke;
 
 	ke = kseq_choose(from, 1);
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_state = KES_THREAD;
 	kseq_rem(from, ke);
 
 	ke->ke_cpu = cpu;
 	sched_add(ke);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.   If steal is 1 we
  * will return kses that have been denied slices due to their nice being too
  * low.  In the future we should prohibit stealing interrupt threads as well.
  */
 
 struct kse *
 kseq_choose(struct kseq *kseq, int steal)
 {
 	struct kse *ke;
 	struct runq *swap;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	swap = NULL;
 
 	for (;;) {
 		ke = runq_choose(kseq->ksq_curr);
 		if (ke == NULL) {
 			/*
 			 * We already swaped once and didn't get anywhere.
 			 */
 			if (swap)
 				break;
 			swap = kseq->ksq_curr;
 			kseq->ksq_curr = kseq->ksq_next;
 			kseq->ksq_next = swap;
 			continue;
 		}
 		/*
 		 * If we encounter a slice of 0 the kse is in a
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
 		if (ke->ke_slice == 0 && steal == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
 			runq_add(ke->ke_runq, ke);
 			continue;
 		}
 		return (ke);
 	}
 
 	return (runq_choose(&kseq->ksq_idle));
 }
 
 static void
 kseq_setup(struct kseq *kseq)
 {
 	runq_init(&kseq->ksq_timeshare[0]);
 	runq_init(&kseq->ksq_timeshare[1]);
 	runq_init(&kseq->ksq_idle);
 
 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
 	kseq->ksq_next = &kseq->ksq_timeshare[1];
 
 	kseq->ksq_loads[PRI_ITHD] = 0;
 	kseq->ksq_loads[PRI_REALTIME] = 0;
 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
 	kseq->ksq_loads[PRI_IDLE] = 0;
 	kseq->ksq_load = 0;
 #ifdef SMP
 	kseq->ksq_rslices = 0;
 #endif
 }
 
 static void
 sched_setup(void *dummy)
 {
 #ifdef SMP
 	int i;
 #endif
 
 	slice_min = (hz/100);	/* 10ms */
 	slice_max = (hz/7);	/* ~140ms */
 
 #ifdef SMP
 	/* init kseqs */
 	/* Create the idmap. */
 #ifdef ULE_HTT_EXPERIMENTAL
 	if (smp_topology == NULL) {
 #else
 	if (1) {
 #endif
 		for (i = 0; i < MAXCPU; i++) {
 			kseq_setup(&kseq_cpu[i]);
 			kseq_idmap[i] = &kseq_cpu[i];
 			kseq_cpu[i].ksq_cpus = 1;
 		}
 	} else {
 		int j;
 
 		for (i = 0; i < smp_topology->ct_count; i++) {
 			struct cpu_group *cg;
 
 			cg = &smp_topology->ct_group[i];
 			kseq_setup(&kseq_cpu[i]);
 
 			for (j = 0; j < MAXCPU; j++)
 				if ((cg->cg_mask & (1 << j)) != 0)
 					kseq_idmap[j] = &kseq_cpu[i];
 			kseq_cpu[i].ksq_cpus = cg->cg_count;
 		}
 	}
 	callout_init(&kseq_lb_callout, CALLOUT_MPSAFE);
 	kseq_balance(NULL);
 #else
 	kseq_setup(KSEQ_SELF());
 #endif
 	mtx_lock_spin(&sched_lock);
 	kseq_add(KSEQ_SELF(), &kse0);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct ksegrp *kg)
 {
 	int pri;
 
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 
 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 	pri += SCHED_PRI_BASE;
 	pri += kg->kg_nice;
 
 	if (pri > PRI_MAX_TIMESHARE)
 		pri = PRI_MAX_TIMESHARE;
 	else if (pri < PRI_MIN_TIMESHARE)
 		pri = PRI_MIN_TIMESHARE;
 
 	kg->kg_user_pri = pri;
 
 	return;
 }
 
 /*
  * Calculate a time slice based on the properties of the kseg and the runq
  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
  */
 static void
 sched_slice(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 
 	kg = ke->ke_ksegrp;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 
 	/*
 	 * Rationale:
 	 * KSEs in interactive ksegs get the minimum slice so that we
 	 * quickly notice if it abuses its advantage.
 	 *
 	 * KSEs in non-interactive ksegs are assigned a slice that is
 	 * based on the ksegs nice value relative to the least nice kseg
 	 * on the run queue for this cpu.
 	 *
 	 * If the KSE is less nice than all others it gets the maximum
 	 * slice and other KSEs will adjust their slice relative to
 	 * this when they first expire.
 	 *
 	 * There is 20 point window that starts relative to the least
 	 * nice kse on the run queue.  Slice size is determined by
 	 * the kse distance from the last nice ksegrp.
 	 *
 	 * If you are outside of the window you will get no slice and
 	 * you will be reevaluated each time you are selected on the
 	 * run queue.
 	 *	
 	 */
 
 	if (!SCHED_INTERACTIVE(kg)) {
 		int nice;
 
 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
 		    kg->kg_nice < kseq->ksq_nicemin)
 			ke->ke_slice = SCHED_SLICE_MAX;
 		else if (nice <= SCHED_PRI_NTHRESH)
 			ke->ke_slice = SCHED_SLICE_NICE(nice);
 		else
 			ke->ke_slice = 0;
 	} else
 		ke->ke_slice = SCHED_SLICE_MIN;
 
 	CTR6(KTR_ULE,
 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
 
 	/*
 	 * Check to see if we need to scale back the slp and run time
 	 * in the kg.  This will cause us to forget old interactivity
 	 * while maintaining the current ratio.
 	 */
 	sched_interact_update(kg);
 
 	return;
 }
 
 static void
 sched_interact_update(struct ksegrp *kg)
 {
 	/* XXX Fixme, use a linear algorithm and not a while loop. */
 	while ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
 		kg->kg_runtime = (kg->kg_runtime / 5) * 4;
 		kg->kg_slptime = (kg->kg_slptime / 5) * 4;
 	}
 }
 
 static int
 sched_interact_score(struct ksegrp *kg)
 {
 	int div;
 
 	if (kg->kg_runtime > kg->kg_slptime) {
 		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 	} if (kg->kg_slptime > kg->kg_runtime) {
 		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 		return (kg->kg_runtime / div);
 	}
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most SCHED_SLICE_MAX.
  */
 int
 sched_rr_interval(void)
 {
 	return (SCHED_SLICE_MAX);
 }
 
 void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
 	 * Adjust counters and watermark for pctcpu calc.
 	 */
 	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 		/*
 		 * Shift the tick count out so that the divide doesn't
 		 * round away our results.
 		 */
 		ke->ke_ticks <<= 10;
 		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 			    SCHED_CPU_TICKS;
 		ke->ke_ticks >>= 10;
 	} else
 		ke->ke_ticks = 0;
 	ke->ke_ltick = ticks;
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
 #ifdef SMP
 /* XXX Should be changed to kseq_load_lowest() */
 int
 sched_pickcpu(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (!smp_started)
 		return (0);
 
 	load = 0;
 	cpu = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load < load) {
 			cpu = i;
 			load = kseq->ksq_load;
 		}
 	}
 
 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
 	return (cpu);
 }
 #else
 int
 sched_pickcpu(void)
 {
 	return (0);
 }
 #endif
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		adjustrunqueue(td, prio);
 	} else {
 		td->td_priority = prio;
 	}
 }
 
 void
-sched_switchout(struct thread *td)
+sched_switch(struct thread *td)
 {
+	struct thread *newtd;
+	u_int sched_nest;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	ke = td->td_kse;
 
 	td->td_last_kse = ke;
         td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
         td->td_flags &= ~TDF_NEEDRESCHED;
 
 	if (TD_IS_RUNNING(td)) {
 		if (td->td_proc->p_flag & P_SA) {
 			kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
 			setrunqueue(td);
 		} else {
 			/*
 			 * This queue is always correct except for idle threads which
 			 * have a higher priority due to priority propagation.
 			 */
 			if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE &&
 			    ke->ke_thread->td_priority > PRI_MIN_IDLE)
 				ke->ke_runq = KSEQ_SELF()->ksq_curr;
 			runq_add(ke->ke_runq, ke);
 			/* setrunqueue(td); */
 		}
 		return;
 	}
 	if (ke->ke_runq)
 		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
 	/*
 	 * We will not be on the run queue. So we must be
 	 * sleeping or similar.
 	 */
 	if (td->td_proc->p_flag & P_SA)
 		kse_reassign(ke);
-}
-
-void
-sched_switchin(struct thread *td)
-{
-	/* struct kse *ke = td->td_kse; */
-	mtx_assert(&sched_lock, MA_OWNED);
+	sched_nest = sched_lock.mtx_recurse;
+	newtd = choosethread();
+	if (td != newtd)
+		cpu_switch(td, newtd);
+	sched_lock.mtx_recurse = sched_nest;
+	sched_lock.mtx_lock = (uintptr_t)td;
 
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 	struct kse *ke;
 	struct thread *td;
 	struct kseq *kseq;
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * We need to adjust the nice counts for running KSEs.
 	 */
 	if (kg->kg_pri_class == PRI_TIMESHARE)
 		FOREACH_KSE_IN_GROUP(kg, ke) {
 			if (ke->ke_runq == NULL)
 				continue;
 			kseq = KSEQ_CPU(ke->ke_cpu);
 			kseq_nice_rem(kseq, kg->kg_nice);
 			kseq_nice_add(kseq, nice);
 		}
 	kg->kg_nice = nice;
 	sched_priority(kg);
 	FOREACH_THREAD_IN_GROUP(kg, td)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	td->td_slptime = ticks;
 	td->td_priority = prio;
 
 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
 	    td->td_kse, td->td_slptime);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * Let the kseg know how long we slept for.  This is because process
 	 * interactivity behavior is modeled in the kseg.
 	 */
 	if (td->td_slptime) {
 		struct ksegrp *kg;
 		int hzticks;
 
 		kg = td->td_ksegrp;
 		hzticks = ticks - td->td_slptime;
 		kg->kg_slptime += hzticks << 10;
 		sched_interact_update(kg);
 		sched_priority(kg);
 		if (td->td_kse)
 			sched_slice(td->td_kse);
 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
 		    td->td_kse, hzticks);
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
         if (td->td_priority < curthread->td_priority)
                 curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 
 	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
 	child->ke_runq = NULL;
 
 	/* Grab our parents cpu estimation information. */
 	child->ke_ticks = ke->ke_ticks;
 	child->ke_ltick = ke->ke_ltick;
 	child->ke_ftick = ke->ke_ftick;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
 	/* XXX Need something better here */
 
 	child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE;
 	child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE;
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	child->kg_user_pri = kg->kg_user_pri;
 	child->kg_nice = kg->kg_nice;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (kg->kg_pri_class == class)
 		return;
 
 	FOREACH_KSE_IN_GROUP(kg, ke) {
 		if (ke->ke_state != KES_ONRUNQ &&
 		    ke->ke_state != KES_THREAD)
 			continue;
 		kseq = KSEQ_CPU(ke->ke_cpu);
 
 		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
 		kseq->ksq_loads[PRI_BASE(class)]++;
 
 		if (kg->kg_pri_class == PRI_TIMESHARE)
 			kseq_nice_rem(kseq, kg->kg_nice);
 		else if (class == PRI_TIMESHARE)
 			kseq_nice_add(kseq, kg->kg_nice);
 	}
 
 	kg->kg_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct proc *child)
 {
 	/* XXX Need something better here */
 	mtx_assert(&sched_lock, MA_OWNED);
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	/* kg->kg_slptime += child->kg_slptime; */
 	kg->kg_runtime += child->kg_runtime;
 	sched_interact_update(kg);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_clock(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 #if 0
 	struct kse *nke;
 #endif
 
 	/*
 	 * sched_setup() apparently happens prior to stathz being set.  We
 	 * need to resolve the timers earlier in the boot so we can avoid
 	 * calculating this here.
 	 */
 	if (realstathz == 0) {
 		realstathz = stathz ? stathz : hz;
 		tickincr = hz / realstathz;
 		/*
 		 * XXX This does not work for values of stathz that are much
 		 * larger than hz.
 		 */
 		if (tickincr == 0)
 			tickincr = 1;
 	}
 
 	ke = td->td_kse;
 	kg = ke->ke_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
 
 	/* Adjust ticks for pctcpu */
 	ke->ke_ticks++;
 	ke->ke_ltick = ticks;
 
 	/* Go up to one second beyond our max and then trim back down */
 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 		sched_pctcpu_update(ke);
 
 	if (td->td_flags & TDF_IDLETD)
 		return;
 
 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 
 	/*
 	 * We only do slicing code for TIMESHARE ksegrps.
 	 */
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 	/*
 	 * Check for a higher priority task on the run queue.  This can happen
 	 * on SMP if another processor woke up a process on our runq.
 	 */
 	kseq = KSEQ_SELF();
 #if 0
 	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) {
 		if (sched_strict &&
 		    nke->ke_thread->td_priority < td->td_priority)
 			td->td_flags |= TDF_NEEDRESCHED;
 		else if (nke->ke_thread->td_priority <
 		    td->td_priority SCHED_PRIO_SLOP)
 		    
 		if (nke->ke_thread->td_priority < td->td_priority)
 			td->td_flags |= TDF_NEEDRESCHED;
 	}
 #endif
 	/*
 	 * We used a tick charge it to the ksegrp so that we can compute our
 	 * interactivity.
 	 */
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	/*
 	 * We used up one time slice.
 	 */
 	ke->ke_slice--;
 #ifdef SMP
 	kseq->ksq_rslices--;
 #endif
 
 	if (ke->ke_slice > 0)
 		return;
 	/*
 	 * We're out of time, recompute priorities and requeue.
 	 */
 	kseq_rem(kseq, ke);
 	sched_priority(kg);
 	sched_slice(ke);
 	if (SCHED_CURR(kg, ke))
 		ke->ke_runq = kseq->ksq_curr;
 	else
 		ke->ke_runq = kseq->ksq_next;
 	kseq_add(kseq, ke);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
 int
 sched_runnable(void)
 {
 	struct kseq *kseq;
 	int load;
 
 	load = 1;
 
 	mtx_lock_spin(&sched_lock);
 	kseq = KSEQ_SELF();
 
 	if (kseq->ksq_load)
 		goto out;
 #ifdef SMP
 	/*
 	 * For SMP we may steal other processor's KSEs.  Just search until we
 	 * verify that at least on other cpu has a runnable task.
 	 */
 	if (smp_started) {
 		int i;
 
 		for (i = 0; i < mp_maxid; i++) {
 			if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 				continue;
 			kseq = KSEQ_CPU(i);
 			if (kseq->ksq_load > kseq->ksq_cpus)
 				goto out;
 		}
 	}
 #endif
 	load = 0;
 out:
 	mtx_unlock_spin(&sched_lock);
 	return (load);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 #if 0
 	struct kseq *kseq;
 	struct kse *ke;
 #endif
 	
 	kg = td->td_ksegrp;
 
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		/*
 		 * This optimization is temporarily disabled because it
 		 * breaks priority propagation.
 		 */
 #if 0
 		kseq = KSEQ_SELF();
 		if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
 #ifdef SMP
 		    kseq->ksq_load > kseq->ksq_cpus &&
 #else
 		    kseq->ksq_load > 1 &&
 #endif
 		    (ke = kseq_choose(kseq, 0)) != NULL &&
 		    ke->ke_thread->td_priority < td->td_priority)
 #endif
 			curthread->td_flags |= TDF_NEEDRESCHED;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef SMP
 retry:
 #endif
 	kseq = KSEQ_SELF();
 	ke = kseq_choose(kseq, 0);
 	if (ke) {
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
 			    ke, ke->ke_runq, ke->ke_slice,
 			    ke->ke_thread->td_priority);
 		}
 		return (ke);
 	}
 
 #ifdef SMP
 	if (smp_started) {
 		/*
 		 * Find the cpu with the highest load and steal one proc.
 		 */
 		if ((kseq = kseq_load_highest()) == NULL)
 			return (NULL);
 
 		/*
 		 * Remove this kse from this kseq and runq and then requeue
 		 * on the current processor.  Then we will dequeue it
 		 * normally above.
 		 */
 		kseq_move(kseq, PCPU_GET(cpuid));
 		goto retry;
 	}
 #endif
 
 	return (NULL);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
 
 	switch (PRI_BASE(kg->kg_pri_class)) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
 		kseq = KSEQ_SELF();
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		/*
 		 * This is for priority prop.
 		 */
 		if (ke->ke_thread->td_priority > PRI_MIN_IDLE)
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = &kseq->ksq_idle;
 		ke->ke_slice = SCHED_SLICE_MIN;
 		break;
 	default:
 		panic("Unknown pri class.\n");
 		break;
 	}
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(ke->ke_runq, ke);
 	kseq_add(kseq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 	runq_remove(ke->ke_runq, ke);
 	kseq_rem(kseq, ke);
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct kse *ke;
 
 	pctcpu = 0;
 	ke = td->td_kse;
 
 	mtx_lock_spin(&sched_lock);
 	if (ke->ke_ticks) {
 		int rtick;
 
 		/*
 		 * Don't update more frequently than twice a second.  Allowing
 		 * this causes the cpu usage to decay away too quickly due to
 		 * rounding errors.
 		 */
 		if (ke->ke_ltick < (ticks - (hz / 2)))
 			sched_pctcpu_update(ke);
 		/* How many rtick per second ? */
 		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 	}
 
 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 	mtx_unlock_spin(&sched_lock);
 
 	return (pctcpu);
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 7fc9df23cd99..cd804cc559a5 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -1,95 +1,94 @@
 /*-
  * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SCHED_H_
 #define	_SYS_SCHED_H_
 
 /*
  * General scheduling info.
  */
 int	sched_rr_interval(void);
 int	sched_runnable(void);
 
 /* 
  * Proc related scheduling hooks.
  */
 void	sched_exit(struct proc *p, struct proc *child);
 void	sched_fork(struct proc *p, struct proc *child);
 
 /*
  * KSE Groups contain scheduling priority information.  They record the
  * behavior of groups of KSEs and threads.
  */
 void	sched_class(struct ksegrp *kg, int class);
 void	sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child);
 void	sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child);
 void	sched_nice(struct ksegrp *kg, int nice);
 
 /*
  * Threads are switched in and out, block on resources, and have temporary
  * priorities inherited from their ksegs.
  */
 void	sched_exit_thread(struct thread *td, struct thread *child);
 void	sched_fork_thread(struct thread *td, struct thread *child);
 void	sched_prio(struct thread *td, u_char prio);
 void	sched_sleep(struct thread *td, u_char prio);
-void	sched_switchin(struct thread *td);
-void	sched_switchout(struct thread *td);
+void	sched_switch(struct thread *td);
 void	sched_userret(struct thread *td);
 void	sched_wakeup(struct thread *td);
 
 /*
  * KSEs are moved on and off of run queues.
  */
 void	sched_add(struct thread *td);
 struct kse *sched_choose(void);
 void	sched_clock(struct thread *td);
 void	sched_exit_kse(struct kse *ke, struct kse *child);
 void	sched_fork_kse(struct kse *ke, struct kse *child);
 void	sched_rem(struct thread *td);
 
 /*
  * and they use up cpu time.
  */
 fixpt_t	sched_pctcpu(struct thread *td);
 
 /*
  * These procedures tell the process data structure allocation code how
  * many bytes to actually allocate.
  */
 int	sched_sizeof_kse(void);
 int	sched_sizeof_ksegrp(void);
 int	sched_sizeof_proc(void);
 int	sched_sizeof_thread(void);
 
 extern struct ke_sched *kse0_sched;
 extern struct kg_sched *ksegrp0_sched;
 extern struct p_sched *proc0_sched;
 extern struct td_sched *thread0_sched;
 
 #endif /* !_SYS_SCHED_H_ */