Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 121681)
+++ head/sys/kern/kern_fork.c	(revision 121682)
@@ -1,835 +1,835 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/mac.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/jail.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <sys/user.h>
 #include <machine/critical.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 static int forksleep; /* Place for fork1() to sleep on. */
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	/* Don't allow kernel only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	sysctl_wire_old_buffer(req, sizeof(int));
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;
 	int flags;
 	int pages;
 	struct proc **procp;
 {
 	struct proc *p1, *p2, *pptr;
 	uid_t uid;
 	struct proc *newproc;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
 	struct kse *ke2;
 	struct ksegrp *kg2;
 	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	p1 = td->td_proc;
 	mtx_lock(&Giant);
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		vm_forkproc(td, NULL, NULL, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent.)
 		 */
 		if (flags & RFFDG) {
 			FILEDESC_LOCK(p1->p_fd);
 			if (p1->p_fd->fd_refcnt > 1) {
 				struct filedesc *newfd;
 
 				newfd = fdcopy(td->td_proc->p_fd);
 				FILEDESC_UNLOCK(p1->p_fd);
 				fdfree(td);
 				p1->p_fd = newfd;
 			} else
 				FILEDESC_UNLOCK(p1->p_fd);
 		}
 		mtx_unlock(&Giant);
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Note 1:1 allows for forking with one thread coming out on the
 	 * other side with the expectation that the process is about to
 	 * exec.
 	 */
 	if (p1->p_flag & P_SA) {
 		/*
 		 * Idle the other threads for a second.
 		 * Since the user space is copied, it must remain stable.
 		 * In addition, all threads (from the user perspective)
 		 * need to either be suspended or in the kernel,
 		 * where they will try restart in the parent and will
 		 * be aborted in the child.
 		 */
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_NO_EXIT)) {
 			/* Abort.. someone else is single threading before us */
 			PROC_UNLOCK(p1);
 			mtx_unlock(&Giant);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 		/*
 		 * All other activity in this process
 		 * is now suspended at the user boundary,
 		 * (or other safe places if we think of any).
 		 */
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	uid = td->td_ucred->cr_ruid;
 	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 */
 	PROC_LOCK(p1);
 	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
 	PROC_UNLOCK(p1);
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			PROC_LOCK(p2);
 			while (p2->p_pid == trypid ||
 			    p2->p_pgrp->pg_id == trypid ||
 			    p2->p_session->s_sid == trypid) {
 				trypid++;
 				if (trypid >= pidchecked) {
 					PROC_UNLOCK(p2);
 					goto retry;
 				}
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp->pg_id > trypid &&
 			    pidchecked > p2->p_pgrp->pg_id)
 				pidchecked = p2->p_pgrp->pg_id;
 			if (p2->p_session->s_sid > trypid &&
 			    pidchecked > p2->p_session->s_sid)
 				pidchecked = p2->p_session->s_sid;
 			PROC_UNLOCK(p2);
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(td->td_proc->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		FILEDESC_LOCK(p1->p_fd);
 		fd = fdcopy(td->td_proc->p_fd);
 		FILEDESC_UNLOCK(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_LOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_UNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	td2 = FIRST_THREAD_IN_PROC(p2);
 	kg2 = FIRST_KSEGRP_IN_PROC(p2);
 	ke2 = FIRST_KSE_IN_KSEGRP(kg2);
 
 	/* Allocate and switch to an alternate kstack if specified */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 	bzero(&p2->p_startzero,
 	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
 	bzero(&ke2->ke_startzero,
 	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
 	bzero(&td2->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
 	bzero(&kg2->kg_startzero,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 #undef RANGEOF
 
 	/* Set up the thread as an active thread (as if runnable). */
 	ke2->ke_state = KES_THREAD;
 	ke2->ke_thread = td2;
 	td2->td_kse = ke2;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 * The p_stats substruct is set in vm_forkproc.
 	 */
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag = PS_INMEM;
 	/*
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
 	sched_fork(p1, p2);
 
 	mtx_unlock_spin(&sched_lock);
 	p2->p_ucred = crhold(td->td_ucred);
 	td2->td_ucred = crhold(p2->p_ucred);	/* XXXKSE */
 
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	/* Bump references to the text vnode (for procfs) */
 	p2->p_textvp = p1->p_textvp;
 	if (p2->p_textvp)
 		VREF(p2->p_textvp);
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * p_limit is copy-on-write, bump refcnt,
 	 */
 	p2->p_limit = p1->p_limit;
 	p2->p_limit->p_refcnt++;
 
 	/*
 	 * Setup linkage for kernel based threading
 	 */
 	if((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & (P_ALTSTACK | P_SUGID);
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		cnt.v_forks++;
 		cnt.v_forkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		cnt.v_vforks++;
 		cnt.v_vforkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else if (p1 == &proc0) {
 		cnt.v_kthreads++;
 		cnt.v_kthreadpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else {
 		cnt.v_rforks++;
 		cnt.v_rforkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	if ((flags & RFSTOPPED) == 0) {
 		mtx_lock_spin(&sched_lock);
 		p2->p_state = PRS_NORMAL;
 		TD_SET_CAN_RUN(td2);
 		setrunqueue(td2);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * If other threads are waiting, let them continue now
 	 */
 	if (p1->p_flag & P_SA) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	mtx_unlock(&Giant);
 	*procp = p2;
 	return (0);
 fail:
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 			uid);
 	sx_xunlock(&allproc_lock);
 	uma_zfree(proc_zone, newproc);
 	if (p1->p_flag & P_SA) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	tsleep(&forksleep, PUSER, "fork", hz / 2);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct proc *p;
 	struct thread *td;
 
 	/*
 	 * Processes normally resume in mi_switch() after being
 	 * cpu_switch()'ed to, but when children start up they arrive here
 	 * instead, so we must do much the same things as mi_switch() would.
 	 */
 
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 	td = curthread;
 	p = td->td_proc;
 	td->td_oncpu = PCPU_GET(cpuid);
 	p->p_state = PRS_NORMAL;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	sched_lock.mtx_lock = (uintptr_t)td;
-	sched_lock.mtx_recurse = 0;
+	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	cpu_critical_fork_exit();
 	CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	PROC_LOCK(p);
 	if (p->p_flag & P_KTHREAD) {
 		PROC_UNLOCK(p);
 		mtx_lock(&Giant);
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	PROC_UNLOCK(p);
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame, 0);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/sched_4bsd.c
===================================================================
--- head/sys/kern/sched_4bsd.c	(revision 121681)
+++ head/sys/kern/sched_4bsd.c	(revision 121682)
@@ -1,733 +1,730 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 struct ke_sched {
 	int	ske_cpticks;	/* (j) Ticks of cpu time. */
 };
 
 static struct ke_sched ke_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = NULL;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = NULL;
 
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
 
 static struct callout schedcpu_callout;
 static struct callout roundrobin_callout;
 
 static void	roundrobin(void *arg);
 static void	schedcpu(void *arg);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct ksegrp *kg);
 static void	resetpriority(struct ksegrp *kg);
 
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 
 	new_val = sched_quantum * tick;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < tick)
 		return (EINVAL);
 	sched_quantum = new_val / tick;
 	hogticks = 2 * sched_quantum;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
 	"Roundrobin scheduling quantum in microseconds");
 
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (td->td_priority < curthread->td_priority && curthread->td_kse)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Force switch among equal priority processes every 100ms.
  * We don't actually need to force a context switch of the current process.
  * The act of firing the event triggers a context switch to softclock() and
  * then switching back out again which is equivalent to a preemption, thus
  * no further work is needed on the local CPU.
  */
 /* ARGSUSED */
 static void
 roundrobin(void *arg)
 {
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	forward_roundrobin();
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (kg_estcpu) usage in 5 * loadav time
  *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		kg_estcpu *= decay;
  * will compute
  * 	kg_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void *arg)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 	int awake, realstathz;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		/*
 		 * Prevent state changes and protect run queue.
 		 */
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Increment time in/out of memory.  We ignore overflow; with
 		 * 16-bit int's (remember them?) overflow takes 45 days.
 		 */
 		p->p_swtime++;
 		FOREACH_KSEGRP_IN_PROC(p, kg) { 
 			awake = 0;
 			FOREACH_KSE_IN_GROUP(kg, ke) {
 				/*
 				 * Increment sleep time (if sleeping).  We
 				 * ignore overflow, as above.
 				 */
 				/*
 				 * The kse slptimes are not touched in wakeup
 				 * because the thread may not HAVE a KSE.
 				 */
 				if (ke->ke_state == KES_ONRUNQ) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				} else if ((ke->ke_state == KES_THREAD) &&
 				    (TD_IS_RUNNING(ke->ke_thread))) {
 					awake = 1;
 					/* Do not clear KEF_DIDRUN */
 				} else if (ke->ke_flags & KEF_DIDRUN) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				}
 
 				/*
 				 * ke_pctcpu is only for ps and ttyinfo().
 				 * Do it per kse, and add them up at the end?
 				 * XXXKSE
 				 */
 				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
 				    FSHIFT;
 				/*
 				 * If the kse has been idle the entire second,
 				 * stop recalculating its priority until
 				 * it wakes up.
 				 */
 				if (ke->ke_sched->ske_cpticks == 0)
 					continue;
 #if	(FSHIFT >= CCPU_SHIFT)
 				ke->ke_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ke->ke_sched->ske_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ke->ke_sched->ske_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ke->ke_pctcpu += ((FSCALE - ccpu) *
 				    (ke->ke_sched->ske_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ke->ke_sched->ske_cpticks = 0;
 			} /* end of kse loop */
 			/* 
 			 * If there are ANY running threads in this KSEGRP,
 			 * then don't count it as sleeping.
 			 */
 			if (awake) {
 				if (kg->kg_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(kg);
 				}
 				kg->kg_slptime = 0;
 			} else
 				kg->kg_slptime++;
 			if (kg->kg_slptime > 1)
 				continue;
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
 			FOREACH_THREAD_IN_GROUP(kg, td) {
 				if (td->td_priority >= PUSER) {
 					sched_prio(td, kg->kg_user_pri);
 				}
 			}
 		} /* end of ksegrp loop */
 		mtx_unlock_spin(&sched_lock);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay kg_estcpu to zero.
  */
 static void
 updatepri(struct ksegrp *kg)
 {
 	register fixpt_t loadfac;
 	register unsigned int newcpu;
 
 	loadfac = loadfactor(averunnable.ldavg[0]);
 	if (kg->kg_slptime > 5 * loadfac)
 		kg->kg_estcpu = 0;
 	else {
 		newcpu = kg->kg_estcpu;
 		kg->kg_slptime--;	/* was incremented in schedcpu() */
 		while (newcpu && --kg->kg_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		kg->kg_estcpu = newcpu;
 	}
 	resetpriority(kg);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct ksegrp *kg)
 {
 	register unsigned int newpriority;
 	struct thread *td;
 
 	if (kg->kg_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
 		maybe_resched(td);			/* XXXKSE silly */
 	}
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	hogticks = 2 * sched_quantum;
 
 	callout_init(&schedcpu_callout, CALLOUT_MPSAFE);
 	callout_init(&roundrobin_callout, 0);
 
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
 }
 
 /* External interfaces start here */
 int
 sched_runnable(void)
 {
         return runq_check(&runq);
 }
 
 int 
 sched_rr_interval(void)
 {
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	return (sched_quantum);
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (kg_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time kg_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	ke = td->td_kse;
 
 	ke->ke_sched->ske_cpticks++;
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(kg);
 		if (td->td_priority >= PUSER)
 			td->td_priority = kg->kg_user_pri;
 	}
 }
 
 /*
  * charge childs scheduling cpu usage to parent.
  *
  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
  * Charge it to the ksegrp that did the wait since process estcpu is sum of
  * all ksegrps, this is strictly as expected.  Assume that the child process
  * aggregated all the estcpu into the 'built-in' ksegrp.
  */
 void
 sched_exit(struct proc *p, struct proc *p1)
 {
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 	child->ke_sched->ske_cpticks = 0;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	child->kg_estcpu = kg->kg_estcpu;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_nice = nice;
 	resetpriority(kg);
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  * This may include moving the thread within the KSEGRP,
  * changing the assignment of a kse to the thread,
  * and moving a KSE in the system run queue.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		adjustrunqueue(td, prio);
 	} else {
 		td->td_priority = prio;
 	}
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_ksegrp->kg_slptime = 0;
 	td->td_priority = prio;
 }
 
 void
 sched_switch(struct thread *td)
 {
 	struct thread *newtd;
-	u_long sched_nest;
 	struct kse *ke;
 	struct proc *p;
 
 	ke = td->td_kse;
 	p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?"));
 
 	td->td_lastcpu = td->td_oncpu;
 	td->td_last_kse = ke;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		/* Put us back on the run queue (kse and all). */
 		setrunqueue(td);
 	} else if (p->p_flag & P_SA) {
 		/*
 		 * We will not be on the run queue. So we must be
 		 * sleeping or similar. As it's available,
 		 * someone else can use the KSE if they need it.
 		 */
 		kse_reassign(ke);
 	}
-	sched_nest = sched_lock.mtx_recurse;
 	newtd = choosethread();
 	if (td != newtd)
 		cpu_switch(td, newtd);
-	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	if (kg->kg_slptime > 1)
 		updatepri(kg);
 	kg->kg_slptime = 0;
 	setrunqueue(td);
 	maybe_resched(td);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("runq_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("runq_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_add: process swapped out"));
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(&runq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_remove: process swapped out"));
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	runq_remove(&runq, ke);
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kse *ke;
 
 	ke = runq_choose(&runq);
 
 	if (ke != NULL) {
 		runq_remove(&runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		KASSERT((ke->ke_thread != NULL),
 		    ("runq_choose: No thread on KSE"));
 		KASSERT((ke->ke_thread->td_kse != NULL),
 		    ("runq_choose: No KSE on thread"));
 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		    ("runq_choose: process swapped out"));
 	}
 	return (ke);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	kg = td->td_ksegrp;
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp));
 }
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	if (ke)
 		return (ke->ke_pctcpu);
 
 	return (0);
 }
Index: head/sys/kern/sched_ule.c
===================================================================
--- head/sys/kern/sched_ule.c	(revision 121681)
+++ head/sys/kern/sched_ule.c	(revision 121682)
@@ -1,1347 +1,1344 @@
 /*-
  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 #define KTR_ULE         KTR_NFS
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 /* XXX This is bogus compatability crap for ps */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
 
 static int sched_strict;
 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
 
 static int slice_min = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
 
 static int slice_max = 10;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
 
 int realstathz;
 int tickincr = 1;
 
 #ifdef SMP
 /* Callout to handle load balancing SMP systems. */
 static struct callout kseq_lb_callout;
 #endif
 
 /*
  * These datastructures are allocated within their parent datastructure but
  * are scheduler specific.
  */
 
 struct ke_sched {
 	int		ske_slice;
 	struct runq	*ske_runq;
 	/* The following variables are only used for pctcpu calculation */
 	int		ske_ltick;	/* Last tick that we were running on */
 	int		ske_ftick;	/* First tick that we were running on */
 	int		ske_ticks;	/* Tick count */
 	/* CPU that we have affinity for. */
 	u_char		ske_cpu;
 };
 #define	ke_slice	ke_sched->ske_slice
 #define	ke_runq		ke_sched->ske_runq
 #define	ke_ltick	ke_sched->ske_ltick
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
 };
 #define	kg_slptime	kg_sched->skg_slptime
 #define	kg_runtime	kg_sched->skg_runtime
 
 struct td_sched {
 	int	std_slptime;
 };
 #define	td_slptime	td_sched->std_slptime
 
 struct td_sched td_sched;
 struct ke_sched ke_sched;
 struct kg_sched kg_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = &kg_sched;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = &td_sched;
 
 /*
  * The priority is primarily determined by the interactivity score.  Thus, we
  * give lower(better) priorities to kse groups that use less CPU.  The nice
  * value is then directly added to this to allow nice to have some effect
  * on latency.
  *
  * PRI_RANGE:	Total priority range for timeshare threads.
  * PRI_NRESV:	Number of nice values.
  * PRI_BASE:	The start of the dynamic range.
  */
 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	SCHED_PRI_NRESV		PRIO_TOTAL
 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
 #define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
 #define	SCHED_PRI_INTERACT(score)					\
     ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
 
 /*
  * These determine the interactivity of a process.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time at fork time.
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshhold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
 #define	SCHED_SLP_RUN_THROTTLE	(100)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
  * These parameters and macros determine the size of the time slice that is
  * granted to each thread.
  *
  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
  * SLICE_MAX:	Maximum time slice granted.
  * SLICE_RANGE:	Range of available time slices scaled by hz.
  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  */
 #define	SCHED_SLICE_MIN			(slice_min)
 #define	SCHED_SLICE_MAX			(slice_max)
 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
 #define	SCHED_SLICE_NICE(nice)						\
     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
 
 /*
  * This macro determines whether or not the kse belongs on the current or
  * next run queue.
  * 
  * XXX nice value should effect how interactive a kg is.
  */
 #define	SCHED_INTERACTIVE(kg)						\
     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
 #define	SCHED_CURR(kg, ke)						\
     (ke->ke_thread->td_priority != kg->kg_user_pri ||			\
     SCHED_INTERACTIVE(kg))
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
  */
 
 #define	SCHED_CPU_TIME	10
 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
 
 /*
  * kseq - per processor runqs and statistics.
  */
 
 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
 
 struct kseq {
 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
 	struct runq	*ksq_next;		/* Next timeshare queue. */
 	struct runq	*ksq_curr;		/* Current queue. */
 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
 	int		ksq_load;		/* Aggregate load. */
 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
 	int		ksq_cpus;	/* Count of CPUs in this kseq. */
 	unsigned int	ksq_rslices;	/* Slices on run queue */
 #endif
 };
 
 /*
  * One kse queue per processor.
  */
 #ifdef SMP
 struct kseq	kseq_cpu[MAXCPU];
 struct kseq	*kseq_idmap[MAXCPU];
 #define	KSEQ_SELF()	(kseq_idmap[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(kseq_idmap[(x)])
 #else
 struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
 
 static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 static void sched_interact_update(struct ksegrp *kg);
 void sched_pctcpu_update(struct kse *ke);
 int sched_pickcpu(void);
 
 /* Operations on per processor queues */
 static struct kse * kseq_choose(struct kseq *kseq, int steal);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_add(struct kseq *kseq, struct kse *ke);
 static void kseq_rem(struct kseq *kseq, struct kse *ke);
 static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
 struct kseq * kseq_load_highest(void);
 void kseq_balance(void *arg);
 void kseq_move(struct kseq *from, int cpu);
 #endif
 
 void
 kseq_print(int cpu)
 {
 	struct kseq *kseq;
 	int i;
 
 	kseq = KSEQ_CPU(cpu);
 
 	printf("kseq:\n");
 	printf("\tload:           %d\n", kseq->ksq_load);
 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 	printf("\tnice counts:\n");
 	for (i = 0; i < PRIO_TOTAL + 1; i++)
 		if (kseq->ksq_nice[i])
 			printf("\t\t%d = %d\n",
 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
 }
 
 static void
 kseq_add(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
 	kseq->ksq_load++;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices += ke->ke_slice;
 #endif
 }
 
 static void
 kseq_rem(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
 	kseq->ksq_load--;
 	ke->ke_runq = NULL;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices -= ke->ke_slice;
 #endif
 }
 
 static void
 kseq_nice_add(struct kseq *kseq, int nice)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
 		kseq->ksq_nicemin = nice;
 }
 
 static void
 kseq_nice_rem(struct kseq *kseq, int nice) 
 {
 	int n;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	n = nice + SCHED_PRI_NHALF;
 	kseq->ksq_nice[n]--;
 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
 
 	/*
 	 * If this wasn't the smallest nice value or there are more in
 	 * this bucket we can just return.  Otherwise we have to recalculate
 	 * the smallest nice.
 	 */
 	if (nice != kseq->ksq_nicemin ||
 	    kseq->ksq_nice[n] != 0 ||
 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
 		return;
 
 	for (; n < SCHED_PRI_NRESV + 1; n++)
 		if (kseq->ksq_nice[n]) {
 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
 			return;
 		}
 }
 
 #ifdef SMP
 /*
  * kseq_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
  * by migrating some processes.
  *
  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
  * algorithm simplicity and more gradual effects on load in larger systems.
  *
  * It could be improved by considering the priorities and slices assigned to
  * each task prior to balancing them.  There are many pathological cases with
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
 void
 kseq_balance(void *arg)
 {
 	struct kseq *kseq;
 	int high_load;
 	int low_load;
 	int high_cpu;
 	int low_cpu;
 	int move;
 	int diff;
 	int i;
 
 	high_cpu = 0;
 	low_cpu = 0;
 	high_load = 0;
 	low_load = -1;
 
 	mtx_lock_spin(&sched_lock);
 	if (smp_started == 0)
 		goto out;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > high_load) {
 			high_load = kseq->ksq_load;
 			high_cpu = i;
 		}
 		if (low_load == -1 || kseq->ksq_load < low_load) {
 			low_load = kseq->ksq_load;
 			low_cpu = i;
 		}
 	}
 
 	kseq = KSEQ_CPU(high_cpu);
 
 	/*
 	 * Nothing to do.
 	 */
 	if (high_load < kseq->ksq_cpus + 1)
 		goto out;
 
 	high_load -= kseq->ksq_cpus;
 
 	if (low_load >= high_load)
 		goto out;
 
 	diff = high_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
 
 	for (i = 0; i < move; i++)
 		kseq_move(kseq, low_cpu);
 
 out:
 	mtx_unlock_spin(&sched_lock);
 	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
 
 	return;
 }
 
 struct kseq *
 kseq_load_highest(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	cpu = 0;
 	load = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > load) {
 			load = kseq->ksq_load;
 			cpu = i;
 		}
 	}
 	kseq = KSEQ_CPU(cpu);
 
 	if (load > kseq->ksq_cpus)
 		return (kseq);
 
 	return (NULL);
 }
 
 void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kse *ke;
 
 	ke = kseq_choose(from, 1);
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_state = KES_THREAD;
 	kseq_rem(from, ke);
 
 	ke->ke_cpu = cpu;
 	sched_add(ke->ke_thread);
 }
 #endif
 
 /*
  * Pick the highest priority task we have and return it.   If steal is 1 we
  * will return kses that have been denied slices due to their nice being too
  * low.  In the future we should prohibit stealing interrupt threads as well.
  */
 
 struct kse *
 kseq_choose(struct kseq *kseq, int steal)
 {
 	struct kse *ke;
 	struct runq *swap;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	swap = NULL;
 
 	for (;;) {
 		ke = runq_choose(kseq->ksq_curr);
 		if (ke == NULL) {
 			/*
 			 * We already swaped once and didn't get anywhere.
 			 */
 			if (swap)
 				break;
 			swap = kseq->ksq_curr;
 			kseq->ksq_curr = kseq->ksq_next;
 			kseq->ksq_next = swap;
 			continue;
 		}
 		/*
 		 * If we encounter a slice of 0 the kse is in a
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
 		if (ke->ke_slice == 0 && steal == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
 			runq_add(ke->ke_runq, ke);
 			continue;
 		}
 		return (ke);
 	}
 
 	return (runq_choose(&kseq->ksq_idle));
 }
 
 static void
 kseq_setup(struct kseq *kseq)
 {
 	runq_init(&kseq->ksq_timeshare[0]);
 	runq_init(&kseq->ksq_timeshare[1]);
 	runq_init(&kseq->ksq_idle);
 
 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
 	kseq->ksq_next = &kseq->ksq_timeshare[1];
 
 	kseq->ksq_loads[PRI_ITHD] = 0;
 	kseq->ksq_loads[PRI_REALTIME] = 0;
 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
 	kseq->ksq_loads[PRI_IDLE] = 0;
 	kseq->ksq_load = 0;
 #ifdef SMP
 	kseq->ksq_rslices = 0;
 #endif
 }
 
 static void
 sched_setup(void *dummy)
 {
 #ifdef SMP
 	int i;
 #endif
 
 	slice_min = (hz/100);	/* 10ms */
 	slice_max = (hz/7);	/* ~140ms */
 
 #ifdef SMP
 	/* init kseqs */
 	/* Create the idmap. */
 #ifdef ULE_HTT_EXPERIMENTAL
 	if (smp_topology == NULL) {
 #else
 	if (1) {
 #endif
 		for (i = 0; i < MAXCPU; i++) {
 			kseq_setup(&kseq_cpu[i]);
 			kseq_idmap[i] = &kseq_cpu[i];
 			kseq_cpu[i].ksq_cpus = 1;
 		}
 	} else {
 		int j;
 
 		for (i = 0; i < smp_topology->ct_count; i++) {
 			struct cpu_group *cg;
 
 			cg = &smp_topology->ct_group[i];
 			kseq_setup(&kseq_cpu[i]);
 
 			for (j = 0; j < MAXCPU; j++)
 				if ((cg->cg_mask & (1 << j)) != 0)
 					kseq_idmap[j] = &kseq_cpu[i];
 			kseq_cpu[i].ksq_cpus = cg->cg_count;
 		}
 	}
 	callout_init(&kseq_lb_callout, CALLOUT_MPSAFE);
 	kseq_balance(NULL);
 #else
 	kseq_setup(KSEQ_SELF());
 #endif
 	mtx_lock_spin(&sched_lock);
 	kseq_add(KSEQ_SELF(), &kse0);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct ksegrp *kg)
 {
 	int pri;
 
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 
 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 	pri += SCHED_PRI_BASE;
 	pri += kg->kg_nice;
 
 	if (pri > PRI_MAX_TIMESHARE)
 		pri = PRI_MAX_TIMESHARE;
 	else if (pri < PRI_MIN_TIMESHARE)
 		pri = PRI_MIN_TIMESHARE;
 
 	kg->kg_user_pri = pri;
 
 	return;
 }
 
 /*
  * Calculate a time slice based on the properties of the kseg and the runq
  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
  */
 static void
 sched_slice(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 
 	kg = ke->ke_ksegrp;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 
 	/*
 	 * Rationale:
 	 * KSEs in interactive ksegs get the minimum slice so that we
 	 * quickly notice if it abuses its advantage.
 	 *
 	 * KSEs in non-interactive ksegs are assigned a slice that is
 	 * based on the ksegs nice value relative to the least nice kseg
 	 * on the run queue for this cpu.
 	 *
 	 * If the KSE is less nice than all others it gets the maximum
 	 * slice and other KSEs will adjust their slice relative to
 	 * this when they first expire.
 	 *
 	 * There is 20 point window that starts relative to the least
 	 * nice kse on the run queue.  Slice size is determined by
 	 * the kse distance from the last nice ksegrp.
 	 *
 	 * If you are outside of the window you will get no slice and
 	 * you will be reevaluated each time you are selected on the
 	 * run queue.
 	 *	
 	 */
 
 	if (!SCHED_INTERACTIVE(kg)) {
 		int nice;
 
 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
 		    kg->kg_nice < kseq->ksq_nicemin)
 			ke->ke_slice = SCHED_SLICE_MAX;
 		else if (nice <= SCHED_PRI_NTHRESH)
 			ke->ke_slice = SCHED_SLICE_NICE(nice);
 		else
 			ke->ke_slice = 0;
 	} else
 		ke->ke_slice = SCHED_SLICE_MIN;
 
 	CTR6(KTR_ULE,
 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
 
 	/*
 	 * Check to see if we need to scale back the slp and run time
 	 * in the kg.  This will cause us to forget old interactivity
 	 * while maintaining the current ratio.
 	 */
 	sched_interact_update(kg);
 
 	return;
 }
 
 static void
 sched_interact_update(struct ksegrp *kg)
 {
         int ratio;
 
 	if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) {
 		ratio = ((SCHED_SLP_RUN_MAX * 15) / (kg->kg_runtime +
 		    kg->kg_slptime ));
 		kg->kg_runtime = (kg->kg_runtime * ratio) / 16;
 		kg->kg_slptime = (kg->kg_slptime * ratio) / 16;
 	}
 }
 
 static int
 sched_interact_score(struct ksegrp *kg)
 {
 	int div;
 
 	if (kg->kg_runtime > kg->kg_slptime) {
 		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
 		return (SCHED_INTERACT_HALF +
 		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
 	} if (kg->kg_slptime > kg->kg_runtime) {
 		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
 		return (kg->kg_runtime / div);
 	}
 
 	/*
 	 * This can happen if slptime and runtime are 0.
 	 */
 	return (0);
 
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most SCHED_SLICE_MAX.
  */
 int
 sched_rr_interval(void)
 {
 	return (SCHED_SLICE_MAX);
 }
 
 void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
 	 * Adjust counters and watermark for pctcpu calc.
 	 */
 	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
 		/*
 		 * Shift the tick count out so that the divide doesn't
 		 * round away our results.
 		 */
 		ke->ke_ticks <<= 10;
 		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
 			    SCHED_CPU_TICKS;
 		ke->ke_ticks >>= 10;
 	} else
 		ke->ke_ticks = 0;
 	ke->ke_ltick = ticks;
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
 #ifdef SMP
 /* XXX Should be changed to kseq_load_lowest() */
 int
 sched_pickcpu(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (!smp_started)
 		return (0);
 
 	load = 0;
 	cpu = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load < load) {
 			cpu = i;
 			load = kseq->ksq_load;
 		}
 	}
 
 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
 	return (cpu);
 }
 #else
 int
 sched_pickcpu(void)
 {
 	return (0);
 }
 #endif
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	struct kse *ke;
 
 	ke = td->td_kse;
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		/*
 		 * If the priority has been elevated due to priority
 		 * propagation, we may have to move ourselves to a new
 		 * queue.  We still call adjustrunqueue below in case kse
 		 * needs to fix things up.
 		 */
 		if (ke && ((td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
 		    prio < td->td_ksegrp->kg_user_pri) ||
 		    (td->td_ksegrp->kg_pri_class == PRI_IDLE &&
 		    prio < PRI_MIN_IDLE))) {
 			runq_remove(ke->ke_runq, ke);
 			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
 			runq_add(ke->ke_runq, ke);
 		}
 		adjustrunqueue(td, prio);
 	} else
 		td->td_priority = prio;
 }
 
 void
 sched_switch(struct thread *td)
 {
 	struct thread *newtd;
-	u_int sched_nest;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	ke = td->td_kse;
 
 	td->td_last_kse = ke;
         td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
         td->td_flags &= ~TDF_NEEDRESCHED;
 
 	if (TD_IS_RUNNING(td)) {
 		if (td->td_proc->p_flag & P_SA) {
 			kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
 			setrunqueue(td);
 		} else {
 			/*
 			 * This queue is always correct except for idle threads
 			 * which have a higher priority due to priority
 			 * propagation.
 			 */
 			if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) {
 				if (td->td_priority < PRI_MIN_IDLE)
 					ke->ke_runq = KSEQ_SELF()->ksq_curr;
 				else
 					ke->ke_runq = &KSEQ_SELF()->ksq_idle;
 			}
 			runq_add(ke->ke_runq, ke);
 			/* setrunqueue(td); */
 		}
 	} else {
 		if (ke->ke_runq)
 			kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
 		/*
 		 * We will not be on the run queue. So we must be
 		 * sleeping or similar.
 		 */
 		if (td->td_proc->p_flag & P_SA)
 			kse_reassign(ke);
 	}
-	sched_nest = sched_lock.mtx_recurse;
 	newtd = choosethread();
 	if (td != newtd)
 		cpu_switch(td, newtd);
-	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
 
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 	struct kse *ke;
 	struct thread *td;
 	struct kseq *kseq;
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * We need to adjust the nice counts for running KSEs.
 	 */
 	if (kg->kg_pri_class == PRI_TIMESHARE)
 		FOREACH_KSE_IN_GROUP(kg, ke) {
 			if (ke->ke_runq == NULL)
 				continue;
 			kseq = KSEQ_CPU(ke->ke_cpu);
 			kseq_nice_rem(kseq, kg->kg_nice);
 			kseq_nice_add(kseq, nice);
 		}
 	kg->kg_nice = nice;
 	sched_priority(kg);
 	FOREACH_THREAD_IN_GROUP(kg, td)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	td->td_slptime = ticks;
 	td->td_priority = prio;
 
 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
 	    td->td_kse, td->td_slptime);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * Let the kseg know how long we slept for.  This is because process
 	 * interactivity behavior is modeled in the kseg.
 	 */
 	if (td->td_slptime) {
 		struct ksegrp *kg;
 		int hzticks;
 
 		kg = td->td_ksegrp;
 		hzticks = ticks - td->td_slptime;
 		kg->kg_slptime += hzticks << 10;
 		sched_interact_update(kg);
 		sched_priority(kg);
 		if (td->td_kse)
 			sched_slice(td->td_kse);
 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
 		    td->td_kse, hzticks);
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
         if (td->td_priority < curthread->td_priority)
                 curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 
 	child->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
 	child->ke_runq = NULL;
 
 	/* Grab our parents cpu estimation information. */
 	child->ke_ticks = ke->ke_ticks;
 	child->ke_ltick = ke->ke_ltick;
 	child->ke_ftick = ke->ke_ftick;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
 	/* XXX Need something better here */
 
 	child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE;
 	child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE;
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	child->kg_user_pri = kg->kg_user_pri;
 	child->kg_nice = kg->kg_nice;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (kg->kg_pri_class == class)
 		return;
 
 	FOREACH_KSE_IN_GROUP(kg, ke) {
 		if (ke->ke_state != KES_ONRUNQ &&
 		    ke->ke_state != KES_THREAD)
 			continue;
 		kseq = KSEQ_CPU(ke->ke_cpu);
 
 		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
 		kseq->ksq_loads[PRI_BASE(class)]++;
 
 		if (kg->kg_pri_class == PRI_TIMESHARE)
 			kseq_nice_rem(kseq, kg->kg_nice);
 		else if (class == PRI_TIMESHARE)
 			kseq_nice_add(kseq, kg->kg_nice);
 	}
 
 	kg->kg_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct proc *child)
 {
 	/* XXX Need something better here */
 	mtx_assert(&sched_lock, MA_OWNED);
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	/* kg->kg_slptime += child->kg_slptime; */
 	kg->kg_runtime += child->kg_runtime;
 	sched_interact_update(kg);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_clock(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	/*
 	 * sched_setup() apparently happens prior to stathz being set.  We
 	 * need to resolve the timers earlier in the boot so we can avoid
 	 * calculating this here.
 	 */
 	if (realstathz == 0) {
 		realstathz = stathz ? stathz : hz;
 		tickincr = hz / realstathz;
 		/*
 		 * XXX This does not work for values of stathz that are much
 		 * larger than hz.
 		 */
 		if (tickincr == 0)
 			tickincr = 1;
 	}
 
 	ke = td->td_kse;
 	kg = ke->ke_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
 
 	/* Adjust ticks for pctcpu */
 	ke->ke_ticks++;
 	ke->ke_ltick = ticks;
 
 	/* Go up to one second beyond our max and then trim back down */
 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 		sched_pctcpu_update(ke);
 
 	if (td->td_flags & TDF_IDLETD)
 		return;
 
 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 	/*
 	 * We only do slicing code for TIMESHARE ksegrps.
 	 */
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 	/*
 	 * We used a tick charge it to the ksegrp so that we can compute our
 	 * interactivity.
 	 */
 	kg->kg_runtime += tickincr << 10;
 	sched_interact_update(kg);
 
 	/*
 	 * We used up one time slice.
 	 */
 	ke->ke_slice--;
 	kseq = KSEQ_SELF();
 #ifdef SMP
 	kseq->ksq_rslices--;
 #endif
 
 	if (ke->ke_slice > 0)
 		return;
 	/*
 	 * We're out of time, recompute priorities and requeue.
 	 */
 	kseq_rem(kseq, ke);
 	sched_priority(kg);
 	sched_slice(ke);
 	if (SCHED_CURR(kg, ke))
 		ke->ke_runq = kseq->ksq_curr;
 	else
 		ke->ke_runq = kseq->ksq_next;
 	kseq_add(kseq, ke);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
 int
 sched_runnable(void)
 {
 	struct kseq *kseq;
 	int load;
 
 	load = 1;
 
 	mtx_lock_spin(&sched_lock);
 	kseq = KSEQ_SELF();
 
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (kseq->ksq_load > 0)
 			goto out;
 	} else
 		if (kseq->ksq_load - 1 > 0)
 			goto out;
 #ifdef SMP
 	/*
 	 * For SMP we may steal other processor's KSEs.  Just search until we
 	 * verify that at least on other cpu has a runnable task.
 	 */
 	if (smp_started) {
 		int i;
 
 		for (i = 0; i < mp_maxid; i++) {
 			if (CPU_ABSENT(i) || (i & stopped_cpus) != 0)
 				continue;
 			kseq = KSEQ_CPU(i);
 			if (kseq->ksq_load > kseq->ksq_cpus)
 				goto out;
 		}
 	}
 #endif
 	load = 0;
 out:
 	mtx_unlock_spin(&sched_lock);
 	return (load);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	kg = td->td_ksegrp;
 	
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef SMP
 retry:
 #endif
 	kseq = KSEQ_SELF();
 	ke = kseq_choose(kseq, 0);
 	if (ke) {
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
 			    ke, ke->ke_runq, ke->ke_slice,
 			    ke->ke_thread->td_priority);
 		}
 		return (ke);
 	}
 
 #ifdef SMP
 	if (smp_started) {
 		/*
 		 * Find the cpu with the highest load and steal one proc.
 		 */
 		if ((kseq = kseq_load_highest()) == NULL)
 			return (NULL);
 
 		/*
 		 * Remove this kse from this kseq and runq and then requeue
 		 * on the current processor.  Then we will dequeue it
 		 * normally above.
 		 */
 		kseq_move(kseq, PCPU_GET(cpuid));
 		goto retry;
 	}
 #endif
 
 	return (NULL);
 }
 
 void
 sched_add(struct thread *td)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	ke = td->td_kse;
 	kg = td->td_ksegrp;
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
 
 	switch (PRI_BASE(kg->kg_pri_class)) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
 		kseq = KSEQ_SELF();
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		/*
 		 * This is for priority prop.
 		 */
 		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = &kseq->ksq_idle;
 		ke->ke_slice = SCHED_SLICE_MIN;
 		break;
 	default:
 		panic("Unknown pri class.\n");
 		break;
 	}
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(ke->ke_runq, ke);
 	kseq_add(kseq, ke);
 }
 
 void
 sched_rem(struct thread *td)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 	runq_remove(ke->ke_runq, ke);
 	kseq_rem(kseq, ke);
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
 	struct kse *ke;
 
 	pctcpu = 0;
 	ke = td->td_kse;
 	if (ke == NULL)
 		return (0);
 
 	mtx_lock_spin(&sched_lock);
 	if (ke->ke_ticks) {
 		int rtick;
 
 		/*
 		 * Don't update more frequently than twice a second.  Allowing
 		 * this causes the cpu usage to decay away too quickly due to
 		 * rounding errors.
 		 */
 		if (ke->ke_ltick < (ticks - (hz / 2)))
 			sched_pctcpu_update(ke);
 		/* How many rtick per second ? */
 		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 	}
 
 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 	mtx_unlock_spin(&sched_lock);
 
 	return (pctcpu);
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }