Index: head/sys/kern/kern_kse.c
===================================================================
--- head/sys/kern/kern_kse.c	(revision 106179)
+++ head/sys/kern/kern_kse.c	(revision 106180)
@@ -1,1711 +1,1731 @@
 /* 
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ucontext.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm_map.h>
 
 #include <machine/frame.h>
 
 /*
  * KSEGRP related storage.
  */
 static uma_zone_t ksegrp_zone;
 static uma_zone_t kse_zone;
 static uma_zone_t thread_zone;
 
 /* DEBUG ONLY */
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 static int oiks_debug = 1;	/* 0 disable, 1 printf, 2 enter debugger */
 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
 	&oiks_debug, 0, "OIKS thread debug");
 
 static int max_threads_per_proc = 10;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
 struct mtx zombie_thread_lock;
 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
     "zombie_thread_lock", MTX_SPIN);
 
 
 
 void kse_purge(struct proc *p, struct thread *td);
 /*
  * Pepare a thread for use.
  */
 static void
 thread_ctor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_flags |= TDF_UNBOUND;
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static void
 thread_init(void *mem, int size)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	mtx_lock(&Giant);
 	pmap_new_thread(td, 0);
 	mtx_unlock(&Giant);
 	cpu_thread_setup(td);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	pmap_dispose_thread(td);
 }
 
 /* 
  * KSE is linked onto the idle queue.
  */
 void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
 	ke->ke_state = KES_UNQUEUED;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu = NOCPU;
 }
 
 void
 kse_unlink(struct kse *ke)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = ke->ke_ksegrp;
 	if (ke->ke_state == KES_IDLE) {
 		kg->kg_idle_kses--;
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 	}
 
 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 	if (--kg->kg_kses == 0) {
 			ksegrp_unlink(kg);
 	}
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	kse_stash(ke);
 }
 
 void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
 	TAILQ_INIT(&kg->kg_threads);
 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
 	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
 	kg->kg_proc	= p;
 /* the following counters are in the -zero- section and may not need clearing */
 	kg->kg_numthreads = 0;
 	kg->kg_runnable = 0;
 	kg->kg_kses = 0;
 	kg->kg_idle_kses = 0;
 	kg->kg_loan_kses = 0;
 	kg->kg_runq_kses = 0; /* XXXKSE change name */
 /* link it in now that it's consistent */
 	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
 void
 ksegrp_unlink(struct ksegrp *kg)
 {
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	p = kg->kg_proc;
 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
 	    ("kseg_unlink: residual threads or KSEs"));
 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 	p->p_numksegrps--;
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	ksegrp_stash(kg);
 }
 
 /*
  * for a newly created process,
  * link up a the structure and its initial threads etc.
  */
 void
 proc_linkup(struct proc *p, struct ksegrp *kg,
 			struct kse *ke, struct thread *td)
 {
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 	p->p_numksegrps = 0;
 	p->p_numthreads = 0;
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
 }
 
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
+	struct proc *p;
+	struct thread *td2;
 
-	return(ENOSYS);
+	p = td->td_proc;
+	mtx_lock_spin(&sched_lock);
+	FOREACH_THREAD_IN_PROC(p, td2) {
+		if (td2->td_mailbox == uap->tmbx) {
+			td2->td_flags |= TDF_INTERRUPT;
+			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
+				if (td2->td_flags & TDF_CVWAITQ)
+					cv_abort(td2);
+				else
+					abortsleep(td2);
+			}	
+			mtx_unlock_spin(&sched_lock);
+			return 0;
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+	return(ESRCH);
 }
 
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (!(p->p_flag & P_KSES))
 		return EINVAL;
 	/* must be a bound thread */ 
 	if (td->td_flags & TDF_UNBOUND)
 		return EINVAL;
 	kg = td->td_ksegrp;
 	/* serialize killing kse */
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
 		p->p_flag &= ~P_KSES;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	} else {
 		while (mtx_owned(&Giant))
 			mtx_unlock(&Giant);
 		td->td_kse->ke_flags |= KEF_EXIT;
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return 0;
 }
 
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (p->p_flag & P_KSES) {
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return (EINVAL);
 }
 
 /* struct kse_wakeup_args {
 	struct kse_mailbox *mbx;
 }; */
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
 	struct proc *p;
 	struct kse *ke, *ke2;
 	struct ksegrp *kg;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (!(p->p_flag & P_KSES))
 		return EINVAL;
 	if (td->td_standin == NULL)
 		td->td_standin = thread_alloc();
 	ke = NULL;
 	mtx_lock_spin(&sched_lock);
 	if (uap->mbx) {
 		FOREACH_KSEGRP_IN_PROC(p, kg) {
 			FOREACH_KSE_IN_GROUP(kg, ke2) {
 				if (ke2->ke_mailbox != uap->mbx) 
 					continue;
 				if (ke2->ke_state == KES_IDLE) {
 					ke = ke2;
 					goto found;
 				} else {
 					mtx_unlock_spin(&sched_lock);
 					td->td_retval[0] = 0;
 					td->td_retval[1] = 0;
 					return 0;
 				}
 			}	
 		}
 	} else {
 		kg = td->td_ksegrp;
 		ke = TAILQ_FIRST(&kg->kg_iq);
 	}
 	if (ke == NULL) {
 		mtx_unlock_spin(&sched_lock);
 		return ESRCH;
 	}
 found:
 	thread_schedule_upcall(td, ke);
 	mtx_unlock_spin(&sched_lock);
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 	return 0;
 }
 
 /* 
  * No new KSEG: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate a new KSE and schedule an upcall on it.
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
 	int newgroup;
 }; */
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
 	struct kse *newke;
 	struct kse *ke;
 	struct ksegrp *newkg;
 	struct ksegrp *kg;
 	struct proc *p;
 	struct kse_mailbox mbx;
 	int err;
 
 	p = td->td_proc;
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
 	kg = td->td_ksegrp;
 	if (uap->newgroup) {
 		/* 
 		 * If we want a new KSEGRP it doesn't matter whether
 		 * we have already fired up KSE mode before or not.
 		 * We put the process in KSE mode and create a new KSEGRP
 		 * and KSE. If our KSE has not got a mailbox yet then
 		 * that doesn't matter, just leave it that way. It will 
 		 * ensure that this thread stay BOUND. It's possible
 		 * that the call came form a threaded library and the main 
 		 * program knows nothing of threads.
 		 */
 		newkg = ksegrp_alloc();
 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
 		      kg_startzero, kg_endzero)); 
 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 		newke = kse_alloc();
 	} else {
 		/* 
 		 * Otherwise, if we have already set this KSE
 		 * to have a mailbox, we want to make another KSE here,
 		 * but only if there are not already the limit, which 
 		 * is 1 per CPU max.
 		 * 
 		 * If the current KSE doesn't have a mailbox we just use it
 		 * and give it one.
 		 *
 		 * Because we don't like to access
 		 * the KSE outside of schedlock if we are UNBOUND,
 		 * (because it can change if we are preempted by an interrupt) 
 		 * we can deduce it as having a mailbox if we are UNBOUND,
 		 * and only need to actually look at it if we are BOUND,
 		 * which is safe.
 		 */
 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
 #if 0  /* while debugging */
 #ifdef SMP
 			if (kg->kg_kses > mp_ncpus)
 #endif
 				return (EPROCLIM);
 #endif
 			newke = kse_alloc();
 		} else {
 			newke = NULL;
 		}
 		newkg = NULL;
 	}
 	if (newke) {
 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
 		      ke_startzero, ke_endzero));
 #if 0
 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
 #endif
 		/* For the first call this may not have been set */
 		if (td->td_standin == NULL) {
 			td->td_standin = thread_alloc();
 		}
 		mtx_lock_spin(&sched_lock);
 		if (newkg)
 			ksegrp_link(newkg, p);
 		else
 			newkg = kg;
 		kse_link(newke, newkg);
 		if (p->p_sflag & PS_NEEDSIGCHK)
 			newke->ke_flags |= KEF_ASTPENDING;
 		newke->ke_mailbox = uap->mbx;
 		newke->ke_upcall = mbx.km_func;
 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
 		thread_schedule_upcall(td, newke);
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		/*
 		 * If we didn't allocate a new KSE then the we are using
 		 * the exisiting (BOUND) kse.
 		 */
 		ke = td->td_kse;
 		ke->ke_mailbox = uap->mbx;
 		ke->ke_upcall = mbx.km_func;
 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
 	}
 	/*
 	 * Fill out the KSE-mode specific fields of the new kse.
 	 */
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 	return (0);
 }
 
 /*
  * Fill a ucontext_t with a thread's context information.
  *
  * This is an analogue to getcontext(3).
  */
 void
 thread_getcontext(struct thread *td, ucontext_t *uc)
 {
 
 /*
  * XXX this is declared in a MD include file, i386/include/ucontext.h but
  * is used in MI code.
  */
 #ifdef __i386__
 	get_mcontext(td, &uc->uc_mcontext);
 #endif
 	uc->uc_sigmask = td->td_proc->p_sigmask;
 }
 
 /*
  * Set a thread's context from a ucontext_t.
  *
  * This is an analogue to setcontext(3).
  */
 int
 thread_setcontext(struct thread *td, ucontext_t *uc)
 {
 	int ret;
 
 /*
  * XXX this is declared in a MD include file, i386/include/ucontext.h but
  * is used in MI code.
  */
 #ifdef __i386__
 	ret = set_mcontext(td, &uc->uc_mcontext);
 #else
 	ret = ENOSYS;
 #endif
 	if (ret == 0) {
 		SIG_CANTMASK(uc->uc_sigmask);
 		PROC_LOCK(td->td_proc);
 		td->td_proc->p_sigmask = uc->uc_sigmask;
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (ret);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 #ifndef __ia64__
 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, 0);
 #else
 	/*
 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
 	 * of contigmallloc().  This hackery is to pre-construct a whole
 	 * pile of thread structures with associated kernel stacks early
 	 * in the system startup while contigmalloc() still works. Once we
 	 * have them, keep them.  Sigh.
 	 */
 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
 #endif
 	ksegrp_zone = uma_zcreate("KSEGRP", sizeof (struct ksegrp),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	kse_zone = uma_zcreate("KSE", sizeof (struct kse),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 }
 
 /*
  * Stash an embarasingly extra thread into the zombie thread queue.
  */
 void
 thread_stash(struct thread *td)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Stash an embarasingly extra kse into the zombie kse queue.
  */
 void
 kse_stash(struct kse *ke)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
  */
 void
 ksegrp_stash(struct ksegrp *kg)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Reap zombie threads.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 	struct kse *ke_first, *ke_next;
 	struct ksegrp *kg_first, * kg_next;
 
 	/*
 	 * don't even bother to lock if none at this instant
 	 * We really don't care about the next instant..
 	 */
 	if ((!TAILQ_EMPTY(&zombie_threads))
 	    || (!TAILQ_EMPTY(&zombie_kses))
 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
 		mtx_lock_spin(&zombie_thread_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		ke_first = TAILQ_FIRST(&zombie_kses);
 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		if (ke_first)
 			TAILQ_INIT(&zombie_kses);
 		if (kg_first)
 			TAILQ_INIT(&zombie_ksegrps);
 		mtx_unlock_spin(&zombie_thread_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_runq);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 		while (ke_first) {
 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
 			kse_free(ke_first);
 			ke_first = ke_next;
 		}
 		while (kg_first) {
 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 			ksegrp_free(kg_first);
 			kg_first = kg_next;
 		}
 	}
 }
 
 /*
  * Allocate a ksegrp.
  */
 struct ksegrp *
 ksegrp_alloc(void)
 {
 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
 }
 
 /*
  * Allocate a kse.
  */
 struct kse *
 kse_alloc(void)
 {
 	return (uma_zalloc(kse_zone, M_WAITOK));
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
 	thread_reap(); /* check if any zombies to get */
 	return (uma_zalloc(thread_zone, M_WAITOK));
 }
 
 /*
  * Deallocate a ksegrp.
  */
 void
 ksegrp_free(struct ksegrp *td)
 {
 	uma_zfree(ksegrp_zone, td);
 }
 
 /*
  * Deallocate a kse.
  */
 void
 kse_free(struct kse *td)
 {
 	uma_zfree(kse_zone, td);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
  * The list is anchored in the ksegrp structure.
  */
 int
 thread_export_context(struct thread *td)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	uintptr_t mbx;
 	void *addr;
 	int error;
 	ucontext_t uc;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Export the user/machine context. */
 #if 0
 	addr = (caddr_t)td->td_mailbox +
 	    offsetof(struct kse_thr_mailbox, tm_context);
 #else /* if user pointer arithmetic is valid in the kernel */
 		addr = (void *)(&td->td_mailbox->tm_context);
 #endif
 	error = copyin(addr, &uc, sizeof(ucontext_t));
 	if (error == 0) {
 		thread_getcontext(td, &uc);
 		error = copyout(&uc, addr, sizeof(ucontext_t));
 
 	}
 	if (error) {
 		PROC_LOCK(p);
 		psignal(p, SIGSEGV);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	/* get address in latest mbox of list pointer */
 #if 0
 	addr = (caddr_t)td->td_mailbox
 	    + offsetof(struct kse_thr_mailbox , tm_next);
 #else /* if user pointer arithmetic is valid in the kernel */
 	addr = (void *)(&td->td_mailbox->tm_next);
 #endif
 	/*
 	 * Put the saved address of the previous first
 	 * entry into this one
 	 */
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = td->td_mailbox;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * Take the list of completed mailboxes for this KSEGRP and put them on this
  * KSE's mailbox as it's the next one going up.
  */
 static int
 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
 {
 	struct proc *p = kg->kg_proc;
 	void *addr;
 	uintptr_t mbx;
 
 #if 0
 	addr = (caddr_t)ke->ke_mailbox
 	    + offsetof(struct kse_mailbox, km_completed);
 #else /* if user pointer arithmetic is valid in the kernel */
 		addr = (void *)(&ke->ke_mailbox->km_completed);
 #endif
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		/* XXXKSE could use atomic CMPXCH here */
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * Discard the current thread and exit from its context.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our KSE's ke_tdspare slot, freeing the
  * thread that might be there currently. Because we know that only this
  * processor will run our KSE, we needn't worry about someone else grabbing
  * our context before we do a cpu_throw.
  */
 void
 thread_exit(void)
 {
 	struct thread *td;
 	struct kse *ke;
 	struct proc *p;
 	struct ksegrp	*kg;
 
 	td = curthread;
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	KASSERT(ke != NULL, ("thread exiting without a kse"));
 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
 
 	if (ke->ke_tdspare != NULL) {
 		thread_stash(ke->ke_tdspare);
 		ke->ke_tdspare = NULL;
 	}
 	if (td->td_standin != NULL) {
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff.
 	 */
 	if (p->p_numthreads > 1) {
 		/*
 		 * Unlink this thread from its proc and the kseg.
 		 * In keeping with the other structs we probably should
 		 * have a thread_unlink() that does some of this but it
 		 * would only be called from here (I think) so it would
 		 * be a waste. (might be useful for proc_fini() as well.)
  		 */
 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
 		p->p_numthreads--;
 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
 		kg->kg_numthreads--;
 		/*
 		 * The test below is NOT true if we are the
 		 * sole exiting thread. P_STOPPED_SNGL is unset
 		 * in exit1() after it is the only survivor.
 		 */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 
 		/* Reassign this thread's KSE. */
 		ke->ke_thread = NULL;
 		td->td_kse = NULL;
 		ke->ke_state = KES_UNQUEUED;
 		KASSERT((ke->ke_bound != td),
 		    ("thread_exit: entered with ke_bound set"));
 
 		/* 
 		 * The reason for all this hoopla is 
 		 * an attempt to stop our thread stack from being freed 
 		 * until AFTER we have stopped running on it.
 		 * Since we are under schedlock, almost any method where
 		 * it is eventually freed by someone else is probably ok.
 		 * (Especially if they do it under schedlock). We could 
 		 * almost free it here if we could be certain that 
 		 * the uma code wouldn't pull it apart immediatly, 
 		 * but unfortunatly we can not guarantee that.
 		 *
 		 * For threads that are exiting and NOT killing their
 		 * KSEs we can just stash it in the KSE, however
 		 * in the case where the KSE is also being deallocated,
 		 * we need to store it somewhere else. It turns out that
 		 * we will never free the last KSE, so there is always one
 		 * other KSE available. We might as well just choose one
 		 * and stash it there. Being under schedlock should make that
 		 * safe.
 		 *
 		 * In borrower threads, we can stash it in the lender
 		 * Where it won't be needed until this thread is long gone.
 		 * Borrower threads can't kill their KSE anyhow, so even
 		 * the KSE would be a safe place for them. It is not
 		 * necessary to have a KSE (or KSEGRP) at all beyond this
 		 * point, while we are under the protection of schedlock.
 		 *
 		 * Either give the KSE to another thread to use (or make
 		 * it idle), or free it entirely, possibly along with its
 		 * ksegrp if it's the last one.
 		 */
 		if (ke->ke_flags & KEF_EXIT) {
 			kse_unlink(ke);
 			/*
 			 * Designate another KSE to hold our thread.
 			 * Safe as long as we abide by whatever lock 
 			 * we control it with.. The other KSE will not
 			 * be able to run it until we release the schelock,
 			 * but we need to be careful about it deciding to 
 			 * write to the stack before then. Luckily
 			 * I believe that while another thread's
 			 * standin thread can be used in this way, the
 			 * spare thread for the KSE cannot be used without
 			 * holding schedlock at least once.
 			 */
 			ke =  FIRST_KSE_IN_PROC(p);
 		} else {
 			kse_reassign(ke);
 		}
 		if (ke->ke_bound) {
 			/*
 			 * WE are a borrower..
 			 * stash our thread with the owner.
 			 */
 			if (ke->ke_bound->td_standin) {
 				thread_stash(ke->ke_bound->td_standin);
 			}
 			ke->ke_bound->td_standin = td;
 		} else {
 			if (ke->ke_tdspare != NULL) {
 				thread_stash(ke->ke_tdspare);
 				ke->ke_tdspare = NULL;
 			}
 			ke->ke_tdspare = td;
 		}
 		PROC_UNLOCK(p);
 		td->td_state	= TDS_INACTIVE;
 		td->td_proc	= NULL;
 		td->td_ksegrp	= NULL;
 		td->td_last_kse	= NULL;
 	} else {
 		PROC_UNLOCK(p);
 	}
 
 	cpu_throw();
 	/* NOTREACHED */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  */
 void
 thread_link(struct thread *td, struct ksegrp *kg)
 {
 	struct proc *p;
 
 	p = kg->kg_proc;
 	td->td_state = TDS_INACTIVE;
 	td->td_proc	= p;
 	td->td_ksegrp	= kg;
 	td->td_last_kse	= NULL;
 
 	LIST_INIT(&td->td_contested);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
 	p->p_numthreads++;
 	kg->kg_numthreads++;
 	if (oiks_debug && p->p_numthreads > max_threads_per_proc) {
 		printf("OIKS %d\n", p->p_numthreads);
 		if (oiks_debug > 1)
 			Debugger("OIKS");
 	}
 	td->td_kse	= NULL;
 }
 
 void
 kse_purge(struct proc *p, struct thread *td)
 {
 	struct kse *ke;
 	struct ksegrp *kg;
 
  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
 	mtx_lock_spin(&sched_lock);
 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 			kg->kg_idle_kses--;
 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 			kg->kg_kses--;
 			if (ke->ke_tdspare)
 				thread_stash(ke->ke_tdspare);
    			kse_stash(ke);
 		}
 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 		p->p_numksegrps--;
 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
 			("wrong kg_kses"));
 		if (kg != td->td_ksegrp) {
 			ksegrp_stash(kg);
 		}
 	}
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
 	p->p_numksegrps++;
 	mtx_unlock_spin(&sched_lock);
 }
 
 
 /*
  * Create a thread and schedule it for upcall on the KSE given.
  */
 struct thread *
 thread_schedule_upcall(struct thread *td, struct kse *ke)
 {
 	struct thread *td2;
 	struct ksegrp *kg;
 	int newkse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	newkse = (ke != td->td_kse);
 
 	/* 
 	 * If the kse is already owned by another thread then we can't
 	 * schedule an upcall because the other thread must be BOUND
 	 * which means it is not in a position to take an upcall.
 	 * We must be borrowing the KSE to allow us to complete some in-kernel
 	 * work. When we complete, the Bound thread will have teh chance to 
 	 * complete. This thread will sleep as planned. Hopefully there will
 	 * eventually be un unbound thread that can be converted to an
 	 * upcall to report the completion of this thread.
 	 */
 	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
 		return (NULL);
 	}
 	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
 
 	if (ke->ke_state == KES_IDLE) {
 		kg = ke->ke_ksegrp;
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 		kg->kg_idle_kses--;
 		ke->ke_state = KES_UNQUEUED;
 	}
 	if ((td2 = td->td_standin) != NULL) {
 		td->td_standin = NULL;
 	} else {
 		if (newkse)
 			panic("no reserve thread when called with a new kse");
 		/*
 		 * If called from (e.g.) sleep and we do not have
 		 * a reserve thread, then we've used it, so do not
 		 * create an upcall.
 		 */
 		return(NULL);
 	}
 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
 	bzero(&td2->td_startzero,
 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	thread_link(td2, ke->ke_ksegrp);
 	cpu_set_upcall(td2, td->td_pcb);
 
 	/*
 	 * XXXKSE do we really need this? (default values for the
 	 * frame).
 	 */
 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	/*
 	 * Bind the new thread to the KSE,
 	 * and if it's our KSE, lend it back to ourself
 	 * so we can continue running.
 	 */
 	td2->td_ucred = crhold(td->td_ucred);
 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
 	td2->td_kse = ke;
 	td2->td_state = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	/*
 	 * If called from msleep(), we are working on the current
 	 * KSE so fake that we borrowed it. If called from
 	 * kse_create(), don't, as we have a new kse too.
 	 */
 	if (!newkse) {
 		/*
 		 * This thread will be scheduled when the current thread
 		 * blocks, exits or tries to enter userspace, (which ever
 		 * happens first). When that happens the KSe will "revert"
 		 * to this thread in a BOUND manner. Since we are called
 		 * from msleep() this is going to be "very soon" in nearly
 		 * all cases.
 		 */
 		ke->ke_bound = td2;
 		TD_SET_LOAN(td2);
 	} else {
 		ke->ke_bound = NULL;
 		ke->ke_thread = td2;
 		ke->ke_state = KES_THREAD;
 		setrunqueue(td2);
 	}
 	return (td2);	/* bogus.. should be a void function */
 }
 
 /*
  * Schedule an upcall to notify a KSE process recieved signals.
  *
  * XXX - Modifying a sigset_t like this is totally bogus.
  */
 struct thread *
 signal_upcall(struct proc *p, int sig)
 {
 	struct thread *td, *td2;
 	struct kse *ke;
 	sigset_t ss;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 return (NULL);
 
 	td = FIRST_THREAD_IN_PROC(p);
 	ke = td->td_kse;
 	PROC_UNLOCK(p);
 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
 	PROC_LOCK(p);
 	if (error)
 		return (NULL);
 	SIGADDSET(ss, sig);
 	PROC_UNLOCK(p);
 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
 	PROC_LOCK(p);
 	if (error)
 		return (NULL);
 	if (td->td_standin == NULL)
 		td->td_standin = thread_alloc();
 	mtx_lock_spin(&sched_lock);
 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
 	mtx_unlock_spin(&sched_lock);
 	return (td2);
 }
 
 /*
  * setup done on the thread when it enters the kernel.
  * XXXKSE Presently only for syscalls but eventually all kernel entries.
  */
 void
 thread_user_enter(struct proc *p, struct thread *td)
 {
 	struct kse *ke;
 
 	/*
 	 * First check that we shouldn't just abort.
 	 * But check if we are the single thread first!
 	 * XXX p_singlethread not locked, but should be safe.
 	 */
 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		thread_exit();
 		/* NOTREACHED */
 	}
 
 	/*
 	 * If we are doing a syscall in a KSE environment,
 	 * note where our mailbox is. There is always the
 	 * possibility that we could do this lazily (in sleep()),
 	 * but for now do it every time.
 	 */
 	ke = td->td_kse;
 	if (ke->ke_mailbox != NULL) {
 #if 0
 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
 		    + offsetof(struct kse_mailbox, km_curthread));
 #else /* if user pointer arithmetic is ok in the kernel */
 		td->td_mailbox =
 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
 #endif
 		if ((td->td_mailbox == NULL) ||
 		    (td->td_mailbox == (void *)-1)) {
 			td->td_mailbox = NULL;	/* single thread it.. */
 			td->td_flags &= ~TDF_UNBOUND;
 		} else {
 			if (td->td_standin == NULL)
 				td->td_standin = thread_alloc();
 			td->td_flags |= TDF_UNBOUND;
 		}
 	}
 }
 
 /*
  * The extra work we go through if we are a threaded process when we
  * return to userland.
  *
  * If we are a KSE process and returning to user mode, check for
  * extra work to do before we return (e.g. for more syscalls
  * to complete first).  If we were in a critical section, we should
  * just return to let it finish. Same if we were in the UTS (in
  * which case the mailbox's context's busy indicator will be set).
  * The only traps we suport will have set the mailbox.
  * We will clear it here.
  */
 int
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	int error;
 	int unbound;
 	struct kse *ke;
 	struct ksegrp *kg;
 	struct thread *td2;
 	struct proc *p;
 
 	error = 0;
 
 	unbound = td->td_flags & TDF_UNBOUND;
 
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 
 	/*
 	 * Originally bound threads never upcall but they may 
 	 * loan out their KSE at this point.
 	 * Upcalls imply bound.. They also may want to do some Philantropy.
 	 * Unbound threads on the other hand either yield to other work
 	 * or transform into an upcall.
 	 * (having saved their context to user space in both cases)
 	 */
 	if (unbound ) {
 		/*
 		 * We are an unbound thread, looking to return to 
 		 * user space.
 		 * THere are several possibilities:
 		 * 1) we are using a borrowed KSE. save state and exit.
 		 *    kse_reassign() will recycle the kse as needed,
 		 * 2) we are not.. save state, and then convert ourself
 		 *    to be an upcall, bound to the KSE.
 		 *    if there are others that need the kse,
 		 *    give them a chance by doing an mi_switch().
 		 *    Because we are bound, control will eventually return
 		 *    to us here.
 		 * ***
 		 * Save the thread's context, and link it
 		 * into the KSEGRP's list of completed threads.
 		 */
 		error = thread_export_context(td);
 		td->td_mailbox = NULL;
 		if (error) {
 			/*
 			 * If we are not running on a borrowed KSE, then
 			 * failing to do the KSE operation just defaults
 			 * back to synchonous operation, so just return from
 			 * the syscall. If it IS borrowed, there is nothing
 			 * we can do. We just lose that context. We
 			 * probably should note this somewhere and send
 			 * the process a signal.
 			 */
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGSEGV);
 			mtx_lock_spin(&sched_lock);
 			if (td->td_kse->ke_bound == NULL) {
 				td->td_flags &= ~TDF_UNBOUND;
 				PROC_UNLOCK(td->td_proc);
 				mtx_unlock_spin(&sched_lock);
 				return (error);	/* go sync */
 			}
 			thread_exit();
 		}
 
 		/*
 		 * if the KSE is owned and we are borrowing it,
 		 * don't make an upcall, just exit so that the owner
 		 * can get its KSE if it wants it.
 		 * Our context is already safely stored for later
 		 * use by the UTS.
 		 */
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		if (td->td_kse->ke_bound) {
 			thread_exit();
 		}
 		PROC_UNLOCK(p);
 				
 		/*
 		 * Turn ourself into a bound upcall.
 		 * We will rely on kse_reassign()
 		 * to make us run at a later time.
 		 * We should look just like a sheduled upcall
 		 * from msleep() or cv_wait().
 		 */
 		td->td_flags &= ~TDF_UNBOUND;
 		td->td_flags |= TDF_UPCALLING;
 		/* Only get here if we have become an upcall */
 
 	} else {
 		mtx_lock_spin(&sched_lock);
 	}
 	/* 
 	 * We ARE going back to userland with this KSE.
 	 * Check for threads that need to borrow it.
 	 * Optimisation: don't call mi_switch if no-one wants the KSE.
 	 * Any other thread that comes ready after this missed the boat.
 	 */
 	ke = td->td_kse;
 	if ((td2 = kg->kg_last_assigned)) 
 		td2 = TAILQ_NEXT(td2, td_runq);
 	else
 		td2 = TAILQ_FIRST(&kg->kg_runq);
 	if (td2)  {
 		/* 
 		 * force a switch to more urgent 'in kernel'
 		 * work. Control will return to this thread
 		 * when there is no more work to do.
 		 * kse_reassign() will do tha for us.
 		 */
 		TD_SET_LOAN(td);
 		ke->ke_bound = td;
 		ke->ke_thread = NULL;
 		mi_switch(); /* kse_reassign() will (re)find td2 */
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Optimisation:
 	 * Ensure that we have a spare thread available,
 	 * for when we re-enter the kernel.
 	 */
 	if (td->td_standin == NULL) {
 		if (ke->ke_tdspare) {
 			td->td_standin = ke->ke_tdspare;
 			ke->ke_tdspare = NULL;
 		} else {
 			td->td_standin = thread_alloc();
 		}
 	}
 
 	/* 
 	 * To get here, we know there is no other need for our
 	 * KSE so we can proceed. If not upcalling, go back to 
 	 * userspace. If we are, get the upcall set up.
 	 */
 	if ((td->td_flags & TDF_UPCALLING) == 0)
 		return (0);
 
 	/* 
 	 * We must be an upcall to get this far.
 	 * There is no more work to do and we are going to ride
 	 * this thead/KSE up to userland as an upcall.
 	 * Do the last parts of the setup needed for the upcall.
 	 */
 	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
 
 	/*
 	 * Set user context to the UTS.
 	 */
 	cpu_set_upcall_kse(td, ke);
 
 	/*
 	 * Put any completed mailboxes on this KSE's list.
 	 */
 	error = thread_link_mboxes(kg, ke);
 	if (error)
 		goto bad;
 
 	/*
 	 * Set state and mailbox.
 	 * From now on we are just a bound outgoing process.
 	 * **Problem** userret is often called several times.
 	 * it would be nice if this all happenned only on the first time 
 	 * through. (the scan for extra work etc.)
 	 */
+	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_UPCALLING;
+	mtx_unlock_spin(&sched_lock);
 #if 0
 	error = suword((caddr_t)ke->ke_mailbox +
 	    offsetof(struct kse_mailbox, km_curthread), 0);
 #else	/* if user pointer arithmetic is ok in the kernel */
 	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
 #endif
 	if (!error)
 		return (0);
 
 bad:
 	/*
 	 * Things are going to be so screwed we should just kill the process.
  	 * how do we do that?
 	 */
 	PROC_LOCK(td->td_proc);
 	psignal(td->td_proc, SIGSEGV);
 	PROC_UNLOCK(td->td_proc);
 	return (error);	/* go sync */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accellerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int force_exit)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
 	if ((p->p_flag & P_KSES) == 0)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread) 
 		return (1);
 
 	if (force_exit == SINGLE_EXIT)
 		p->p_flag |= P_SINGLE_EXIT;
 	else
 		p->p_flag &= ~P_SINGLE_EXIT;
 	p->p_flag |= P_STOPPED_SINGLE;
 	p->p_singlethread = td;
 	/* XXXKSE Which lock protects the below values? */
 	while ((p->p_numthreads - p->p_suspcount) != 1) {
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			if (TD_IS_INHIBITED(td2)) {
 				if (force_exit == SINGLE_EXIT) {
 					if (TD_IS_SUSPENDED(td2)) {
 						thread_unsuspend_one(td2);
 					}
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR)) {
 						if (td2->td_flags & TDF_CVWAITQ)
 							cv_abort(td2);
 						else
 							abortsleep(td2);
 					}
 				} else {
 					if (TD_IS_SUSPENDED(td2))
 						continue;
 					/* maybe other inhibitted states too? */
 					if (TD_IS_SLEEPING(td2)) 
 						thread_suspend_one(td2);
 				}
 			}
 		}
 		/* 
 		 * Maybe we suspended some threads.. was it enough? 
 		 */
 		if ((p->p_numthreads - p->p_suspcount) == 1) {
 			mtx_unlock_spin(&sched_lock);
 			break;
 		}
 
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_one(td);
 		mtx_unlock(&Giant);
 		PROC_UNLOCK(p);
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		PROC_LOCK(p);
 	}
 	if (force_exit == SINGLE_EXIT)
 		kse_purge(p, td);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless 
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 
 	td = curthread;
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p)) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked 
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		} 
 		if (return_instead)
 			return (1);
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			mtx_lock_spin(&sched_lock);
 			while (mtx_owned(&Giant))
 				mtx_unlock(&Giant);
 			/* 
 			 * free extra kses and ksegrps, we needn't worry 
 			 * about if current thread is in same ksegrp as 
 			 * p_singlethread and last kse in the group
 			 * could be killed, this is protected by kg_numthreads,
 			 * in this case, we deduce that kg_numthreads must > 1.
 			 */
 			ke = td->td_kse;
 			if (ke->ke_bound == NULL && 
 			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
 				ke->ke_flags |= KEF_EXIT;
 			thread_exit();
 		}
 
 		/*
 		 * When a thread suspends, it just
 		 * moves to the processes's suspend queue
 		 * and stays there.
 		 *
 		 * XXXKSE if TDF_BOUND is true
 		 * it will not release it's KSE which might
 		 * lead to deadlock if there are not enough KSEs
 		 * to complete all waiting threads.
 		 * Maybe be able to 'lend' it out again.
 		 * (lent kse's can not go back to userland?)
 		 * and can only be lent in STOPPED state.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount+1 == p->p_numthreads)) {
 			mtx_unlock_spin(&sched_lock);
 			PROC_LOCK(p->p_pptr);
 			if ((p->p_pptr->p_procsig->ps_flag &
 				PS_NOCLDSTOP) == 0) {
 				psignal(p->p_pptr, SIGCHLD);
 			}
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 		}
 		mtx_assert(&Giant, MA_NOTOWNED);
 		thread_suspend_one(td);
 		PROC_UNLOCK(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
 	/*
 	 * Hack: If we are suspending but are on the sleep queue
 	 * then we are in msleep or the cv equivalent. We
 	 * want to look like we have two Inhibitors.
 	 * May already be set.. doesn't matter.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		TD_SET_SLEEPING(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_unsuspend_one(p->p_singlethread);
 	}
 }
 
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~P_STOPPED_SINGLE;
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
 		mtx_lock_spin(&sched_lock);
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 106179)
+++ head/sys/kern/kern_synch.c	(revision 106180)
@@ -1,647 +1,657 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 int	hogticks;
 int	lbolt;
 
 static struct callout loadav_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 static void	endtsleep(void *);
 static void	loadav(void *arg);
 
 /*
  * We're only looking at 7 bits of the address; everything is
  * aligned to 4, lots of things are aligned to greater powers
  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
  */
 #define TABLESIZE	128
 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 
 void
 sleepinit(void)
 {
 	int i;
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	for (i = 0; i < TABLESIZE; i++)
 		TAILQ_INIT(&slpque[i]);
 }
 
 /*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The mutex argument is exited before the caller is suspended, and
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
 
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int sig, catch = priority & PCATCH;
 	int rval = 0;
 	WITNESS_SAVE_DECL(mtx);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
 	WITNESS_SLEEP(0, &mtx->mtx_object);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
 	    ("sleeping without a mutex"));
 	/*
 	 * If we are capable of async syscalls and there isn't already
 	 * another one ready to return, start a new thread
 	 * and queue it as ready to run. Note that there is danger here
 	 * because we need to make sure that we don't sleep allocating
 	 * the thread (recursion here might be bad).
 	 * Hence the TDF_INMSLEEP flag.
 	 */
 	if (p->p_flag & P_KSES) {
 		/*
 		 * Just don't bother if we are exiting
-		 * and not the exiting thread.
+		 * and not the exiting thread or thread was marked as
+		 * interrupted.
 		 */
-		if ((p->p_flag & P_WEXIT) && catch && (p->p_singlethread != td))
+		if (catch &&
+		    (((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) ||
+		     (td->td_flags & TDF_INTERRUPT))) {
+			td->td_flags &= ~TDF_INTERRUPT;
 			return (EINTR);
+		}
 		mtx_lock_spin(&sched_lock);
 		if ((td->td_flags & (TDF_UNBOUND|TDF_INMSLEEP)) ==
 		    TDF_UNBOUND) {
 			/*
 			 * Arrange for an upcall to be readied.
 			 * it will not actually happen until all
 			 * pending in-kernel work for this KSEGRP
 			 * has been done.
 			 */
 			/* Don't recurse here! */
 			td->td_flags |= TDF_INMSLEEP;
 			thread_schedule_upcall(td, td->td_kse);
 			td->td_flags &= ~TDF_INMSLEEP;
 		}
 	} else {
 		mtx_lock_spin(&sched_lock);
 	}
 	if (cold ) {
 		/*
 		 * During autoconfiguration, just give interrupts
 		 * a chance, then just return.
 		 * Don't run any other procs or panic below,
 		 * in case this is the idle process and already asleep.
 		 */
 		if (mtx != NULL && priority & PDROP)
 			mtx_unlock(mtx);
 		mtx_unlock_spin(&sched_lock);
 		return (0);
 	}
 
 	DROP_GIANT();
 
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(&mtx->mtx_object, mtx);
 		mtx_unlock(mtx);
 		if (priority & PDROP)
 			mtx = NULL;
 	}
 
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
 	    td, p->p_pid, p->p_comm, wmesg, ident);
 
 	td->td_wchan = ident;
 	td->td_wmesg = wmesg;
 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
 	TD_SET_ON_SLEEPQ(td);
 	if (timo)
 		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there, and
 	 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
 	 * without resuming us, thus we must be ready for sleep
 	 * when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
 	if (catch) {
 		CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
 		    p->p_pid, p->p_comm);
 		td->td_flags |= TDF_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 		sig = cursig(td);
 		if (sig == 0 && thread_suspend_check(1))
 			sig = SIGSTOP;
 		mtx_lock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		if (sig != 0) {
 			if (TD_ON_SLEEPQ(td))
 				unsleep(td);
 		} else if (!TD_ON_SLEEPQ(td))
 			catch = 0;
 	} else
 		sig = 0;
 
 	/*
 	 * Let the scheduler know we're about to voluntarily go to sleep.
 	 */
 	sched_sleep(td, priority & PRIMASK);
 
 	if (TD_ON_SLEEPQ(td)) {
 		p->p_stats->p_ru.ru_nvcsw++;
 		TD_SET_SLEEPING(td);
 		mi_switch();
 	}
 	/*
 	 * We're awake from voluntary sleep.
 	 */
 	CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	td->td_flags &= ~TDF_SINTR;
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
 		if (sig == 0)
 			rval = EWOULDBLOCK;
 	} else if (td->td_flags & TDF_TIMOFAIL) {
 		td->td_flags &= ~TDF_TIMOFAIL;
 	} else if (timo && callout_stop(&td->td_slpcallout) == 0) {
 		/*
 		 * This isn't supposed to be pretty.  If we are here, then
 		 * the endtsleep() callout is currently executing on another
 		 * CPU and is either spinning on the sched_lock or will be
 		 * soon.  If we don't synchronize here, there is a chance
 		 * that this process may msleep() again before the callout
 		 * has a chance to run and the callout may end up waking up
 		 * the wrong msleep().  Yuck.
 		 */
 		TD_SET_SLEEPING(td);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		td->td_flags &= ~TDF_TIMOFAIL;
+	} 
+	if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) &&
+	    (rval == 0)) {
+		td->td_flags &= ~TDF_INTERRUPT;
+		rval = EINTR;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (rval == 0 && catch) {
 		PROC_LOCK(p);
 		/* XXX: shouldn't we always be calling cursig() */
 		if (sig != 0 || (sig = cursig(td))) {
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
 				rval = ERESTART;
 		}
 		PROC_UNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	if (mtx != NULL) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
 	}
 	return (rval);
 }
 
 /*
  * Implement timeout for msleep()
  *
  * If process hasn't been awakened (wchan non-zero),
  * set timeout flag and undo the sleep.  If proc
  * is stopped, just unsleep so it will remain stopped.
  * MP-safe, called without the Giant mutex.
  */
 static void
 endtsleep(arg)
 	void *arg;
 {
 	register struct thread *td = arg;
 
 	CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)",
 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
 	mtx_lock_spin(&sched_lock);
 	/*
 	 * This is the other half of the synchronization with msleep()
 	 * described above.  If the TDS_TIMEOUT flag is set, we lost the
 	 * race and just need to put the process back on the runqueue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 		td->td_flags |= TDF_TIMEOUT;
 	} else {
 		td->td_flags |= TDF_TIMOFAIL;
 	}
 	TD_CLR_SLEEPING(td);
 	setrunnable(td);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Abort a thread, as if an interrupt had occured.  Only abort
  * interruptable waits (unfortunatly it isn't only safe to abort others).
  * This is about identical to cv_abort().
  * Think about merging them?
  * Also, whatever the signal code does...
  */
 void
 abortsleep(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
 		if (TD_ON_SLEEPQ(td)) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 		}
 	}
 }
 
 /*
  * Remove a process from its wait queue
  */
 void
 unsleep(struct thread *td)
 {
 
 	mtx_lock_spin(&sched_lock);
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make all processes sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct thread *td;
 	struct thread *ntd;
 	struct proc *p;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 restart:
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			goto restart;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make a process sleeping on the specified identifier runnable.
  * May wake more than one process if a target process is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct thread *td;
 	register struct proc *p;
 	struct thread *ntd;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			break;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * The machine independent parts of mi_switch().
  */
 void
 mi_switch(void)
 {
 	struct bintime new_switchtime;
 	struct thread *td = curthread;	/* XXX */
 	struct proc *p = td->td_proc;	/* XXX */
 	struct kse *ke = td->td_kse;
 	u_int sched_nest;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) &&
 	    !TD_ON_RUNQ(td) &&
 	    !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1,
 	    ("mi_switch: switch in a critical section"));
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	binuptime(&new_switchtime);
 	bintime_add(&p->p_runtime, &new_switchtime);
 	bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
 
 #ifdef DDB
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (db_active) {
 		mtx_unlock_spin(&sched_lock);
 		db_error("Context switches not allowed in the debugger.");
 	}
 #endif
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * over max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime.sec > p->p_cpulimit) {
 		p->p_sflag |= PS_XCPU;
 		ke->ke_flags |= KEF_ASTPENDING;
 	}
 
 	/*
 	 * Finish up stats for outgoing thread.
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
 	CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 
 	sched_nest = sched_lock.mtx_recurse;
 	sched_switchout(td);
 
 	cpu_switch();		/* SHAZAM!!*/
 
 	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_switchin(td);
 
 	/* 
 	 * Start setting up stats etc. for the incoming thread.
 	 * Similar code in fork_exit() is returned to by cpu_switch()
 	 * in the case of a new thread/process.
 	 */
 	CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Call the switchin function while still holding the scheduler lock
 	 * (used by the idlezero code and the general page-zeroing code)
 	 */
 	if (td->td_switchin)
 		td->td_switchin();
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
 	default:
 		break;
 	}
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return;
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return;
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			wakeup(&proc0);
 		}
 	} else
 		sched_wakeup(td);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  * XXXKSE   Needs complete rewrite when correct info is available.
  * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 	struct proc *p;
 	struct thread *td;
 
 	avg = &averunnable;
 	sx_slock(&allproc_lock);
 	nrun = 0;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			switch (td->td_state) {
 			case TDS_RUNQ:
 			case TDS_RUNNING:
 				if ((p->p_flag & P_NOLOAD) != 0)
 					goto nextproc;
 				nrun++; /* XXXKSE */
 			default:
 				break;
 			}
 nextproc:
 			continue;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 	    loadav, NULL);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(dummy)
 	void *dummy;
 {
 	callout_init(&loadav_callout, 0);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 }
 
 /*
  * General purpose yield system call
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
 	struct ksegrp *kg = td->td_ksegrp;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
 	sched_prio(td, PRI_MAX_TIMESHARE);
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
 	td->td_retval[0] = 0;
 
 	return (0);
 }
 
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 106179)
+++ head/sys/kern/kern_thread.c	(revision 106180)
@@ -1,1711 +1,1731 @@
 /* 
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ucontext.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm_map.h>
 
 #include <machine/frame.h>
 
 /*
  * KSEGRP related storage.
  */
 static uma_zone_t ksegrp_zone;
 static uma_zone_t kse_zone;
 static uma_zone_t thread_zone;
 
 /* DEBUG ONLY */
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 static int oiks_debug = 1;	/* 0 disable, 1 printf, 2 enter debugger */
 SYSCTL_INT(_kern_threads, OID_AUTO, oiks, CTLFLAG_RW,
 	&oiks_debug, 0, "OIKS thread debug");
 
 static int max_threads_per_proc = 10;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 struct threadqueue zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
 struct mtx zombie_thread_lock;
 MTX_SYSINIT(zombie_thread_lock, &zombie_thread_lock,
     "zombie_thread_lock", MTX_SPIN);
 
 
 
 void kse_purge(struct proc *p, struct thread *td);
 /*
  * Pepare a thread for use.
  */
 static void
 thread_ctor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_flags |= TDF_UNBOUND;
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static void
 thread_init(void *mem, int size)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	mtx_lock(&Giant);
 	pmap_new_thread(td, 0);
 	mtx_unlock(&Giant);
 	cpu_thread_setup(td);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread	*td;
 
 	KASSERT((size == sizeof(struct thread)),
 	    ("size mismatch: %d != %d\n", size, (int)sizeof(struct thread)));
 
 	td = (struct thread *)mem;
 	pmap_dispose_thread(td);
 }
 
 /* 
  * KSE is linked onto the idle queue.
  */
 void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
 	ke->ke_state = KES_UNQUEUED;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu = NOCPU;
 }
 
 void
 kse_unlink(struct kse *ke)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = ke->ke_ksegrp;
 	if (ke->ke_state == KES_IDLE) {
 		kg->kg_idle_kses--;
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 	}
 
 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 	if (--kg->kg_kses == 0) {
 			ksegrp_unlink(kg);
 	}
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	kse_stash(ke);
 }
 
 void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
 	TAILQ_INIT(&kg->kg_threads);
 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
 	TAILQ_INIT(&kg->kg_iq);		/* idle kses in ksegrp */
 	TAILQ_INIT(&kg->kg_lq);		/* loan kses in ksegrp */
 	kg->kg_proc	= p;
 /* the following counters are in the -zero- section and may not need clearing */
 	kg->kg_numthreads = 0;
 	kg->kg_runnable = 0;
 	kg->kg_kses = 0;
 	kg->kg_idle_kses = 0;
 	kg->kg_loan_kses = 0;
 	kg->kg_runq_kses = 0; /* XXXKSE change name */
 /* link it in now that it's consistent */
 	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
 void
 ksegrp_unlink(struct ksegrp *kg)
 {
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	p = kg->kg_proc;
 	KASSERT(((kg->kg_numthreads == 0) && (kg->kg_kses == 0)),
 	    ("kseg_unlink: residual threads or KSEs"));
 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 	p->p_numksegrps--;
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	ksegrp_stash(kg);
 }
 
 /*
  * for a newly created process,
  * link up a the structure and its initial threads etc.
  */
 void
 proc_linkup(struct proc *p, struct ksegrp *kg,
 			struct kse *ke, struct thread *td)
 {
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 	p->p_numksegrps = 0;
 	p->p_numthreads = 0;
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
 }
 
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
+	struct proc *p;
+	struct thread *td2;
 
-	return(ENOSYS);
+	p = td->td_proc;
+	mtx_lock_spin(&sched_lock);
+	FOREACH_THREAD_IN_PROC(p, td2) {
+		if (td2->td_mailbox == uap->tmbx) {
+			td2->td_flags |= TDF_INTERRUPT;
+			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
+				if (td2->td_flags & TDF_CVWAITQ)
+					cv_abort(td2);
+				else
+					abortsleep(td2);
+			}	
+			mtx_unlock_spin(&sched_lock);
+			return 0;
+		}
+	}
+	mtx_unlock_spin(&sched_lock);
+	return(ESRCH);
 }
 
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (!(p->p_flag & P_KSES))
 		return EINVAL;
 	/* must be a bound thread */ 
 	if (td->td_flags & TDF_UNBOUND)
 		return EINVAL;
 	kg = td->td_ksegrp;
 	/* serialize killing kse */
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if ((kg->kg_kses == 1) && (kg->kg_numthreads > 1)) {
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	if ((p->p_numthreads == 1) && (p->p_numksegrps == 1)) {
 		p->p_flag &= ~P_KSES;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	} else {
 		while (mtx_owned(&Giant))
 			mtx_unlock(&Giant);
 		td->td_kse->ke_flags |= KEF_EXIT;
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return 0;
 }
 
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (p->p_flag & P_KSES) {
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return (EINVAL);
 }
 
 /* struct kse_wakeup_args {
 	struct kse_mailbox *mbx;
 }; */
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
 	struct proc *p;
 	struct kse *ke, *ke2;
 	struct ksegrp *kg;
 
 	p = td->td_proc;
 	/* KSE-enabled processes only, please. */
 	if (!(p->p_flag & P_KSES))
 		return EINVAL;
 	if (td->td_standin == NULL)
 		td->td_standin = thread_alloc();
 	ke = NULL;
 	mtx_lock_spin(&sched_lock);
 	if (uap->mbx) {
 		FOREACH_KSEGRP_IN_PROC(p, kg) {
 			FOREACH_KSE_IN_GROUP(kg, ke2) {
 				if (ke2->ke_mailbox != uap->mbx) 
 					continue;
 				if (ke2->ke_state == KES_IDLE) {
 					ke = ke2;
 					goto found;
 				} else {
 					mtx_unlock_spin(&sched_lock);
 					td->td_retval[0] = 0;
 					td->td_retval[1] = 0;
 					return 0;
 				}
 			}	
 		}
 	} else {
 		kg = td->td_ksegrp;
 		ke = TAILQ_FIRST(&kg->kg_iq);
 	}
 	if (ke == NULL) {
 		mtx_unlock_spin(&sched_lock);
 		return ESRCH;
 	}
 found:
 	thread_schedule_upcall(td, ke);
 	mtx_unlock_spin(&sched_lock);
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 	return 0;
 }
 
 /* 
  * No new KSEG: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate a new KSE and schedule an upcall on it.
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
 	int newgroup;
 }; */
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
 	struct kse *newke;
 	struct kse *ke;
 	struct ksegrp *newkg;
 	struct ksegrp *kg;
 	struct proc *p;
 	struct kse_mailbox mbx;
 	int err;
 
 	p = td->td_proc;
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
 	p->p_flag |= P_KSES; /* easier to just set it than to test and set */
 	kg = td->td_ksegrp;
 	if (uap->newgroup) {
 		/* 
 		 * If we want a new KSEGRP it doesn't matter whether
 		 * we have already fired up KSE mode before or not.
 		 * We put the process in KSE mode and create a new KSEGRP
 		 * and KSE. If our KSE has not got a mailbox yet then
 		 * that doesn't matter, just leave it that way. It will 
 		 * ensure that this thread stay BOUND. It's possible
 		 * that the call came form a threaded library and the main 
 		 * program knows nothing of threads.
 		 */
 		newkg = ksegrp_alloc();
 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
 		      kg_startzero, kg_endzero)); 
 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 		newke = kse_alloc();
 	} else {
 		/* 
 		 * Otherwise, if we have already set this KSE
 		 * to have a mailbox, we want to make another KSE here,
 		 * but only if there are not already the limit, which 
 		 * is 1 per CPU max.
 		 * 
 		 * If the current KSE doesn't have a mailbox we just use it
 		 * and give it one.
 		 *
 		 * Because we don't like to access
 		 * the KSE outside of schedlock if we are UNBOUND,
 		 * (because it can change if we are preempted by an interrupt) 
 		 * we can deduce it as having a mailbox if we are UNBOUND,
 		 * and only need to actually look at it if we are BOUND,
 		 * which is safe.
 		 */
 		if ((td->td_flags & TDF_UNBOUND) || td->td_kse->ke_mailbox) {
 #if 0  /* while debugging */
 #ifdef SMP
 			if (kg->kg_kses > mp_ncpus)
 #endif
 				return (EPROCLIM);
 #endif
 			newke = kse_alloc();
 		} else {
 			newke = NULL;
 		}
 		newkg = NULL;
 	}
 	if (newke) {
 		bzero(&newke->ke_startzero, RANGEOF(struct kse,
 		      ke_startzero, ke_endzero));
 #if 0
 		bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
 		      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
 #endif
 		/* For the first call this may not have been set */
 		if (td->td_standin == NULL) {
 			td->td_standin = thread_alloc();
 		}
 		mtx_lock_spin(&sched_lock);
 		if (newkg)
 			ksegrp_link(newkg, p);
 		else
 			newkg = kg;
 		kse_link(newke, newkg);
 		if (p->p_sflag & PS_NEEDSIGCHK)
 			newke->ke_flags |= KEF_ASTPENDING;
 		newke->ke_mailbox = uap->mbx;
 		newke->ke_upcall = mbx.km_func;
 		bcopy(&mbx.km_stack, &newke->ke_stack, sizeof(stack_t));
 		thread_schedule_upcall(td, newke);
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		/*
 		 * If we didn't allocate a new KSE then the we are using
 		 * the exisiting (BOUND) kse.
 		 */
 		ke = td->td_kse;
 		ke->ke_mailbox = uap->mbx;
 		ke->ke_upcall = mbx.km_func;
 		bcopy(&mbx.km_stack, &ke->ke_stack, sizeof(stack_t));
 	}
 	/*
 	 * Fill out the KSE-mode specific fields of the new kse.
 	 */
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 	return (0);
 }
 
 /*
  * Fill a ucontext_t with a thread's context information.
  *
  * This is an analogue to getcontext(3).
  */
 void
 thread_getcontext(struct thread *td, ucontext_t *uc)
 {
 
 /*
  * XXX this is declared in a MD include file, i386/include/ucontext.h but
  * is used in MI code.
  */
 #ifdef __i386__
 	get_mcontext(td, &uc->uc_mcontext);
 #endif
 	uc->uc_sigmask = td->td_proc->p_sigmask;
 }
 
 /*
  * Set a thread's context from a ucontext_t.
  *
  * This is an analogue to setcontext(3).
  */
 int
 thread_setcontext(struct thread *td, ucontext_t *uc)
 {
 	int ret;
 
 /*
  * XXX this is declared in a MD include file, i386/include/ucontext.h but
  * is used in MI code.
  */
 #ifdef __i386__
 	ret = set_mcontext(td, &uc->uc_mcontext);
 #else
 	ret = ENOSYS;
 #endif
 	if (ret == 0) {
 		SIG_CANTMASK(uc->uc_sigmask);
 		PROC_LOCK(td->td_proc);
 		td->td_proc->p_sigmask = uc->uc_sigmask;
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (ret);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 #ifndef __ia64__
 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, 0);
 #else
 	/*
 	 * XXX the ia64 kstack allocator is really lame and is at the mercy
 	 * of contigmallloc().  This hackery is to pre-construct a whole
 	 * pile of thread structures with associated kernel stacks early
 	 * in the system startup while contigmalloc() still works. Once we
 	 * have them, keep them.  Sigh.
 	 */
 	thread_zone = uma_zcreate("THREAD", sizeof (struct thread),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
 	uma_prealloc(thread_zone, 512);		/* XXX arbitary */
 #endif
 	ksegrp_zone = uma_zcreate("KSEGRP", sizeof (struct ksegrp),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	kse_zone = uma_zcreate("KSE", sizeof (struct kse),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 }
 
 /*
  * Stash an embarasingly extra thread into the zombie thread queue.
  */
 void
 thread_stash(struct thread *td)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Stash an embarasingly extra kse into the zombie kse queue.
  */
 void
 kse_stash(struct kse *ke)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
  */
 void
 ksegrp_stash(struct ksegrp *kg)
 {
 	mtx_lock_spin(&zombie_thread_lock);
 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
 	mtx_unlock_spin(&zombie_thread_lock);
 }
 
 /*
  * Reap zombie threads.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 	struct kse *ke_first, *ke_next;
 	struct ksegrp *kg_first, * kg_next;
 
 	/*
 	 * don't even bother to lock if none at this instant
 	 * We really don't care about the next instant..
 	 */
 	if ((!TAILQ_EMPTY(&zombie_threads))
 	    || (!TAILQ_EMPTY(&zombie_kses))
 	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
 		mtx_lock_spin(&zombie_thread_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		ke_first = TAILQ_FIRST(&zombie_kses);
 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		if (ke_first)
 			TAILQ_INIT(&zombie_kses);
 		if (kg_first)
 			TAILQ_INIT(&zombie_ksegrps);
 		mtx_unlock_spin(&zombie_thread_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_runq);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 		while (ke_first) {
 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
 			kse_free(ke_first);
 			ke_first = ke_next;
 		}
 		while (kg_first) {
 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 			ksegrp_free(kg_first);
 			kg_first = kg_next;
 		}
 	}
 }
 
 /*
  * Allocate a ksegrp.
  */
 struct ksegrp *
 ksegrp_alloc(void)
 {
 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
 }
 
 /*
  * Allocate a kse.
  */
 struct kse *
 kse_alloc(void)
 {
 	return (uma_zalloc(kse_zone, M_WAITOK));
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
 	thread_reap(); /* check if any zombies to get */
 	return (uma_zalloc(thread_zone, M_WAITOK));
 }
 
 /*
  * Deallocate a ksegrp.
  */
 void
 ksegrp_free(struct ksegrp *td)
 {
 	uma_zfree(ksegrp_zone, td);
 }
 
 /*
  * Deallocate a kse.
  */
 void
 kse_free(struct kse *td)
 {
 	uma_zfree(kse_zone, td);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
  * The list is anchored in the ksegrp structure.
  */
 int
 thread_export_context(struct thread *td)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	uintptr_t mbx;
 	void *addr;
 	int error;
 	ucontext_t uc;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Export the user/machine context. */
 #if 0
 	addr = (caddr_t)td->td_mailbox +
 	    offsetof(struct kse_thr_mailbox, tm_context);
 #else /* if user pointer arithmetic is valid in the kernel */
 		addr = (void *)(&td->td_mailbox->tm_context);
 #endif
 	error = copyin(addr, &uc, sizeof(ucontext_t));
 	if (error == 0) {
 		thread_getcontext(td, &uc);
 		error = copyout(&uc, addr, sizeof(ucontext_t));
 
 	}
 	if (error) {
 		PROC_LOCK(p);
 		psignal(p, SIGSEGV);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	/* get address in latest mbox of list pointer */
 #if 0
 	addr = (caddr_t)td->td_mailbox
 	    + offsetof(struct kse_thr_mailbox , tm_next);
 #else /* if user pointer arithmetic is valid in the kernel */
 	addr = (void *)(&td->td_mailbox->tm_next);
 #endif
 	/*
 	 * Put the saved address of the previous first
 	 * entry into this one
 	 */
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = td->td_mailbox;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * Take the list of completed mailboxes for this KSEGRP and put them on this
  * KSE's mailbox as it's the next one going up.
  */
 static int
 thread_link_mboxes(struct ksegrp *kg, struct kse *ke)
 {
 	struct proc *p = kg->kg_proc;
 	void *addr;
 	uintptr_t mbx;
 
 #if 0
 	addr = (caddr_t)ke->ke_mailbox
 	    + offsetof(struct kse_mailbox, km_completed);
 #else /* if user pointer arithmetic is valid in the kernel */
 		addr = (void *)(&ke->ke_mailbox->km_completed);
 #endif
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		/* XXXKSE could use atomic CMPXCH here */
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * Discard the current thread and exit from its context.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our KSE's ke_tdspare slot, freeing the
  * thread that might be there currently. Because we know that only this
  * processor will run our KSE, we needn't worry about someone else grabbing
  * our context before we do a cpu_throw.
  */
 void
 thread_exit(void)
 {
 	struct thread *td;
 	struct kse *ke;
 	struct proc *p;
 	struct ksegrp	*kg;
 
 	td = curthread;
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	KASSERT(ke != NULL, ("thread exiting without a kse"));
 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
 
 	if (ke->ke_tdspare != NULL) {
 		thread_stash(ke->ke_tdspare);
 		ke->ke_tdspare = NULL;
 	}
 	if (td->td_standin != NULL) {
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff.
 	 */
 	if (p->p_numthreads > 1) {
 		/*
 		 * Unlink this thread from its proc and the kseg.
 		 * In keeping with the other structs we probably should
 		 * have a thread_unlink() that does some of this but it
 		 * would only be called from here (I think) so it would
 		 * be a waste. (might be useful for proc_fini() as well.)
  		 */
 		TAILQ_REMOVE(&p->p_threads, td, td_plist);
 		p->p_numthreads--;
 		TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
 		kg->kg_numthreads--;
 		/*
 		 * The test below is NOT true if we are the
 		 * sole exiting thread. P_STOPPED_SNGL is unset
 		 * in exit1() after it is the only survivor.
 		 */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 
 		/* Reassign this thread's KSE. */
 		ke->ke_thread = NULL;
 		td->td_kse = NULL;
 		ke->ke_state = KES_UNQUEUED;
 		KASSERT((ke->ke_bound != td),
 		    ("thread_exit: entered with ke_bound set"));
 
 		/* 
 		 * The reason for all this hoopla is 
 		 * an attempt to stop our thread stack from being freed 
 		 * until AFTER we have stopped running on it.
 		 * Since we are under schedlock, almost any method where
 		 * it is eventually freed by someone else is probably ok.
 		 * (Especially if they do it under schedlock). We could 
 		 * almost free it here if we could be certain that 
 		 * the uma code wouldn't pull it apart immediatly, 
 		 * but unfortunatly we can not guarantee that.
 		 *
 		 * For threads that are exiting and NOT killing their
 		 * KSEs we can just stash it in the KSE, however
 		 * in the case where the KSE is also being deallocated,
 		 * we need to store it somewhere else. It turns out that
 		 * we will never free the last KSE, so there is always one
 		 * other KSE available. We might as well just choose one
 		 * and stash it there. Being under schedlock should make that
 		 * safe.
 		 *
 		 * In borrower threads, we can stash it in the lender
 		 * Where it won't be needed until this thread is long gone.
 		 * Borrower threads can't kill their KSE anyhow, so even
 		 * the KSE would be a safe place for them. It is not
 		 * necessary to have a KSE (or KSEGRP) at all beyond this
 		 * point, while we are under the protection of schedlock.
 		 *
 		 * Either give the KSE to another thread to use (or make
 		 * it idle), or free it entirely, possibly along with its
 		 * ksegrp if it's the last one.
 		 */
 		if (ke->ke_flags & KEF_EXIT) {
 			kse_unlink(ke);
 			/*
 			 * Designate another KSE to hold our thread.
 			 * Safe as long as we abide by whatever lock 
 			 * we control it with.. The other KSE will not
 			 * be able to run it until we release the schelock,
 			 * but we need to be careful about it deciding to 
 			 * write to the stack before then. Luckily
 			 * I believe that while another thread's
 			 * standin thread can be used in this way, the
 			 * spare thread for the KSE cannot be used without
 			 * holding schedlock at least once.
 			 */
 			ke =  FIRST_KSE_IN_PROC(p);
 		} else {
 			kse_reassign(ke);
 		}
 		if (ke->ke_bound) {
 			/*
 			 * WE are a borrower..
 			 * stash our thread with the owner.
 			 */
 			if (ke->ke_bound->td_standin) {
 				thread_stash(ke->ke_bound->td_standin);
 			}
 			ke->ke_bound->td_standin = td;
 		} else {
 			if (ke->ke_tdspare != NULL) {
 				thread_stash(ke->ke_tdspare);
 				ke->ke_tdspare = NULL;
 			}
 			ke->ke_tdspare = td;
 		}
 		PROC_UNLOCK(p);
 		td->td_state	= TDS_INACTIVE;
 		td->td_proc	= NULL;
 		td->td_ksegrp	= NULL;
 		td->td_last_kse	= NULL;
 	} else {
 		PROC_UNLOCK(p);
 	}
 
 	cpu_throw();
 	/* NOTREACHED */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  */
 void
 thread_link(struct thread *td, struct ksegrp *kg)
 {
 	struct proc *p;
 
 	p = kg->kg_proc;
 	td->td_state = TDS_INACTIVE;
 	td->td_proc	= p;
 	td->td_ksegrp	= kg;
 	td->td_last_kse	= NULL;
 
 	LIST_INIT(&td->td_contested);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
 	p->p_numthreads++;
 	kg->kg_numthreads++;
 	if (oiks_debug && p->p_numthreads > max_threads_per_proc) {
 		printf("OIKS %d\n", p->p_numthreads);
 		if (oiks_debug > 1)
 			Debugger("OIKS");
 	}
 	td->td_kse	= NULL;
 }
 
 void
 kse_purge(struct proc *p, struct thread *td)
 {
 	struct kse *ke;
 	struct ksegrp *kg;
 
  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
 	mtx_lock_spin(&sched_lock);
 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 			kg->kg_idle_kses--;
 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 			kg->kg_kses--;
 			if (ke->ke_tdspare)
 				thread_stash(ke->ke_tdspare);
    			kse_stash(ke);
 		}
 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 		p->p_numksegrps--;
 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
 		    ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
 			("wrong kg_kses"));
 		if (kg != td->td_ksegrp) {
 			ksegrp_stash(kg);
 		}
 	}
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
 	p->p_numksegrps++;
 	mtx_unlock_spin(&sched_lock);
 }
 
 
 /*
  * Create a thread and schedule it for upcall on the KSE given.
  */
 struct thread *
 thread_schedule_upcall(struct thread *td, struct kse *ke)
 {
 	struct thread *td2;
 	struct ksegrp *kg;
 	int newkse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	newkse = (ke != td->td_kse);
 
 	/* 
 	 * If the kse is already owned by another thread then we can't
 	 * schedule an upcall because the other thread must be BOUND
 	 * which means it is not in a position to take an upcall.
 	 * We must be borrowing the KSE to allow us to complete some in-kernel
 	 * work. When we complete, the Bound thread will have teh chance to 
 	 * complete. This thread will sleep as planned. Hopefully there will
 	 * eventually be un unbound thread that can be converted to an
 	 * upcall to report the completion of this thread.
 	 */
 	if (ke->ke_bound && ((ke->ke_bound->td_flags & TDF_UNBOUND) == 0)) {
 		return (NULL);
 	}
 	KASSERT((ke->ke_bound == NULL), ("kse already bound"));
 
 	if (ke->ke_state == KES_IDLE) {
 		kg = ke->ke_ksegrp;
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 		kg->kg_idle_kses--;
 		ke->ke_state = KES_UNQUEUED;
 	}
 	if ((td2 = td->td_standin) != NULL) {
 		td->td_standin = NULL;
 	} else {
 		if (newkse)
 			panic("no reserve thread when called with a new kse");
 		/*
 		 * If called from (e.g.) sleep and we do not have
 		 * a reserve thread, then we've used it, so do not
 		 * create an upcall.
 		 */
 		return(NULL);
 	}
 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
 	bzero(&td2->td_startzero,
 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	thread_link(td2, ke->ke_ksegrp);
 	cpu_set_upcall(td2, td->td_pcb);
 
 	/*
 	 * XXXKSE do we really need this? (default values for the
 	 * frame).
 	 */
 	bcopy(td->td_frame, td2->td_frame, sizeof(struct trapframe));
 
 	/*
 	 * Bind the new thread to the KSE,
 	 * and if it's our KSE, lend it back to ourself
 	 * so we can continue running.
 	 */
 	td2->td_ucred = crhold(td->td_ucred);
 	td2->td_flags = TDF_UPCALLING; /* note: BOUND */
 	td2->td_kse = ke;
 	td2->td_state = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	/*
 	 * If called from msleep(), we are working on the current
 	 * KSE so fake that we borrowed it. If called from
 	 * kse_create(), don't, as we have a new kse too.
 	 */
 	if (!newkse) {
 		/*
 		 * This thread will be scheduled when the current thread
 		 * blocks, exits or tries to enter userspace, (which ever
 		 * happens first). When that happens the KSe will "revert"
 		 * to this thread in a BOUND manner. Since we are called
 		 * from msleep() this is going to be "very soon" in nearly
 		 * all cases.
 		 */
 		ke->ke_bound = td2;
 		TD_SET_LOAN(td2);
 	} else {
 		ke->ke_bound = NULL;
 		ke->ke_thread = td2;
 		ke->ke_state = KES_THREAD;
 		setrunqueue(td2);
 	}
 	return (td2);	/* bogus.. should be a void function */
 }
 
 /*
  * Schedule an upcall to notify a KSE process recieved signals.
  *
  * XXX - Modifying a sigset_t like this is totally bogus.
  */
 struct thread *
 signal_upcall(struct proc *p, int sig)
 {
 	struct thread *td, *td2;
 	struct kse *ke;
 	sigset_t ss;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 return (NULL);
 
 	td = FIRST_THREAD_IN_PROC(p);
 	ke = td->td_kse;
 	PROC_UNLOCK(p);
 	error = copyin(&ke->ke_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
 	PROC_LOCK(p);
 	if (error)
 		return (NULL);
 	SIGADDSET(ss, sig);
 	PROC_UNLOCK(p);
 	error = copyout(&ss, &ke->ke_mailbox->km_sigscaught, sizeof(sigset_t));
 	PROC_LOCK(p);
 	if (error)
 		return (NULL);
 	if (td->td_standin == NULL)
 		td->td_standin = thread_alloc();
 	mtx_lock_spin(&sched_lock);
 	td2 = thread_schedule_upcall(td, ke); /* Bogus JRE */
 	mtx_unlock_spin(&sched_lock);
 	return (td2);
 }
 
 /*
  * setup done on the thread when it enters the kernel.
  * XXXKSE Presently only for syscalls but eventually all kernel entries.
  */
 void
 thread_user_enter(struct proc *p, struct thread *td)
 {
 	struct kse *ke;
 
 	/*
 	 * First check that we shouldn't just abort.
 	 * But check if we are the single thread first!
 	 * XXX p_singlethread not locked, but should be safe.
 	 */
 	if ((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) {
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		thread_exit();
 		/* NOTREACHED */
 	}
 
 	/*
 	 * If we are doing a syscall in a KSE environment,
 	 * note where our mailbox is. There is always the
 	 * possibility that we could do this lazily (in sleep()),
 	 * but for now do it every time.
 	 */
 	ke = td->td_kse;
 	if (ke->ke_mailbox != NULL) {
 #if 0
 		td->td_mailbox = (void *)fuword((caddr_t)ke->ke_mailbox
 		    + offsetof(struct kse_mailbox, km_curthread));
 #else /* if user pointer arithmetic is ok in the kernel */
 		td->td_mailbox =
 		    (void *)fuword( (void *)&ke->ke_mailbox->km_curthread);
 #endif
 		if ((td->td_mailbox == NULL) ||
 		    (td->td_mailbox == (void *)-1)) {
 			td->td_mailbox = NULL;	/* single thread it.. */
 			td->td_flags &= ~TDF_UNBOUND;
 		} else {
 			if (td->td_standin == NULL)
 				td->td_standin = thread_alloc();
 			td->td_flags |= TDF_UNBOUND;
 		}
 	}
 }
 
 /*
  * The extra work we go through if we are a threaded process when we
  * return to userland.
  *
  * If we are a KSE process and returning to user mode, check for
  * extra work to do before we return (e.g. for more syscalls
  * to complete first).  If we were in a critical section, we should
  * just return to let it finish. Same if we were in the UTS (in
  * which case the mailbox's context's busy indicator will be set).
  * The only traps we suport will have set the mailbox.
  * We will clear it here.
  */
 int
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	int error;
 	int unbound;
 	struct kse *ke;
 	struct ksegrp *kg;
 	struct thread *td2;
 	struct proc *p;
 
 	error = 0;
 
 	unbound = td->td_flags & TDF_UNBOUND;
 
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 
 	/*
 	 * Originally bound threads never upcall but they may 
 	 * loan out their KSE at this point.
 	 * Upcalls imply bound.. They also may want to do some Philantropy.
 	 * Unbound threads on the other hand either yield to other work
 	 * or transform into an upcall.
 	 * (having saved their context to user space in both cases)
 	 */
 	if (unbound ) {
 		/*
 		 * We are an unbound thread, looking to return to 
 		 * user space.
 		 * THere are several possibilities:
 		 * 1) we are using a borrowed KSE. save state and exit.
 		 *    kse_reassign() will recycle the kse as needed,
 		 * 2) we are not.. save state, and then convert ourself
 		 *    to be an upcall, bound to the KSE.
 		 *    if there are others that need the kse,
 		 *    give them a chance by doing an mi_switch().
 		 *    Because we are bound, control will eventually return
 		 *    to us here.
 		 * ***
 		 * Save the thread's context, and link it
 		 * into the KSEGRP's list of completed threads.
 		 */
 		error = thread_export_context(td);
 		td->td_mailbox = NULL;
 		if (error) {
 			/*
 			 * If we are not running on a borrowed KSE, then
 			 * failing to do the KSE operation just defaults
 			 * back to synchonous operation, so just return from
 			 * the syscall. If it IS borrowed, there is nothing
 			 * we can do. We just lose that context. We
 			 * probably should note this somewhere and send
 			 * the process a signal.
 			 */
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGSEGV);
 			mtx_lock_spin(&sched_lock);
 			if (td->td_kse->ke_bound == NULL) {
 				td->td_flags &= ~TDF_UNBOUND;
 				PROC_UNLOCK(td->td_proc);
 				mtx_unlock_spin(&sched_lock);
 				return (error);	/* go sync */
 			}
 			thread_exit();
 		}
 
 		/*
 		 * if the KSE is owned and we are borrowing it,
 		 * don't make an upcall, just exit so that the owner
 		 * can get its KSE if it wants it.
 		 * Our context is already safely stored for later
 		 * use by the UTS.
 		 */
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		if (td->td_kse->ke_bound) {
 			thread_exit();
 		}
 		PROC_UNLOCK(p);
 				
 		/*
 		 * Turn ourself into a bound upcall.
 		 * We will rely on kse_reassign()
 		 * to make us run at a later time.
 		 * We should look just like a sheduled upcall
 		 * from msleep() or cv_wait().
 		 */
 		td->td_flags &= ~TDF_UNBOUND;
 		td->td_flags |= TDF_UPCALLING;
 		/* Only get here if we have become an upcall */
 
 	} else {
 		mtx_lock_spin(&sched_lock);
 	}
 	/* 
 	 * We ARE going back to userland with this KSE.
 	 * Check for threads that need to borrow it.
 	 * Optimisation: don't call mi_switch if no-one wants the KSE.
 	 * Any other thread that comes ready after this missed the boat.
 	 */
 	ke = td->td_kse;
 	if ((td2 = kg->kg_last_assigned)) 
 		td2 = TAILQ_NEXT(td2, td_runq);
 	else
 		td2 = TAILQ_FIRST(&kg->kg_runq);
 	if (td2)  {
 		/* 
 		 * force a switch to more urgent 'in kernel'
 		 * work. Control will return to this thread
 		 * when there is no more work to do.
 		 * kse_reassign() will do tha for us.
 		 */
 		TD_SET_LOAN(td);
 		ke->ke_bound = td;
 		ke->ke_thread = NULL;
 		mi_switch(); /* kse_reassign() will (re)find td2 */
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Optimisation:
 	 * Ensure that we have a spare thread available,
 	 * for when we re-enter the kernel.
 	 */
 	if (td->td_standin == NULL) {
 		if (ke->ke_tdspare) {
 			td->td_standin = ke->ke_tdspare;
 			ke->ke_tdspare = NULL;
 		} else {
 			td->td_standin = thread_alloc();
 		}
 	}
 
 	/* 
 	 * To get here, we know there is no other need for our
 	 * KSE so we can proceed. If not upcalling, go back to 
 	 * userspace. If we are, get the upcall set up.
 	 */
 	if ((td->td_flags & TDF_UPCALLING) == 0)
 		return (0);
 
 	/* 
 	 * We must be an upcall to get this far.
 	 * There is no more work to do and we are going to ride
 	 * this thead/KSE up to userland as an upcall.
 	 * Do the last parts of the setup needed for the upcall.
 	 */
 	CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
 
 	/*
 	 * Set user context to the UTS.
 	 */
 	cpu_set_upcall_kse(td, ke);
 
 	/*
 	 * Put any completed mailboxes on this KSE's list.
 	 */
 	error = thread_link_mboxes(kg, ke);
 	if (error)
 		goto bad;
 
 	/*
 	 * Set state and mailbox.
 	 * From now on we are just a bound outgoing process.
 	 * **Problem** userret is often called several times.
 	 * it would be nice if this all happenned only on the first time 
 	 * through. (the scan for extra work etc.)
 	 */
+	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_UPCALLING;
+	mtx_unlock_spin(&sched_lock);
 #if 0
 	error = suword((caddr_t)ke->ke_mailbox +
 	    offsetof(struct kse_mailbox, km_curthread), 0);
 #else	/* if user pointer arithmetic is ok in the kernel */
 	error = suword((caddr_t)&ke->ke_mailbox->km_curthread, 0);
 #endif
 	if (!error)
 		return (0);
 
 bad:
 	/*
 	 * Things are going to be so screwed we should just kill the process.
  	 * how do we do that?
 	 */
 	PROC_LOCK(td->td_proc);
 	psignal(td->td_proc, SIGSEGV);
 	PROC_UNLOCK(td->td_proc);
 	return (error);	/* go sync */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accellerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int force_exit)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
 	if ((p->p_flag & P_KSES) == 0)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread) 
 		return (1);
 
 	if (force_exit == SINGLE_EXIT)
 		p->p_flag |= P_SINGLE_EXIT;
 	else
 		p->p_flag &= ~P_SINGLE_EXIT;
 	p->p_flag |= P_STOPPED_SINGLE;
 	p->p_singlethread = td;
 	/* XXXKSE Which lock protects the below values? */
 	while ((p->p_numthreads - p->p_suspcount) != 1) {
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			if (TD_IS_INHIBITED(td2)) {
 				if (force_exit == SINGLE_EXIT) {
 					if (TD_IS_SUSPENDED(td2)) {
 						thread_unsuspend_one(td2);
 					}
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR)) {
 						if (td2->td_flags & TDF_CVWAITQ)
 							cv_abort(td2);
 						else
 							abortsleep(td2);
 					}
 				} else {
 					if (TD_IS_SUSPENDED(td2))
 						continue;
 					/* maybe other inhibitted states too? */
 					if (TD_IS_SLEEPING(td2)) 
 						thread_suspend_one(td2);
 				}
 			}
 		}
 		/* 
 		 * Maybe we suspended some threads.. was it enough? 
 		 */
 		if ((p->p_numthreads - p->p_suspcount) == 1) {
 			mtx_unlock_spin(&sched_lock);
 			break;
 		}
 
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_one(td);
 		mtx_unlock(&Giant);
 		PROC_UNLOCK(p);
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		PROC_LOCK(p);
 	}
 	if (force_exit == SINGLE_EXIT)
 		kse_purge(p, td);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless 
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 
 	td = curthread;
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p)) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked 
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		} 
 		if (return_instead)
 			return (1);
 
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			mtx_lock_spin(&sched_lock);
 			while (mtx_owned(&Giant))
 				mtx_unlock(&Giant);
 			/* 
 			 * free extra kses and ksegrps, we needn't worry 
 			 * about if current thread is in same ksegrp as 
 			 * p_singlethread and last kse in the group
 			 * could be killed, this is protected by kg_numthreads,
 			 * in this case, we deduce that kg_numthreads must > 1.
 			 */
 			ke = td->td_kse;
 			if (ke->ke_bound == NULL && 
 			    ((kg->kg_kses != 1) || (kg->kg_numthreads == 1)))
 				ke->ke_flags |= KEF_EXIT;
 			thread_exit();
 		}
 
 		/*
 		 * When a thread suspends, it just
 		 * moves to the processes's suspend queue
 		 * and stays there.
 		 *
 		 * XXXKSE if TDF_BOUND is true
 		 * it will not release it's KSE which might
 		 * lead to deadlock if there are not enough KSEs
 		 * to complete all waiting threads.
 		 * Maybe be able to 'lend' it out again.
 		 * (lent kse's can not go back to userland?)
 		 * and can only be lent in STOPPED state.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount+1 == p->p_numthreads)) {
 			mtx_unlock_spin(&sched_lock);
 			PROC_LOCK(p->p_pptr);
 			if ((p->p_pptr->p_procsig->ps_flag &
 				PS_NOCLDSTOP) == 0) {
 				psignal(p->p_pptr, SIGCHLD);
 			}
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 		}
 		mtx_assert(&Giant, MA_NOTOWNED);
 		thread_suspend_one(td);
 		PROC_UNLOCK(p);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
 	/*
 	 * Hack: If we are suspending but are on the sleep queue
 	 * then we are in msleep or the cv equivalent. We
 	 * want to look like we have two Inhibitors.
 	 * May already be set.. doesn't matter.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		TD_SET_SLEEPING(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_unsuspend_one(p->p_singlethread);
 	}
 }
 
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~P_STOPPED_SINGLE;
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
 		mtx_lock_spin(&sched_lock);
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 106179)
+++ head/sys/sys/proc.h	(revision 106180)
@@ -1,938 +1,939 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #include <sys/filedesc.h>
 #include <sys/queue.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX */
 #include <sys/runq.h>
 #include <sys/signal.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	int		s_count;	/* (m) Ref cnt; pgrps in session. */
 	struct	proc	*s_leader;	/* (m + e) Session leader. */
 	struct	vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct	tty	*s_ttyp;	/* (m) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct	mtx	s_mtx;		/* Mutex to protect members */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Pgrp id. */
 	int		pg_jobc;	/* (m) job cntl proc count */
 	struct	mtx	pg_mtx;		/*  Mutex to protect members */
 };
 
 struct procsig {
 	sigset_t ps_sigignore;		/* Signals being ignored. */
 	sigset_t ps_sigcatch;		/* Signals being caught by user. */
 	int	 ps_flag;
 	struct	 sigacts *ps_sigacts;	/* Signal actions, state. */
 	int	 ps_refcnt;
 };
 
 #define	PS_NOCLDWAIT	0x0001	/* No zombies if child dies */
 #define	PS_NOCLDSTOP	0x0002	/* No SIGCHLD when children stop. */
 #define	PS_CLDSIGIGN	0x0004	/* The SIGCHLD handler is SIG_IGN. */
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *      	(exception aiods switch vmspaces, but they are also
  *      	marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by sched_lock mtx
  *      k - only accessed by curthread
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      p - select lock (sellock)
  *      r - p_peers lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct ithd;
 struct nlminfo;
 struct trapframe;
 
 /*
  * Here we define the four structures used for process information.
  *
  * The first is the thread. It might be though of as a "Kernel
  * Schedulable Entity Context".
  * This structure contains all the information as to where a thread of 
  * execution is now, or was when it was suspended, why it was suspended,
  * and anything else that will be needed to restart it when it is
  * rescheduled. Always associated with a KSE when running, but can be
  * reassigned to an equivalent KSE  when being restarted for
  * load balancing. Each of these is associated with a kernel stack
  * and a pcb.
  * 
  * It is important to remember that a particular thread structure only
  * exists as long as the system call or kernel entrance (e.g. by pagefault)
  * which it is currently executing. It should threfore NEVER be referenced
  * by pointers in long lived structures that live longer than a single
  * request. If several threads complete their work at the same time,
  * they will all rewind their stacks to the user boundary, report their
  * completion state, and all but one will be freed. That last one will
  * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel
  * entrance. (basically to save freeing and then re-allocating it) The KSE
  * keeps a cached thread available to allow it to quickly
  * get one when it needs a new one. There is also a system
  * cache of free threads. Threads have priority and partake in priority
  * inherritance schemes.
  */
 struct thread;
 
 /* 
  * The second structure is the Kernel Schedulable Entity. (KSE)
  * It represents the ability to take a slot in the scheduler queue.
  * As long as this is scheduled, it could continue to run any threads that
  * are assigned to the KSEGRP (see later) until either it runs out
  * of runnable threads of high enough priority, or CPU.
  * It runs on one CPU and is assigned a quantum of time. When a thread is
  * blocked, The KSE continues to run and will search for another thread
  * in a runnable state amongst those it has. It May decide to return to user
  * mode with a new 'empty' thread if there are no runnable threads.
  * Threads are temporarily associated with a KSE for scheduling reasons.
  */
 struct kse;
 
 /*
  * The KSEGRP is allocated resources across a number of CPUs.
  * (Including a number of CPUxQUANTA. It parcels these QUANTA up among
  * Its KSEs, each of which should be running in a different CPU.
  * BASE priority and total available quanta are properties of a KSEGRP.
  * Multiple KSEGRPs in a single process compete against each other
  * for total quanta in the same way that a forked child competes against
  * it's parent process.
  */
 struct ksegrp;
 
 /*
  * A process is the owner of all system resources allocated to a task
  * except CPU quanta.
  * All KSEGs under one process see, and have the same access to, these
  * resources (e.g. files, memory, sockets, permissions kqueues).
  * A process may compete for CPU cycles on the same basis as a
  * forked process cluster by spawning several KSEGRPs. 
  */
 struct proc;
 
 /***************
  * In pictures:
  With a single run queue used by all processors:
 
  RUNQ: --->KSE---KSE--...               SLEEPQ:[]---THREAD---THREAD---THREAD
 	   |   /                               []---THREAD
 	   KSEG---THREAD--THREAD--THREAD       []
 	                                       []---THREAD---THREAD
 
   (processors run THREADs from the KSEG until they are exhausted or
   the KSEG exhausts its quantum) 
 
 With PER-CPU run queues:
 KSEs on the separate run queues directly
 They would be given priorities calculated from the KSEG.
 
  *
  *****************/
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * The first KSE available in the correct group will run this thread.
  * If several are available, use the one on the same CPU as last time.
  * When waing to be run, threads are hung off the KSEGRP in priority order.
  * with N runnable and queued KSEs in the KSEGRP, the first N threads
  * are linked to them. Other threads are not yet assigned.
  */
 struct thread {
 	struct proc	*td_proc;	/* Associated process. */
 	struct ksegrp	*td_ksegrp;	/* Associated KSEG. */
 	TAILQ_ENTRY(thread) td_plist;	/* All threads in this proc */
 	TAILQ_ENTRY(thread) td_kglist;	/* All threads in this ksegrp */
 
 	/* The two queues below should someday be merged */
 	TAILQ_ENTRY(thread) td_slpq; 	/* (j) Sleep queue. XXXKSE */ 
 	TAILQ_ENTRY(thread) td_lockq; 	/* (j) Lock queue. XXXKSE */ 
 	TAILQ_ENTRY(thread) td_runq; 	/* (j) Run queue(s). XXXKSE */ 
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 
 /* Cleared during fork1() or thread_sched_upcall() */
 #define	td_startzero td_flags
 	int		td_flags;	/* (j) TDF_* flags. */
 	int		td_inhibitors;	/* (j) Why can not run */
 	struct kse	*td_last_kse;	/* (j) Previous value of td_kse */
 	struct kse	*td_kse;	/* (j) Current KSE if running. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	void		*td_wchan;	/* (j) Sleep address. */
 	const char	*td_wmesg;	/* (j) Reason for sleep. */
 	u_char		td_lastcpu;	/* (j) Last cpu we were on. */
 	u_char		td_inktr;	/* (k) Currently handling a KTR. */
 	u_char		td_inktrace;	/* (k) Currently handling a KTRACE. */
 	short		td_locks;	/* (k) DEBUG: lockmgr count of locks */
 	struct mtx	*td_blocked;	/* (j) Mutex process is blocked on. */
 	struct ithd	*td_ithd;	/* (b) For interrupt threads only. */
 	const char	*td_lockname;	/* (j) Name of lock blocked on. */
 	LIST_HEAD(, mtx) td_contested;	/* (j) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	struct kse_thr_mailbox *td_mailbox; /* the userland mailbox address */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	void		(*td_switchin)(void); /* (k) Switchin special func. */
 	struct thread	*td_standin;	/* (?) use this for an upcall */
 	u_int		td_critnest;	/* (k) Critical section nest level. */
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall() */
 #define	td_startcopy td_endzero
 	u_char		td_base_pri;	/* (j) Thread base kernel priority. */
 	u_char		td_priority;	/* (j) Thread active priority. */
 #define	td_endcopy td_pcb
 
 /*
  * fields that must be manually set in fork1() or thread_sched_upcall()
  * or already have been set in the allocator, contstructor, etc..
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x20,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;
 	register_t 	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* Kernel VA of kstack. */
 	int		td_kstack_pages; /* Size of the kstack */
 	struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */
 	vm_offset_t	td_altkstack;	/* Kernel VA of alternate kstack. */
 	int		td_altkstack_pages; /* Size of the alternate kstack */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 };
 /* flags kept in td_flags */ 
 #define	TDF_UNBOUND	0x000001 /* May give away the kse, uses the kg runq. */
 #define	TDF_INPANIC	0x000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_SINTR	0x000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x000010 /* Timing out during sleep. */
 #define	TDF_SELECT	0x000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_CVWAITQ	0x000080 /* Thread is on a cv_waitq (not slpq). */
 #define	TDF_UPCALLING	0x000100 /* This thread is doing an upcall. */
 #define	TDF_ONSLEEPQ	0x000200 /* On the sleep queue. */
 #define	TDF_INMSLEEP	0x000400 /* Don't recurse in msleep(). */
 #define	TDF_TIMOFAIL	0x001000 /* Timeout from sleep after we were awake. */
+#define	TDF_INTERRUPT	0x002000 /* Thread is marked as interrupted. */
 #define	TDF_DEADLKTREAT	0x800000 /* Lock aquisition - deadlock treatment. */
 
 #define	TDI_SUSPENDED	0x01	/* On suspension queue. */
 #define	TDI_SLEEPING	0x02	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x04	/* Stack not in mem.. bad juju if run. */
 #define	TDI_LOCK	0x08	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x10	/* Awaiting interrupt. */
 #define	TDI_LOAN	0x20	/* bound thread's KSE is lent */
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_LENT(td)		((td)->td_inhibitors & TDI_LOAN)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= inhib;			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & inhib) &&		\
 	    (((td)->td_inhibitors &= ~inhib) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_LOAN(td)		TD_SET_INHIB((td), TDI_LOAN)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 #define	TD_CLR_LOAN(td)		TD_CLR_INHIB((td), TDI_LOAN)
 
 #define	TD_SET_RUNNING(td)	do {(td)->td_state = TDS_RUNNING; } while (0)
 #define	TD_SET_RUNQ(td)		do {(td)->td_state = TDS_RUNQ; } while (0)
 #define	TD_SET_CAN_RUN(td)	do {(td)->td_state = TDS_CAN_RUN; } while (0)
 #define	TD_SET_ON_SLEEPQ(td)	do {(td)->td_flags |= TDF_ONSLEEPQ; } while (0)
 #define	TD_CLR_ON_SLEEPQ(td)	do {			\
 		(td)->td_flags &= ~TDF_ONSLEEPQ;	\
 		(td)->td_wchan = NULL;			\
 } while (0)
 
 
 /*
  * Traps for young players:
  * The main thread variable that controls whether a thread acts as a threaded
  * or unthreaded thread is the td_bound counter (0 == unbound).
  * UPCALLS run with the UNBOUND flags clear, after they are first scheduled.
  * i.e. they bind themselves to whatever thread thay are first scheduled with.
  * You may see BOUND threads in KSE processes but you should never see
  * UNBOUND threads in non KSE processes.
  */
 
 /*
  * The schedulable entity that can be given a context to run.
  * A process may have several of these. Probably one per processor
  * but posibly a few more. In this universe they are grouped
  * with a KSEG that contains the priority and niceness
  * for the group.
  */
 struct kse {
 	struct proc	*ke_proc;	/* Associated process. */
 	struct ksegrp	*ke_ksegrp;	/* Associated KSEG. */
 	TAILQ_ENTRY(kse) ke_kglist;	/* Queue of all KSEs in ke_ksegrp. */
 	TAILQ_ENTRY(kse) ke_kgrlist;	/* Queue of all KSEs in this state. */
 	TAILQ_ENTRY(kse) ke_procq;	/* (j) Run queue. */
 
 #define	ke_startzero ke_flags
 	int		ke_flags;	/* (j) KEF_* flags. */
 	struct thread	*ke_thread;	/* Active associated thread. */
 	struct thread	*ke_bound;	/* Thread bound to this KSE (*) */
 	int		ke_cpticks;	/* (j) Ticks of cpu time. */
 	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
 	u_int64_t	ke_uu;		/* (j) Previous user time in usec. */
 	u_int64_t	ke_su;		/* (j) Previous system time in usec. */
 	u_int64_t	ke_iu;		/* (j) Previous intr time in usec. */
 	u_int64_t	ke_uticks;	/* (j) Statclock hits in user mode. */
 	u_int64_t	ke_sticks;	/* (j) Statclock hits in system mode. */
 	u_int64_t	ke_iticks;	/* (j) Statclock hits in intr. */
 	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
 	char		ke_rqindex;	/* (j) Run queue index. */
 	enum {
 		KES_IDLE = 0x10,
 		KES_ONRUNQ,
 		KES_UNQUEUED,		/* in transit */
 		KES_THREAD		/* slaved to thread state */
 	} ke_state;			/* (j) S* process status. */
 	struct kse_mailbox *ke_mailbox;	/* the userland mailbox address */
 	stack_t		ke_stack;
 	void		*ke_upcall;
 	struct thread	*ke_tdspare;	/* spare thread for upcalls */
 #define	ke_endzero ke_dummy
 	u_char		ke_dummy;
 };
 
 /* flags kept in ke_flags */
 #define	KEF_OWEUPC	0x00002	/* Owe process an addupc() call at next ast. */
 #define	KEF_IDLEKSE	0x00004	/* A 'Per CPU idle process'.. has one thread */
 #define	KEF_LOANED	0x00008	/* On loan from the bound thread to another */
 #define	KEF_USER	0x00200	/* Process is not officially in the kernel */
 #define	KEF_ASTPENDING	0x00400	/* KSE has a pending ast. */
 #define	KEF_NEEDRESCHED	0x00800	/* Process needs to yield. */
 #define	KEF_ONLOANQ	0x01000 /* KSE is on loan queue. */
 #define	KEF_DIDRUN	0x02000	/* KSE actually ran. */
 #define	KEF_EXIT	0x04000	/* KSE is being killed. */
 
 /*
  * (*) A bound KSE with a bound thread in a KSE process may be lent to
  * Other threads, as long as those threads do not leave the kernel. 
  * The other threads must be either exiting, or be unbound with a valid
  * mailbox so that they can save their state there rather than going
  * to user space. While this happens the real bound thread is still linked
  * to the kse via the ke_bound field, and the KSE has its "KEF_LOANED
  * flag set.
  */
 
 /*
  * Kernel-scheduled entity group (KSEG).  The scheduler considers each KSEG to
  * be an indivisible unit from a time-sharing perspective, though each KSEG may
  * contain multiple KSEs.
  */
 struct ksegrp {
 	struct proc	*kg_proc;	/* Process that contains this KSEG. */
 	TAILQ_ENTRY(ksegrp) kg_ksegrp;	/* Queue of KSEGs in kg_proc. */
 	TAILQ_HEAD(, kse) kg_kseq;	/* (ke_kglist) All KSEs. */
 	TAILQ_HEAD(, kse) kg_iq;	/* (ke_kgrlist) Idle KSEs. */
 	TAILQ_HEAD(, kse) kg_lq;	/* (ke_kgrlist) Loan KSEs. */
 	TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */
 	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) waiting RUNNABLE threads */
 	TAILQ_HEAD(, thread) kg_slpq;	/* (td_runq) NONRUNNABLE threads. */
 
 #define	kg_startzero kg_estcpu
 	u_int		kg_estcpu;	/* Sum of the same field in KSEs. */
  	u_int		kg_slptime;	/* (j) How long completely blocked. */
 	struct thread 	*kg_last_assigned; /* Last thread assigned to a KSE */
 	int		kg_runnable;	/* Num runnable threads on queue. */
 	int		kg_runq_kses;	/* Num KSEs on runq. */
 	int		kg_loan_kses;	/* Num KSEs on loan queue. */
 	struct kse_thr_mailbox *kg_completed; /* (c) completed thread mboxes */
 #define	kg_endzero kg_pri_class
 
 #define	kg_startcopy 	kg_endzero
 	u_char		kg_pri_class;	/* (j) Scheduling class. */
 	u_char		kg_user_pri;	/* (j) User pri from estcpu and nice. */
 	char		kg_nice;	/* (j?/k?) Process "nice" value. */
 #define	kg_endcopy kg_numthreads
 	int		kg_numthreads;	/* Num threads in total */
 	int		kg_idle_kses;	/* num KSEs idle */
 	int		kg_kses;	/* Num KSEs in group. */
 };
 
 /*
  * The old fashionned process. May have multiple threads, KSEGRPs
  * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, ksegrp) p_ksegrps;	/* (kg_ksegrp) All KSEGs. */
 	TAILQ_HEAD(, thread) p_threads;	/* (td_plist) Threads. (shortcut) */
 	TAILQ_HEAD(, thread) p_suspended; /* (td_runq) suspended threads */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Ptr to open files structure. */
 					/* Accumulated stats for all KSEs? */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (m) Process limits. */
 	struct vm_object *p_upages_obj; /* (a) Upages object. */
 	struct procsig	*p_procsig;	/* (c) Signal actions, state (CPU). */
 
 	/*struct ksegrp	p_ksegrp;
 	struct kse	p_kse; */
 
 	/*
 	 * The following don't make too much sense..
 	 * See the td_ or ke_ versions of the same flags
 	 */
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_sflag;	/* (j) PS_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* KSEs can be run */
 		PRS_ZOMBIE
 	} p_state;			/* (j) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct mtx	p_mtx;		/* (k) Lock for this struct. */
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid; 	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtime;	/* (j) Time swapped in or out. */
 	struct itimerval p_realtimer;	/* (h?/k?) Alarm timer. */
 	struct bintime	p_runtime;	/* (j) Real time. */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracep;	/* (c + o) Trace to vnode. */
 	sigset_t	p_siglist;	/* (c) Sigs arrived, not delivered. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	char		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct klist p_klist;		/* (c) Knotes attached to this proc. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	sigset_t	p_oldsigmask;	/* (c) Saved mask from pre sigpause. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	void		*p_aioinfo;	/* (c) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (j) If single threading this is it */
 	int		p_suspcount;	/* (j) # threads in suspended mode */
 	int		p_userthreads;	/* (j) # threads in userland */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_sigmask
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	sigset_t	p_sigmask;	/* (c) Current signal mask. */
 	stack_t		p_sigstk;	/* (c) Stack ptr and on-stack flag. */
 	u_int		p_magic;	/* (b) Magic number. */
 	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (j) Current CPU limit in seconds. */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xstat
 
 	u_short		p_xstat;	/* (c) Exit status; also stop sig. */
 	int		p_numthreads;	/* (?) number of threads */
 	int		p_numksegrps;	/* (?) number of ksegrps */
 	struct mdproc	p_md;		/* (c) Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h) Interval timer callout. */
 	struct user	*p_uarea;	/* (k) Kernel VA of u-area (CPU) */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct rusage	*p_ru;		/* (a) Exit information. XXX */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 };
 
 #define	p_rlimit	p_limit->pl_rlimit
 #define	p_sigacts	p_procsig->ps_sigacts
 #define	p_sigignore	p_procsig->ps_sigignore
 #define	p_sigcatch	p_procsig->ps_sigcatch
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. (SMP) */
 
 /* Status values (p_stat). */
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KTHREAD	0x00004	/* Kernel thread. (*)*/
 #define	P_NOLOAD	0x00008	/* Ignore during load avg calculations. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_KSES		0x08000	/* Process is using KSEs. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 
 /* flags that control how threads may be suspended for some reason */
 #define	P_STOPPED_SIG		0x20000	/* Stopped due to SIGSTOP/SIGTSTP */
 #define	P_STOPPED_TRACE		0x40000	/* Stopped because of tracing */
 #define	P_STOPPED_SINGLE	0x80000	/* Only one thread can continue */
 					/* (not to user) */
 #define	P_SINGLE_EXIT		0x00400	/* Threads suspending should exit, */
 					/* not wait */
 #define	P_TRACED		0x00800	/* Debugged process being traced. */
 #define	P_STOPPED		(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)		((p)->p_flag & P_STOPPED)
 
 /* Should be moved to machine-dependent areas. */
 #define	P_UNUSED100000	0x100000
 #define	P_COWINPROGRESS	0x400000 /* Snapshot copy-on-write in progress. */
 
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_OLDMASK	0x2000000 /* Need to restore mask after suspend. */
 #define	P_ALTSTACK	0x4000000 /* Have alternate signal stack. */
 #define	P_INEXEC	0x8000000 /* Process is in execve(). */
 
 /* These flags are kept in p_sflag and are protected with sched_lock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_XCPU		0x00002 /* Exceeded CPU limit. */
 #define	PS_PROFIL	0x00004	/* Has started profiling. */
 #define	PS_ALRMPEND	0x00020	/* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040	/* Pending SIGPROF needs to be posted. */
 #define	PS_SWAPINREQ	0x00100	/* Swapin request due to wakeup. */
 #define	PS_SWAPPING	0x00200	/* Process is being swapped. */
 #define	PS_NEEDSIGCHK	0x02000	/* Process may need signal delivery. */
 #define	PS_SWAPPINGIN	0x04000	/* Swapin in progress. */
 
 /* used only in legacy conversion code */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_KSEGRP_IN_PROC(p, kg)					\
 	TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp)
 #define	FOREACH_THREAD_IN_GROUP(kg, td)					\
 	TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist)
 #define	FOREACH_KSE_IN_GROUP(kg, ke)					\
 	TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 /* XXXKSE the lines below should probably only be used in 1:1 code */
 #define	FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&p->p_threads)
 #define	FIRST_KSEGRP_IN_PROC(p) TAILQ_FIRST(&p->p_ksegrps)
 #define	FIRST_KSE_IN_KSEGRP(kg) TAILQ_FIRST(&kg->kg_kseq)
 #define	FIRST_KSE_IN_PROC(p) FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p))
 
 static __inline int
 sigonstack(size_t sp)
 {
 	register struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	return ((p->p_flag & P_ALTSTACK) ?
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	    ((p->p_sigstk.ss_size == 0) ? (p->p_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size))
 #else
 	    ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 /* Handy macro to determine if p1 can mangle p2. */
 #define	PRISON_CHECK(p1, p2) \
 	((p1)->p_prison == NULL || (p1)->p_prison == (p2)->p_prison)
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s) {							\
 	if (--(s)->s_count == 0)					\
 		FREE(s, M_SESSION);					\
 }
 
 #define	STOPEVENT(p, e, v) do {						\
 	PROC_LOCK(p);							\
 	_STOPEVENT((p), (e), (v));					\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	if ((p)->p_stops & (e)) {					\
 		stopevent((p), (e), (v));				\
 	}								\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg)						\
 	do {								\
 		if ((pg) != NULL)					\
 			PGRP_LOCK(pg);					\
 	} while (0);
 
 #define	PGRP_UNLOCK_PGSIGNAL(pg)					\
 	do {								\
 		if ((pg) != NULL)					\
 			PGRP_UNLOCK(pg);				\
 	} while (0);
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	if ((p)->p_lock++ == 0) {					\
 		mtx_lock_spin(&sched_lock);				\
 		faultin((p));						\
 		mtx_unlock_spin(&sched_lock);				\
 	}								\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(--(p)->p_lock);						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td))
 
 /* Lock and unlock process arguments. */
 #define	PARGS_LOCK(p)		mtx_lock(&pargs_ref_lock)
 #define	PARGS_UNLOCK(p)		mtx_unlock(&pargs_ref_lock)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern struct sx proctree_lock;
 extern struct mtx pargs_ref_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0 */
 extern struct ksegrp ksegrp0;		/* Primary ksegrp in proc0 */
 extern struct kse kse0;			/* Primary kse in proc0 */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 extern int ps_argsopen;
 extern int ps_showallprocs;
 extern int sched_quantum;		/* Scheduling quantum in ticks. */
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct proc *updateproc;		/* Process slot for syncer (sic). */
 
 extern struct uma_zone *proc_zone;
 
 extern int lastpid;
 
 /*
  * XXX macros for scheduler.  Shouldn't be here, but currently needed for
  * bounding the dubious p_estcpu inheritance in wait1().
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
 	     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT	1		/* Priorities per nice level. */
 
 struct	proc *pfind(pid_t);	/* Find process by id. */
 struct	pgrp *pgfind(pid_t);	/* Find process group by id. */
 struct	proc *zpfind(pid_t);	/* Find zombie process by id. */
 
 void	adjustrunqueue(struct thread *, int newpri);
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, int, int, struct proc **);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 int	leavepgrp(struct proc *p);
 void	mi_switch(void);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
 void	threadinit(void);
 void	proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	setrunnable(struct thread *);
 void	setrunqueue(struct thread *);
 void	setsugid(struct proc *p);
 void	sleepinit(void);
 void	stopevent(struct proc *, u_int, u_int);
 void	cpu_idle(void);
 void	cpu_switch(void);
 void	cpu_throw(void) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *, u_int);
 
 void	cpu_exit(struct thread *);
 void	cpu_sched_exit(struct thread *);
 void	exit1(struct thread *, int) __dead2;
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 void	cpu_wait(struct proc *);
 
 /* New in KSE. */
 struct	ksegrp *ksegrp_alloc(void);
 void	ksegrp_free(struct ksegrp *kg);
 void	ksegrp_stash(struct ksegrp *kg);
 struct	kse *kse_alloc(void);
 void	kse_free(struct kse *ke);
 void	kse_stash(struct kse *ke);
 void	cpu_set_upcall(struct thread *td, void *pcb);
 void	cpu_set_upcall_kse(struct thread *td, struct kse *ke);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_setup(struct thread *td);
 void	kse_reassign(struct kse *ke);
 void	kse_link(struct kse *ke, struct ksegrp *kg);
 void	kse_unlink(struct kse *ke);
 void	ksegrp_link(struct ksegrp *kg, struct proc *p);
 void	ksegrp_unlink(struct ksegrp *kg);
 void	make_kse_runnable(struct kse *ke);
 struct thread *signal_upcall(struct proc *p, int sig);
 struct	thread *thread_alloc(void);
 void	thread_exit(void) __dead2;
 int	thread_export_context(struct thread *td);
 void	thread_free(struct thread *td);
 void	thread_getcontext(struct thread *td, ucontext_t *uc);
 void	thread_link(struct thread *td, struct ksegrp *kg);
 void	thread_reap(void);
 struct thread *thread_schedule_upcall(struct thread *td, struct kse *ke);
 int	thread_setcontext(struct thread *td, ucontext_t *uc);
 int	thread_single(int how);
 #define	SINGLE_NO_EXIT 0			/* values for 'how' */
 #define	SINGLE_EXIT 1
 void	thread_single_end(void);
 void	thread_stash(struct thread *td);
 int	thread_suspend_check(int how);
 void	thread_suspend_one(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_unsuspend_one(struct thread *td);
 int	thread_userret(struct thread *td, struct trapframe *frame);
 void	thread_user_enter(struct proc *p, struct thread *td);
 
 void	thread_sanity_check(struct thread *td, char *);
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */