Index: head/sys/alpha/alpha/trap.c
===================================================================
--- head/sys/alpha/alpha/trap.c	(revision 116360)
+++ head/sys/alpha/alpha/trap.c	(revision 116361)
@@ -1,1307 +1,1307 @@
 /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */
 /*
  * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  * 
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  * 
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  * 
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* #include "opt_fix_unaligned_vax_fp.h" */
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/pioctl.h>
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <sys/user.h>
 #include <sys/ptrace.h>
 #include <machine/clock.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/pal.h>
 #include <machine/fpu.h>
 
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #include <alpha/alpha/db_instruction.h>		/* for handle_opdec() */
 
 unsigned long	Sfloat_to_reg(unsigned int);
 unsigned int	reg_to_Sfloat(unsigned long);
 unsigned long	Tfloat_reg_cvt(unsigned long);
 #ifdef FIX_UNALIGNED_VAX_FP
 unsigned long	Ffloat_to_reg(unsigned int);
 unsigned int	reg_to_Ffloat(unsigned long);
 unsigned long	Gfloat_reg_cvt(unsigned long);
 #endif
 
 int		unaligned_fixup(unsigned long, unsigned long,
 		    unsigned long, struct thread *);
 int		handle_opdec(struct thread *td, u_int64_t *ucodep);
 
 static void printtrap(const unsigned long, const unsigned long,
       const unsigned long, const unsigned long, struct trapframe *, int, int);
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 static const char *arith_exceptions[] = {
 	"software completion",
 	"invalid operation",
 	"division by zero",
 	"overflow",
 	"underflow",
 	"inexact result",
 	"integer overflow",
 };
 
 static const char *instruction_faults[] = {
 	"bpt",
 	"bugchk",
 	"gentrap",
 	"FEN",
 	"opDec"
 };
 
 static const char *interrupt_types[] = {
 	"interprocessor",
 	"clock",
 	"correctable error",
 	"machine check",
 	"I/O device",
 	"performance counter"
 };
 
 static const char *mmfault_types[] = {
 	"translation not valid",
 	"access violation",
 	"fault on read",
 	"fault on execute",
 	"fault on write"
 };
 
 static const char *mmfault_causes[] = {
 	"instruction fetch",
 	"load instructon",
 	"store instruction"
 };
 
 static void
 printtrap(a0, a1, a2, entry, framep, isfatal, user)
 	const unsigned long a0, a1, a2, entry;
 	struct trapframe *framep;
 	int isfatal, user;
 {
 	char ubuf[64];
 	const char *entryname;
 	unsigned long i;
 
 	switch (entry) {
 	case ALPHA_KENTRY_INT:
 		entryname = "interrupt";
 		break;
 	case ALPHA_KENTRY_ARITH:
 		entryname = "arithmetic trap";
 		break;
 	case ALPHA_KENTRY_MM:
 		entryname = "memory management fault";
 		break;
 	case ALPHA_KENTRY_IF:
 		entryname = "instruction fault";
 		break;
 	case ALPHA_KENTRY_UNA:
 		entryname = "unaligned access fault";
 		break;
 	case ALPHA_KENTRY_SYS:
 		entryname = "system call";
 		break;
 	default:
 		snprintf(ubuf, sizeof(ubuf), "type %lx", entry);
 		entryname = (const char *) ubuf;
 		break;
 	}
 
 	printf("\n");
 	printf("%s %s trap:\n", isfatal? "fatal" : "handled",
 	       user ? "user" : "kernel");
 	printf("\n");
 	printf("    trap entry     = 0x%lx (%s)\n", entry, entryname);
 #ifdef SMP
 	printf("    cpuid          = %d\n", PCPU_GET(cpuid));
 #endif
 	switch (entry) {
 	case ALPHA_KENTRY_INT:
 		printf("    interrupt type = ");
 		if (a0 < 5) {
 			printf("%s\n", interrupt_types[a0]);
 			if (a0 > 1) {
 				printf("    vector         = 0x%lx\n", a1);
 				if (a0 < 3)
 					printf("    logout area    = 0x%lx\n",
 					    a2);
 			}
 		} else
 			printf("0x%lx (unknown)\n", a0);
 		break;
 	case ALPHA_KENTRY_ARITH:
 		printf("    exception type = ");
 		for (i = 0; i < 7; i++)
 			if (a0 & (1 << i)) {
 				printf("%s", arith_exceptions[i]);
 				if (a0 & (~0 - (1 << i)))
 					printf(", ");
 			}
 		printf("\n");
 		printf("    register mask  = 0x%lx", a1);
 		break;
 	case ALPHA_KENTRY_MM:
 		printf("    faulting va    = 0x%lx\n", a0);
 		printf("    type           = ");
 		if (a1 < 5)
 			printf("%s\n", mmfault_types[a1]);
 		else
 			printf("0x%lx (unknown)\n", a1);
 		printf("    cause          = ");
 		i = a2 + 1;
 		if (i < 3)
 			printf("%s\n", mmfault_causes[i]);
 		else
 			printf("0x%lx (unknown)\n", a2);
 		break;
 	case ALPHA_KENTRY_IF:
 		printf("    fault type     = ");
 		if (a0 < 5)
 			printf("%s\n", instruction_faults[a0]);
 		else
 			printf("0x%lx (unknown)\n", a0);
 		break;
 	case ALPHA_KENTRY_UNA:
 		printf("    faulting va    = 0x%lx\n", a0);
 		printf("    opcode         = 0x%lx\n", a1);
 		printf("    register       = 0x%lx\n", a2);
 		break;
 	default:
 		printf("    a0             = 0x%lx\n", a0);
 		printf("    a1             = 0x%lx\n", a1);
 		printf("    a2             = 0x%lx\n", a2);
 		break;
 	}
 	printf("    pc             = 0x%lx\n", framep->tf_regs[FRAME_PC]);
 	printf("    ra             = 0x%lx\n", framep->tf_regs[FRAME_RA]);
 	printf("    sp             = 0x%lx\n", framep->tf_regs[FRAME_SP]);
 	if (curthread != NULL && (curthread->td_proc->p_flag & P_KTHREAD) == 0)
 		printf("    usp            = 0x%lx\n", alpha_pal_rdusp());
 	printf("    curthread      = %p\n", curthread);
 	if (curthread != NULL)
 		printf("        pid = %d, comm = %s\n",
 		    curthread->td_proc->p_pid, curthread->td_proc->p_comm);
 	printf("\n");
 }
 
 /*
  * Trap is called from locore to handle most types of processor traps.
  * System calls are broken out for efficiency and ASTs are broken out
  * to make the code a bit cleaner and more representative of the
  * Alpha architecture.
  */
 /*ARGSUSED*/
 void
 trap(a0, a1, a2, entry, framep)
 	const unsigned long a0, a1, a2, entry;
 	struct trapframe *framep;
 {
 	register struct thread *td;
 	register struct proc *p;
 	register int i;
 	u_int64_t ucode;
 	u_int sticks;
 	int user;
 #ifdef SMP
 	register_t s;
 #endif
 
 	/*
 	 * Find our per-cpu globals.
 	 */
 #ifdef SMP
 	s = intr_disable();
 #endif
 	pcpup = (struct pcpu *) alpha_pal_rdval();
 	td = curthread;
 #ifdef SMP
 	if (td == NULL) {
 		printtrap(a0, a1, a2, entry, framep, 1, 0);
 		cpu_halt();
 	}
 	td->td_md.md_kernnest++;
 	intr_restore(s);
 #endif
 	p = td->td_proc;
 
 	/*
 	GIANT_REQUIRED;
 	 * Giant hasn't been acquired yet.
 	 */
 
 	cnt.v_trap++;
 	ucode = 0;
 	user = (framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) != 0;
 	CTR5(KTR_TRAP, "%s trap: pid %d, (%lx, %lx, %lx)",
 	    user ? "user" : "kernel", p->p_pid, a0, a1, a2);
 	if (user)  {
 		sticks = td->td_sticks;
 		td->td_frame = framep;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
 	} else {
 		sticks = 0;		/* XXX bogus -Wuninitialized warning */
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 	}
 
 #ifdef DIAGNOSTIC
 	if (user)
 		alpha_fpstate_check(td);
 #endif
 
 	switch (entry) {
 	case ALPHA_KENTRY_UNA:
 		/*
 		 * If user-land, do whatever fixups, printing, and
 		 * signalling is appropriate (based on system-wide
 		 * and per-process unaligned-access-handling flags).
 		 */
 		if (user) {
 			mtx_lock(&Giant);
 			i = unaligned_fixup(a0, a1, a2, td);
 			mtx_unlock(&Giant);
 			if (i == 0)
 				goto out;
 			ucode = a0;		/* VA */
 			break;
 		}
 
 		/*
 		 * Unaligned access from kernel mode is always an error,
 		 * EVEN IF A COPY FAULT HANDLER IS SET!
 		 *
 		 * It's an error if a copy fault handler is set because
 		 * the various routines which do user-initiated copies
 		 * do so in a bcopy-like manner.  In other words, the
 		 * kernel never assumes that pointers provided by the
 		 * user are properly aligned, and so if the kernel
 		 * does cause an unaligned access it's a kernel bug.
 		 */
 		goto dopanic;
 
 	case ALPHA_KENTRY_ARITH:
 		/* 
 		 * If user-land, give a SIGFPE if software completion
 		 * is not requested or if the completion fails.
 		 */
 		if (user) {
 			mtx_lock(&Giant);
 			if (a0 & EXCSUM_SWC)
 				if (fp_software_completion(a1, td)) {
 					mtx_unlock(&Giant);
 					goto out;
 				}
 			mtx_unlock(&Giant);
 			i = SIGFPE;
 			ucode =  a0;		/* exception summary */
 			break;
 		}
 
 		/* Always fatal in kernel.  Should never happen. */
 		goto dopanic;
 
 	case ALPHA_KENTRY_IF:
 		/*
 		 * These are always fatal in kernel, and should never happen.
 		 */
 		if (!user) {
 #ifdef DDB
 			/*
 			 * ...unless, of course, DDB is configured; BUGCHK
 			 * is used to invoke the kernel debugger, and we
 			 * might have set a breakpoint.
 			 */
 			if (a0 == ALPHA_IF_CODE_BUGCHK ||
 			    a0 == ALPHA_IF_CODE_BPT) {
 				if (kdb_trap(a0, a1, a2, entry, framep))
 					goto out;
 			}
 
 			/*
 			 * If we get here, DDB did _not_ handle the
 			 * trap, and we need to PANIC!
 			 */
 #endif
 			goto dopanic;
 		}
 		i = 0;
 		switch (a0) {
 		case ALPHA_IF_CODE_GENTRAP:
 			if (framep->tf_regs[FRAME_A0] == -2) { /* weird! */
 				i = SIGFPE;
 				ucode =  a0;	/* exception summary */
 				break;
 			}
 			/* FALLTHROUGH */
 		case ALPHA_IF_CODE_BPT:
 		case ALPHA_IF_CODE_BUGCHK:
 			if (td->td_md.md_flags & (MDTD_STEP1|MDTD_STEP2)) {
 				mtx_lock(&Giant);
 				ptrace_clear_single_step(td);
 				td->td_frame->tf_regs[FRAME_PC] -= 4;
 				mtx_unlock(&Giant);
 			}
 			ucode = a0;		/* trap type */
 			i = SIGTRAP;
 			break;
 
 		case ALPHA_IF_CODE_OPDEC:
 			i = handle_opdec(td, &ucode);
 			if (i == 0)
 				goto out;
 			break;
 
 		case ALPHA_IF_CODE_FEN:
 			/*
 			 * on exit from the kernel, if thread == fpcurthread,
 			 * FP is enabled.
 			 */
 			if (PCPU_GET(fpcurthread) == td) {
 				printf("trap: fp disabled for fpcurthread == %p",
 				    td);
 				goto dopanic;
 			}
 	
 			alpha_fpstate_switch(td);
 			goto out;
 
 		default:
 			printf("trap: unknown IF type 0x%lx\n", a0);
 			goto dopanic;
 		}
 		break;
 
 	case ALPHA_KENTRY_MM:
 		switch (a1) {
 		case ALPHA_MMCSR_FOR:
 		case ALPHA_MMCSR_FOE:
 		case ALPHA_MMCSR_FOW:
 			pmap_emulate_reference(p->p_vmspace, a0, user,
 			    a1 == ALPHA_MMCSR_FOW);
 			goto out;
 
 		case ALPHA_MMCSR_INVALTRANS:
 		case ALPHA_MMCSR_ACCESS:
 	    	{
 			register vm_offset_t va;
 			register struct vmspace *vm = NULL;
 			register vm_map_t map;
 			vm_prot_t ftype = 0;
 			int rv;
 
 			/*
 			 * If it was caused by fuswintr or suswintr,
 			 * just punt.  Note that we check the faulting
 			 * address against the address accessed by
 			 * [fs]uswintr, in case another fault happens
 			 * when they are running.
 			 */
 			if (!user &&
 			    td != NULL &&
 			    td->td_pcb->pcb_onfault ==
 			      (unsigned long)fswintrberr &&
 			    td->td_pcb->pcb_accessaddr == a0) {
 				framep->tf_regs[FRAME_PC] =
 				    td->td_pcb->pcb_onfault;
 				td->td_pcb->pcb_onfault = 0;
 				goto out;
 			}
 
 			/*
 			 * It is only a kernel address space fault iff:
 			 *	1. !user and
 			 *	2. pcb_onfault not set or
 			 *	3. pcb_onfault set but kernel space data fault
 			 * The last can occur during an exec() copyin where the
 			 * argument space is lazy-allocated.
 			 *
 			 * For the purposes of the Linux emulator, we allow
 			 * kernel accesses to a small region of the
 			 * user stack which the emulator uses to
 			 * translate syscall arguments.
 			 */
 			if (!user 
 			    && ((a0 >= VM_MIN_KERNEL_ADDRESS) 
 				|| (td == NULL) 
 				|| (td->td_pcb->pcb_onfault == 0))) {
 				if (a0 >= trunc_page(PS_STRINGS
 						     - szsigcode
 						     - SPARE_USRSPACE)
 				    && a0 < round_page(PS_STRINGS
 						       - szsigcode)) {
 					vm = p->p_vmspace;
 					map = &vm->vm_map;
 				} else {
 					map = kernel_map;
 				}
 			} else {
 				vm = p->p_vmspace;
 				map = &vm->vm_map;
 			}
 	
 			switch (a2) {
 			case -1:		/* instruction fetch fault */
 			case 0:			/* load instruction */
 				ftype = VM_PROT_READ;
 				break;
 			case 1:			/* store instruction */
 				ftype = VM_PROT_WRITE;
 				break;
 #ifdef DIAGNOSTIC
 			default:		/* XXX gcc -Wuninitialized */
 				goto dopanic;
 #endif
 			}
 	
 			va = trunc_page((vm_offset_t)a0);
 
 			if (map != kernel_map) {
 				/*
 				 * Keep swapout from messing with us
 				 * during thiscritical time.
 				 */
 				PROC_LOCK(p);
 				++p->p_lock;
 				PROC_UNLOCK(p);
 
 				/* Fault in the user page: */
 				rv = vm_fault(map, va, ftype,
 					      (ftype & VM_PROT_WRITE)
 						      ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 				PROC_LOCK(p);
 				--p->p_lock;
 				PROC_UNLOCK(p);
 			} else {
 				/*
 				 * Don't have to worry about process
 				 * locking or stacks in the kernel.
 				 */
 				rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 			}
 			if (rv == KERN_SUCCESS)
 				goto out;
 
 			if (!user) {
 				/* Check for copyin/copyout fault */
 				if (td != NULL &&
 				    td->td_pcb->pcb_onfault != 0) {
 					framep->tf_regs[FRAME_PC] =
 					    td->td_pcb->pcb_onfault;
 					td->td_pcb->pcb_onfault = 0;
 					goto out;
 				}
 				goto dopanic;
 			}
 			ucode = a0;
 			i = SIGSEGV;
 #ifdef DEBUG
 			printtrap(a0, a1, a2, entry, framep, 1, user);
 #endif
 			break;
 		    }
 
 		default:
 			printf("trap: unknown MMCSR value 0x%lx\n", a1);
 			goto dopanic;
 		}
 		break;
 
 	default:
 		goto dopanic;
 	}
 
 #ifdef DEBUG
 	printtrap(a0, a1, a2, entry, framep, 1, user);
 #endif
 	framep->tf_regs[FRAME_TRAPARG_A0] = a0;
 	framep->tf_regs[FRAME_TRAPARG_A1] = a1;
 	framep->tf_regs[FRAME_TRAPARG_A2] = a2;
 	trapsignal(td, i, ucode);
 out:
 	if (user) {
 		framep->tf_regs[FRAME_SP] = alpha_pal_rdusp();
 		userret(td, framep, sticks);
 		mtx_assert(&Giant, MA_NOTOWNED);
 #ifdef DIAGNOSTIC
 		cred_free_thread(td);
 #endif
 	}
 	return;
 
 dopanic:
 	printtrap(a0, a1, a2, entry, framep, 1, user);
 	/* XXX dump registers */
 #ifdef DDB
 	kdb_trap(a0, a1, a2, entry, framep);
 #endif
 	panic("trap");
 }
 
 /*
  * Process a system call.
  *
  * System calls are strange beasts.  They are passed the syscall number
  * in v0, and the arguments in the registers (as normal).  They return
  * an error flag in a3 (if a3 != 0 on return, the syscall had an error),
  * and the return value (if any) in v0.
  *
  * The assembly stub takes care of moving the call number into a register
  * we can get to, and moves all of the argument registers into their places
  * in the trap frame.  On return, it restores the callee-saved registers,
  * a3, and v0 from the frame before returning to the user process.
  */
 void
 syscall(code, framep)
 	u_int64_t code;
 	struct trapframe *framep;
 {
 	struct sysent *callp;
 	struct thread *td;
 	struct proc *p;
 	int error = 0;
 	u_int64_t opc;
 	u_int sticks;
 	u_int64_t args[10];					/* XXX */
 	u_int hidden = 0, nargs;
 #ifdef SMP
 	register_t s;
 #endif
 
 	/*
 	 * Find our per-cpu globals.
 	 */
 #ifdef SMP
 	s = intr_disable();
 #endif
 	pcpup = (struct pcpu *) alpha_pal_rdval();
 	td = curthread;
 #ifdef SMP
 	td->td_md.md_kernnest++;
 	intr_restore(s);
 #endif
 	p = td->td_proc;
 
 	framep->tf_regs[FRAME_TRAPARG_A0] = 0;
 	framep->tf_regs[FRAME_TRAPARG_A1] = 0;
 	framep->tf_regs[FRAME_TRAPARG_A2] = 0;
 #if notdef				/* can't happen, ever. */
 	if ((framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) == 0)
 		panic("syscall");
 #endif
 
 	cnt.v_syscall++;
 	td->td_frame = framep;
 	opc = framep->tf_regs[FRAME_PC] - 4;
 	sticks = td->td_sticks;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		thread_user_enter(p, td);
 #ifdef DIAGNOSTIC
 	alpha_fpstate_check(td);
 #endif
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, &params); */
 		panic("prepsyscall");
 	} else {
 		/*
 		 * syscall() and __syscall() are handled the same on
 		 * the alpha, as everything is 64-bit aligned, anyway.
 		 */
 		if (code == SYS_syscall || code == SYS___syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = framep->tf_regs[FRAME_A0];
 			hidden = 1;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	nargs = (callp->sy_narg & SYF_ARGMASK) + hidden;
 	switch (nargs) {
 	default:
 		if (nargs > 10)		/* XXX */
 			panic("syscall: too many args (%d)", nargs);
 		error = copyin((caddr_t)(alpha_pal_rdusp()), &args[6],
 		    (nargs - 6) * sizeof(u_int64_t));
 	case 6:	
 		args[5] = framep->tf_regs[FRAME_A5];
 	case 5:	
 		args[4] = framep->tf_regs[FRAME_A4];
 	case 4:	
 		args[3] = framep->tf_regs[FRAME_A3];
 	case 3:	
 		args[2] = framep->tf_regs[FRAME_A2];
 	case 2:	
 		args[1] = framep->tf_regs[FRAME_A1];
 	case 1:	
 		args[0] = framep->tf_regs[FRAME_A0];
 	case 0:
 		break;
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 	    ktrsyscall(code, (callp->sy_narg & SYF_ARGMASK), args + hidden);
 #endif
 	/*
 	 * Try to run the syscall without the MP lock if the syscall
 	 * is MP safe
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = 0;
 
 		STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK));
 
 		error = (*callp->sy_call)(td, args + hidden);
 	}
 
 
 	switch (error) {
 	case 0:
 		framep->tf_regs[FRAME_V0] = td->td_retval[0];
 		framep->tf_regs[FRAME_A4] = td->td_retval[1];
 		framep->tf_regs[FRAME_A3] = 0;
 		break;
 	case ERESTART:
 		framep->tf_regs[FRAME_PC] = opc;
 		break;
 	case EJUSTRETURN:
 		break;
 	default:
 		if (p->p_sysent->sv_errsize) {
 			if (error >= p->p_sysent->sv_errsize)
 				error = -1; /* XXX */
 			else
 				error = p->p_sysent->sv_errtbl[error];
 		}
 		framep->tf_regs[FRAME_V0] = error;
 		framep->tf_regs[FRAME_A3] = 1;
 		break;
 	}
 
 	/*
 	 * Release Giant if we had to get it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	userret(td, framep, sticks);
 	
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Unaligned access handler.  It's not clear that this can get much slower...
  *
  */
 const static int reg_to_framereg[32] = {
 	FRAME_V0,	FRAME_T0,	FRAME_T1,	FRAME_T2,
 	FRAME_T3,	FRAME_T4,	FRAME_T5,	FRAME_T6,
 	FRAME_T7,	FRAME_S0,	FRAME_S1,	FRAME_S2,
 	FRAME_S3,	FRAME_S4,	FRAME_S5,	FRAME_S6,
 	FRAME_A0,	FRAME_A1,	FRAME_A2,	FRAME_A3,
 	FRAME_A4,	FRAME_A5,	FRAME_T8,	FRAME_T9,
 	FRAME_T10,	FRAME_T11,	FRAME_RA,	FRAME_T12,
 	FRAME_AT,	FRAME_GP,	FRAME_SP,	-1,
 };
 
 #define	irp(td, reg)							\
 	((reg_to_framereg[(reg)] == -1) ? NULL :			\
 	    &(td)->td_frame->tf_regs[reg_to_framereg[(reg)]])
 
 #define	frp(td, reg)							\
 	(&(td)->td_pcb->pcb_fp.fpr_regs[(reg)])
 
 #define	unaligned_load(storage, ptrf, mod)				\
 	if (copyin((caddr_t)va, &(storage), sizeof (storage)) == 0 &&	\
 	    (regptr = ptrf(td, reg)) != NULL)				\
 		signal = 0;						\
 	else								\
 		break;							\
 	*regptr = mod (storage);
 
 #define	unaligned_store(storage, ptrf, mod)				\
 	if ((regptr = ptrf(td, reg)) == NULL)				\
 		(storage) = 0;						\
 	else								\
 		(storage) = mod (*regptr);				\
 	if (copyout(&(storage), (caddr_t)va, sizeof (storage)) == 0)	\
 		signal = 0;						\
 	else								\
 		break;
 
 #define	unaligned_load_integer(storage)					\
 	unaligned_load(storage, irp, )
 
 #define	unaligned_store_integer(storage)				\
 	unaligned_store(storage, irp, )
 
 #define	unaligned_load_floating(storage, mod)				\
 	alpha_fpstate_save(td, 1);					\
 	unaligned_load(storage, frp, mod)
 
 #define	unaligned_store_floating(storage, mod)				\
 	alpha_fpstate_save(td, 0);					\
 	unaligned_store(storage, frp, mod)
 
 unsigned long
 Sfloat_to_reg(s)
 	unsigned int s;
 {
 	unsigned long sign, expn, frac;
 	unsigned long result;
 
 	sign = (s & 0x80000000) >> 31;
 	expn = (s & 0x7f800000) >> 23;
 	frac = (s & 0x007fffff) >>  0;
 
 	/* map exponent part, as appropriate. */
 	if (expn == 0xff)
 		expn = 0x7ff;
 	else if ((expn & 0x80) != 0)
 		expn = (0x400 | (expn & ~0x80));
 	else if ((expn & 0x80) == 0 && expn != 0)
 		expn = (0x380 | (expn & ~0x80));
 
 	result = (sign << 63) | (expn << 52) | (frac << 29);
 	return (result);
 }
 
 unsigned int
 reg_to_Sfloat(r)
 	unsigned long r;
 {
 	unsigned long sign, expn, frac;
 	unsigned int result;
 
 	sign = (r & 0x8000000000000000) >> 63;
 	expn = (r & 0x7ff0000000000000) >> 52;
 	frac = (r & 0x000fffffe0000000) >> 29;
 
 	/* map exponent part, as appropriate. */
 	expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00);
 
 	result = (sign << 31) | (expn << 23) | (frac << 0);
 	return (result);
 }
 
 /*
  * Conversion of T floating datums to and from register format
  * requires no bit reordering whatsoever.
  */
 unsigned long
 Tfloat_reg_cvt(input)
 	unsigned long input;
 {
 
 	return (input);
 }
 
 #ifdef FIX_UNALIGNED_VAX_FP
 unsigned long
 Ffloat_to_reg(f)
 	unsigned int f;
 {
 	unsigned long sign, expn, frlo, frhi;
 	unsigned long result;
 
 	sign = (f & 0x00008000) >> 15;
 	expn = (f & 0x00007f80) >>  7;
 	frhi = (f & 0x0000007f) >>  0;
 	frlo = (f & 0xffff0000) >> 16;
 
 	/* map exponent part, as appropriate. */
 	if ((expn & 0x80) != 0)
 		expn = (0x400 | (expn & ~0x80));
 	else if ((expn & 0x80) == 0 && expn != 0)
 		expn = (0x380 | (expn & ~0x80));
 
 	result = (sign << 63) | (expn << 52) | (frhi << 45) | (frlo << 29);
 	return (result);
 }
 
 unsigned int
 reg_to_Ffloat(r)
 	unsigned long r;
 {
 	unsigned long sign, expn, frhi, frlo;
 	unsigned int result;
 
 	sign = (r & 0x8000000000000000) >> 63;
 	expn = (r & 0x7ff0000000000000) >> 52;
 	frhi = (r & 0x000fe00000000000) >> 45;
 	frlo = (r & 0x00001fffe0000000) >> 29;
 
 	/* map exponent part, as appropriate. */
 	expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00);
 
 	result = (sign << 15) | (expn << 7) | (frhi << 0) | (frlo << 16);
 	return (result);
 }
 
 /*
  * Conversion of G floating datums to and from register format is
  * symmetrical.  Just swap shorts in the quad...
  */
 unsigned long
 Gfloat_reg_cvt(input)
 	unsigned long input;
 {
 	unsigned long a, b, c, d;
 	unsigned long result;
 
 	a = (input & 0x000000000000ffff) >> 0;
 	b = (input & 0x00000000ffff0000) >> 16;
 	c = (input & 0x0000ffff00000000) >> 32;
 	d = (input & 0xffff000000000000) >> 48;
 
 	result = (a << 48) | (b << 32) | (c << 16) | (d << 0);
 	return (result);
 }
 #endif /* FIX_UNALIGNED_VAX_FP */
 
 extern int	alpha_unaligned_print, alpha_unaligned_fix;
 extern int	alpha_unaligned_sigbus;
 
 struct unaligned_fixup_data {
 	const char *type;	/* opcode name */
 	int size;		/* size, 0 if fixup not supported */
 };
 
 int
 unaligned_fixup(va, opcode, reg, td)
 	unsigned long va, opcode, reg;
 	struct thread *td;
 {
 	int doprint, dofix, dosigbus;
 	int signal, size;
 	const char *type;
 	struct proc *p;
 	unsigned long *regptr, longdata, uac;
 	int intdata;		/* signed to get extension when storing */
 	u_int16_t worddata;	/* unsigned to _avoid_ extension */
 	const struct unaligned_fixup_data tab_0c[0x2] = {
 		{ "ldwu",	2 },	{ "stw",	2 },
 	};
 	const struct unaligned_fixup_data tab_20[0x10] = {
 #ifdef FIX_UNALIGNED_VAX_FP
 		{ "ldf",	4 },	{ "ldg",	8 },
 #else
 		{ "ldf",	0 },	{ "ldg",	0 },
 #endif
 		{ "lds",	4 },	{ "ldt",	8 },
 #ifdef FIX_UNALIGNED_VAX_FP
 		{ "stf",	4 },	{ "stg",	8 },
 #else
 		{ "stf",	0 },	{ "stg",	0 },
 #endif
 		{ "sts",	4 },	{ "stt",	8 },
 		{ "ldl",	4 },	{ "ldq",	8 },
 		{ "ldl_l",	0 },	{ "ldq_l",	0 },	/* can't fix */
 		{ "stl",	4 },	{ "stq",	8 },
 		{ "stl_c",	0 },	{ "stq_c",	0 },	/* can't fix */
 	};
 
 	/*
 	 * Figure out what actions to take.
 	 *
 	 */
 
 	if (td) {
 		p = td->td_proc;
 		uac = p->p_md.md_uac;
 	} else {
 		uac = 0;
 		p = NULL;
 	}
 
 	doprint = alpha_unaligned_print && !(uac & MDP_UAC_NOPRINT);
 	dofix = alpha_unaligned_fix && !(uac & MDP_UAC_NOFIX);
 	dosigbus = alpha_unaligned_sigbus | (uac & MDP_UAC_SIGBUS);
 
 	/*
 	 * Find out which opcode it is.  Arrange to have the opcode
 	 * printed if it's an unknown opcode.
 	 */
 	if (opcode >= 0x0c && opcode <= 0x0d) {
 		type = tab_0c[opcode - 0x0c].type;
 		size = tab_0c[opcode - 0x0c].size;
 	} else if (opcode >= 0x20 && opcode <= 0x2f) {
 		type = tab_20[opcode - 0x20].type;
 		size = tab_20[opcode - 0x20].size;
 	} else {
 		type = "0x%lx";
 		size = 0;
 	}
 
 	/*
 	 * See if the user can access the memory in question.
 	 * Even if it's an unknown opcode, SEGV if the access
 	 * should have failed.
 	 */
 	if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) {
 		signal = SIGSEGV;
 		goto out;
 	}
 
 	/*
 	 * If we're supposed to be noisy, squawk now.
 	 */
 	if (doprint) {
 		uprintf(
 		"pid %d (%s): unaligned access: va=0x%lx pc=0x%lx ra=0x%lx op=",
 		    p->p_pid, p->p_comm, va, td->td_frame->tf_regs[FRAME_PC],
 		    td->td_frame->tf_regs[FRAME_RA]);
 		uprintf(type,opcode);
 		uprintf("\n");
 	}
 
 	/*
 	 * If we should try to fix it and know how, give it a shot.
 	 *
 	 * We never allow bad data to be unknowingly used by the
 	 * user process.  That is, if we decide not to fix up an
 	 * access we cause a SIGBUS rather than letting the user
 	 * process go on without warning.
 	 *
 	 * If we're trying to do a fixup, we assume that things
 	 * will be botched.  If everything works out OK, 
 	 * unaligned_{load,store}_* clears the signal flag.
 	 */
 	signal = SIGBUS;
 	if (dofix && size != 0) {
 		switch (opcode) {
 		case 0x0c:                      /* ldwu */
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			unaligned_load_integer(worddata);
 			break;
 
 		case 0x0d:                      /* stw */
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			unaligned_store_integer(worddata);
 			break;
 #ifdef FIX_UNALIGNED_VAX_FP
 		case 0x20:			/* ldf */
 			unaligned_load_floating(intdata, Ffloat_to_reg);
 			break;
 
 		case 0x21:			/* ldg */
 			unaligned_load_floating(longdata, Gfloat_reg_cvt);
 			break;
 #endif
 
 		case 0x22:			/* lds */
 			unaligned_load_floating(intdata, Sfloat_to_reg);
 			break;
 
 		case 0x23:			/* ldt */
 			unaligned_load_floating(longdata, Tfloat_reg_cvt);
 			break;
 
 #ifdef FIX_UNALIGNED_VAX_FP
 		case 0x24:			/* stf */
 			unaligned_store_floating(intdata, reg_to_Ffloat);
 			break;
 
 		case 0x25:			/* stg */
 			unaligned_store_floating(longdata, Gfloat_reg_cvt);
 			break;
 #endif
 
 		case 0x26:			/* sts */
 			unaligned_store_floating(intdata, reg_to_Sfloat);
 			break;
 
 		case 0x27:			/* stt */
 			unaligned_store_floating(longdata, Tfloat_reg_cvt);
 			break;
 
 		case 0x28:			/* ldl */
 			unaligned_load_integer(intdata);
 			break;
 
 		case 0x29:			/* ldq */
 			unaligned_load_integer(longdata);
 			break;
 
 		case 0x2c:			/* stl */
 			unaligned_store_integer(intdata);
 			break;
 
 		case 0x2d:			/* stq */
 			unaligned_store_integer(longdata);
 			break;
 
 #ifdef DIAGNOSTIC
 		default:
 			panic("unaligned_fixup: can't get here");
 #endif
 		}
 	} 
 
 	/*
 	 * Force SIGBUS if requested.
 	 */
 	if (dosigbus)
 		signal = SIGBUS;
 
 out:
 	return (signal);
 }
 
 
 /*
  * Reserved/unimplemented instruction (opDec fault) handler
  *
  * Argument is the process that caused it.  No useful information
  * is passed to the trap handler other than the fault type.  The
  * address of the instruction that caused the fault is 4 less than
  * the PC stored in the trap frame.
  *
  * If the instruction is emulated successfully, this function returns 0.
  * Otherwise, this function returns the signal to deliver to the process,
  * and fills in *ucodep with the code to be delivered.
  */
 int
 handle_opdec(td, ucodep)
 	struct thread *td;
 	u_int64_t *ucodep;
 {
 	alpha_instruction inst;
 	register_t *regptr, memaddr;
 	u_int64_t inst_pc;
 	int sig;
 
 	/*
 	 * Read USP into frame in case it's going to be used or modified.
 	 * This keeps us from having to check for it in lots of places
 	 * later.
 	 */
 	td->td_frame->tf_regs[FRAME_SP] = alpha_pal_rdusp();
 
 	inst_pc = memaddr = td->td_frame->tf_regs[FRAME_PC] - 4;
 	if (copyin((caddr_t)inst_pc, &inst, sizeof (inst)) != 0) {
 		/*
 		 * really, this should never happen, but in case it
 		 * does we handle it.
 		 */
 		printf("WARNING: handle_opdec() couldn't fetch instruction\n");
 		goto sigsegv;
 	}
 
 	switch (inst.generic_format.opcode) {
 	case op_ldbu:
 	case op_ldwu:
 	case op_stw:
 	case op_stb:
 		regptr = irp(td, inst.mem_format.rs);
 		if (regptr != NULL)
 			memaddr = *regptr;
 		else
 			memaddr = 0;
 		memaddr += inst.mem_format.displacement;
 
 		regptr = irp(td, inst.mem_format.rd);
 
 		if (inst.mem_format.opcode == op_ldwu ||
 		    inst.mem_format.opcode == op_stw) {
 			if (memaddr & 0x01) {
 				sig = unaligned_fixup(memaddr,
 				    inst.mem_format.opcode,
 				    inst.mem_format.rd, td);
 				if (sig)
 					goto unaligned_fixup_sig;
 				break;
 			}
 		}
 
 		if (inst.mem_format.opcode == op_ldbu) {
 			u_int8_t b;
 
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			if (copyin((caddr_t)memaddr, &b, sizeof (b)) != 0)
 				goto sigsegv;
 			if (regptr != NULL)
 				*regptr = b;
 		} else if (inst.mem_format.opcode == op_ldwu) {
 			u_int16_t w;
 
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			if (copyin((caddr_t)memaddr, &w, sizeof (w)) != 0)
 				goto sigsegv;
 			if (regptr != NULL)
 				*regptr = w;
 		} else if (inst.mem_format.opcode == op_stw) {
 			u_int16_t w;
 
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			w = (regptr != NULL) ? *regptr : 0;
 			if (copyout(&w, (caddr_t)memaddr, sizeof (w)) != 0)
 				goto sigsegv;
 		} else if (inst.mem_format.opcode == op_stb) {
 			u_int8_t b;
 
 			/* XXX ONLY WORKS ON LITTLE-ENDIAN ALPHA */
 			b = (regptr != NULL) ? *regptr : 0;
 			if (copyout(&b, (caddr_t)memaddr, sizeof (b)) != 0)
 				goto sigsegv;
 		}
 		break;
 
 	case op_intmisc:
 		if (inst.operate_generic_format.function == op_sextb &&
 		    inst.operate_generic_format.ra == 31) {
 			int8_t b;
 
 			if (inst.operate_generic_format.is_lit) {
 				b = inst.operate_lit_format.literal;
 			} else {
 				if (inst.operate_reg_format.sbz != 0)
 					goto sigill;
 				regptr = irp(td, inst.operate_reg_format.rt);
 				b = (regptr != NULL) ? *regptr : 0;
 			}
 
 			regptr = irp(td, inst.operate_generic_format.rc);
 			if (regptr != NULL)
 				*regptr = b;
 			break;
 		}
 		if (inst.operate_generic_format.function == op_sextw &&
 		    inst.operate_generic_format.ra == 31) {
 			int16_t w;
 
 			if (inst.operate_generic_format.is_lit) {
 				w = inst.operate_lit_format.literal;
 			} else {
 				if (inst.operate_reg_format.sbz != 0)
 					goto sigill;
 				regptr = irp(td, inst.operate_reg_format.rt);
 				w = (regptr != NULL) ? *regptr : 0;
 			}
 
 			regptr = irp(td, inst.operate_generic_format.rc);
 			if (regptr != NULL)
 				*regptr = w;
 			break;
 		}
 		goto sigill;
 
 	default:
 		goto sigill;
 	}
 
 	/*
 	 * Write back USP.  Note that in the error cases below,
 	 * nothing will have been successfully modified so we don't
 	 * have to write it out.
 	 */
 	alpha_pal_wrusp(td->td_frame->tf_regs[FRAME_SP]);
 
 	return (0);
 
 sigill:
 	*ucodep = ALPHA_IF_CODE_OPDEC;			/* trap type */
 	return (SIGILL);
 
 sigsegv:
 	sig = SIGSEGV;
 	td->td_frame->tf_regs[FRAME_PC] = inst_pc;	/* re-run instr. */
 unaligned_fixup_sig:
 	*ucodep = memaddr;				/* faulting address */
 	return (sig);
 }
Index: head/sys/alpha/linux/linux_sysvec.c
===================================================================
--- head/sys/alpha/linux/linux_sysvec.c	(revision 116360)
+++ head/sys/alpha/linux/linux_sysvec.c	(revision 116361)
@@ -1,287 +1,287 @@
 /*-
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* XXX we use functions that might not exist. */
 #include "opt_compat.h"
 
 #ifndef COMPAT_43
 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/sysent.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 
 #include <alpha/linux/linux.h>
 #include <alpha/linux/linux_proto.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_util.h>
 #undef szsigcode
 
 MODULE_VERSION(linux, 1);
 MODULE_DEPEND(linux, osf1, 1, 1, 1);
 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
 
 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define	SHELLMAGIC	0x2123 /* #! */
 #else
 #define	SHELLMAGIC	0x2321
 #endif
 
 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
 
 void osendsig(sig_t catcher, int sig, sigset_t *mask, u_long code);
 
 static int	elf_linux_fixup(register_t **stack_base,
     struct image_params *iparams);
 static int	exec_linux_imgact_try(struct image_params *iparams);
 
 static int
 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	Elf64_Auxargs *args;
 	register_t *pos;
 
 	KASSERT(curthread->td_proc == imgp->proc &&
-	    (curthread->td_proc->p_flag & P_THREADED) == 0,
+	    (curthread->td_proc->p_flag & P_SA) == 0,
 	    ("unsafe elf_linux_fixup(), should be curproc"));
 	args = (Elf64_Auxargs *)imgp->auxargs;
 	pos = *stack_base + (imgp->argc + imgp->envc + 2);
 
 	if (args->trace)
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 	
 	free(imgp->auxargs, M_TEMP);      
 	imgp->auxargs = NULL;
 
 	(*stack_base)--;
 	**stack_base = (register_t)imgp->argc;
 	return 0;
 }
 
 /*
  * If a linux binary is exec'ing something, try this image activator 
  * first.  We override standard shell script execution in order to
  * be able to modify the interpreter path.  We only do this if a linux
  * binary is doing the exec, so we do not create an EXEC module for it.
  */
 static int
 exec_linux_imgact_try(imgp)
 	struct image_params *imgp;
 {
 	const char *head;
 	int error;
 
 	head = (const char *)imgp->image_header;
 	error = -1;
 
 	/*
 	 * The interpreter for shell scripts run from a linux binary needs
 	 * to be located in /compat/linux if possible in order to recursively
 	 * maintain linux path emulation.
 	 */
 	if (((const short *)head)[0] == SHELLMAGIC) {
 		/*
 		 * Run our normal shell image activator.  If it succeeds
 		 * attempt to use the alternate path for the interpreter.  If
 		 * an alternate path is found, use our stringspace to store it.
 		 */
 		if ((error = exec_shell_imgact(imgp)) == 0) {
 			char *rpath = NULL;
 
 			linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
 			    imgp->interpreter_name, &rpath, 0);
 			if (rpath != imgp->interpreter_name) {
 				int len = strlen(rpath) + 1;
 
 				if (len <= MAXSHELLCMDLEN) {
 					memcpy(imgp->interpreter_name, rpath,
 					    len);
 				}
 				free(rpath, M_TEMP);
 			}
 		}
 	}
 	return(error);
 }
 
 /*
  * To maintain OSF/1 compat, linux uses BSD signals & errnos on their
  * alpha port.  This greatly simplfies things for us.
  */
 
 struct sysentvec elf_linux_sysvec = {
 	LINUX_SYS_MAXSYSCALL,
 	linux_sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	elf_linux_fixup,
 	osendsig,
 	linux_sigcode,
 	&linux_szsigcode,
 	NULL,
 	"Linux ELF",
 	elf64_coredump,
 	exec_linux_imgact_try,
 	LINUX_MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_setregs
 };
 
 static Elf64_Brandinfo linux_brand = {
 					ELFOSABI_LINUX,
 					EM_ALPHA,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.1",
 					&elf_linux_sysvec
 				 };
 
 static Elf64_Brandinfo linux_glibc2brand = {
 					ELFOSABI_LINUX,
 					EM_ALPHA,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.2",
 					&elf_linux_sysvec
 				 };
 
 Elf64_Brandinfo *linux_brandlist[] = {
 					&linux_brand,
 					&linux_glibc2brand,
 					NULL
 				};
 
 static int
 linux_elf_modevent(module_t mod, int type, void *data)
 {
 	Elf64_Brandinfo **brandinfo;
 	int error;
 	struct linux_ioctl_handler **lihp;
 
 	error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		    ++brandinfo)
 			if (elf64_insert_brand_entry(*brandinfo) < 0)
 				error = EINVAL;
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_register_handler(*lihp);
 			if (bootverbose)
 				printf("Linux ELF exec handler installed\n");
 		} else
 			printf("cannot insert Linux ELF brand handler\n");
 		break;
 	case MOD_UNLOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		    ++brandinfo)
 			if (elf64_brand_inuse(*brandinfo))
 				error = EBUSY;
 		if (error == 0) {
 			for (brandinfo = &linux_brandlist[0];
 			    *brandinfo != NULL; ++brandinfo)
 				if (elf64_remove_brand_entry(*brandinfo) < 0)
 					error = EINVAL;
 		}
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_unregister_handler(*lihp);
 			if (bootverbose)
 				printf("Linux ELF exec handler removed\n");
 			linux_mib_destroy();
 		} else
 			printf("Could not deinstall ELF interpreter entry\n");
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 static moduledata_t linux_elf_mod = {
 	"linuxelf",
 	linux_elf_modevent,
 	0
 };
 
 DUMMY(rt_sigreturn);
 
 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 116360)
+++ head/sys/amd64/amd64/pmap.c	(revision 116361)
@@ -1,3005 +1,3005 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  * $FreeBSD$
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_msgbuf.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 
 #define PMAP_KEEP_PDIRS
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #define MINPV 2048
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static pt_entry_t protection_codes[8];
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_paddr_t avail_start;		/* PA of first available physical page */
 vm_paddr_t avail_end;		/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 
 static int nkpt;
 static int ndmpdp;
 vm_offset_t kernel_vm_end;
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 static u_int64_t	KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 int pmap_pagedaemon_waken;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *ptmmap;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static pt_entry_t *pt_crashdumpmap;
 static caddr_t crashdumpmap;
 
 static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(void);
 static void	amd64_protection_init(void);
 static __inline void	pmap_changebit(vm_page_t m, int bit, boolean_t setem);
 
 static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va,
 				      vm_page_t m, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
 		vm_page_t mpte, vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex);
 static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
  * by using a large (2MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return newaddr;
 }
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return va >> PDRSHIFT;
 }
 
 
 /* Return various clipped indexes for a given VA */
 static __inline vm_pindex_t
 pmap_pte_index(vm_offset_t va)
 {
 
 	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pde_index(vm_offset_t va)
 {
 
 	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pdpe_index(vm_offset_t va)
 {
 
 	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pml4e_index(vm_offset_t va)
 {
 
 	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
 }
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	if (!pmap)
 		return NULL;
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 
 	pml4e = pmap_pml4e(pmap, va);
 	if (pml4e == NULL || (*pml4e & PG_V) == 0)
 		return NULL;
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		 return NULL;
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return NULL;
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	return (PTmap + (amd64_btop(va) & mask));
 }
 
 static u_int64_t
 allocpages(int n)
 {
 	u_int64_t ret;
 
 	ret = avail_start;
 	bzero((void *)ret, n * PAGE_SIZE);
 	avail_start += n * PAGE_SIZE;
 	return (ret);
 }
 
 static void
 create_pagetables(void)
 {
 	int i;
 
 	/* Allocate pages */
 	KPTphys = allocpages(NKPT);
 	KPML4phys = allocpages(1);
 	KPDPphys = allocpages(NKPML4E);
 	KPDphys = allocpages(NKPDPE);
 
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 1)
 		ndmpdp = 1;
 	DMPDPphys = allocpages(NDMPML4E);
 	DMPDphys = allocpages(ndmpdp);
 
 	/* Fill in the underlying page table pages */
 	/* Read-only from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
 	for (i = 0; (i << PAGE_SHIFT) < avail_start; i++) {
 		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
 		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V;
 	}
 
 	/* Now map the page tables at their location within PTmap */
 	for (i = 0; i < NKPT; i++) {
 		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
 	}
 
 #if 0
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < avail_start; i++) {
 		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS;
 	}
 #endif
 
 	/* And connect up the PD to the PDP */
 	for (i = 0; i < NKPDPE; i++) {
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
 	}
 
 
 	/* Now set up the direct map space using 2MB pages */
 	for (i = 0; i < NPDEPG * ndmpdp; i++) {
 		((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
 		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS;
 	}
 
 	/* And the direct map space's PDP */
 	for (i = 0; i < ndmpdp; i++) {
 		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the Direct Map slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
 	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the KVA slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
 	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr)
 	vm_paddr_t *firstaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 
 	avail_start = *firstaddr;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables();
 	*firstaddr = avail_start;
 
 	virtual_avail = (vm_offset_t) KERNBASE + avail_start;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 
 	/* XXX do %cr0 as well */
 	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
 	load_cr3(KPML4phys);
 
 	/*
 	 * Initialize protection array.
 	 */
 	amd64_protection_init();
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	LIST_INIT(&allpmaps);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
         /*
 	 * CMAP1 is only used for the memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 * XXX ptmmap is not used.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 * XXX msgbufmap is not used.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
 	       atop(round_page(MSGBUF_SIZE)))
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 	invltlb();
 }
 
 static void *
 pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (void *)kmem_alloc(kernel_map, bytes);
 }
 
 void *
 uma_small_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	static vm_pindex_t colour;
 	vm_page_t m;
 	int pflags;
 	void *va;
 
 	*flags = UMA_SLAB_PRIV;
 
 	if ((wait & (M_NOWAIT|M_USE_RESERVE)) == M_NOWAIT)
 		pflags = VM_ALLOC_INTERRUPT;
 	else
 		pflags = VM_ALLOC_SYSTEM;
 
 	if (wait & M_ZERO)
 		pflags |= VM_ALLOC_ZERO;
 
 	for (;;) {
 		m = vm_page_alloc(NULL, colour++, pflags | VM_ALLOC_NOOBJ);
 		if (m == NULL) {
 			if (wait & M_NOWAIT)
 				return (NULL);
 			else
 				VM_WAIT;
 		} else
 			break;
 	}
 
 	va = (void *)PHYS_TO_DMAP(m->phys_addr);
 	if ((m->flags & PG_ZERO) == 0)
 		pagezero(va);
 	return (va);
 }
 
 void
 uma_small_free(void *mem, int size, u_int8_t flags)
 {
 	vm_page_t m;
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)mem));
 	vm_page_lock_queues();
 	vm_page_free(m);
 	vm_page_unlock_queues();
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_paddr_t phys_start, phys_end;
 {
 	int i;
 	int initial_pvs;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 
 	for(i = 0; i < vm_page_array_size; i++) {
 		vm_page_t m;
 
 		m = &vm_page_array[i];
 		TAILQ_INIT(&m->md.pv_list);
 		m->md.pv_list_count = 0;
 	}
 
 	/*
 	 * init the pv free list
 	 */
 	initial_pvs = vm_page_array_size;
 	if (initial_pvs < MINPV)
 		initial_pvs = MINPV;
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
 	uma_zone_set_allocf(pvzone, pmap_pv_allocf);
 	uma_prealloc(pvzone, initial_pvs);
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  * Initialize the address space (zone) for the pv_entries.  Set a
  * high water mark so that the system can recover from excessive
  * numbers of pv entries.
  */
 void
 pmap_init2()
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 }
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 #if defined(PMAP_DIAGNOSTIC)
 
 /*
  * This code checks for non-writeable/modified pages.
  * This should be an invalid condition.
  */
 static int
 pmap_nw_modified(pt_entry_t ptea)
 {
 	int pte;
 
 	pte = (int) ptea;
 
 	if ((pte & (PG_M|PG_RW)) == PG_M)
 		return 1;
 	else
 		return 0;
 }
 #endif
 
 
 /*
  * this routine defines the region(s) of memory that should
  * not be tested for the modified bit.
  */
 static PMAP_INLINE int
 pmap_track_modified(vm_offset_t va)
 {
 	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * Normal invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 	return (pmap == kernel_pmap ||
 	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde, *pdep;
 
 	if (pmap == 0)
 		return 0;
 	pdep = pmap_pde(pmap, va);
 	if (pdep) {
 		pde = *pdep;
 		if (pde) {
 			if ((pde & PG_PS) != 0) {
 				rtval = (pde & ~PDRMASK) | (va & PDRMASK);
 				return rtval;
 			}
 			pte = pmap_pte(pmap, va);
 			rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
 			return rtval;
 		}
 	}
 	return 0;
 
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pde = pmap_pde(kernel_pmap, va);
 		if (*pde & PG_PS) {
 			pa = (*pde & ~(NBPDR - 1)) | (va & (NBPDR - 1));
 		} else {
 			pa = *vtopte(va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return pa;
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | PG_G);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 
 	va = sva = *virt;
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
 		va += PAGE_SIZE;
 		m++;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 static vm_page_t
 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 retry:
 	m = vm_page_lookup(object, pindex);
 	if (m != NULL) {
 		vm_page_lock_queues();
 		if (vm_page_sleep_if_busy(m, FALSE, "pplookp"))
 			goto retry;
 		vm_page_unlock_queues();
 	}
 	return m;
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 
 	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
 		vm_page_lock_queues();
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
 		/*
 		 * unmap the page table page
 		 */
 		if (m->pindex >= (NUPDE + NUPDPE)) {
 			/* PDP page */
 			pml4_entry_t *pml4;
 			pml4 = pmap_pml4e(pmap, va);
 			pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
 			*pml4 = 0;
 		} else if (m->pindex >= NUPDE) {
 			/* PD page */
 			pdp_entry_t *pdp;
 			pdp = pmap_pdpe(pmap, va);
 			pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
 			*pdp = 0;
 		} else {
 			/* PTE page */
 			pd_entry_t *pd;
 			pd = pmap_pde(pmap, va);
 			pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
 			*pd = 0;
 		}
 		--pmap->pm_stats.resident_count;
 		if (m->pindex < NUPDE) {
 			/* Unhold the PD page */
 			vm_page_t pdpg;
 			pdpg = vm_page_lookup(pmap->pm_pteobj, NUPDE + pmap_pdpe_index(va));
 			while (vm_page_sleep_if_busy(pdpg, FALSE, "pulook"))
 				vm_page_lock_queues();
 			vm_page_unhold(pdpg);
 			if (pdpg->hold_count == 0)
 				_pmap_unwire_pte_hold(pmap, va, pdpg);
 		}
 		if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 			/* Unhold the PDP page */
 			vm_page_t pdppg;
 			pdppg = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pmap_pml4e_index(va));
 			while (vm_page_sleep_if_busy(pdppg, FALSE, "pulooK"))
 				vm_page_lock_queues();
 			vm_page_unhold(pdppg);
 			if (pdppg->hold_count == 0)
 				_pmap_unwire_pte_hold(pmap, va, pdppg);
 		}
 		if (pmap_is_current(pmap)) {
 			/*
 			 * Do an invltlb to make the invalidated mapping
 			 * take effect immediately.
 			 */
 			pmap_invalidate_page(pmap, pteva);
 		}
 
 		/*
 		 * If the page is finally unwired, simply free it.
 		 */
 		--m->wire_count;
 		if (m->wire_count == 0) {
 			vm_page_busy(m);
 			vm_page_free_zero(m);
 			atomic_subtract_int(&cnt.v_wire_count, 1);
 		}
 		return 1;
 	}
 	return 0;
 }
 
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	vm_page_unhold(m);
 	if (m->hold_count == 0)
 		return _pmap_unwire_pte_hold(pmap, va, m);
 	else
 		return 0;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
 {
 	vm_pindex_t ptepindex;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 
 	if (mpte == NULL) {
 		ptepindex = pmap_pde_pindex(va);
 		if (pmap->pm_pteobj->root &&
 		    pmap->pm_pteobj->root->pindex == ptepindex) {
 			mpte = pmap->pm_pteobj->root;
 		} else {
 			while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL &&
 			    vm_page_sleep_if_busy(mpte, FALSE, "pulook"))
 				vm_page_lock_queues();
 		}
 	}
 
 	return pmap_unwire_pte_hold(pmap, va, mpte);
 }
 
 void
 pmap_pinit0(pmap)
 	struct pmap *pmap;
 {
 
 	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t pml4pg;
 
 	/*
 	 * allocate object for the ptes
 	 */
 	if (pmap->pm_pteobj == NULL)
 		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + NUPML4E + 1);
 
 	/*
 	 * allocate the page directory page
 	 */
 	pml4pg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + NUPML4E,
 	    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	vm_page_lock_queues();
 	vm_page_flag_clear(pml4pg, PG_BUSY);
 	pml4pg->valid = VM_PAGE_BITS_ALL;
 	vm_page_unlock_queues();
 
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		bzero(pmap->pm_pml4, PAGE_SIZE);
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	/* Wire in kernel global address entries. */
 	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
 	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
 
 	/* install self-referential address mapping entry(s) */
 	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Wire in kernel global address entries.  To avoid a race condition
  * between pmap initialization and pmap_growkernel, this procedure
  * should be called after the vmspace is attached to the process
  * but before this pmap is activated.
  */
 void
 pmap_pinit2(pmap)
 	struct pmap *pmap;
 {
 	/* XXX: Remove this stub when no longer called */
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap, ptepindex)
 	pmap_t	pmap;
 	vm_pindex_t ptepindex;
 {
 	vm_page_t m, pdppg, pdpg;
 
 	/*
 	 * Find or fabricate a new pagetable page
 	 */
 	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 
 	KASSERT(m->queue == PQ_NONE,
 		("_pmap_allocpte: %p->queue != PQ_NONE", m));
 
 	/*
 	 * Increment the hold count for the page table page
 	 * (denoting a new mapping.)
 	 */
 	m->hold_count++;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index);
 		} else {
 			/* Add reference to pdp page */
 			pdppg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + pml4index);
 			pdppg->hold_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			_pmap_allocpte(pmap, NUPDE + pdpindex);
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				_pmap_allocpte(pmap, NUPDE + pdpindex);
 			} else {
 				/* Add reference to the pd page */
 				pdpg = pmap_page_lookup(pmap->pm_pteobj, NUPDE + pdpindex);
 				pdpg->hold_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	/*
 	 * Try to use the new mapping, but if we cannot, then
 	 * do it with the routine that maps the page explicitly.
 	 */
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	vm_page_lock_queues();
 	m->valid = VM_PAGE_BITS_ALL;
 	vm_page_flag_clear(m, PG_ZERO);
 	vm_page_wakeup(m);
 	vm_page_unlock_queues();
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		*pd = 0;
 		pd = 0;
 		pmap_invalidate_all(kernel_pmap);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != 0 && (*pd & PG_V) != 0) {
 		/*
 		 * In order to get the page table page, try the
 		 * hint first.
 		 */
 		if (pmap->pm_pteobj->root &&
 			(pmap->pm_pteobj->root->pindex == ptepindex)) {
 			m = pmap->pm_pteobj->root;
 		} else {
 			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
 		}
 		m->hold_count++;
 		return m;
 	}
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	m = _pmap_allocpte(pmap, ptepindex);
 	return m;
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_object_t object;
 	vm_page_t m;
 
 	object = pmap->pm_pteobj;
 
 	KASSERT(object->ref_count == 1,
 	    ("pmap_release: pteobj reference count %d != 1",
 	    object->ref_count));
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	vm_page_lock_queues();
 	while ((m = TAILQ_FIRST(&object->memq)) != NULL) {
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_busy(m);
 		vm_page_free(m);
 	}
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("pmap_release: leaking page table pages"));
 	vm_page_unlock_queues();
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	int s;
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 
 	s = splhigh();
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	while (kernel_vm_end < addr) {
 		if ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
 
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 get_pv_entry(void)
 {
 	pv_entry_count++;
 	if (pv_entry_high_water &&
 		(pv_entry_count > pv_entry_high_water) &&
 		(pmap_pagedaemon_waken == 0)) {
 		pmap_pagedaemon_waken = 1;
 		wakeup (&vm_pages_needed);
 	}
 	return uma_zalloc(pvzone, M_NOWAIT);
 }
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 
 static int
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 	int rtval;
 	int s;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 
 	rtval = 0;
 	if (pv) {
 		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 	}
 			
 	splx(s);
 	return rtval;
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
 {
 
 	int s;
 	pv_entry_t pv;
 
 	s = splvm();
 	pv = get_pv_entry();
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 	pv->pv_ptem = mpte;
 
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 
 	splx(s);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte);
 		if (oldpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) oldpte)) {
 				printf(
 	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
 				    va, oldpte);
 			}
 #endif
 			if (pmap_track_modified(va))
 				vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		return pmap_remove_entry(pmap, m, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
 	}
 
 	return 0;
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = pmap_pte(pmap, va);
 	if (pte == NULL || (*pte & PG_V) == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
 	int anyvalid;
 
 	if (pmap == NULL)
 		return;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva);
 			return;
 		}
 	}
 
 	anyvalid = 0;
 
 	for (; sva < eva; sva = pdnxt) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pde(pmap, sva);
 		if (pde == 0)
 			continue;
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			*pde = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (; sva != pdnxt; sva += PAGE_SIZE) {
 			pte = pmap_pte(pmap, sva);
 			if (pte == NULL || *pte == 0)
 				continue;
 			anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva))
 				break;
 		}
 	}
 
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	register pv_entry_t pv;
 	pt_entry_t *pte, tpte;
 	int s;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	s = splvm();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pv->pv_pmap->pm_stats.resident_count--;
 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pv->pv_pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) tpte)) {
 				printf(
 	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
 				    pv->pv_va, tpte);
 			}
 #endif
 			if (pmap_track_modified(pv->pv_va))
 				vm_page_dirty(m);
 		}
 		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	splx(s);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr, *pde;
 	int anychanged;
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	anychanged = 0;
 
 	for (; sva < eva; sva = pdnxt) {
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pde(pmap, sva);
 		if (pde == NULL)
 			continue;
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			*pde &= ~(PG_M|PG_RW);
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anychanged = 1;
 			continue;
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (; sva != pdnxt; sva += PAGE_SIZE) {
 			pt_entry_t pbits;
 			pt_entry_t *pte;
 			vm_page_t m;
 
 			pte = pmap_pte(pmap, sva);
 			if (pte == NULL)
 				continue;
 			pbits = *pte;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0 &&
 				    pmap_track_modified(sva)) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits);
 					vm_page_dirty(m);
 					pbits &= ~PG_M;
 				}
 			}
 
 			pbits &= ~PG_RW;
 
 			if (pbits != *pte) {
 				pte_store(pte, pbits);
 				anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	register pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte;
 
 	if (pmap == NULL)
 		return;
 
 	va &= PG_FRAME;
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
 				origpte, va);
 		}
 	}
 #endif
 
 	pte = pmap_pte(pmap, va);
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL)
 		panic("pmap_enter: invalid page directory va=%#lx\n", va);
 
 	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	if (origpte & PG_PS)
 		panic("pmap_enter: attempted pmap_enter on 2MB page");
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (pmap_nw_modified((pt_entry_t) origpte)) {
 			printf(
 	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
 			    va, origpte);
 		}
 #endif
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->hold_count--;
 
 		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
 			if ((origpte & PG_RW) == 0) {
 				pte_store(pte, origpte | PG_RW);
 				pmap_invalidate_page(pmap, va);
 			}
 			return;
 		}
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			if ((origpte & PG_M) && pmap_track_modified(va)) {
 				vm_page_t om;
 				om = PHYS_TO_VM_PAGE(opa);
 				vm_page_dirty(om);
 			}
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		int err;
 		vm_page_lock_queues();
 		err = pmap_remove_pte(pmap, pte, va);
 		vm_page_unlock_queues();
 		if (err)
 			panic("pmap_enter: pte vanished, va: 0x%lx", va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_initialized && 
 	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, mpte, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
 
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		pte_store(pte, newpte | PG_A);
 		/*if (origpte)*/ {
 			pmap_invalidate_page(pmap, va);
 		}
 	}
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
 static vm_page_t
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->hold_count++;
 		} else {
 retry:
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 2MB page");
 				if (pmap->pm_pteobj->root &&
 					(pmap->pm_pteobj->root->pindex == ptepindex)) {
 					mpte = pmap->pm_pteobj->root;
 				} else {
 					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
 				}
 				if (mpte == NULL)
 					goto retry;
 				mpte->hold_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			vm_page_lock_queues();
 			pmap_unwire_pte_hold(pmap, va, mpte);
 			vm_page_unlock_queues();
 		}
 		return 0;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
 		pmap_insert_entry(pmap, va, mpte, m);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_offset_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 #define MAX_INIT_PT (96)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size, int limit)
 {
 	vm_pindex_t tmpidx;
 	int psize;
 	vm_page_t p, mpte;
 
 	if (pmap == NULL || object == NULL)
 		return;
 
 	/*
 	 * This code maps large physical mmap regions into the
 	 * processor address space.  Note that some shortcuts
 	 * are taken, but the code works.
 	 */
 	if ((object->type == OBJT_DEVICE) &&
 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		int i;
 		vm_page_t m[1];
 		int npdes;
 		pd_entry_t ptepa, *pde;
 
 		pde = pmap_pde(pmap, addr);
 		if (pde != 0 && (*pde & PG_V) != 0)
 			return;
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			vm_page_lock_queues();
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 		}
 		vm_page_unlock_queues();
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1)) {
 			return;
 		}
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i = 0; i < npdes; i++) {
 			pde_store(pde, ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			pde++;
 		}
 		pmap_invalidate_all(kernel_pmap);
 		return;
 	}
 
 	psize = amd64_btop(size);
 
 	if ((object->type != OBJT_VNODE) ||
 	    ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
 	     (object->resident_page_count > MAX_INIT_PT))) {
 		return;
 	}
 
 	if (psize + pindex > object->size) {
 		if (object->size < pindex)
 			return;
 		psize = object->size - pindex;
 	}
 
 	mpte = NULL;
 
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < pindex) {
 			p = vm_page_splay(pindex, object->root);
 			if ((object->root = p)->pindex < pindex)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if ((limit & MAP_PREFAULT_MADVISE) &&
 		    cnt.v_free_count < cnt.v_free_reserved) {
 			break;
 		}
 		vm_page_lock_queues();
 		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
 		    (p->busy == 0) &&
 		    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 			if ((p->queue - p->pc) == PQ_CACHE)
 				vm_page_deactivate(p);
 			vm_page_busy(p);
 			vm_page_unlock_queues();
 			mpte = pmap_enter_quick(pmap, 
 				addr + amd64_ptob(tmpidx), p, mpte);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 		}
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * pmap_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of pmap_object_init_pt, except it runs at page fault time instead
  * of mmap time.
  */
 #define PFBAK 4
 #define PFFOR 4
 #define PAGEORDER_SIZE (PFBAK+PFFOR)
 
 static int pmap_prefault_pageorder[] = {
 	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
 	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
 	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
 	-4 * PAGE_SIZE, 4 * PAGE_SIZE
 };
 
 void
 pmap_prefault(pmap, addra, entry)
 	pmap_t pmap;
 	vm_offset_t addra;
 	vm_map_entry_t entry;
 {
 	int i;
 	vm_offset_t starta;
 	vm_offset_t addr;
 	vm_pindex_t pindex;
 	vm_page_t m, mpte;
 	vm_object_t object;
 	pd_entry_t *pde;
 
 	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
 		return;
 
 	object = entry->object.vm_object;
 
 	starta = addra - PFBAK * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	mpte = NULL;
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t lobject;
 		pt_entry_t *pte;
 
 		addr = addra + pmap_prefault_pageorder[i];
 		if (addr > addra + (PFFOR * PAGE_SIZE))
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		pde = pmap_pde(pmap, addr);
 		if (pde == NULL || (*pde & PG_V) == 0) 
 			continue;
 
 		pte = vtopte(addr);
 		if ((*pte & PG_V) == 0)
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = object;
 		for (m = vm_page_lookup(lobject, pindex);
 		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
 		    lobject = lobject->backing_object) {
 			if (lobject->backing_object_offset & PAGE_MASK)
 				break;
 			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
 			m = vm_page_lookup(lobject->backing_object, pindex);
 		}
 
 		/*
 		 * give-up when a page is not in memory
 		 */
 		if (m == NULL)
 			break;
 		vm_page_lock_queues();
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			(m->busy == 0) &&
 		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
 			vm_page_busy(m);
 			vm_page_unlock_queues();
 			mpte = pmap_enter_quick(pmap, addr, m, mpte);
 			vm_page_lock_queues();
 			vm_page_wakeup(m);
 		}
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 
 	if (pmap == NULL)
 		return;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pte = pmap_pte(pmap, va);
 	if (wired && (*pte & PG_W) == 0) {
 		pmap->pm_stats.wired_count++;
 		*pte |= PG_W;
 	} else if (!wired && (*pte & PG_W) != 0) {
 		pmap->pm_stats.wired_count--;
 		*pte &= ~PG_W;
 	}
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 	vm_page_t m;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr, *pde;
 		vm_pindex_t ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables\n");
 
 		/*
 		 * Don't let optional prefaulting of pages make us go
 		 * way below the low water mark of free pages or way
 		 * above high water mark of used pv entries.
 		 */
 		if (cnt.v_free_count < cnt.v_free_reserved ||
 		    pv_entry_count > pv_entry_high_water)
 			break;
 		
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = pmap_pde_pindex(addr);
 
 		pde = pmap_pde(src_pmap, addr);
 		if (pde)
 			srcptepaddr = *pde;
 		else
 			continue;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			pde = pmap_pde(dst_pmap, addr);
 			if (pde == 0) {
 				/*
 				 * XXX should do an allocpte here to
 				 * instantiate the pde
 				 */
 				continue;
 			}
 			if (*pde == 0) {
 				*pde = srcptepaddr;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
 		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
 		if ((srcmpte == NULL) ||
 		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
 			continue;
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				/*
 				 * We have to check after allocpte for the
 				 * pte still being around...  allocpte can
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr);
 				dst_pte = pmap_pte(dst_pmap, addr);
 				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
 					/*
 					 * Clear the modified and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					m = PHYS_TO_VM_PAGE(ptetemp);
 					*dst_pte = ptetemp & ~(PG_M | PG_A);
 					dst_pmap->pm_stats.resident_count++;
 					pmap_insert_entry(dst_pmap, addr,
 						dstmpte, m);
 	 			} else {
 					vm_page_lock_queues();
 					pmap_unwire_pte_hold(dst_pmap, addr, dstmpte);
 					vm_page_unlock_queues();
 				}
 				if (dstmpte->hold_count >= srcmpte->hold_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 }	
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	bcopy((void *)src, (void *)dst, PAGE_SIZE);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap, m)
 	pmap_t pmap;
 	vm_page_t m;
 {
 	pv_entry_t pv;
 	int loops = 0;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return FALSE;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			splx(s);
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	splx(s);
 	return (FALSE);
 }
 
 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap, sva, eva)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m;
 	pv_entry_t pv, npv;
 	int s;
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	s = splvm();
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 
 		if (pv->pv_va >= eva || pv->pv_va < sva) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 		pte = vtopte(pv->pv_va);
 #else
 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 #endif
 		tpte = *pte;
 
 		if (tpte == 0) {
 			printf("TPTE at %p  IS ZERO @ VA %08lx\n",
 							pte, pv->pv_va);
 			panic("bad pte");
 		}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 		if (tpte & PG_W) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 		m = PHYS_TO_VM_PAGE(tpte);
 		KASSERT(m->phys_addr == (tpte & PG_FRAME),
 		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
 
 		KASSERT(m < &vm_page_array[vm_page_array_size],
 			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		pte_clear(pte);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			vm_page_dirty(m);
 		}
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
 			vm_page_flag_clear(m, PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	splx(s);
 	pmap_invalidate_all(pmap);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return FALSE;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
 			continue;
 		}
 #endif
 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 		if (*pte & PG_M) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static __inline void
 pmap_changebit(vm_page_t m, int bit, boolean_t setem)
 {
 	register pv_entry_t pv;
 	register pt_entry_t *pte;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
 	    (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
 		return;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * don't write protect pager mappings
 		 */
 		if (!setem && (bit == PG_RW)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
 			continue;
 		}
 #endif
 
 		pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 
 		if (setem) {
 			*pte |= bit;
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		} else {
 			pt_entry_t pbits = *pte;
 			if (pbits & bit) {
 				if (bit == PG_RW) {
 					if (pbits & PG_M) {
 						vm_page_dirty(m);
 					}
 					pte_store(pte, pbits & ~(PG_M|PG_RW));
 				} else {
 					pte_store(pte, pbits & ~bit);
 				}
 				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 			}
 		}
 	}
 	if (!setem && bit == PG_RW)
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	splx(s);
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(vm_page_t m, vm_prot_t prot)
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
 			pmap_changebit(m, PG_RW, FALSE);
 		} else {
 			pmap_remove_all(m);
 		}
 	}
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	register pv_entry_t pv, pvf, pvn;
 	pt_entry_t *pte;
 	pt_entry_t v;
 	int s;
 	int rtval = 0;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return (rtval);
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 
 		pvf = pv;
 
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 
 			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 
 			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
 				pte_store(pte, v & ~PG_A);
 				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 
 				rtval++;
 				if (rtval > 4) {
 					break;
 				}
 			}
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	splx(s);
 
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pmap_changebit(m, PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pmap_changebit(m, PG_A, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 amd64_protection_init()
 {
 	register long *kp, prot;
 
 #if 0
 #define PG_NX (1ul << 63)
 #else
 #define PG_NX 0
 #endif
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 			*kp++ = PG_NX;
 			break;
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 			*kp++ = PG_RW | PG_NX;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_paddr_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	GIANT_REQUIRED;
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	pa = pa & PG_FRAME;
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter(tmpva, pa);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapdev(va, size)
 	vm_offset_t va;
 	vm_size_t size;
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t *pte;
 
 	base = va & PG_FRAME;
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
 		pte = vtopte(tmpva);
 		pte_clear(pte);
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap, addr)
 	pmap_t pmap;
 	vm_offset_t addr;
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	ptep = pmap_pte(pmap, addr);
 	if (ptep == 0) {
 		return 0;
 	}
 
 	if ((pte = *ptep) != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	pmap_t	pmap;
 	u_int64_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	pmap->pm_active |= PCPU_GET(cpumask);
 	cr3 = vtophys(pmap->pm_pml4);
 	/* XXXKSE this is wrong.
 	 * pmap_activate is for the current thread on the current cpu
 	 */
-	if (p->p_flag & P_THREADED) {
+	if (p->p_flag & P_SA) {
 		/* Make sure all other cr3 entries are updated. */
 		/* what if they are running?  XXXKSE (maybe abort them) */
 		FOREACH_THREAD_IN_PROC(p, td) {
 			td->td_pcb->pcb_cr3 = cr3;
 		}
 	} else {
 		td->td_pcb->pcb_cr3 = cr3;
 	}
 	load_cr3(cr3);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 116360)
+++ head/sys/amd64/amd64/trap.c	(revision 116361)
@@ -1,817 +1,817 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD$
  */
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/tss.h>
 
 #include <amd64/isa/icu.h>
 #include <amd64/isa/intr_machdep.h>
 
 #include <ddb/ddb.h>
 
 #include <sys/sysctl.h>
 
 extern void trap(struct trapframe frame);
 extern void syscall(struct trapframe frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 #define MAX_TRAP_MSG		28
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 };
 
 #ifdef DDB
 static int ddb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 	&ddb_on_nmi, 0, "Go to DDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 #ifdef DEVICE_POLLING
 extern u_int32_t poll_in_trap;
 extern int ether_poll(int count);
 #endif /* DEVICE_POLLING */
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	u_int sticks = 0;
 	int i = 0, ucode = 0, type, code;
 
 	atomic_add_int(&cnt.v_trap, 1);
 	type = frame.tf_trapno;
 
 #ifdef DDB
 	if (db_active) {
 		vm_offset_t eva;
 		eva = (type == T_PAGEFLT ? frame.tf_addr : 0);
 		trap_fatal(&frame, eva);
 		goto out;
 	}
 #endif
 
 	if ((frame.tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame.tf_cs) == SEL_UPL)
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * We shouldn't enable interrupts while in a critical
 			 * section.
 			 */
 			if (td->td_critnest == 0)
 				enable_intr();
 		}
 	}
 
 	code = frame.tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 */
 		if (td->td_critnest != 0)
 			trap_fatal(&frame, frame.tf_addr);
 	}
 
 #ifdef	DEVICE_POLLING
 	if (poll_in_trap)
 		ether_poll(poll_in_trap);
 #endif	/* DEVICE_POLLING */
 
         if (ISPL(frame.tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		sticks = td->td_sticks;
 		td->td_frame = &frame;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_rflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = npxtrap();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			i = trap_pfault(&frame, TRUE);
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto userout;
 			i = SIGFPE;
 			ucode = FPE_FPU_NP_TRAP;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(&frame, FALSE);
 			goto out;
 
 		case T_DNA:
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna()) {
 				printf("npxdna in kernel mode!\n");
 				goto out;
 			}
 			break;
 
 		case T_STKFLT:		/* stack fault */
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame.tf_rip == (long)doreti_iret) {
 				frame.tf_rip = (long)doreti_iret_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb) != NULL &&
 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame.tf_rip =
 				    (long)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_rflags & PSL_NT) {
 				frame.tf_rflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			/* XXX Giant */
 			if (kdb_trap (type, 0, &frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(&frame, 0);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	trapsignal(td, i, ucode);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", frame.tf_addr);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, &frame, sticks);
 	mtx_assert(&Giant, MA_NOTOWNED);
 userout:
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	vm_offset_t eva = frame->tf_addr;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss;
 	long esp;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_rsp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (long)&frame->tf_rsp;
 	}
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
 		return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault\n");
 	panic("double fault");
 }
 
 /*
  *	syscall -	system call request C handler
  *
  *	A system call is essentially treated as a trap.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_rflags;
 	u_int sticks;
 	int error;
 	int narg;
 	register_t args[8];
 	register_t *argp;
 	u_int code;
 	int reg, regcnt;
 
 	/*
 	 * note: PCPU_LAZY_INC() can only be used if we can afford
 	 * occassional inaccuracy in the count.
 	 */
 	PCPU_LAZY_INC(cnt.v_syscall);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame.tf_cs) != SEL_UPL) {
 		mtx_lock(&Giant);	/* try to stabilize the system XXX */
 		panic("syscall");
 		/* NOT REACHED */
 		mtx_unlock(&Giant);
 	}
 #endif
 
 	reg = 0;
 	regcnt = 6;
 	sticks = td->td_sticks;
 	td->td_frame = &frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		thread_user_enter(p, td);
 	params = (caddr_t)frame.tf_rsp + sizeof(register_t);
 	code = frame.tf_rax;
 	orig_tf_rflags = frame.tf_rflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(&frame, (int *)args, &code, &params);
 	} else {
 		if (code == SYS_syscall || code == SYS___syscall) {
 			code = frame.tf_rdi;
 			reg++;
 			regcnt--;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	if (narg <= regcnt) {
 		argp = &frame.tf_rdi;
 		argp += reg;
 		error = 0;
 	} else {
 		KASSERT(narg <= sizeof(args) / sizeof(args[0]),
 		    ("Too many syscall arguments!"));
 		KASSERT(params != NULL, ("copyin args with no params!"));
 		argp = &frame.tf_rdi;
 		argp += reg;
 		bcopy(argp, args, sizeof(args[0]) * regcnt);
 		error = copyin(params, &args[regcnt],
 		    (narg - regcnt) * sizeof(args[0]));
 		argp = &args[0];
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, argp);
 #endif
 
 	/*
 	 * Try to run the syscall without Giant if the syscall
 	 * is MP safe.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame.tf_rdx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		error = (*callp->sy_call)(td, argp);
 	}
 
 	switch (error) {
 	case 0:
 		frame.tf_rax = td->td_retval[0];
 		frame.tf_rdx = td->td_retval[1];
 		frame.tf_rflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
 		 * We have to do a full context restore so that %r10
 		 * (which was holding the value of %rcx) is restored for
 		 * the next iteration.
 		 */
 		frame.tf_rip -= frame.tf_err;
 		frame.tf_r10 = frame.tf_rcx;
 		td->td_pcb->pcb_flags |= PCB_FULLCTX;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame.tf_rax = error;
 		frame.tf_rflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Release Giant if we previously set it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	/*
 	 * Traced syscall.
 	 */
 	if (orig_tf_rflags & PSL_T) {
 		frame.tf_rflags &= ~PSL_T;
 		trapsignal(td, SIGTRAP, 0);
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/compat/svr4/svr4_sysvec.c
===================================================================
--- head/sys/compat/svr4/svr4_sysvec.c	(revision 116360)
+++ head/sys/compat/svr4/svr4_sysvec.c	(revision 116361)
@@ -1,421 +1,421 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Christos Zoulas.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* XXX we use functions that might not exist. */
 #include "opt_compat.h"
 
 #ifndef COMPAT_43
 #error "Unable to compile SVR4-emulator due to missing COMPAT_43 option!"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/socket.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/module.h>
 #include <vm/vm.h>
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <machine/cpu.h>
 #include <netinet/in.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_syscall.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_sockio.h>
 #include <compat/svr4/svr4_errno.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_siginfo.h>
 #include <compat/svr4/svr4_util.h>
 
 int bsd_to_svr4_errno[ELAST+1] = {
         0,
         SVR4_EPERM,
         SVR4_ENOENT,
         SVR4_ESRCH,
         SVR4_EINTR,
         SVR4_EIO,
         SVR4_ENXIO,
         SVR4_E2BIG,
         SVR4_ENOEXEC,
         SVR4_EBADF,
         SVR4_ECHILD,
         SVR4_EDEADLK,
         SVR4_ENOMEM,
         SVR4_EACCES,
         SVR4_EFAULT,
         SVR4_ENOTBLK,
         SVR4_EBUSY,
         SVR4_EEXIST,
         SVR4_EXDEV,
         SVR4_ENODEV,
         SVR4_ENOTDIR,
         SVR4_EISDIR,
         SVR4_EINVAL,
         SVR4_ENFILE,
         SVR4_EMFILE,
         SVR4_ENOTTY,
         SVR4_ETXTBSY,
         SVR4_EFBIG,
         SVR4_ENOSPC,
         SVR4_ESPIPE,
         SVR4_EROFS,
         SVR4_EMLINK,
         SVR4_EPIPE,
         SVR4_EDOM,
         SVR4_ERANGE,
         SVR4_EAGAIN,
         SVR4_EINPROGRESS,
         SVR4_EALREADY,
         SVR4_ENOTSOCK,
         SVR4_EDESTADDRREQ,
         SVR4_EMSGSIZE,
         SVR4_EPROTOTYPE,
         SVR4_ENOPROTOOPT,
         SVR4_EPROTONOSUPPORT,
         SVR4_ESOCKTNOSUPPORT,
         SVR4_EOPNOTSUPP,
         SVR4_EPFNOSUPPORT,
         SVR4_EAFNOSUPPORT,
         SVR4_EADDRINUSE,
         SVR4_EADDRNOTAVAIL,
         SVR4_ENETDOWN,
         SVR4_ENETUNREACH,
         SVR4_ENETRESET,
         SVR4_ECONNABORTED,
         SVR4_ECONNRESET,
         SVR4_ENOBUFS,
         SVR4_EISCONN,
         SVR4_ENOTCONN,
         SVR4_ESHUTDOWN,
         SVR4_ETOOMANYREFS,
         SVR4_ETIMEDOUT,
         SVR4_ECONNREFUSED,
         SVR4_ELOOP,
         SVR4_ENAMETOOLONG,
         SVR4_EHOSTDOWN,
         SVR4_EHOSTUNREACH,
         SVR4_ENOTEMPTY,
         SVR4_EPROCLIM,
         SVR4_EUSERS,
         SVR4_EDQUOT,
         SVR4_ESTALE,
         SVR4_EREMOTE,
         SVR4_EBADRPC,
         SVR4_ERPCMISMATCH,
         SVR4_EPROGUNAVAIL,
         SVR4_EPROGMISMATCH,
         SVR4_EPROCUNAVAIL,
         SVR4_ENOLCK,
         SVR4_ENOSYS,
         SVR4_EFTYPE,
         SVR4_EAUTH,
         SVR4_ENEEDAUTH,
         SVR4_EIDRM,
         SVR4_ENOMSG,
 };
 
 
 static int 	svr4_fixup(register_t **stack_base, struct image_params *imgp);
 
 extern struct sysent svr4_sysent[];
 #undef szsigcode
 #undef sigcode
 
 extern int svr4_szsigcode;
 extern char svr4_sigcode[];
 
 struct sysentvec svr4_sysvec = {
   SVR4_SYS_MAXSYSCALL,
   svr4_sysent,
   0xff,
   SVR4_SIGTBLSZ,
   bsd_to_svr4_sig,
   ELAST,  /* ELAST */
   bsd_to_svr4_errno,
   NULL,
   svr4_fixup,
   svr4_sendsig,
   svr4_sigcode,
   &svr4_szsigcode,
   NULL,
   "SVR4",
   elf32_coredump,
   NULL,
   SVR4_MINSIGSTKSZ,
   PAGE_SIZE,
   VM_MIN_ADDRESS,
   VM_MAXUSER_ADDRESS,
   USRSTACK,
   PS_STRINGS,
   VM_PROT_ALL,
   exec_copyout_strings,
   exec_setregs
 };
 
 Elf32_Brandinfo svr4_brand = {
   ELFOSABI_SYSV,
   EM_386,			/* XXX only implemented for x86 so far. */
   "SVR4",
   svr4_emul_path,
   "/lib/libc.so.1",
   &svr4_sysvec
 };
 
 const char      svr4_emul_path[] = "/compat/svr4";
 
 static int
 svr4_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	Elf32_Auxargs *args;
 	register_t *pos;
              
 	KASSERT(curthread->td_proc == imgp->proc &&
-	    (curthread->td_proc->p_flag & P_THREADED) == 0,
+	    (curthread->td_proc->p_flag & P_SA) == 0,
 	    ("unsafe svr4_fixup(), should be curproc"));
 	args = (Elf32_Auxargs *)imgp->auxargs;
 	pos = *stack_base + (imgp->argc + imgp->envc + 2);  
     
 	if (args->trace)
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 	
 	free(imgp->auxargs, M_TEMP);      
 	imgp->auxargs = NULL;
 
 	(*stack_base)--;
 	**stack_base = (register_t)imgp->argc;
 	return 0;
 }
 
 /*
  * Search an alternate path before passing pathname arguments on
  * to system calls. Useful for keeping a separate 'emulation tree'.
  *
  * If cflag is set, we check if an attempt can be made to create
  * the named file, i.e. we check if the directory it should
  * be in exists.
  *
  * Code shamelessly stolen by Mark Newton from IBCS2 emulation code.
  */
 int
 svr4_emul_find(td, sgp, prefix, path, pbuf, cflag)
 	struct thread	 *td;
 	caddr_t		 *sgp;		/* Pointer to stackgap memory */
 	const char	 *prefix;
 	char		 *path;
 	char		**pbuf;
 	int		  cflag;
 {
 	struct nameidata	 nd;
 	struct nameidata	 ndroot;
 	struct vattr		 vat;
 	struct vattr		 vatroot;
 	int			 error;
 	char			*ptr, *buf, *cp;
 	size_t			 sz, len;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pbuf = path;
 
 	for (ptr = buf; (*ptr = *prefix) != '\0'; ptr++, prefix++)
 		continue;
 
 	sz = MAXPATHLEN - (ptr - buf);
 
 	/* 
 	 * If sgp is not given then the path is already in kernel space
 	 */
 	if (sgp == NULL)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		free(buf, M_TEMP);
 		return error;
 	}
 
 	if (*ptr != '/') {
 		free(buf, M_TEMP);
 		return EINVAL;
 	}
 
 	/*
 	 * We know that there is a / somewhere in this pathname.
 	 * Search backwards for it, to find the file's parent dir
 	 * to see if it exists in the alternate tree. If it does,
 	 * and we want to create a file (cflag is set). We don't
 	 * need to worry about the root comparison in this case.
 	 */
 
 	if (cflag) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--);
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		if ((error = namei(&nd)) != 0) {
 			free(buf, M_TEMP);
 			return error;
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 
 		*cp = '/';
 	}
 	else {
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		if ((error = namei(&nd)) != 0) {
 			free(buf, M_TEMP);
 			return error;
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 
 		/*
 		 * We now compare the vnode of the svr4_root to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, svr4_emul_path,
 		       td);
 
 		if ((error = namei(&ndroot)) != 0) {
 			/* Cannot happen! */
 			free(buf, M_TEMP);
 			vrele(nd.ni_vp);
 			return error;
 		}
 		NDFREE(&ndroot, NDF_ONLY_PNBUF);
 
 		if ((error = VOP_GETATTR(nd.ni_vp, &vat, td->td_ucred, td)) != 0) {
 			goto done;
 		}
 
 		if ((error = VOP_GETATTR(ndroot.ni_vp, &vatroot, td->td_ucred, td))
 		    != 0) {
 			goto done;
 		}
 
 		if (vat.va_fsid == vatroot.va_fsid &&
 		    vat.va_fileid == vatroot.va_fileid) {
 			error = ENOENT;
 			goto done;
 		}
 
 	}
 	if (sgp == NULL)
 		*pbuf = buf;
 	else {
 		sz = &ptr[len] - buf;
 		*pbuf = stackgap_alloc(sgp, sz + 1);
 		error = copyout(buf, *pbuf, sz);
 		free(buf, M_TEMP);
 	}
 
 
 done:
 	vrele(nd.ni_vp);
 	if (!cflag)
 		vrele(ndroot.ni_vp);
 	return error;
 }
 
 static int
 svr4_elf_modevent(module_t mod, int type, void *data)
 {
 	int error;
 
 	error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		if (elf32_insert_brand_entry(&svr4_brand) < 0)
 			error = EINVAL;
 		if (error)
 			printf("cannot insert svr4 elf brand handler\n");
 		else if (bootverbose)
 			printf("svr4 ELF exec handler installed\n");
 		break;
 	case MOD_UNLOAD:
 		/* Only allow the emulator to be removed if it isn't in use. */
 		if (elf32_brand_inuse(&svr4_brand) != 0) {
 			error = EBUSY;
 		} else if (elf32_remove_brand_entry(&svr4_brand) < 0) {
 			error = EINVAL;
 		}
 
 		if (error)
 			printf("Could not deinstall ELF interpreter entry (error %d)\n",
 			       error);
 		else if (bootverbose)
 			printf("svr4 ELF exec handler removed\n");
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 static moduledata_t svr4_elf_mod = {
 	"svr4elf",
 	svr4_elf_modevent,
 	0
 };
 DECLARE_MODULE(svr4elf, svr4_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
 MODULE_DEPEND(svr4elf, streams, 1, 1, 1);
Index: head/sys/ddb/db_ps.c
===================================================================
--- head/sys/ddb/db_ps.c	(revision 116360)
+++ head/sys/ddb/db_ps.c	(revision 116361)
@@ -1,233 +1,233 @@
 /*-
  * Copyright (c) 1993 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/cons.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 
 #include <ddb/ddb.h>
 
 static void	dumpthread(volatile struct proc *p, volatile struct thread *td);
 
 void
 db_ps(dummy1, dummy2, dummy3, dummy4)
 	db_expr_t	dummy1;
 	boolean_t	dummy2;
 	db_expr_t	dummy3;
 	char *		dummy4;
 {
 	int np;
 	int nl = 0;
 	volatile struct proc *p, *pp;
 	volatile struct thread *td;
 	char *state;
 
 	np = nprocs;
 
 	/* sx_slock(&allproc_lock); */
 	if (!LIST_EMPTY(&allproc))
 		p = LIST_FIRST(&allproc);
 	else
 		p = &proc0;
 
 	db_printf("  pid   proc     addr    uid  ppid  pgrp  flag   stat  wmesg    wchan  cmd\n");
 	while (--np >= 0) {
 		/*
 		 * XXX just take 20 for now...
 		 */
 		if (nl++ >= 20) {
 			int c;
 
 			db_printf("--More--");
 			c = cngetc();
 			db_printf("\r");
 			/*
 			 * A whole screenfull or just one line?
 			 */
 			switch (c) {
 			case '\n':		/* just one line */
 				nl = 20;
 				break;
 			case ' ':
 				nl = 0;		/* another screenfull */
 				break;
 			default:		/* exit */
 				db_printf("\n");
 				return;
 			}
 		}
 		if (p == NULL) {
 			printf("oops, ran out of processes early!\n");
 			break;
 		}
 		/* PROC_LOCK(p); */
 		pp = p->p_pptr;
 		if (pp == NULL)
 			pp = p;
 
 
 		switch(p->p_state) {
 		case PRS_NORMAL:
 			if (P_SHOULDSTOP(p))
 				state = "stop";
 			else
 				state = "";
 			break;
 		case PRS_NEW:
 			state = "new ";
 			break;
 		case PRS_ZOMBIE:
 			state = "zomb";
 			break;
 		default:
 			state = "Unkn";
 			break;
 		}
 		db_printf("%5d %8p %8p %4d %5d %5d %07x %s",
 		    p->p_pid, (volatile void *)p, (void *)p->p_uarea, 
 		    p->p_ucred != NULL ? p->p_ucred->cr_ruid : 0, pp->p_pid,
 		    p->p_pgrp != NULL ? p->p_pgrp->pg_id : 0, p->p_flag,
 		    state);
-		if (p->p_flag & P_THREADED) 
+		if (p->p_flag & P_SA) 
 			db_printf("(threaded)  %s\n", p->p_comm);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			dumpthread(p, td);
 			nl++;
 		}
 		/* PROC_UNLOCK(p); */
 
 		p = LIST_NEXT(p, p_list);
 		if (p == NULL && np > 0)
 			p = LIST_FIRST(&zombproc);
     	}
 	/* sx_sunlock(&allproc_lock); */
 }
 
 static void
 dumpthread(volatile struct proc *p, volatile struct thread *td)
 {
-	if (p->p_flag & P_THREADED) 
+	if (p->p_flag & P_SA) 
 		db_printf( "   thread %p ksegrp %p ", td, td->td_ksegrp);
 	if (TD_ON_SLEEPQ(td)) {
 		if (td->td_flags & TDF_CVWAITQ)
 			if (TD_IS_SLEEPING(td))
 				db_printf("[CV]");
 			else
 				db_printf("[CVQ");
 		else
 			if (TD_IS_SLEEPING(td))
 				db_printf("[SLP]");
 			else
 				db_printf("[SLPQ");
 		db_printf("%s %p]", td->td_wmesg,
 		    (void *)td->td_wchan);
 	}
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 		if (TD_ON_LOCK(td)) {
 			db_printf("[LOCK %6s %8p]",
 			    td->td_lockname,
 			    (void *)td->td_blocked);
 		}
 #if 0 /* covered above */
 		if (TD_IS_SLEEPING(td)) {
 			db_printf("[SLP]");
 		}  
 #endif
 		if (TD_IS_SWAPPED(td)) {
 			db_printf("[SWAP]");
 		}
 		if (TD_IS_SUSPENDED(td)) {
 			db_printf("[SUSP]");
 		}
 		if (TD_AWAITING_INTR(td)) {
 			db_printf("[IWAIT]");
 		}
 		break;
 	case TDS_CAN_RUN:
 		db_printf("[Can run]");
 		break;
 	case TDS_RUNQ:
 		db_printf("[RUNQ]");
 		break;
 	case TDS_RUNNING:
 		db_printf("[CPU %d]", td->td_oncpu);
 		break;
 	case TDS_INACTIVE:
 		db_printf("[INACTIVE]");
 		break;
 	default:
 		db_printf("[UNK: %#x]", td->td_state);
 	}
-	if (p->p_flag & P_THREADED) {
+	if (p->p_flag & P_SA) {
 		if (td->td_kse)
 			db_printf("[kse %p]", td->td_kse);
 		db_printf("\n");
 	} else
 		db_printf(" %s\n", p->p_comm);
 }
 
 
 #define INKERNEL(va)    (((vm_offset_t)(va)) >= USRSTACK)
 void
 db_show_one_thread(db_expr_t addr, boolean_t have_addr,
 		db_expr_t count, char *modif)
 {
 	struct proc *p;
 	struct thread *td;
 
 	if (!have_addr)
 		td = curthread;
 	else if (!INKERNEL(addr)) {
 		printf("bad thread address");
 		return;
 	} else
 		td = (struct thread *)addr;
 	/* quick sanity check */
 	if ((p = td->td_proc) != td->td_ksegrp->kg_proc)
 		return;
 	printf("Proc %p ",p);
 	dumpthread(p, td);
 #ifdef	__i386__
 	db_stack_thread((db_expr_t)td, 1, count, modif);
 #endif
 }
Index: head/sys/fs/procfs/procfs_status.c
===================================================================
--- head/sys/fs/procfs/procfs_status.c	(revision 116360)
+++ head/sys/fs/procfs/procfs_status.c	(revision 116361)
@@ -1,214 +1,214 @@
 /*
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_status.c	8.4 (Berkeley) 6/15/94
  *
  * From:
  *	$Id: procfs_status.c,v 3.1 1993/12/15 09:40:17 jsp Exp $
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sysent.h>
 #include <sys/tty.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/procfs/procfs.h>
 
 int
 procfs_doprocstatus(PFS_FILL_ARGS)
 {
 	struct session *sess;
 	struct thread *tdfirst;
 	struct tty *tp;
 	struct ucred *cr;
 	const char *wmesg;
 	char *pc;
 	char *sep;
 	int pid, ppid, pgid, sid;
 	int i;
 
 	pid = p->p_pid;
 	PROC_LOCK(p);
 	ppid = p->p_pptr ? p->p_pptr->p_pid : 0;
 	pgid = p->p_pgrp->pg_id;
 	sess = p->p_pgrp->pg_session;
 	SESS_LOCK(sess);
 	sid = sess->s_leader ? sess->s_leader->p_pid : 0;
 
 /* comm pid ppid pgid sid maj,min ctty,sldr start ut st wmsg 
                                 euid ruid rgid,egid,groups[1 .. NGROUPS]
 */
 
 	pc = p->p_comm;
 	do {
 		if (*pc < 33 || *pc > 126 || *pc == '\\')
 			sbuf_printf(sb, "\\%03o", *pc);
 		else
 			sbuf_putc(sb, *pc);
 	} while (*++pc);
 	sbuf_printf(sb, " %d %d %d %d ", pid, ppid, pgid, sid);
 	if ((p->p_flag & P_CONTROLT) && (tp = sess->s_ttyp))
 		sbuf_printf(sb, "%d,%d ", major(tp->t_dev), minor(tp->t_dev));
 	else
 		sbuf_printf(sb, "%d,%d ", -1, -1);
 
 	sep = "";
 	if (sess->s_ttyvp) {
 		sbuf_printf(sb, "%sctty", sep);
 		sep = ",";
 	}
 	if (SESS_LEADER(p)) {
 		sbuf_printf(sb, "%ssldr", sep);
 		sep = ",";
 	}
 	SESS_UNLOCK(sess);
 	if (*sep != ',') {
 		sbuf_printf(sb, "noflags");
 	}
 
 	mtx_lock_spin(&sched_lock);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		wmesg = "-kse- ";
 	else {
 		tdfirst = FIRST_THREAD_IN_PROC(p);
 		if (tdfirst->td_wchan != NULL) {
 			KASSERT(tdfirst->td_wmesg != NULL,
 			    ("wchan %p has no wmesg", tdfirst->td_wchan));
 			wmesg = tdfirst->td_wmesg;
 		} else
 			wmesg = "nochan";
 	}
 
 	if (p->p_sflag & PS_INMEM) {
 		struct timeval start, ut, st;
 
 		calcru(p, &ut, &st, (struct timeval *) NULL);
 		mtx_unlock_spin(&sched_lock);
 		start = p->p_stats->p_start;
 		timevaladd(&start, &boottime);
 		sbuf_printf(sb, " %ld,%ld %ld,%ld %ld,%ld",
 		    start.tv_sec, start.tv_usec,
 		    ut.tv_sec, ut.tv_usec,
 		    st.tv_sec, st.tv_usec);
 	} else {
 		mtx_unlock_spin(&sched_lock);
 		sbuf_printf(sb, " -1,-1 -1,-1 -1,-1");
 	}
 
 	sbuf_printf(sb, " %s", wmesg);
 
 	cr = p->p_ucred;
 
 	sbuf_printf(sb, " %lu %lu %lu",
 		(u_long)cr->cr_uid,
 		(u_long)cr->cr_ruid,
 		(u_long)cr->cr_rgid);
 
 	/* egid (cr->cr_svgid) is equal to cr_ngroups[0] 
 	   see also getegid(2) in /sys/kern/kern_prot.c */
 
 	for (i = 0; i < cr->cr_ngroups; i++) {
 		sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]);
 	}
 
 	if (jailed(p->p_ucred)) {
 		mtx_lock(&p->p_ucred->cr_prison->pr_mtx);
 		sbuf_printf(sb, " %s", p->p_ucred->cr_prison->pr_host);
 		mtx_unlock(&p->p_ucred->cr_prison->pr_mtx);
 	} else {
 		sbuf_printf(sb, " -");
 	}
 	PROC_UNLOCK(p);
 	sbuf_printf(sb, "\n");
 
 	return (0);
 }
 
 int
 procfs_doproccmdline(PFS_FILL_ARGS)
 {
 	struct ps_strings pstr;
 	int error, i;
 
 	/*
 	 * If we are using the ps/cmdline caching, use that.  Otherwise
 	 * revert back to the old way which only implements full cmdline
 	 * for the currept process and just p->p_comm for all other
 	 * processes.
 	 * Note that if the argv is no longer available, we deliberately
 	 * don't fall back on p->p_comm or return an error: the authentic
 	 * Linux behaviour is to return zero-length in this case.
 	 */
 
 	PROC_LOCK(p);
 	if (p->p_args && (ps_argsopen || !p_cansee(td, p))) {
 		sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	PROC_UNLOCK(p);
 	if (p != td->td_proc) {
 		sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
 	} else {
 		error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
 		    sizeof(pstr));
 		if (error)
 			return (error);
 		for (i = 0; i < pstr.ps_nargvstr; i++) {
 			sbuf_copyin(sb, pstr.ps_argvstr[i], 0);
 			sbuf_printf(sb, "%c", '\0');
 		}
 	}
 
 	return (0);
 }
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 116360)
+++ head/sys/i386/i386/pmap.c	(revision 116361)
@@ -1,3397 +1,3397 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_pmap.h"
 #include "opt_msgbuf.h"
 #include "opt_kstack_pages.h"
 #include "opt_swtch.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #if defined(SMP) || defined(APIC_IO)
 #include <machine/smp.h>
 #include <machine/apic.h>
 #include <machine/segments.h>
 #include <machine/tss.h>
 #endif /* SMP || APIC_IO */
 
 #define PMAP_KEEP_PDIRS
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #define MINPV 2048
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v) ((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static int protection_codes[8];
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 #if defined(SMP) && defined(LAZY_SWITCH)
 static struct mtx lazypmap_lock;
 #endif
 
 vm_paddr_t avail_start;	/* PA of first available physical page */
 vm_paddr_t avail_end;	/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static int pgeflag;		/* PG_G or-in */
 static int pseflag;		/* PG_PS or-in */
 
 static int nkpt;
 vm_offset_t kernel_vm_end;
 extern u_int32_t KERNend;
 
 #ifdef PAE
 static uma_zone_t pdptzone;
 #endif
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 int pmap_pagedaemon_waken;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP2, *CMAP3, *ptmmap;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR2, CADDR3;
 static struct mtx CMAPCADDR12_lock;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static pt_entry_t *pt_crashdumpmap;
 static caddr_t crashdumpmap;
 
 #ifdef SMP
 extern pt_entry_t *SMPpt;
 #endif
 static pt_entry_t *PMAP1 = 0;
 static pt_entry_t *PADDR1 = 0;
 
 static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(void);
 static void	i386_protection_init(void);
 static __inline void	pmap_changebit(vm_page_t m, int bit, boolean_t setem);
 
 static vm_page_t pmap_enter_quick(pmap_t pmap, vm_offset_t va,
 				      vm_page_t m, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 static int pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va,
 		vm_page_t mpte, vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex);
 static vm_page_t pmap_page_lookup(vm_object_t object, vm_pindex_t pindex);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 static void *pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 #ifdef PAE
 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 #endif
 
 static pd_entry_t pdir4mb;
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 #ifdef I686_CPU_not	/* Problem seems to have gone away */
 	/* Deal with un-resolved Pentium4 issues */
 	if (cpu_class == CPUCLASS_686 &&
 	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
 	    (cpu_id & 0xf00) == 0xf00)
 		return newaddr;
 #endif
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE)
 		newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_paddr_t firstaddr;
 	vm_paddr_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 	int i;
 
 	avail_start = firstaddr;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize protection array.
 	 */
 	i386_protection_init();
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 #ifdef PAE
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	LIST_INIT(&allpmaps);
 #if defined(SMP) && defined(LAZY_SWITCH)
 	mtx_init(&lazypmap_lock, "lazypmap", NULL, MTX_SPIN);
 #endif
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the idle process page zeroing.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 
 	mtx_init(&CMAPCADDR12_lock, "CMAPCADDR12", NULL, MTX_DEF);
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 * XXX ptmmap is not used.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 * XXX msgbufmap is not used.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
 	       atop(round_page(MSGBUF_SIZE)))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 
 	virtual_avail = va;
 
 	*CMAP1 = *CMAP2 = 0;
 	for (i = 0; i < NKPT; i++)
 		PTD[i] = 0;
 
 	pgeflag = 0;
 #ifndef DISABLE_PG_G
 	if (cpu_feature & CPUID_PGE)
 		pgeflag = PG_G;
 #endif
 #ifdef I686_CPU_not	/* Problem seems to have gone away */
 	/* Deal with un-resolved Pentium4 issues */
 	if (cpu_class == CPUCLASS_686 &&
 	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
 	    (cpu_id & 0xf00) == 0xf00) {
 		printf("Warning: Pentium 4 cpu: PG_G disabled (global flag)\n");
 		pgeflag = 0;
 	}
 #endif
 	
 /*
  * Initialize the 4MB page size flag
  */
 	pseflag = 0;
 /*
  * The 4MB page version of the initial
  * kernel page mapping.
  */
 	pdir4mb = 0;
 
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE)
 		pseflag = PG_PS;
 #endif
 #ifdef I686_CPU_not	/* Problem seems to have gone away */
 	/* Deal with un-resolved Pentium4 issues */
 	if (cpu_class == CPUCLASS_686 &&
 	    strcmp(cpu_vendor, "GenuineIntel") == 0 &&
 	    (cpu_id & 0xf00) == 0xf00) {
 		printf("Warning: Pentium 4 cpu: PG_PS disabled (4MB pages)\n");
 		pseflag = 0;
 	}
 #endif
 #ifndef DISABLE_PSE
 	if (pseflag) {
 		pd_entry_t ptditmp;
 		/*
 		 * Note that we have enabled PSE mode
 		 */
 		ptditmp = *(PTmap + i386_btop(KERNBASE));
 		ptditmp &= ~(NBPDR - 1);
 		ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
 		pdir4mb = ptditmp;
 	}
 #endif
 #ifndef SMP
 	/*
 	 * Turn on PGE/PSE.  SMP does this later on since the
 	 * 4K page tables are required for AP boot (for now).
 	 * XXX fixme.
 	 */
 	pmap_set_opt();
 #endif
 #ifdef SMP
 	if (cpu_apic_address == 0)
 		panic("pmap_bootstrap: no local apic! (non-SMP hardware?)");
 
 	/* local apic is mapped on last page */
 	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V | PG_RW | PG_N | pgeflag |
 	    (cpu_apic_address & PG_FRAME));
 #endif
 	invltlb();
 }
 
 /*
  * Enable 4MB page mode for MP startup.  Turn on PG_G support.
  * BSP will run this after all the AP's have started up.
  */
 void
 pmap_set_opt(void)
 {
 	pt_entry_t *pte;
 	vm_offset_t va, endva;
 
 	if (pgeflag && (cpu_feature & CPUID_PGE)) {
 		load_cr4(rcr4() | CR4_PGE);
 		invltlb();		/* Insurance */
 	}
 #ifndef DISABLE_PSE
 	if (pseflag && (cpu_feature & CPUID_PSE)) {
 		load_cr4(rcr4() | CR4_PSE);
 		invltlb();		/* Insurance */
 	}
 #endif
 	if (PCPU_GET(cpuid) == 0) {
 #ifndef DISABLE_PSE
 		if (pdir4mb) {
 			kernel_pmap->pm_pdir[KPTDI] = PTD[KPTDI] = pdir4mb;
 			invltlb();	/* Insurance */
 		}
 #endif
 		if (pgeflag) {
 			/* Turn on PG_G for text, data, bss pages. */
 			va = (vm_offset_t)btext;
 #ifndef DISABLE_PSE
 			if (pseflag && (cpu_feature & CPUID_PSE)) {
 				if (va < KERNBASE + (1 << PDRSHIFT))
 					va = KERNBASE + (1 << PDRSHIFT);
 			}
 #endif
 			endva = KERNBASE + KERNend;
 			while (va < endva) {
 				pte = vtopte(va);
 				if (*pte)
 					*pte |= pgeflag;
 				va += PAGE_SIZE;
 			}
 			invltlb();	/* Insurance */
 		}
 		/*
 		 * We do not need to broadcast the invltlb here, because
 		 * each AP does it the moment it is released from the boot
 		 * lock.  See ap_init().
 		 */
 	}
 }
 
 static void *
 pmap_pv_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (void *)kmem_alloc(kernel_map, bytes);
 }
 
 #ifdef PAE
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (contigmalloc(PAGE_SIZE, NULL, 0, 0x0ULL, 0xffffffffULL, 1, 0));
 }
 #endif
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_paddr_t phys_start, phys_end;
 {
 	int i;
 	int initial_pvs;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 
 	for(i = 0; i < vm_page_array_size; i++) {
 		vm_page_t m;
 
 		m = &vm_page_array[i];
 		TAILQ_INIT(&m->md.pv_list);
 		m->md.pv_list_count = 0;
 	}
 
 	/*
 	 * init the pv free list
 	 */
 	initial_pvs = vm_page_array_size;
 	if (initial_pvs < MINPV)
 		initial_pvs = MINPV;
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
 	uma_zone_set_allocf(pvzone, pmap_pv_allocf);
 	uma_prealloc(pvzone, initial_pvs);
 
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1, 0);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  * Initialize the address space (zone) for the pv_entries.  Set a
  * high water mark so that the system can recover from excessive
  * numbers of pv entries.
  */
 void
 pmap_init2()
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 }
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 #if defined(PMAP_DIAGNOSTIC)
 
 /*
  * This code checks for non-writeable/modified pages.
  * This should be an invalid condition.
  */
 static int
 pmap_nw_modified(pt_entry_t ptea)
 {
 	int pte;
 
 	pte = (int) ptea;
 
 	if ((pte & (PG_M|PG_RW)) == PG_M)
 		return 1;
 	else
 		return 0;
 }
 #endif
 
 
 /*
  * this routine defines the region(s) of memory that should
  * not be tested for the modified bit.
  */
 static PMAP_INLINE int
 pmap_track_modified(vm_offset_t va)
 {
 	if ((va < kmi.clean_sva) || (va >= kmi.clean_eva)) 
 		return 1;
 	else
 		return 0;
 }
 
 #ifdef I386_CPU
 /*
  * i386 only has "invalidate everything" and no SMP to worry about.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 #else /* !I386_CPU */
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 */
 	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	critical_exit();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 */
 	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	critical_exit();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 #ifdef SWTCH_OPTIM_STATS
 	tlb_flush_count++;
 #endif
 	critical_enter();
 	/*
 	 * We need to disable interrupt preemption but MUST NOT have
 	 * interrupts disabled here.
 	 * XXX we may need to hold schedlock to get a coherent pm_active
 	 */
 	if (pmap->pm_active == -1 || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	critical_exit();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 #endif /* !SMP */
 #endif /* !I386_CPU */
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 	return (pmap == kernel_pmap ||
 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME));
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  */
 pt_entry_t * 
 pmap_pte_quick(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return vtopte(va);
 		newpf = *pde & PG_FRAME;
 		if (((*PMAP1) & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR1);
 		}
 		return PADDR1 + (i386_btop(va) & (NPTEPG - 1));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	if (pmap == 0)
 		return 0;
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
 			rtval = (pde & ~PDRMASK) | (va & PDRMASK);
 			return rtval;
 		}
 		pte = pmap_pte_quick(pmap, va);
 		rtval = ((*pte & PG_FRAME) | (va & PAGE_MASK));
 		return rtval;
 	}
 	return 0;
 
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 
 	va = sva = *virt;
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kenter(va, VM_PAGE_TO_PHYS(*m));
 		va += PAGE_SIZE;
 		m++;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 static vm_page_t
 pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 retry:
 	m = vm_page_lookup(object, pindex);
 	if (m != NULL) {
 		vm_page_lock_queues();
 		if (vm_page_sleep_if_busy(m, FALSE, "pplookp"))
 			goto retry;
 		vm_page_unlock_queues();
 	}
 	return m;
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 {
 
 	while (vm_page_sleep_if_busy(m, FALSE, "pmuwpt"))
 		vm_page_lock_queues();
 
 	if (m->hold_count == 0) {
 		vm_offset_t pteva;
 		/*
 		 * unmap the page table page
 		 */
 		pmap->pm_pdir[m->pindex] = 0;
 		--pmap->pm_stats.resident_count;
 		if (pmap_is_current(pmap)) {
 			/*
 			 * Do an invltlb to make the invalidated mapping
 			 * take effect immediately.
 			 */
 			pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 			pmap_invalidate_page(pmap, pteva);
 		}
 
 		/*
 		 * If the page is finally unwired, simply free it.
 		 */
 		--m->wire_count;
 		if (m->wire_count == 0) {
 			vm_page_busy(m);
 			vm_page_free_zero(m);
 			atomic_subtract_int(&cnt.v_wire_count, 1);
 		}
 		return 1;
 	}
 	return 0;
 }
 
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m)
 {
 	vm_page_unhold(m);
 	if (m->hold_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m);
 	else
 		return 0;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
 {
 	unsigned ptepindex;
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 
 	if (mpte == NULL) {
 		ptepindex = (va >> PDRSHIFT);
 		if (pmap->pm_pteobj->root &&
 			(pmap->pm_pteobj->root->pindex == ptepindex)) {
 			mpte = pmap->pm_pteobj->root;
 		} else {
 			while ((mpte = vm_page_lookup(pmap->pm_pteobj, ptepindex)) != NULL &&
 			       vm_page_sleep_if_busy(mpte, FALSE, "pulook"))
 				vm_page_lock_queues();
 		}
 	}
 
 	return pmap_unwire_pte_hold(pmap, mpte);
 }
 
 void
 pmap_pinit0(pmap)
 	struct pmap *pmap;
 {
 
 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
 #ifdef PAE
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	vm_page_t ptdpg[NPGPTD];
 	vm_paddr_t pa;
 	int i;
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_pageable(kernel_map,
 		    NBPTD);
 #ifdef PAE
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 	}
 
 	/*
 	 * allocate object for the ptes
 	 */
 	if (pmap->pm_pteobj == NULL)
 		pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI +
 		    NPGPTD);
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD; i++) {
 		ptdpg[i] = vm_page_grab(pmap->pm_pteobj, PTDPTDI + i,
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		vm_page_lock_queues();
 		vm_page_flag_clear(ptdpg[i], PG_BUSY);
 		ptdpg[i]->valid = VM_PAGE_BITS_ALL;
 		vm_page_unlock_queues();
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
 	}
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	/* Wire in kernel global address entries. */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
 #endif
 
 	/* install self-referential address mapping entry(s) */
 	for (i = 0; i < NPGPTD; i++) {
 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
 #ifdef PAE
 		pmap->pm_pdpt[i] = pa | PG_V;
 #endif
 	}
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Wire in kernel global address entries.  To avoid a race condition
  * between pmap initialization and pmap_growkernel, this procedure
  * should be called after the vmspace is attached to the process
  * but before this pmap is activated.
  */
 void
 pmap_pinit2(pmap)
 	struct pmap *pmap;
 {
 	/* XXX: Remove this stub when no longer called */
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap, ptepindex)
 	pmap_t	pmap;
 	unsigned ptepindex;
 {
 	vm_paddr_t ptepa;
 	vm_offset_t pteva;
 	vm_page_t m;
 
 	/*
 	 * Find or fabricate a new pagetable page
 	 */
 	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 
 	KASSERT(m->queue == PQ_NONE,
 		("_pmap_allocpte: %p->queue != PQ_NONE", m));
 
 	/*
 	 * Increment the hold count for the page table page
 	 * (denoting a new mapping.)
 	 */
 	m->hold_count++;
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	/*
 	 * Try to use the new mapping, but if we cannot, then
 	 * do it with the routine that maps the page explicitly.
 	 */
 	if ((m->flags & PG_ZERO) == 0) {
 		if (pmap_is_current(pmap)) {
 			pteva = VM_MAXUSER_ADDRESS + i386_ptob(ptepindex);
 			bzero((caddr_t) pteva, PAGE_SIZE);
 		} else {
 			pmap_zero_page(m);
 		}
 	}
 	vm_page_lock_queues();
 	m->valid = VM_PAGE_BITS_ALL;
 	vm_page_flag_clear(m, PG_ZERO);
 	vm_page_wakeup(m);
 	vm_page_unlock_queues();
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va)
 {
 	unsigned ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		pmap_invalidate_all(kernel_pmap);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		/*
 		 * In order to get the page table page, try the
 		 * hint first.
 		 */
 		if (pmap->pm_pteobj->root &&
 			(pmap->pm_pteobj->root->pindex == ptepindex)) {
 			m = pmap->pm_pteobj->root;
 		} else {
 			m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
 		}
 		m->hold_count++;
 		return m;
 	}
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	return _pmap_allocpte(pmap, ptepindex);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 #ifdef LAZY_SWITCH
 #ifdef SMP
 /*
  * Deal with a SMP shootdown of other users of the pmap that we are
  * trying to dispose of.  This can be a bit hairy.
  */
 static u_int *lazymask;
 static u_int lazyptd;
 static volatile u_int lazywait;
 
 void pmap_lazyfix_action(void);
 
 void
 pmap_lazyfix_action(void)
 {
 	u_int mymask = PCPU_GET(cpumask);
 
 	if (rcr3() == lazyptd) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 #ifdef SWTCH_OPTIM_STATS
 		atomic_add_int(&lazy_flush_smpfixup, 1);
 	} else {
 		if (*lazymask & mymask)
 			lazy_flush_smpbadcr3++;
 		else
 			lazy_flush_smpmiss++;
 #endif
 	}
 	atomic_clear_int(lazymask, mymask);
 	atomic_store_rel_int(&lazywait, 1);
 }
 
 static void
 pmap_lazyfix_self(u_int mymask)
 {
 
 	if (rcr3() == lazyptd) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 #ifdef SWTCH_OPTIM_STATS
 		lazy_flush_fixup++;
 	} else {
 		if (*lazymask & mymask)
 			lazy_flush_smpbadcr3++;
 		else
 			lazy_flush_smpmiss++;
 #endif
 	}
 	atomic_clear_int(lazymask, mymask);
 }
 
 
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int mymask = PCPU_GET(cpumask);
 	u_int mask;
 	register u_int spins;
 
 	while ((mask = pmap->pm_active) != 0) {
 		spins = 50000000;
 		mask = mask & -mask;	/* Find least significant set bit */
 		mtx_lock_spin(&lazypmap_lock);
 #ifdef PAE
 		lazyptd = vtophys(pmap->pm_pdpt);
 #else
 		lazyptd = vtophys(pmap->pm_pdir);
 #endif
 		if (mask == mymask) {
 			lazymask = &pmap->pm_active;
 			pmap_lazyfix_self(mymask);
 		} else {
 			atomic_store_rel_int((u_int *)&lazymask,
 			    (u_int)&pmap->pm_active);
 			atomic_store_rel_int(&lazywait, 0);
 			ipi_selected(mask, IPI_LAZYPMAP);
 			while (lazywait == 0) {
 				ia32_pause();
 				if (--spins == 0)
 					break;
 			}
 #ifdef SWTCH_OPTIM_STATS
 			lazy_flush_smpipi++;
 #endif
 		}
 		mtx_unlock_spin(&lazypmap_lock);
 		if (spins == 0)
 			printf("pmap_lazyfix: spun for 50000000\n");
 	}
 }
 
 #else	/* SMP */
 
 /*
  * Cleaning up on uniprocessor is easy.  For various reasons, we're
  * unlikely to have to even execute this code, including the fact
  * that the cleanup is deferred until the parent does a wait(2), which
  * means that another userland process has run.
  */
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int cr3;
 
 	cr3 = vtophys(pmap->pm_pdir);
 	if (cr3 == rcr3()) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 		pmap->pm_active &= ~(PCPU_GET(cpumask));
 #ifdef SWTCH_OPTIM_STATS
 		lazy_flush_fixup++;
 #endif
 	}
 }
 #endif	/* SMP */
 #endif	/* LAZY_SWITCH */
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_object_t object;
 	vm_page_t m;
 	int i;
 
 	object = pmap->pm_pteobj;
 
 	KASSERT(object->ref_count == 1,
 	    ("pmap_release: pteobj reference count %d != 1",
 	    object->ref_count));
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 #ifdef LAZY_SWITCH
 	pmap_lazyfix(pmap);
 #endif
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = 0;
 #endif
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	vm_page_lock_queues();
 	for (i = 0; i < NPGPTD; i++) {
 		m = TAILQ_FIRST(&object->memq);
 #ifdef PAE
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_busy(m);
 		vm_page_free_zero(m);
 	}
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("pmap_release: leaking page table pages"));
 	vm_page_unlock_queues();
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct pmap *pmap;
 	int s;
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 	pt_entry_t *pde;
 
 	s = splhigh();
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 		mtx_lock_spin(&allpmaps_lock);
 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
 			pde = pmap_pde(pmap, kernel_vm_end);
 			pde_store(pde, newpdir);
 		}
 		mtx_unlock_spin(&allpmaps_lock);
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 get_pv_entry(void)
 {
 	pv_entry_count++;
 	if (pv_entry_high_water &&
 		(pv_entry_count > pv_entry_high_water) &&
 		(pmap_pagedaemon_waken == 0)) {
 		pmap_pagedaemon_waken = 1;
 		wakeup (&vm_pages_needed);
 	}
 	return uma_zalloc(pvzone, M_NOWAIT);
 }
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 
 static int
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 	int rtval;
 	int s;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 
 	rtval = 0;
 	if (pv) {
 		rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 	}
 			
 	splx(s);
 	return rtval;
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
 {
 
 	int s;
 	pv_entry_t pv;
 
 	s = splvm();
 	pv = get_pv_entry();
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 	pv->pv_ptem = mpte;
 
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 
 	splx(s);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte);
 		if (oldpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) oldpte)) {
 				printf(
 	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
 				    va, oldpte);
 			}
 #endif
 			if (pmap_track_modified(va))
 				vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		return pmap_remove_entry(pmap, m, va);
 	} else {
 		return pmap_unuse_pt(pmap, va, NULL);
 	}
 
 	return 0;
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	int anyvalid;
 
 	if (pmap == NULL)
 		return;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva);
 		return;
 	}
 
 	anyvalid = 0;
 
 	for (; sva < eva; sva = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (; sva != pdnxt; sva += PAGE_SIZE) {
 			if ((pte = pmap_pte_quick(pmap, sva)) == NULL ||
 			    *pte == 0)
 				continue;
 			anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva))
 				break;
 		}
 	}
 
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	register pv_entry_t pv;
 	pt_entry_t *pte, tpte;
 	int s;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	s = splvm();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pv->pv_pmap->pm_stats.resident_count--;
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pv->pv_pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 #if defined(PMAP_DIAGNOSTIC)
 			if (pmap_nw_modified((pt_entry_t) tpte)) {
 				printf(
 	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
 				    pv->pv_va, tpte);
 			}
 #endif
 			if (pmap_track_modified(pv->pv_va))
 				vm_page_dirty(m);
 		}
 		pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	splx(s);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	int anychanged;
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	anychanged = 0;
 
 	for (; sva < eva; sva = pdnxt) {
 		unsigned pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anychanged = 1;
 			continue;
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (; sva != pdnxt; sva += PAGE_SIZE) {
 			pt_entry_t pbits;
 			pt_entry_t *pte;
 			vm_page_t m;
 
 			if ((pte = pmap_pte_quick(pmap, sva)) == NULL)
 				continue;
 			pbits = *pte;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0 &&
 				    pmap_track_modified(sva)) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits);
 					vm_page_dirty(m);
 					pbits &= ~PG_M;
 				}
 			}
 
 			pbits &= ~PG_RW;
 
 			if (pbits != *pte) {
 				pte_store(pte, pbits);
 				anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	register pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte;
 
 	if (pmap == NULL)
 		return;
 
 	va &= PG_FRAME;
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 	}
 #endif
 
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	if (origpte & PG_PS)
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (pmap_nw_modified((pt_entry_t) origpte)) {
 			printf(
 	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
 			    va, origpte);
 		}
 #endif
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->hold_count--;
 
 		if ((prot & VM_PROT_WRITE) && (origpte & PG_V)) {
 			if ((origpte & PG_RW) == 0) {
 				pte_store(pte, origpte | PG_RW);
 				pmap_invalidate_page(pmap, va);
 			}
 			return;
 		}
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			if ((origpte & PG_M) && pmap_track_modified(va)) {
 				vm_page_t om;
 				om = PHYS_TO_VM_PAGE(opa);
 				vm_page_dirty(om);
 			}
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		int err;
 		vm_page_lock_queues();
 		err = pmap_remove_pte(pmap, pte, va);
 		vm_page_unlock_queues();
 		if (err)
 			panic("pmap_enter: pte vanished, va: 0x%x", va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_initialized && 
 	    (m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, mpte, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V);
 
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		pte_store(pte, newpte | PG_A);
 		/*if (origpte)*/ {
 			pmap_invalidate_page(pmap, va);
 		}
 	}
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
 static vm_page_t
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		unsigned ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->hold_count++;
 		} else {
 retry:
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 				if (pmap->pm_pteobj->root &&
 					(pmap->pm_pteobj->root->pindex == ptepindex)) {
 					mpte = pmap->pm_pteobj->root;
 				} else {
 					mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
 				}
 				if (mpte == NULL)
 					goto retry;
 				mpte->hold_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			vm_page_lock_queues();
 			pmap_unwire_pte_hold(pmap, mpte);
 			vm_page_unlock_queues();
 		}
 		return 0;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0)
 		pmap_insert_entry(pmap, va, mpte, m);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_offset_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 #ifndef I386_CPU
 	invlpg(va);
 #else
 	invltlb();
 #endif
 	return ((void *)crashdumpmap);
 }
 
 #define MAX_INIT_PT (96)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size, int limit)
 {
 	vm_offset_t tmpidx;
 	int psize;
 	vm_page_t p, mpte;
 
 	if (pmap == NULL || object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	/*
 	 * This code maps large physical mmap regions into the
 	 * processor address space.  Note that some shortcuts
 	 * are taken, but the code works.
 	 */
 	if (pseflag && (object->type == OBJT_DEVICE) &&
 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		int i;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		pd_entry_t ptepa;
 
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			goto unlock_return;
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			vm_page_lock_queues();
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				goto unlock_return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				goto unlock_return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 		}
 		vm_page_unlock_queues();
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1)) {
 			goto unlock_return;
 		}
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i = 0; i < npdes; i++) {
 			pde_store(&pmap->pm_pdir[ptepindex],
 			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		pmap_invalidate_all(kernel_pmap);
 		goto unlock_return;
 	}
 
 	psize = i386_btop(size);
 
 	if ((object->type != OBJT_VNODE) ||
 	    ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
 	     (object->resident_page_count > MAX_INIT_PT))) {
 		goto unlock_return;
 	}
 
 	if (psize + pindex > object->size) {
 		if (object->size < pindex)
 			goto unlock_return;
 		psize = object->size - pindex;
 	}
 
 	mpte = NULL;
 
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < pindex) {
 			p = vm_page_splay(pindex, object->root);
 			if ((object->root = p)->pindex < pindex)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if ((limit & MAP_PREFAULT_MADVISE) &&
 		    cnt.v_free_count < cnt.v_free_reserved) {
 			break;
 		}
 		vm_page_lock_queues();
 		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
 		    (p->busy == 0) &&
 		    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 			if ((p->queue - p->pc) == PQ_CACHE)
 				vm_page_deactivate(p);
 			vm_page_busy(p);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			mpte = pmap_enter_quick(pmap, 
 				addr + i386_ptob(tmpidx), p, mpte);
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 		}
 		vm_page_unlock_queues();
 	}
 unlock_return:
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * pmap_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of pmap_object_init_pt, except it runs at page fault time instead
  * of mmap time.
  */
 #define PFBAK 4
 #define PFFOR 4
 #define PAGEORDER_SIZE (PFBAK+PFFOR)
 
 static int pmap_prefault_pageorder[] = {
 	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
 	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
 	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
 	-4 * PAGE_SIZE, 4 * PAGE_SIZE
 };
 
 void
 pmap_prefault(pmap, addra, entry)
 	pmap_t pmap;
 	vm_offset_t addra;
 	vm_map_entry_t entry;
 {
 	int i;
 	vm_offset_t starta;
 	vm_offset_t addr;
 	vm_pindex_t pindex;
 	vm_page_t m, mpte;
 	vm_object_t object;
 
 	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)))
 		return;
 
 	object = entry->object.vm_object;
 
 	starta = addra - PFBAK * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	mpte = NULL;
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t lobject;
 		pt_entry_t *pte;
 
 		addr = addra + pmap_prefault_pageorder[i];
 		if (addr > addra + (PFFOR * PAGE_SIZE))
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if ((*pmap_pde(pmap, addr)) == 0) 
 			continue;
 
 		pte = vtopte(addr);
 		if (*pte)
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = object;
 		for (m = vm_page_lookup(lobject, pindex);
 		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
 		    lobject = lobject->backing_object) {
 			if (lobject->backing_object_offset & PAGE_MASK)
 				break;
 			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
 			m = vm_page_lookup(lobject->backing_object, pindex);
 		}
 
 		/*
 		 * give-up when a page is not in memory
 		 */
 		if (m == NULL)
 			break;
 		vm_page_lock_queues();
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			(m->busy == 0) &&
 		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
 
 			if ((m->queue - m->pc) == PQ_CACHE) {
 				vm_page_deactivate(m);
 			}
 			vm_page_busy(m);
 			vm_page_unlock_queues();
 			mpte = pmap_enter_quick(pmap, addr, m, mpte);
 			vm_page_lock_queues();
 			vm_page_wakeup(m);
 		}
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 
 	if (pmap == NULL)
 		return;
 
 	pte = pmap_pte_quick(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 	vm_page_t m;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables\n");
 
 		/*
 		 * Don't let optional prefaulting of pages make us go
 		 * way below the low water mark of free pages or way
 		 * above high water mark of used pv entries.
 		 */
 		if (cnt.v_free_count < cnt.v_free_reserved ||
 		    pv_entry_count > pv_entry_high_water)
 			break;
 		
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
 		srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
 		if ((srcmpte == NULL) ||
 		    (srcmpte->hold_count == 0) || (srcmpte->flags & PG_BUSY))
 			continue;
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				/*
 				 * We have to check after allocpte for the
 				 * pte still being around...  allocpte can
 				 * block.
 				 */
 				dstmpte = pmap_allocpte(dst_pmap, addr);
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if ((*dst_pte == 0) && (ptetemp = *src_pte)) {
 					/*
 					 * Clear the modified and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					m = PHYS_TO_VM_PAGE(ptetemp);
 					*dst_pte = ptetemp & ~(PG_M | PG_A);
 					dst_pmap->pm_stats.resident_count++;
 					pmap_insert_entry(dst_pmap, addr,
 						dstmpte, m);
 	 			} else {
 					vm_page_lock_queues();
 					pmap_unwire_pte_hold(dst_pmap, dstmpte);
 					vm_page_unlock_queues();
 				}
 				if (dstmpte->hold_count >= srcmpte->hold_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 }	
 
 #ifdef SMP
 
 /*
  *	pmap_zpi_switchin*()
  *
  *	These functions allow us to avoid doing IPIs alltogether in certain
  *	temporary page-mapping situations (page zeroing).  Instead to deal
  *	with being preempted and moved onto a different cpu we invalidate
  *	the page when the scheduler switches us in.  This does not occur
  *	very often so we remain relatively optimal with very little effort.
  */
 static void
 pmap_zpi_switchin12(void)
 {
 	invlpg((u_int)CADDR1);
 	invlpg((u_int)CADDR2);
 }
 
 static void
 pmap_zpi_switchin2(void)
 {
 	invlpg((u_int)CADDR2);
 }
 
 static void
 pmap_zpi_switchin3(void)
 {
 	invlpg((u_int)CADDR3);
 }
 
 #endif
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 
 	mtx_lock(&CMAPCADDR12_lock);
 	if (*CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 #ifdef I386_CPU
 	invltlb();
 #else
 #ifdef SMP
 	curthread->td_switchin = pmap_zpi_switchin2;
 #endif
 	invlpg((u_int)CADDR2);
 #endif
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686)
 		i686_pagezero(CADDR2);
 	else
 #endif
 		bzero(CADDR2, PAGE_SIZE);
 #ifdef SMP
 	curthread->td_switchin = NULL;
 #endif
 	*CMAP2 = 0;
 	mtx_unlock(&CMAPCADDR12_lock);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 
 	mtx_lock(&CMAPCADDR12_lock);
 	if (*CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 #ifdef I386_CPU
 	invltlb();
 #else
 #ifdef SMP
 	curthread->td_switchin = pmap_zpi_switchin2;
 #endif
 	invlpg((u_int)CADDR2);
 #endif
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
 		i686_pagezero(CADDR2);
 	else
 #endif
 		bzero((char *)CADDR2 + off, size);
 #ifdef SMP
 	curthread->td_switchin = NULL;
 #endif
 	*CMAP2 = 0;
 	mtx_unlock(&CMAPCADDR12_lock);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	if (*CMAP3)
 		panic("pmap_zero_page: CMAP3 busy");
 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 #ifdef I386_CPU
 	invltlb();
 #else
 #ifdef SMP
 	curthread->td_switchin = pmap_zpi_switchin3;
 #endif
 	invlpg((u_int)CADDR3);
 #endif
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686)
 		i686_pagezero(CADDR3);
 	else
 #endif
 		bzero(CADDR3, PAGE_SIZE);
 #ifdef SMP
 	curthread->td_switchin = NULL;
 #endif
 	*CMAP3 = 0;
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 
 	mtx_lock(&CMAPCADDR12_lock);
 	if (*CMAP1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*CMAP2)
 		panic("pmap_copy_page: CMAP2 busy");
 	*CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
 	*CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
 #ifdef I386_CPU
 	invltlb();
 #else
 #ifdef SMP
 	curthread->td_switchin = pmap_zpi_switchin12;
 #endif
 	invlpg((u_int)CADDR1);
 	invlpg((u_int)CADDR2);
 #endif
 	bcopy(CADDR1, CADDR2, PAGE_SIZE);
 #ifdef SMP
 	curthread->td_switchin = NULL;
 #endif
 	*CMAP1 = 0;
 	*CMAP2 = 0;
 	mtx_unlock(&CMAPCADDR12_lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap, m)
 	pmap_t pmap;
 	vm_page_t m;
 {
 	pv_entry_t pv;
 	int loops = 0;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return FALSE;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			splx(s);
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	splx(s);
 	return (FALSE);
 }
 
 #define PMAP_REMOVE_PAGES_CURPROC_ONLY
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap, sva, eva)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m;
 	pv_entry_t pv, npv;
 	int s;
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 	if (!curthread || (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	s = splvm();
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 
 		if (pv->pv_va >= eva || pv->pv_va < sva) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 #ifdef PMAP_REMOVE_PAGES_CURPROC_ONLY
 		pte = vtopte(pv->pv_va);
 #else
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 #endif
 		tpte = *pte;
 
 		if (tpte == 0) {
 			printf("TPTE at %p  IS ZERO @ VA %08x\n",
 							pte, pv->pv_va);
 			panic("bad pte");
 		}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 		if (tpte & PG_W) {
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 
 		m = PHYS_TO_VM_PAGE(tpte);
 		KASSERT(m->phys_addr == (tpte & PG_FRAME),
 		    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 		    m, (uintmax_t)m->phys_addr, (uintmax_t)tpte));
 
 		KASSERT(m < &vm_page_array[vm_page_array_size],
 			("pmap_remove_pages: bad tpte %#jx", (uintmax_t)tpte));
 
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		pte_clear(pte);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			vm_page_dirty(m);
 		}
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
 			vm_page_flag_clear(m, PG_WRITEABLE);
 		}
 
 		pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem);
 		free_pv_entry(pv);
 	}
 	splx(s);
 	pmap_invalidate_all(pmap);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return FALSE;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * if the bit being tested is the modified bit, then
 		 * mark clean_map and ptes as never
 		 * modified.
 		 */
 		if (!pmap_track_modified(pv->pv_va))
 			continue;
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
 			continue;
 		}
 #endif
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 		if (*pte & PG_M) {
 			splx(s);
 			return TRUE;
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static __inline void
 pmap_changebit(vm_page_t m, int bit, boolean_t setem)
 {
 	register pv_entry_t pv;
 	register pt_entry_t *pte;
 	int s;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS) ||
 	    (!setem && bit == PG_RW && (m->flags & PG_WRITEABLE) == 0))
 		return;
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		/*
 		 * don't write protect pager mappings
 		 */
 		if (!setem && (bit == PG_RW)) {
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 		}
 
 #if defined(PMAP_DIAGNOSTIC)
 		if (!pv->pv_pmap) {
 			printf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
 			continue;
 		}
 #endif
 
 		pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 		if (setem) {
 			*pte |= bit;
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		} else {
 			pt_entry_t pbits = *pte;
 			if (pbits & bit) {
 				if (bit == PG_RW) {
 					if (pbits & PG_M) {
 						vm_page_dirty(m);
 					}
 					pte_store(pte, pbits & ~(PG_M|PG_RW));
 				} else {
 					pte_store(pte, pbits & ~bit);
 				}
 				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 			}
 		}
 	}
 	if (!setem && bit == PG_RW)
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	splx(s);
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(vm_page_t m, vm_prot_t prot)
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) {
 			pmap_changebit(m, PG_RW, FALSE);
 		} else {
 			pmap_remove_all(m);
 		}
 	}
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	register pv_entry_t pv, pvf, pvn;
 	pt_entry_t *pte;
 	pt_entry_t v;
 	int s;
 	int rtval = 0;
 
 	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
 		return (rtval);
 
 	s = splvm();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 
 		pvf = pv;
 
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 
 			if (!pmap_track_modified(pv->pv_va))
 				continue;
 
 			pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 
 			if (pte && ((v = pte_load(pte)) & PG_A) != 0) {
 				pte_store(pte, v & ~PG_A);
 				pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 
 				rtval++;
 				if (rtval > 4) {
 					break;
 				}
 			}
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	splx(s);
 
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pmap_changebit(m, PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pmap_changebit(m, PG_A, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 i386_protection_init()
 {
 	register int *kp, prot;
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 			/*
 			 * Read access is also 0. There isn't any execute bit,
 			 * so just make it readable.
 			 */
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_paddr_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	GIANT_REQUIRED;
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	pa = pa & PG_FRAME;
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter(tmpva, pa);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapdev(va, size)
 	vm_offset_t va;
 	vm_size_t size;
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t *pte;
 
 	base = va & PG_FRAME;
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
 		pte = vtopte(tmpva);
 		pte_clear(pte);
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap, addr)
 	pmap_t pmap;
 	vm_offset_t addr;
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	ptep = pmap_pte_quick(pmap, addr);
 	if (ptep == 0) {
 		return 0;
 	}
 
 	if ((pte = *ptep) != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	pmap_t	pmap;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 #if defined(SMP)
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	pmap->pm_active |= 1;
 #endif
 #ifdef PAE
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/* XXXKSE this is wrong.
 	 * pmap_activate is for the current thread on the current cpu
 	 */
-	if (p->p_flag & P_THREADED) {
+	if (p->p_flag & P_SA) {
 		/* Make sure all other cr3 entries are updated. */
 		/* what if they are running?  XXXKSE (maybe abort them) */
 		FOREACH_THREAD_IN_PROC(p, td) {
 			td->td_pcb->pcb_cr3 = cr3;
 		}
 	} else {
 		td->td_pcb->pcb_cr3 = cr3;
 	}
 	load_cr3(cr3);
 #ifdef SWTCH_OPTIM_STATS
 	tlb_flush_count++;
 #endif
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return npte;
 						}
 						pte = pmap_pte_quick(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads(pmap_t pm);
 void		pmap_pvdump(vm_offset_t pa);
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	int i, j;
 	vm_paddr_t va;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < NPDEPTD; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < NPTEPG; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte_quick(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *ptep);
 			};
 
 }
 
 void
 pmap_pvdump(pa)
 	vm_paddr_t pa;
 {
 	pv_entry_t pv;
 	vm_page_t m;
 
 	printf("pa %x", pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		printf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/i386/i386/sys_machdep.c
===================================================================
--- head/sys/i386/i386/sys_machdep.c	(revision 116360)
+++ head/sys/i386/i386/sys_machdep.c	(revision 116361)
@@ -1,567 +1,567 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)sys_machdep.c	5.5 (Berkeley) 1/19/91
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kstack_pages.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysproto.h>
 #include <sys/user.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/pcb_ext.h>	/* pcb.h included by sys/user.h */
 #include <machine/proc.h>
 #include <machine/sysarch.h>
 
 #include <vm/vm_kern.h>		/* for kernel_map */
 
 #define MAX_LD 8192
 #define LD_PER_PAGE 512
 #define NEW_MAX_LD(num)  ((num + LD_PER_PAGE) & ~(LD_PER_PAGE-1))
 #define SIZE_FROM_LARGEST_LD(num) (NEW_MAX_LD(num) << 3)
 
 
 
 static int i386_get_ldt(struct thread *, char *);
 static int i386_set_ldt(struct thread *, char *);
 static int i386_get_ioperm(struct thread *, char *);
 static int i386_set_ioperm(struct thread *, char *);
 #ifdef SMP
 static void set_user_ldt_rv(struct thread *);
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct sysarch_args {
 	int op;
 	char *parms;
 };
 #endif
 
 int
 sysarch(td, uap)
 	struct thread *td;
 	register struct sysarch_args *uap;
 {
 	int error;
 
 	mtx_lock(&Giant);
 	switch(uap->op) {
 	case I386_GET_LDT:
 		error = i386_get_ldt(td, uap->parms);
 		break;
 
 	case I386_SET_LDT:
 		error = i386_set_ldt(td, uap->parms);
 		break;
 	case I386_GET_IOPERM:
 		error = i386_get_ioperm(td, uap->parms);
 		break;
 	case I386_SET_IOPERM:
 		error = i386_set_ioperm(td, uap->parms);
 		break;
 	case I386_VM86:
 		error = vm86_sysarch(td, uap->parms);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 i386_extend_pcb(struct thread *td)
 {
 	int i, offset;
 	u_long *addr;
 	struct pcb_ext *ext;
 	struct soft_segment_descriptor ssd = {
 		0,			/* segment base address (overwritten) */
 		ctob(IOPAGES + 1) - 1,	/* length */
 		SDT_SYS386TSS,		/* segment type */
 		0,			/* priority level */
 		1,			/* descriptor present */
 		0, 0,
 		0,			/* default 32 size */
 		0			/* granularity */
 	};
 
-	if (td->td_proc->p_flag & P_THREADED)
+	if (td->td_proc->p_flag & P_SA)
 		return (EINVAL);		/* XXXKSE */
 /* XXXKSE  All the code below only works in 1:1   needs changing */
 	ext = (struct pcb_ext *)kmem_alloc(kernel_map, ctob(IOPAGES+1));
 	if (ext == 0)
 		return (ENOMEM);
 	bzero(ext, sizeof(struct pcb_ext)); 
 	/* -16 is so we can convert a trapframe into vm86trapframe inplace */
 	ext->ext_tss.tss_esp0 = td->td_kstack + ctob(KSTACK_PAGES) -
 	    sizeof(struct pcb) - 16;
 	ext->ext_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
 	/*
 	 * The last byte of the i/o map must be followed by an 0xff byte.
 	 * We arbitrarily allocate 16 bytes here, to keep the starting
 	 * address on a doubleword boundary.
 	 */
 	offset = PAGE_SIZE - 16;
 	ext->ext_tss.tss_ioopt = 
 	    (offset - ((unsigned)&ext->ext_tss - (unsigned)ext)) << 16;
 	ext->ext_iomap = (caddr_t)ext + offset;
 	ext->ext_vm86.vm86_intmap = (caddr_t)ext + offset - 32;
 
 	addr = (u_long *)ext->ext_vm86.vm86_intmap;
 	for (i = 0; i < (ctob(IOPAGES) + 32 + 16) / sizeof(u_long); i++)
 		*addr++ = ~0;
 
 	ssd.ssd_base = (unsigned)&ext->ext_tss;
 	ssd.ssd_limit -= ((unsigned)&ext->ext_tss - (unsigned)ext);
 	ssdtosd(&ssd, &ext->ext_tssd);
 
 	KASSERT(td->td_proc == curthread->td_proc, ("giving TSS to !curproc"));
 	KASSERT(td->td_pcb->pcb_ext == 0, ("already have a TSS!"));
 	mtx_lock_spin(&sched_lock);
 	td->td_pcb->pcb_ext = ext;
 	
 	/* switch to the new TSS after syscall completes */
 	td->td_flags |= TDF_NEEDRESCHED;
 	mtx_unlock_spin(&sched_lock);
 
 	return 0;
 }
 
 static int
 i386_set_ioperm(td, args)
 	struct thread *td;
 	char *args;
 {
 	int i, error;
 	struct i386_ioperm_args ua;
 	char *iomap;
 
 	if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0)
 		return (error);
 
 #ifdef MAC
 	if ((error = mac_check_sysarch_ioperm(td->td_ucred)) != 0)
 		return (error);
 #endif
 	if ((error = suser(td)) != 0)
 		return (error);
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
 		return (error);
 	/*
 	 * XXX 
 	 * While this is restricted to root, we should probably figure out
 	 * whether any other driver is using this i/o address, as so not to
 	 * cause confusion.  This probably requires a global 'usage registry'.
 	 */
 
 	if (td->td_pcb->pcb_ext == 0)
 		if ((error = i386_extend_pcb(td)) != 0)
 			return (error);
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	if (ua.start + ua.length > IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	for (i = ua.start; i < ua.start + ua.length; i++) {
 		if (ua.enable) 
 			iomap[i >> 3] &= ~(1 << (i & 7));
 		else
 			iomap[i >> 3] |= (1 << (i & 7));
 	}
 	return (error);
 }
 
 static int
 i386_get_ioperm(td, args)
 	struct thread *td;
 	char *args;
 {
 	int i, state, error;
 	struct i386_ioperm_args ua;
 	char *iomap;
 
 	if ((error = copyin(args, &ua, sizeof(struct i386_ioperm_args))) != 0)
 		return (error);
 	if (ua.start >= IOPAGES * PAGE_SIZE * NBBY)
 		return (EINVAL);
 
 	if (td->td_pcb->pcb_ext == 0) {
 		ua.length = 0;
 		goto done;
 	}
 
 	iomap = (char *)td->td_pcb->pcb_ext->ext_iomap;
 
 	i = ua.start;
 	state = (iomap[i >> 3] >> (i & 7)) & 1;
 	ua.enable = !state;
 	ua.length = 1;
 
 	for (i = ua.start + 1; i < IOPAGES * PAGE_SIZE * NBBY; i++) {
 		if (state != ((iomap[i >> 3] >> (i & 7)) & 1))
 			break;
 		ua.length++;
 	}
 			
 done:
 	error = copyout(&ua, args, sizeof(struct i386_ioperm_args));
 	return (error);
 }
 
 /*
  * Update the GDT entry pointing to the LDT to point to the LDT of the
  * current process.
  *
  * This must be called with sched_lock held.  Unfortunately, we can't use a
  * mtx_assert() here because cpu_switch() calls this function after changing
  * curproc but before sched_lock's owner is updated in mi_switch().
  */   
 void
 set_user_ldt(struct mdproc *mdp)
 {
 	struct proc_ldt *pldt;
 
 	pldt = mdp->md_ldt;
 #ifdef SMP
 	gdt[PCPU_GET(cpuid) * NGDT + GUSERLDT_SEL].sd = pldt->ldt_sd;
 #else
 	gdt[GUSERLDT_SEL].sd = pldt->ldt_sd;
 #endif
 	lldt(GSEL(GUSERLDT_SEL, SEL_KPL));
 	PCPU_SET(currentldt, GSEL(GUSERLDT_SEL, SEL_KPL));
 }
 
 #ifdef SMP
 static void
 set_user_ldt_rv(struct thread *td)
 {
 
 	if (td->td_proc != curthread->td_proc)
 		return;
 
 	mtx_lock_spin(&sched_lock);
 	set_user_ldt(&td->td_proc->p_md);
 	mtx_unlock_spin(&sched_lock);
 }
 #endif
 
 /*
  * Must be called with either sched_lock free or held but not recursed.
  * If it does not return NULL, it will return with it owned.
  */
 struct proc_ldt *
 user_ldt_alloc(struct mdproc *mdp, int len)
 {
 	struct proc_ldt *pldt, *new_ldt;
 
 	if (mtx_owned(&sched_lock))
 		mtx_unlock_spin(&sched_lock);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	MALLOC(new_ldt, struct proc_ldt *, sizeof(struct proc_ldt),
 		M_SUBPROC, M_WAITOK);
 
 	new_ldt->ldt_len = len = NEW_MAX_LD(len);
 	new_ldt->ldt_base = (caddr_t)kmem_alloc(kernel_map,
 		len * sizeof(union descriptor));
 	if (new_ldt->ldt_base == NULL) {
 		FREE(new_ldt, M_SUBPROC);
 		return NULL;
 	}
 	new_ldt->ldt_refcnt = 1;
 	new_ldt->ldt_active = 0;
 
 	mtx_lock_spin(&sched_lock);
 	gdt_segs[GUSERLDT_SEL].ssd_base = (unsigned)new_ldt->ldt_base;
 	gdt_segs[GUSERLDT_SEL].ssd_limit = len * sizeof(union descriptor) - 1;
 	ssdtosd(&gdt_segs[GUSERLDT_SEL], &new_ldt->ldt_sd);
 
 	if ((pldt = mdp->md_ldt)) {
 		if (len > pldt->ldt_len)
 			len = pldt->ldt_len;
 		bcopy(pldt->ldt_base, new_ldt->ldt_base,
 		    len * sizeof(union descriptor));
 	} else {
 		bcopy(ldt, new_ldt->ldt_base, sizeof(ldt));
 	}
 	return new_ldt;
 }
 
 /*
  * Must be called either with sched_lock free or held but not recursed.
  * If md_ldt is not NULL, it will return with sched_lock released.
  */
 void
 user_ldt_free(struct thread *td)
 {
 	struct mdproc *mdp = &td->td_proc->p_md;
 	struct proc_ldt *pldt = mdp->md_ldt;
 
 	if (pldt == NULL)
 		return;
 
 	if (!mtx_owned(&sched_lock))
 		mtx_lock_spin(&sched_lock);
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	if (td == PCPU_GET(curthread)) {
 		lldt(_default_ldt);
 		PCPU_SET(currentldt, _default_ldt);
 	}
 
 	mdp->md_ldt = NULL;
 	if (--pldt->ldt_refcnt == 0) {
 		mtx_unlock_spin(&sched_lock);
 		kmem_free(kernel_map, (vm_offset_t)pldt->ldt_base,
 			pldt->ldt_len * sizeof(union descriptor));
 		FREE(pldt, M_SUBPROC);
 	} else
 		mtx_unlock_spin(&sched_lock);
 }
 
 static int
 i386_get_ldt(td, args)
 	struct thread *td;
 	char *args;
 {
 	int error = 0;
 	struct proc_ldt *pldt = td->td_proc->p_md.md_ldt;
 	int nldt, num;
 	union descriptor *lp;
 	struct i386_ldt_args ua, *uap = &ua;
 
 	if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0)
 		return(error);
 
 #ifdef	DEBUG
 	printf("i386_get_ldt: start=%d num=%d descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 
 	/* verify range of LDTs exist */
 	if ((uap->start < 0) || (uap->num <= 0))
 		return(EINVAL);
 
 	if (pldt) {
 		nldt = pldt->ldt_len;
 		num = min(uap->num, nldt);
 		lp = &((union descriptor *)(pldt->ldt_base))[uap->start];
 	} else {
 		nldt = sizeof(ldt)/sizeof(ldt[0]);
 		num = min(uap->num, nldt);
 		lp = &ldt[uap->start];
 	}
 	if (uap->start + num > nldt)
 		return(EINVAL);
 
 	error = copyout(lp, uap->descs, num * sizeof(union descriptor));
 	if (!error)
 		td->td_retval[0] = num;
 
 	return(error);
 }
 
 static int
 i386_set_ldt(td, args)
 	struct thread *td;
 	char *args;
 {
 	int error = 0, i, n;
 	int largest_ld;
 	struct mdproc *mdp = &td->td_proc->p_md;
 	struct proc_ldt *pldt = mdp->md_ldt;
 	struct i386_ldt_args ua, *uap = &ua;
 	union descriptor *descs;
 	caddr_t old_ldt_base;
 	int descs_size, old_ldt_len;
 	register_t savecrit;
 
 	if ((error = copyin(args, uap, sizeof(struct i386_ldt_args))) < 0)
 		return(error);
 
 #ifdef	DEBUG
 	printf("i386_set_ldt: start=%d num=%d descs=%p\n",
 	    uap->start, uap->num, (void *)uap->descs);
 #endif
 
 	/* verify range of descriptors to modify */
 	if ((uap->start < 0) || (uap->start >= MAX_LD) || (uap->num < 0) ||
 		(uap->num > MAX_LD))
 	{
 		return(EINVAL);
 	}
 	largest_ld = uap->start + uap->num - 1;
 	if (largest_ld >= MAX_LD)
 		return(EINVAL);
 
 	/* allocate user ldt */
 	if (!pldt || largest_ld >= pldt->ldt_len) {
 		struct proc_ldt *new_ldt = user_ldt_alloc(mdp, largest_ld);
 		if (new_ldt == NULL)
 			return ENOMEM;
 		if (pldt) {
 			old_ldt_base = pldt->ldt_base;
 			old_ldt_len = pldt->ldt_len;
 			pldt->ldt_sd = new_ldt->ldt_sd;
 			pldt->ldt_base = new_ldt->ldt_base;
 			pldt->ldt_len = new_ldt->ldt_len;
 			mtx_unlock_spin(&sched_lock);
 			kmem_free(kernel_map, (vm_offset_t)old_ldt_base,
 				old_ldt_len * sizeof(union descriptor));
 			FREE(new_ldt, M_SUBPROC);
 #ifndef SMP
 			mtx_lock_spin(&sched_lock);
 #endif
 		} else {
 			mdp->md_ldt = pldt = new_ldt;
 #ifdef SMP
 			mtx_unlock_spin(&sched_lock);
 #endif
 		}
 #ifdef SMP
 		/* signal other cpus to reload ldt */
 		smp_rendezvous(NULL, (void (*)(void *))set_user_ldt_rv, 
 		    NULL, td);
 #else
 		set_user_ldt(mdp);
 		mtx_unlock_spin(&sched_lock);
 #endif
 	}
 
 	descs_size = uap->num * sizeof(union descriptor);
 	descs = (union descriptor *)kmem_alloc(kernel_map, descs_size);
 	if (descs == NULL)
 		return (ENOMEM);
 	error = copyin(&uap->descs[0], descs, descs_size);
 	if (error) {
 		kmem_free(kernel_map, (vm_offset_t)descs, descs_size);
 		return (error);
 	}
 	/* Check descriptors for access violations */
 	for (i = 0, n = uap->start; i < uap->num; i++, n++) {
 		union descriptor *dp;
 		dp = &descs[i];
 
 		switch (dp->sd.sd_type) {
 		case SDT_SYSNULL:	/* system null */ 
 			dp->sd.sd_p = 0;
 			break;
 		case SDT_SYS286TSS: /* system 286 TSS available */
 		case SDT_SYSLDT:    /* system local descriptor table */
 		case SDT_SYS286BSY: /* system 286 TSS busy */
 		case SDT_SYSTASKGT: /* system task gate */
 		case SDT_SYS286IGT: /* system 286 interrupt gate */
 		case SDT_SYS286TGT: /* system 286 trap gate */
 		case SDT_SYSNULL2:  /* undefined by Intel */ 
 		case SDT_SYS386TSS: /* system 386 TSS available */
 		case SDT_SYSNULL3:  /* undefined by Intel */
 		case SDT_SYS386BSY: /* system 386 TSS busy */
 		case SDT_SYSNULL4:  /* undefined by Intel */ 
 		case SDT_SYS386IGT: /* system 386 interrupt gate */
 		case SDT_SYS386TGT: /* system 386 trap gate */
 		case SDT_SYS286CGT: /* system 286 call gate */ 
 		case SDT_SYS386CGT: /* system 386 call gate */
 			/* I can't think of any reason to allow a user proc
 			 * to create a segment of these types.  They are
 			 * for OS use only.
 			 */
 			kmem_free(kernel_map, (vm_offset_t)descs, descs_size);
 			return EACCES;
 			/*NOTREACHED*/
 
 		/* memory segment types */
 		case SDT_MEMEC:   /* memory execute only conforming */
 		case SDT_MEMEAC:  /* memory execute only accessed conforming */
 		case SDT_MEMERC:  /* memory execute read conforming */
 		case SDT_MEMERAC: /* memory execute read accessed conforming */
 			 /* Must be "present" if executable and conforming. */
 			if (dp->sd.sd_p == 0) {
 				kmem_free(kernel_map, (vm_offset_t)descs,
 				    descs_size);
 				return (EACCES);
 			}
 			break;
 		case SDT_MEMRO:   /* memory read only */
 		case SDT_MEMROA:  /* memory read only accessed */
 		case SDT_MEMRW:   /* memory read write */
 		case SDT_MEMRWA:  /* memory read write accessed */
 		case SDT_MEMROD:  /* memory read only expand dwn limit */
 		case SDT_MEMRODA: /* memory read only expand dwn lim accessed */
 		case SDT_MEMRWD:  /* memory read write expand dwn limit */  
 		case SDT_MEMRWDA: /* memory read write expand dwn lim acessed */
 		case SDT_MEME:    /* memory execute only */ 
 		case SDT_MEMEA:   /* memory execute only accessed */
 		case SDT_MEMER:   /* memory execute read */
 		case SDT_MEMERA:  /* memory execute read accessed */
 			break;
 		default:
 			kmem_free(kernel_map, (vm_offset_t)descs, descs_size);
 			return(EINVAL);
 			/*NOTREACHED*/
 		}
 
 		/* Only user (ring-3) descriptors may be present. */
 		if ((dp->sd.sd_p != 0) && (dp->sd.sd_dpl != SEL_UPL)) {
 			kmem_free(kernel_map, (vm_offset_t)descs, descs_size);
 			return (EACCES);
 		}
 	}
 
 	/* Fill in range */
 	savecrit = intr_disable();
 	bcopy(descs, 
 	    &((union descriptor *)(pldt->ldt_base))[uap->start],
 	    uap->num * sizeof(union descriptor));
 	td->td_retval[0] = uap->start;
 	intr_restore(savecrit);
 	kmem_free(kernel_map, (vm_offset_t)descs, descs_size);
 	return (0);
 }
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 116360)
+++ head/sys/i386/i386/trap.c	(revision 116361)
@@ -1,1095 +1,1095 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #include <machine/vm86.h>
 
 #include <ddb/ddb.h>
 
 #include <sys/sysctl.h>
 
 int (*pmath_emulate)(struct trapframe *);
 
 extern void trap(struct trapframe frame);
 #ifdef I386_CPU
 extern int trapwrite(unsigned addr);
 #endif
 extern void syscall(struct trapframe frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 extern inthand_t IDTVEC(lcall_syscall);
 
 #define MAX_TRAP_MSG		28
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef DDB
 static int ddb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 	&ddb_on_nmi, 0, "Go to DDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 #ifdef DEVICE_POLLING
 extern u_int32_t poll_in_trap;
 extern int ether_poll(int count);
 #endif /* DEVICE_POLLING */
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	u_int sticks = 0;
 	int i = 0, ucode = 0, type, code;
 	vm_offset_t eva;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	atomic_add_int(&cnt.v_trap, 1);
 	type = frame.tf_trapno;
 
 #ifdef DDB
 	if (db_active) {
 		eva = (type == T_PAGEFLT ? rcr2() : 0);
 		trap_fatal(&frame, eva);
 		goto out;
 	}
 #endif
 
 	if ((frame.tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP &&
 			 frame.tf_eip != (int)cpu_switch_load_gs) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * Page faults need interrupts diasabled until later,
 			 * and we shouldn't enable interrupts while in a
 			 * critical section.
 			 */
 			if (type != T_PAGEFLT && td->td_critnest == 0)
 				enable_intr();
 		}
 	}
 
 	eva = 0;
 	code = frame.tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * For some Cyrix CPUs, %cr2 is clobbered by
 		 * interrupts.  This problem is worked around by using
 		 * an interrupt gate for the pagefault handler.  We
 		 * are finally ready to read %cr2 and then must
 		 * reenable interrupts.
 		 *
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 */
 		eva = rcr2();
 		if (td->td_critnest == 0)
 			enable_intr();
 		else
 			trap_fatal(&frame, eva);
 	}
 
 #ifdef	DEVICE_POLLING
 	if (poll_in_trap)
 		ether_poll(poll_in_trap);
 #endif	/* DEVICE_POLLING */
 
         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
 	    ((frame.tf_eflags & PSL_VM) && 
 		!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) {
 		/* user trap */
 
 		sticks = td->td_sticks;
 		td->td_frame = &frame;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
 			ucode = npxtrap();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = code;
 #endif
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)&frame);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			/* FALLTHROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			i = trap_pfault(&frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * The f00f hack workaround has triggered, so
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
 				type = frame.tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
 				ucode = type;
 				i = SIGILL;
 				break;
 			}
 #endif
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto userout;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto userout;
 #endif
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			mtx_lock(&Giant);
 			i = (*pmath_emulate)(&frame);
 			mtx_unlock(&Giant);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					goto userout;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(&frame, FALSE, eva);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)&frame);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)&frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
 				psignal(p, SIGBUS);
 				PROC_UNLOCK(p);
 #endif				
 				goto out;
 			}
 
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame.tf_eip == (int)doreti_iret) {
 				frame.tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_ds) {
 				frame.tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_es) {
 				frame.tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_fs) {
 				frame.tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb) != NULL &&
 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame.tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame.tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			/* XXX Giant */
 			if (user_dbreg_trap() && 
 			   !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			/* XXX Giant */
 			if (kdb_trap (type, 0, &frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(&frame, eva);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	trapsignal(td, i, ucode);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, &frame, sticks);
 	mtx_assert(&Giant, MA_NOTOWNED);
 userout:
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return -2;
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss, esp;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
         		frame->tf_eflags & PSL_VM ? "vm86" :
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
 		return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	panic("double fault");
 }
 
 #ifdef I386_CPU
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct thread *td;
 	struct proc *p;
 	vm_offset_t va;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	td = curthread;
 	p = td->td_proc;
 	vm = p->p_vmspace;
 
 	PROC_LOCK(p);
 	++p->p_lock;
 	PROC_UNLOCK(p);
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
 
 	PROC_LOCK(p);
 	--p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 #endif
 
 /*
  *	syscall -	system call request C handler
  *
  *	A system call is essentially treated as a trap.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_eflags;
 	u_int sticks;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
 
 	/*
 	 * note: PCPU_LAZY_INC() can only be used if we can afford
 	 * occassional inaccuracy in the count.
 	 */
 	PCPU_LAZY_INC(cnt.v_syscall);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame.tf_cs) != SEL_UPL) {
 		mtx_lock(&Giant);	/* try to stabilize the system XXX */
 		panic("syscall");
 		/* NOT REACHED */
 		mtx_unlock(&Giant);
 	}
 #endif
 
 	sticks = td->td_sticks;
 	td->td_frame = &frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		thread_user_enter(p, td);
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	orig_tf_eflags = frame.tf_eflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 */
 			code = fuword(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	if (params != NULL && narg != 0)
 		error = copyin(params, (caddr_t)args,
 		    (u_int)(narg * sizeof(int)));
 	else
 		error = 0;
 		
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, args);
 #endif
 
 	/*
 	 * Try to run the syscall without Giant if the syscall
 	 * is MP safe.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame.tf_edx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		error = (*callp->sy_call)(td, args);
 	}
 
 	switch (error) {
 	case 0:
 		frame.tf_eax = td->td_retval[0];
 		frame.tf_edx = td->td_retval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame.tf_eip -= frame.tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Release Giant if we previously set it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(td, SIGTRAP, 0);
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
Index: head/sys/i386/linux/linux_sysvec.c
===================================================================
--- head/sys/i386/linux/linux_sysvec.c	(revision 116360)
+++ head/sys/i386/linux/linux_sysvec.c	(revision 116361)
@@ -1,973 +1,973 @@
 /*-
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* XXX we use functions that might not exist. */
 #include "opt_compat.h"
 
 #ifndef COMPAT_43
 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/imgact_elf.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <sys/mutex.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
 #include <i386/linux/linux.h>
 #include <i386/linux/linux_proto.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 
 MODULE_VERSION(linux, 1);
 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
 
 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define SHELLMAGIC      0x2123 /* #! */
 #else
 #define SHELLMAGIC      0x2321
 #endif
 
 /*
  * Allow the sendsig functions to use the ldebug() facility
  * even though they are not syscalls themselves. Map them
  * to syscall 0. This is slightly less bogus than using
  * ldebug(sigreturn).
  */
 #define	LINUX_SYS_linux_rt_sendsig	0
 #define	LINUX_SYS_linux_sendsig		0
 
 extern char linux_sigcode[];
 extern int linux_szsigcode;
 
 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
 
 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
 
 static int	linux_fixup(register_t **stack_base,
 		    struct image_params *iparams);
 static int	elf_linux_fixup(register_t **stack_base,
 		    struct image_params *iparams);
 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
 		    caddr_t *params);
 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
 		    u_long code);
 static void	exec_linux_setregs(struct thread *td, u_long entry,
 				   u_long stack, u_long ps_strings);
 
 /*
  * Linux syscalls return negative errno's, we do positive and map them
  */
 static int bsd_to_linux_errno[ELAST + 1] = {
 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
 	-6, -6, -43, -42, -75, -6, -84
 };
 
 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
 };
 
 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
 	SIGIO, SIGURG, SIGSYS
 };
 
 #define LINUX_T_UNKNOWN  255
 static int _bsd_to_linux_trapcode[] = {
 	LINUX_T_UNKNOWN,	/* 0 */
 	6,			/* 1  T_PRIVINFLT */
 	LINUX_T_UNKNOWN,	/* 2 */
 	3,			/* 3  T_BPTFLT */
 	LINUX_T_UNKNOWN,	/* 4 */
 	LINUX_T_UNKNOWN,	/* 5 */
 	16,			/* 6  T_ARITHTRAP */
 	254,			/* 7  T_ASTFLT */
 	LINUX_T_UNKNOWN,	/* 8 */
 	13,			/* 9  T_PROTFLT */
 	1,			/* 10 T_TRCTRAP */
 	LINUX_T_UNKNOWN,	/* 11 */
 	14,			/* 12 T_PAGEFLT */
 	LINUX_T_UNKNOWN,	/* 13 */
 	17,			/* 14 T_ALIGNFLT */
 	LINUX_T_UNKNOWN,	/* 15 */
 	LINUX_T_UNKNOWN,	/* 16 */
 	LINUX_T_UNKNOWN,	/* 17 */
 	0,			/* 18 T_DIVIDE */
 	2,			/* 19 T_NMI */
 	4,			/* 20 T_OFLOW */
 	5,			/* 21 T_BOUND */
 	7,			/* 22 T_DNA */
 	8,			/* 23 T_DOUBLEFLT */
 	9,			/* 24 T_FPOPFLT */
 	10,			/* 25 T_TSSFLT */
 	11,			/* 26 T_SEGNPFLT */
 	12,			/* 27 T_STKFLT */
 	18,			/* 28 T_MCHK */
 	19,			/* 29 T_XMMFLT */
 	15			/* 30 T_RESERVED */
 };
 #define bsd_to_linux_trapcode(code) \
     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
      _bsd_to_linux_trapcode[(code)]: \
      LINUX_T_UNKNOWN)
 
 /*
  * If FreeBSD & Linux have a difference of opinion about what a trap
  * means, deal with it here.
  *
  * MPSAFE
  */
 static int
 translate_traps(int signal, int trap_code)
 {
 	if (signal != SIGBUS)
 		return signal;
 	switch (trap_code) {
 	case T_PROTFLT:
 	case T_TSSFLT:
 	case T_DOUBLEFLT:
 	case T_PAGEFLT:
 		return SIGSEGV;
 	default:
 		return signal;
 	}
 }
 
 static int
 linux_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	register_t *argv, *envp;
 
 	argv = *stack_base;
 	envp = *stack_base + (imgp->argc + 1);
 	(*stack_base)--;
 	**stack_base = (intptr_t)(void *)envp;
 	(*stack_base)--;
 	**stack_base = (intptr_t)(void *)argv;
 	(*stack_base)--;
 	**stack_base = imgp->argc;
 	return 0;
 }
 
 static int
 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	Elf32_Auxargs *args;
 	register_t *pos;
 
 	KASSERT(curthread->td_proc == imgp->proc &&
-	    (curthread->td_proc->p_flag & P_THREADED) == 0,
+	    (curthread->td_proc->p_flag & P_SA) == 0,
 	    ("unsafe elf_linux_fixup(), should be curproc"));
 	args = (Elf32_Auxargs *)imgp->auxargs;
 	pos = *stack_base + (imgp->argc + imgp->envc + 2);
 
 	if (args->trace)
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	if (args->execfd != -1)
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
 	AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
 	AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
 	AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	(*stack_base)--;
 	**stack_base = (register_t)imgp->argc;
 	return 0;
 }
 
 extern int _ucodesel, _udatasel;
 extern unsigned long linux_sznonrtsigcode;
 
 static void
 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_rt_sigframe *fp, frame;
 	int oonstack;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
 		    catcher, sig, (void*)mask, code);
 #endif
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_rt_sigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct l_rt_sigframe));
 	} else
 		fp = (struct l_rt_sigframe *)regs->tf_esp - 1;
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl)
 		if (sig <= p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = catcher;
 	frame.sf_sig = sig;
 	frame.sf_siginfo = &fp->sf_si;
 	frame.sf_ucontext = &fp->sf_sc;
 
 	/* Fill in POSIX parts */
 	frame.sf_si.lsi_signo = sig;
 	frame.sf_si.lsi_code = code;
 	frame.sf_si.lsi_addr = (void *)regs->tf_err;
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
 	frame.sf_sc.uc_link = NULL;		/* XXX ??? */
 
 	frame.sf_sc.uc_stack.ss_sp = p->p_sigstk.ss_sp;
 	frame.sf_sc.uc_stack.ss_size = p->p_sigstk.ss_size;
 	frame.sf_sc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
 	PROC_UNLOCK(p);
 
 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
 
 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
 	frame.sf_sc.uc_mcontext.sc_gs     = rgs();
 	frame.sf_sc.uc_mcontext.sc_fs     = regs->tf_fs;
 	frame.sf_sc.uc_mcontext.sc_es     = regs->tf_es;
 	frame.sf_sc.uc_mcontext.sc_ds     = regs->tf_ds;
 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_edi;
 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_esi;
 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_ebp;
 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_ebx;
 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_edx;
 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_ecx;
 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_eax;
 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_eip;
 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_eflags;
 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_esp;
 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
 		    frame.sf_sc.uc_stack.ss_flags, p->p_sigstk.ss_sp,
 		    p->p_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
 #endif
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 #ifdef DEBUG
 		if (ldebug(rt_sendsig))
 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
 			    fp, oonstack);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
 	    linux_sznonrtsigcode;
 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * in u. to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 static void
 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_sigframe *fp, frame;
 	l_sigset_t lmask;
 	int oonstack, i;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		linux_rt_sendsig(catcher, sig, mask, code);
 		return;
 	}
 
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 #ifdef DEBUG
 	if (ldebug(sendsig))
 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
 		    catcher, sig, (void*)mask, code);
 #endif
 
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_sigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct l_sigframe));
 	} else
 		fp = (struct l_sigframe *)regs->tf_esp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl)
 		if (sig <= p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = catcher;
 	frame.sf_sig = sig;
 
 	bsd_to_linux_sigset(mask, &lmask);
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	frame.sf_sc.sc_mask   = lmask.__bits[0];
 	frame.sf_sc.sc_gs     = rgs();
 	frame.sf_sc.sc_fs     = regs->tf_fs;
 	frame.sf_sc.sc_es     = regs->tf_es;
 	frame.sf_sc.sc_ds     = regs->tf_ds;
 	frame.sf_sc.sc_edi    = regs->tf_edi;
 	frame.sf_sc.sc_esi    = regs->tf_esi;
 	frame.sf_sc.sc_ebp    = regs->tf_ebp;
 	frame.sf_sc.sc_ebx    = regs->tf_ebx;
 	frame.sf_sc.sc_edx    = regs->tf_edx;
 	frame.sf_sc.sc_ecx    = regs->tf_ecx;
 	frame.sf_sc.sc_eax    = regs->tf_eax;
 	frame.sf_sc.sc_eip    = regs->tf_eip;
 	frame.sf_sc.sc_cs     = regs->tf_cs;
 	frame.sf_sc.sc_eflags = regs->tf_eflags;
 	frame.sf_sc.sc_esp_at_signal = regs->tf_esp;
 	frame.sf_sc.sc_ss     = regs->tf_ss;
 	frame.sf_sc.sc_err    = regs->tf_err;
 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
 
 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
 		frame.sf_extramask[i] = lmask.__bits[i+1];
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~(PSL_T | PSL_VM);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 {
 	struct proc *p = td->td_proc;
 	struct l_sigframe frame;
 	struct trapframe *regs;
 	l_sigset_t lmask;
 	int eflags, i;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(sigreturn))
 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
 #endif
 	/*
 	 * The trampoline code hands us the sigframe.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
 		return (EFAULT);
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = frame.sf_sc.sc_eflags;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
 		trapsignal(td, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	lmask.__bits[0] = frame.sf_sc.sc_mask;
 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
 		lmask.__bits[i+1] = frame.sf_extramask[i];
 	PROC_LOCK(p);
 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Restore signal context.
 	 */
 	/* %gs was restored by the trampoline. */
 	regs->tf_fs     = frame.sf_sc.sc_fs;
 	regs->tf_es     = frame.sf_sc.sc_es;
 	regs->tf_ds     = frame.sf_sc.sc_ds;
 	regs->tf_edi    = frame.sf_sc.sc_edi;
 	regs->tf_esi    = frame.sf_sc.sc_esi;
 	regs->tf_ebp    = frame.sf_sc.sc_ebp;
 	regs->tf_ebx    = frame.sf_sc.sc_ebx;
 	regs->tf_edx    = frame.sf_sc.sc_edx;
 	regs->tf_ecx    = frame.sf_sc.sc_ecx;
 	regs->tf_eax    = frame.sf_sc.sc_eax;
 	regs->tf_eip    = frame.sf_sc.sc_eip;
 	regs->tf_cs     = frame.sf_sc.sc_cs;
 	regs->tf_eflags = eflags;
 	regs->tf_esp    = frame.sf_sc.sc_esp_at_signal;
 	regs->tf_ss     = frame.sf_sc.sc_ss;
 
 	return (EJUSTRETURN);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by rt_sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
 {
 	struct proc *p = td->td_proc;
 	struct l_ucontext uc;
 	struct l_sigcontext *context;
 	l_stack_t *lss;
 	stack_t ss;
 	struct trapframe *regs;
 	int eflags;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
 #endif
 	/*
 	 * The trampoline code hands us the ucontext.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
 		return (EFAULT);
 
 	context = &uc.uc_mcontext;
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = context->sc_eflags;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(context->sc_cs)) {
 		trapsignal(td, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	PROC_LOCK(p);
 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Restore signal context
 	 */
 	/* %gs was restored by the trampoline. */
 	regs->tf_fs     = context->sc_fs;
 	regs->tf_es     = context->sc_es;
 	regs->tf_ds     = context->sc_ds;
 	regs->tf_edi    = context->sc_edi;
 	regs->tf_esi    = context->sc_esi;
 	regs->tf_ebp    = context->sc_ebp;
 	regs->tf_ebx    = context->sc_ebx;
 	regs->tf_edx    = context->sc_edx;
 	regs->tf_ecx    = context->sc_ecx;
 	regs->tf_eax    = context->sc_eax;
 	regs->tf_eip    = context->sc_eip;
 	regs->tf_cs     = context->sc_cs;
 	regs->tf_eflags = eflags;
 	regs->tf_esp    = context->sc_esp_at_signal;
 	regs->tf_ss     = context->sc_ss;
 
 	/*
 	 * call sigaltstack & ignore results..
 	 */
 	lss = &uc.uc_stack;
 	ss.ss_sp = lss->ss_sp;
 	ss.ss_size = lss->ss_size;
 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%x, mask: 0x%x"),
 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
 #endif
 	(void)kern_sigaltstack(td, &ss, NULL);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * MPSAFE
  */
 static void
 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
 {
 	args[0] = tf->tf_ebx;
 	args[1] = tf->tf_ecx;
 	args[2] = tf->tf_edx;
 	args[3] = tf->tf_esi;
 	args[4] = tf->tf_edi;
 	args[5] = tf->tf_ebp;	/* Unconfirmed */
 	*params = NULL;		/* no copyin */
 }
 
 
 
 /*
  * Dump core, into a file named as described in the comments for
  * expand_name(), unless the process was setuid/setgid.
  */
 static int
 linux_aout_coredump(struct thread *td, struct vnode *vp, off_t limit)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vmspace *vm = p->p_vmspace;
 	char *tempuser;
 	int error;
 
 	if (ctob((uarea_pages + kstack_pages) +
 	    vm->vm_dsize + vm->vm_ssize) >= limit)
 		return (EFAULT);
 	tempuser = malloc(ctob(uarea_pages + kstack_pages), M_TEMP,
 	    M_WAITOK | M_ZERO);
 	if (tempuser == NULL)
 		return (ENOMEM);
 	PROC_LOCK(p);
 	fill_kinfo_proc(p, &p->p_uarea->u_kproc);
 	PROC_UNLOCK(p);
 	bcopy(p->p_uarea, tempuser, sizeof(struct user));
 	bcopy(td->td_frame,
 	    tempuser + ctob(uarea_pages) +
 	    ((caddr_t)td->td_frame - (caddr_t)td->td_kstack),
 	    sizeof(struct trapframe));
 	error = vn_rdwr(UIO_WRITE, vp, (caddr_t)tempuser,
 	    ctob(uarea_pages + kstack_pages),
 	    (off_t)0, UIO_SYSSPACE, IO_UNIT, cred, NOCRED,
 	    (int *)NULL, td);
 	free(tempuser, M_TEMP);
 	if (error == 0)
 		error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr,
 		    (int)ctob(vm->vm_dsize),
 		    (off_t)ctob(uarea_pages + kstack_pages), UIO_USERSPACE,
 		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
 	if (error == 0)
 		error = vn_rdwr_inchunks(UIO_WRITE, vp,
 		    (caddr_t)trunc_page(USRSTACK - ctob(vm->vm_ssize)),
 		    round_page(ctob(vm->vm_ssize)),
 		    (off_t)ctob(uarea_pages + kstack_pages) +
 			ctob(vm->vm_dsize), UIO_USERSPACE,
 		    IO_UNIT | IO_DIRECT, cred, NOCRED, (int *) NULL, td);
 	return (error);
 }
 /*
  * If a linux binary is exec'ing something, try this image activator
  * first.  We override standard shell script execution in order to
  * be able to modify the interpreter path.  We only do this if a linux
  * binary is doing the exec, so we do not create an EXEC module for it.
  */
 static int	exec_linux_imgact_try(struct image_params *iparams);
 
 static int
 exec_linux_imgact_try(struct image_params *imgp)
 {
     const char *head = (const char *)imgp->image_header;
     int error = -1;
 
     /*
      * The interpreter for shell scripts run from a linux binary needs
      * to be located in /compat/linux if possible in order to recursively
      * maintain linux path emulation.
      */
     if (((const short *)head)[0] == SHELLMAGIC) {
 	    /*
 	     * Run our normal shell image activator.  If it succeeds attempt
 	     * to use the alternate path for the interpreter.  If an alternate
 	     * path is found, use our stringspace to store it.
 	     */
 	    if ((error = exec_shell_imgact(imgp)) == 0) {
 		    char *rpath = NULL;
 
 		    linux_emul_find(FIRST_THREAD_IN_PROC(imgp->proc), NULL,
 			imgp->interpreter_name, &rpath, 0);
 		    if (rpath != imgp->interpreter_name) {
 			    int len = strlen(rpath) + 1;
 
 			    if (len <= MAXSHELLCMDLEN) {
 				    memcpy(imgp->interpreter_name, rpath, len);
 			    }
 			    free(rpath, M_TEMP);
 		    }
 	    }
     }
     return(error);
 }
 
 /*
  * exec_setregs may initialize some registers differently than Linux
  * does, thus potentially confusing Linux binaries. If necessary, we
  * override the exec_setregs default(s) here.
  */
 static void
 exec_linux_setregs(struct thread *td, u_long entry,
 		   u_long stack, u_long ps_strings)
 {
 	struct pcb *pcb = td->td_pcb;
 
 	exec_setregs(td, entry, stack, ps_strings);
 
 	/* Linux sets %gs to 0, we default to _udatasel */
 	pcb->pcb_gs = 0; load_gs(0);
 }
 
 struct sysentvec linux_sysvec = {
 	LINUX_SYS_MAXSYSCALL,
 	linux_sysent,
 	0xff,
 	LINUX_SIGTBLSZ,
 	bsd_to_linux_signal,
 	ELAST + 1,
 	bsd_to_linux_errno,
 	translate_traps,
 	linux_fixup,
 	linux_sendsig,
 	linux_sigcode,
 	&linux_szsigcode,
 	linux_prepsyscall,
 	"Linux a.out",
 	linux_aout_coredump,
 	exec_linux_imgact_try,
 	LINUX_MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_linux_setregs
 };
 
 struct sysentvec elf_linux_sysvec = {
 	LINUX_SYS_MAXSYSCALL,
 	linux_sysent,
 	0xff,
 	LINUX_SIGTBLSZ,
 	bsd_to_linux_signal,
 	ELAST + 1,
 	bsd_to_linux_errno,
 	translate_traps,
 	elf_linux_fixup,
 	linux_sendsig,
 	linux_sigcode,
 	&linux_szsigcode,
 	linux_prepsyscall,
 	"Linux ELF",
 	elf32_coredump,
 	exec_linux_imgact_try,
 	LINUX_MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_linux_setregs
 };
 
 static Elf32_Brandinfo linux_brand = {
 					ELFOSABI_LINUX,
 					EM_386,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.1",
 					&elf_linux_sysvec
 				 };
 
 static Elf32_Brandinfo linux_glibc2brand = {
 					ELFOSABI_LINUX,
 					EM_386,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.2",
 					&elf_linux_sysvec
 				 };
 
 Elf32_Brandinfo *linux_brandlist[] = {
 					&linux_brand,
 					&linux_glibc2brand,
 					NULL
 				};
 
 static int
 linux_elf_modevent(module_t mod, int type, void *data)
 {
 	Elf32_Brandinfo **brandinfo;
 	int error;
 	struct linux_ioctl_handler **lihp;
 
 	error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_insert_brand_entry(*brandinfo) < 0)
 				error = EINVAL;
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_register_handler(*lihp);
 			if (bootverbose)
 				printf("Linux ELF exec handler installed\n");
 		} else
 			printf("cannot insert Linux ELF brand handler\n");
 		break;
 	case MOD_UNLOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_brand_inuse(*brandinfo))
 				error = EBUSY;
 		if (error == 0) {
 			for (brandinfo = &linux_brandlist[0];
 			     *brandinfo != NULL; ++brandinfo)
 				if (elf32_remove_brand_entry(*brandinfo) < 0)
 					error = EINVAL;
 		}
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_unregister_handler(*lihp);
 			if (bootverbose)
 				printf("Linux ELF exec handler removed\n");
 			linux_mib_destroy();
 		} else
 			printf("Could not deinstall ELF interpreter entry\n");
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 static moduledata_t linux_elf_mod = {
 	"linuxelf",
 	linux_elf_modevent,
 	0
 };
 
 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
Index: head/sys/ia64/ia64/trap.c
===================================================================
--- head/sys/ia64/ia64/trap.c	(revision 116360)
+++ head/sys/ia64/ia64/trap.c	(revision 116361)
@@ -1,1216 +1,1216 @@
 /* $FreeBSD$ */
 /* From: src/sys/alpha/alpha/trap.c,v 1.33 */
 /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */
 
 /*
  * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  * 
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  * 
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  * 
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/pioctl.h>
 #include <sys/sysctl.h>
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <sys/user.h>
 #include <sys/ptrace.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/pal.h>
 #include <machine/fpu.h>
 #include <machine/efi.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static int print_usertrap = 0;
 SYSCTL_INT(_machdep, OID_AUTO, print_usertrap,
         CTLFLAG_RW, &print_usertrap, 0, "");
 
 extern int unaligned_fixup(struct trapframe *framep, struct thread *td);
 
 static void break_syscall(struct trapframe *tf);
 static void ia32_syscall(struct trapframe *framep);
 
 /*
  * EFI-Provided FPSWA interface (Floating Point SoftWare Assist
  */
 
 /* The function entry address */
 extern FPSWA_INTERFACE *fpswa_interface;
 
 /* Copy of the faulting instruction bundle */
 typedef struct {
 	u_int64_t	bundle_low64;
 	u_int64_t	bundle_high64;
 } FPSWA_BUNDLE;
 
 /*
  * The fp state descriptor... tell FPSWA where the "true" copy is.
  * We save some registers in the trapframe, so we have to point some of
  * these there.  The rest of the registers are "live"
  */
 typedef struct {
 	u_int64_t	bitmask_low64;			/* f63 - f2 */
 	u_int64_t	bitmask_high64;			/* f127 - f64 */
 	struct _ia64_fpreg *fp_low_preserved;		/* f2 - f5 */
 	struct _ia64_fpreg *fp_low_volatile;		/* f6 - f15 */
 	struct _ia64_fpreg *fp_high_preserved;		/* f16 - f31 */
 	struct _ia64_fpreg *fp_high_volatile;		/* f32 - f127 */
 } FP_STATE;
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 static const char *ia64_vector_names[] = {
 	"VHPT Translation",			/* 0 */
 	"Instruction TLB",			/* 1 */
 	"Data TLB",				/* 2 */
 	"Alternate Instruction TLB",		/* 3 */
 	"Alternate Data TLB",			/* 4 */
 	"Data Nested TLB",			/* 5 */
 	"Instruction Key Miss",			/* 6 */
 	"Data Key Miss",			/* 7 */
 	"Dirty-Bit",				/* 8 */
 	"Instruction Access-Bit",		/* 9 */
 	"Data Access-Bit",			/* 10 */
 	"Break Instruction",			/* 11 */
 	"External Interrupt",			/* 12 */
 	"Reserved 13",				/* 13 */
 	"Reserved 14",				/* 14 */
 	"Reserved 15",				/* 15 */
 	"Reserved 16",				/* 16 */
 	"Reserved 17",				/* 17 */
 	"Reserved 18",				/* 18 */
 	"Reserved 19",				/* 19 */
 	"Page Not Present",			/* 20 */
 	"Key Permission",			/* 21 */
 	"Instruction Access Rights",		/* 22 */
 	"Data Access Rights",			/* 23 */
 	"General Exception",			/* 24 */
 	"Disabled FP-Register",			/* 25 */
 	"NaT Consumption",			/* 26 */
 	"Speculation",				/* 27 */
 	"Reserved 28",				/* 28 */
 	"Debug",				/* 29 */
 	"Unaligned Reference",			/* 30 */
 	"Unsupported Data Reference",		/* 31 */
 	"Floating-point Fault",			/* 32 */
 	"Floating-point Trap",			/* 33 */
 	"Lower-Privilege Transfer Trap",	/* 34 */
 	"Taken Branch Trap",			/* 35 */
 	"Single Step Trap",			/* 36 */
 	"Reserved 37",				/* 37 */
 	"Reserved 38",				/* 38 */
 	"Reserved 39",				/* 39 */
 	"Reserved 40",				/* 40 */
 	"Reserved 41",				/* 41 */
 	"Reserved 42",				/* 42 */
 	"Reserved 43",				/* 43 */
 	"Reserved 44",				/* 44 */
 	"IA-32 Exception",			/* 45 */
 	"IA-32 Intercept",			/* 46 */
 	"IA-32 Interrupt",			/* 47 */
 	"Reserved 48",				/* 48 */
 	"Reserved 49",				/* 49 */
 	"Reserved 50",				/* 50 */
 	"Reserved 51",				/* 51 */
 	"Reserved 52",				/* 52 */
 	"Reserved 53",				/* 53 */
 	"Reserved 54",				/* 54 */
 	"Reserved 55",				/* 55 */
 	"Reserved 56",				/* 56 */
 	"Reserved 57",				/* 57 */
 	"Reserved 58",				/* 58 */
 	"Reserved 59",				/* 59 */
 	"Reserved 60",				/* 60 */
 	"Reserved 61",				/* 61 */
 	"Reserved 62",				/* 62 */
 	"Reserved 63",				/* 63 */
 	"Reserved 64",				/* 64 */
 	"Reserved 65",				/* 65 */
 	"Reserved 66",				/* 66 */
 	"Reserved 67",				/* 67 */
 };
 
 struct bitname {
 	u_int64_t mask;
 	const char* name;
 };
 
 static void
 printbits(u_int64_t mask, struct bitname *bn, int count)
 {
 	int i, first = 1;
 	u_int64_t bit;
 
 	for (i = 0; i < count; i++) {
 		/*
 		 * Handle fields wider than one bit.
 		 */
 		bit = bn[i].mask & ~(bn[i].mask - 1);
 		if (bn[i].mask > bit) {
 			if (first)
 				first = 0;
 			else
 				printf(",");
 			printf("%s=%ld", bn[i].name,
 			       (mask & bn[i].mask) / bit);
 		} else if (mask & bit) {
 			if (first)
 				first = 0;
 			else
 				printf(",");
 			printf("%s", bn[i].name);
 		}
 	}
 }
 
 struct bitname psr_bits[] = {
 	{IA64_PSR_BE,	"be"},
 	{IA64_PSR_UP,	"up"},
 	{IA64_PSR_AC,	"ac"},
 	{IA64_PSR_MFL,	"mfl"},
 	{IA64_PSR_MFH,	"mfh"},
 	{IA64_PSR_IC,	"ic"},
 	{IA64_PSR_I,	"i"},
 	{IA64_PSR_PK,	"pk"},
 	{IA64_PSR_DT,	"dt"},
 	{IA64_PSR_DFL,	"dfl"},
 	{IA64_PSR_DFH,	"dfh"},
 	{IA64_PSR_SP,	"sp"},
 	{IA64_PSR_PP,	"pp"},
 	{IA64_PSR_DI,	"di"},
 	{IA64_PSR_SI,	"si"},
 	{IA64_PSR_DB,	"db"},
 	{IA64_PSR_LP,	"lp"},
 	{IA64_PSR_TB,	"tb"},
 	{IA64_PSR_RT,	"rt"},
 	{IA64_PSR_CPL,	"cpl"},
 	{IA64_PSR_IS,	"is"},
 	{IA64_PSR_MC,	"mc"},
 	{IA64_PSR_IT,	"it"},
 	{IA64_PSR_ID,	"id"},
 	{IA64_PSR_DA,	"da"},
 	{IA64_PSR_DD,	"dd"},
 	{IA64_PSR_SS,	"ss"},
 	{IA64_PSR_RI,	"ri"},
 	{IA64_PSR_ED,	"ed"},
 	{IA64_PSR_BN,	"bn"},
 	{IA64_PSR_IA,	"ia"},
 };
 
 static void
 printpsr(u_int64_t psr)
 {
 	printbits(psr, psr_bits, sizeof(psr_bits)/sizeof(psr_bits[0]));
 }
 
 struct bitname isr_bits[] = {
 	{IA64_ISR_CODE,	"code"},
 	{IA64_ISR_VECTOR, "vector"},
 	{IA64_ISR_X,	"x"},
 	{IA64_ISR_W,	"w"},
 	{IA64_ISR_R,	"r"},
 	{IA64_ISR_NA,	"na"},
 	{IA64_ISR_SP,	"sp"},
 	{IA64_ISR_RS,	"rs"},
 	{IA64_ISR_IR,	"ir"},
 	{IA64_ISR_NI,	"ni"},
 	{IA64_ISR_SO,	"so"},
 	{IA64_ISR_EI,	"ei"},
 	{IA64_ISR_ED,	"ed"},
 };
 
 static void printisr(u_int64_t isr)
 {
 	printbits(isr, isr_bits, sizeof(isr_bits)/sizeof(isr_bits[0]));
 }
 
 static void
 printtrap(int vector, struct trapframe *framep, int isfatal, int user)
 {
 	printf("\n");
 	printf("%s %s trap (cpu %d):\n", isfatal? "fatal" : "handled",
 	       user ? "user" : "kernel", PCPU_GET(cpuid));
 	printf("\n");
 	printf("    trap vector = 0x%x (%s)\n",
 	       vector, ia64_vector_names[vector]);
 	printf("    cr.iip      = 0x%lx\n", framep->tf_special.iip);
 	printf("    cr.ipsr     = 0x%lx (", framep->tf_special.psr);
 	printpsr(framep->tf_special.psr);
 	printf(")\n");
 	printf("    cr.isr      = 0x%lx (", framep->tf_special.isr);
 	printisr(framep->tf_special.isr);
 	printf(")\n");
 	printf("    cr.ifa      = 0x%lx\n", framep->tf_special.ifa);
 	if (framep->tf_special.psr & IA64_PSR_IS) {
 		printf("    ar.cflg     = 0x%lx\n", ia64_get_cflg());
 		printf("    ar.csd      = 0x%lx\n", ia64_get_csd());
 		printf("    ar.ssd      = 0x%lx\n", ia64_get_ssd());
 	}
 	printf("    curthread   = %p\n", curthread);
 	if (curthread != NULL)
 		printf("        pid = %d, comm = %s\n",
 		       curthread->td_proc->p_pid, curthread->td_proc->p_comm);
 	printf("\n");
 }
 
 /*
  *
  */
 int
 do_ast(struct trapframe *tf)
 {
 
 	disable_intr();
 	while (curthread->td_flags & (TDF_ASTPENDING|TDF_NEEDRESCHED)) {
 		enable_intr();
 		ast(tf);
 		disable_intr();
 	}
 	/*
 	 * Keep interrupts disabled. We return r10 as a favor to the EPC
 	 * syscall code so that it can quicky determine if the syscall
 	 * needs to be restarted or not.
 	 */
 	return (tf->tf_scratch.gr10);
 }
 
 /*
  * Trap is called from exception.s to handle most types of processor traps.
  */
 /*ARGSUSED*/
 void
 trap(int vector, struct trapframe *framep)
 {
 	struct proc *p;
 	struct thread *td;
 	u_int64_t ucode;
 	int i, user;
 	u_int sticks;
 
 	user = ((framep->tf_special.iip >> 61) < 5) ? 1 : 0;
 
 	/* Short-circuit break instruction based system calls. */
 	if (vector == IA64_VEC_BREAK && framep->tf_special.ifa == 0x100000) {
 		break_syscall(framep);
 		return;
 	}
 
 	/* Sanitize the FP state in case the user has trashed it. */
 	ia64_set_fpsr(IA64_FPSR_DEFAULT);
 
 	atomic_add_int(&cnt.v_trap, 1);
 
 	td = curthread;
 	p = td->td_proc;
 	ucode = 0;
 
 	if (user) {
 		sticks = td->td_sticks;
 		td->td_frame = framep;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
 	} else {
 		sticks = 0;		/* XXX bogus -Wuninitialized warning */
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 	}
 
 	switch (vector) {
 
 	case IA64_VEC_UNALIGNED_REFERENCE: {
 		/*
 		 * If user-land, do whatever fixups, printing, and
 		 * signalling is appropriate (based on system-wide
 		 * and per-process unaligned-access-handling flags).
 		 */
 		if (user) {
 			i = unaligned_fixup(framep, td);
 			if (i == 0)
 				goto out;
 			ucode = framep->tf_special.ifa;	/* VA */
 			break;
 		}
 
 		/*
 		 * Unaligned access from kernel mode is always an error,
 		 * EVEN IF A COPY FAULT HANDLER IS SET!
 		 *
 		 * It's an error if a copy fault handler is set because
 		 * the various routines which do user-initiated copies
 		 * do so in a bcopy-like manner.  In other words, the
 		 * kernel never assumes that pointers provided by the
 		 * user are properly aligned, and so if the kernel
 		 * does cause an unaligned access it's a kernel bug.
 		 */
 		goto dopanic;
 	}
 
 	case IA64_VEC_FLOATING_POINT_FAULT: {
 		FP_STATE fp_state;
 		FPSWA_RET fpswa_ret;
 		FPSWA_BUNDLE bundle;
 
 		/* Always fatal in kernel.  Should never happen. */
 		if (!user)
 			goto dopanic;
 		if (fpswa_interface == NULL) {
 			i = SIGFPE;
 			ucode = 0;
 			break;
 		}
 		mtx_lock(&Giant);
 		i = copyin((void *)(framep->tf_special.iip), &bundle, 16);
 		mtx_unlock(&Giant);
 		if (i) {
 			i = SIGBUS;		/* EFAULT, basically */
 			ucode = /*a0*/ 0;	/* exception summary */
 			break;
 		}
 		/* f6-f15 are saved in exception_save */
 		fp_state.bitmask_low64 = 0xffc0;	/* bits 6 - 15 */
 		fp_state.bitmask_high64 = 0x0;
 		fp_state.fp_low_preserved = NULL;
 		fp_state.fp_low_volatile = &framep->tf_scratch_fp.fr6;
 		fp_state.fp_high_preserved = NULL;
 		fp_state.fp_high_volatile = NULL;
 		/* The docs are unclear.  Is Fpswa reentrant? */
 		fpswa_ret = fpswa_interface->Fpswa(1, &bundle,
 		    &framep->tf_special.psr, &framep->tf_special.fpsr,
 		    &framep->tf_special.isr, &framep->tf_special.pr,
 		    &framep->tf_special.cfm, &fp_state);
 		if (fpswa_ret.status == 0) {
 			/* fixed.  update ipsr and iip to next insn */
 			int ei;
 
 			ei = (framep->tf_special.isr >> 41) & 0x03;
 			if (ei == 0) {		/* no template for this case */
 				framep->tf_special.psr &= ~IA64_ISR_EI;
 				framep->tf_special.psr |= IA64_ISR_EI_1;
 			} else if (ei == 1) {	/* MFI or MFB */
 				framep->tf_special.psr &= ~IA64_ISR_EI;
 				framep->tf_special.psr |= IA64_ISR_EI_2;
 			} else if (ei == 2) {	/* MMF */
 				framep->tf_special.psr &= ~IA64_ISR_EI;
 				framep->tf_special.iip += 0x10;
 			}
 			goto out;
 		} else if (fpswa_ret.status == -1) {
 			printf("FATAL: FPSWA err1 %lx, err2 %lx, err3 %lx\n",
 			    fpswa_ret.err1, fpswa_ret.err2, fpswa_ret.err3);
 			panic("fpswa fatal error on fp fault");
 		} else if (fpswa_ret.status > 0) {
 #if 0
 			if (fpswa_ret.status & 1) {
 				/*
 				 * New exception needs to be raised.
 				 * If set then the following bits also apply:
 				 * & 2 -> fault was converted to a trap
 				 * & 4 -> SIMD caused the exception
 				 */
 				i = SIGFPE;
 				ucode = /*a0*/ 0;	/* exception summary */
 				break;
 			}
 #endif
 			i = SIGFPE;
 			ucode = /*a0*/ 0;		/* exception summary */
 			break;
 		} else {
 			panic("bad fpswa return code %lx", fpswa_ret.status);
 		}
 	}
 
 	case IA64_VEC_FLOATING_POINT_TRAP: {
 		FP_STATE fp_state;
 		FPSWA_RET fpswa_ret;
 		FPSWA_BUNDLE bundle;
 
 		/* Always fatal in kernel.  Should never happen. */
 		if (!user)
 			goto dopanic;
 		if (fpswa_interface == NULL) {
 			i = SIGFPE;
 			ucode = 0;
 			break;
 		}
 		mtx_lock(&Giant);
 		i = copyin((void *)(framep->tf_special.iip), &bundle, 16);
 		mtx_unlock(&Giant);
 		if (i) {
 			i = SIGBUS;			/* EFAULT, basically */
 			ucode = /*a0*/ 0;		/* exception summary */
 			break;
 		}
 		/* f6-f15 are saved in exception_save */
 		fp_state.bitmask_low64 = 0xffc0;	/* bits 6 - 15 */
 		fp_state.bitmask_high64 = 0x0;
 		fp_state.fp_low_preserved = NULL;
 		fp_state.fp_low_volatile = &framep->tf_scratch_fp.fr6;
 		fp_state.fp_high_preserved = NULL;
 		fp_state.fp_high_volatile = NULL;
 		/* The docs are unclear.  Is Fpswa reentrant? */
 		fpswa_ret = fpswa_interface->Fpswa(0, &bundle,
 		    &framep->tf_special.psr, &framep->tf_special.fpsr,
 		    &framep->tf_special.isr, &framep->tf_special.pr,
 		    &framep->tf_special.cfm, &fp_state);
 		if (fpswa_ret.status == 0) {
 			/* fixed */
 			/*
 			 * should we increment iip like the fault case?
 			 * or has fpswa done something like normalizing a
 			 * register so that we should just rerun it?
 			 */
 			goto out;
 		} else if (fpswa_ret.status == -1) {
 			printf("FATAL: FPSWA err1 %lx, err2 %lx, err3 %lx\n",
 			    fpswa_ret.err1, fpswa_ret.err2, fpswa_ret.err3);
 			panic("fpswa fatal error on fp trap");
 		} else if (fpswa_ret.status > 0) {
 			i = SIGFPE;
 			ucode = /*a0*/ 0;		/* exception summary */
 			break;
 		} else {
 			panic("bad fpswa return code %lx", fpswa_ret.status);
 		}
 	}
 
 	case IA64_VEC_DISABLED_FP: {	/* High FP registers are disabled. */
 		struct pcpu *pcpu;
 		struct pcb *pcb;
 		struct thread *thr;
 
 		/* Always fatal in kernel. Should never happen. */
 		if (!user)
 			goto dopanic;
 
 		pcb = td->td_pcb;
 		pcpu = pcb->pcb_fpcpu;
 
 #if 0
 		printf("XXX: td %p: highfp on cpu %p\n", td, pcpu);
 #endif
 
 		/*
 		 * The pcpu variable holds the address of the per-CPU
 		 * structure of the CPU currently holding this threads
 		 * high FP registers (or NULL if no CPU holds these
 		 * registers). We have to interrupt that CPU and wait
 		 * for it to have saved the registers.
 		 */
 		if (pcpu != NULL) {
 			thr = pcpu->pc_fpcurthread;
 			KASSERT(thr == td, ("High FP state out of sync"));
 
 			if (pcpu == pcpup) {
 				/*
 				 * Short-circuit handling the trap when this
 				 * CPU already holds the high FP registers for
 				 * this thread. We really shouldn't get the
 				 * trap in the first place, but since it's
 				 * only a performance issue and not a
 				 * correctness issue, we emit a message for
 				 * now, enable the high FP registers and
 				 * return.
 				 */
 				printf("XXX: bogusly disabled high FP regs\n");
 				framep->tf_special.psr &= ~IA64_PSR_DFH;
 				goto out;
 			}
 #ifdef SMP
 			/*
 			 * Interrupt the other CPU so that it saves the high
 			 * FP registers of this thread. Note that this can
 			 * only happen for the SMP case.
 			 */
 			ipi_send(pcpu->pc_lid, IPI_HIGH_FP);
 #endif
 #ifdef DIAGNOSTICS
 		} else {
 			KASSERT(PCPU_GET(fpcurthread) != td,
 			    ("High FP state out of sync"));
 #endif
 		}
 
 		thr = PCPU_GET(fpcurthread);
 
 #if 0
 		printf("XXX: cpu %p: highfp belongs to td %p\n", pcpup, thr);
 #endif
 
 		/*
 		 * The thr variable holds the thread that owns the high FP
 		 * registers currently on this CPU. Free this CPU so that
 		 * we can load the current threads high FP registers.
 		 */
 		if (thr != NULL) {
 			KASSERT(thr != td, ("High FP state out of sync"));
 			pcb = thr->td_pcb;
 			KASSERT(pcb->pcb_fpcpu == pcpup,
 			    ("High FP state out of sync"));
 			ia64_highfp_save(thr);
 		}
 
 		/*
 		 * Wait for the other CPU to have saved out high FP
 		 * registers (if applicable).
 		 */
 		while (pcpu && pcpu->pc_fpcurthread == td);
 
 		ia64_highfp_load(td);
 		framep->tf_special.psr &= ~IA64_PSR_DFH;
 		goto out;
 		break;
 	}
 
 	case IA64_VEC_PAGE_NOT_PRESENT:
 	case IA64_VEC_INST_ACCESS_RIGHTS:
 	case IA64_VEC_DATA_ACCESS_RIGHTS: {
 		vm_offset_t va;
 		struct vmspace *vm;
 		vm_map_t map;
 		vm_prot_t ftype;
 		int rv;
 
 		rv = 0; 
 		va = framep->tf_special.ifa;
 
 		/*
 		 * If it was caused by fuswintr or suswintr, just punt. Note
 		 * that we check the faulting address against the address
 		 * accessed by [fs]uswintr, in case another fault happens when
 		 * they are running.
 		 */
 		if (!user && td != NULL && td->td_pcb->pcb_accessaddr == va &&
 		    td->td_pcb->pcb_onfault == (unsigned long)fswintrberr) {
 			framep->tf_special.iip = td->td_pcb->pcb_onfault;
 			framep->tf_special.psr &= ~IA64_PSR_RI;
 			td->td_pcb->pcb_onfault = 0;
 			goto out;
 		}
 
 		va = trunc_page((vm_offset_t)va);
 
 		if (va >= VM_MIN_KERNEL_ADDRESS) {
 			/*
 			 * Don't allow user-mode faults for kernel virtual
 			 * addresses
 			 */
 			if (user)
 				goto no_fault_in;
 			map = kernel_map;
 		} else {
 			vm = (p != NULL) ? p->p_vmspace : NULL;
 			if (vm == NULL)
 				goto no_fault_in;
 			map = &vm->vm_map;
 		}
 
 		if (framep->tf_special.isr & IA64_ISR_X)
 			ftype = VM_PROT_EXECUTE;
 		else if (framep->tf_special.isr & IA64_ISR_W)
 			ftype = VM_PROT_WRITE;
 		else
 			ftype = VM_PROT_READ;
 
 		if (map != kernel_map) {
 			/*
 			 * Keep swapout from messing with us during this
 			 * critical time.
 			 */
 			PROC_LOCK(p);
 			++p->p_lock;
 			PROC_UNLOCK(p);
 
 			/* Fault in the user page: */
 			rv = vm_fault(map, va, ftype, (ftype & VM_PROT_WRITE)
 			    ? VM_FAULT_DIRTY : VM_FAULT_NORMAL);
 
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 		} else {
 			/*
 			 * Don't have to worry about process locking or
 			 * stacks in the kernel.
 			 */
 			rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 		}
 
 		if (rv == KERN_SUCCESS)
 			goto out;
 
 	no_fault_in:
 		/*
 		 * Additionally check the privilege level. We don't want to
 		 * panic when we're in the gateway page, running at user
 		 * level. This happens for the signal trampolines. Note that
 		 * when that happens, user is defined as 0 above. We need to
 		 * set user to 1 to force calling userret() and do_ast().
 		 */
 		if (!TRAPF_USERMODE(framep)) {
 			/* Check for copyin/copyout fault. */
 			if (td != NULL && td->td_pcb->pcb_onfault != 0) {
 				framep->tf_special.iip =
 				    td->td_pcb->pcb_onfault;
 				framep->tf_special.psr &= ~IA64_PSR_RI;
 				td->td_pcb->pcb_onfault = 0;
 				goto out;
 			}
 			goto dopanic;
 		} else
 			user = 1;
 		ucode = va;	
 		i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV;
 		break;
 	}
 
 	case IA64_VEC_BREAK:
 	case IA64_VEC_DEBUG:
 	case IA64_VEC_SINGLE_STEP_TRAP:
 	case IA64_VEC_TAKEN_BRANCH_TRAP: {
 		/*
 		 * These are always fatal in kernel, and should never happen.
 		 */
 		if (!user) {
 #ifdef DDB
 			/*
 			 * ...unless, of course, DDB is configured.
 			 */
 			if (kdb_trap(vector, framep))
 				return;
 
 			/*
 			 * If we get here, DDB did _not_ handle the
 			 * trap, and we need to PANIC!
 			 */
 #endif
 			goto dopanic;
 		}
 		i = SIGTRAP;
 		break;
 	}
 
 	case IA64_VEC_GENERAL_EXCEPTION: {
 		if (user) {
 			ucode = vector;
 			i = SIGILL;
 			break;
 		}
 		goto dopanic;
 	}
 
 	case IA64_VEC_UNSUPP_DATA_REFERENCE:
 	case IA64_VEC_LOWER_PRIVILEGE_TRANSFER: {
 		if (user) {
 			ucode = vector;
 			i = SIGBUS;
 			break;
 		}
 		goto dopanic;
 	}
 
 	case IA64_VEC_IA32_EXCEPTION: {
 		u_int64_t isr = framep->tf_special.isr;
 
 		switch ((isr >> 16) & 0xffff) {
 		case IA32_EXCEPTION_DIVIDE:
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 		case IA32_EXCEPTION_DEBUG:
 		case IA32_EXCEPTION_BREAK:
 			i = SIGTRAP;
 			break;
 
 		case IA32_EXCEPTION_OVERFLOW:
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case IA32_EXCEPTION_BOUND:
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case IA32_EXCEPTION_DNA:
 			ucode = 0;
 			i = SIGFPE;
 			break;
 
 		case IA32_EXCEPTION_NOT_PRESENT:
 		case IA32_EXCEPTION_STACK_FAULT:
 		case IA32_EXCEPTION_GPFAULT:
 			ucode = (isr & 0xffff) + BUS_SEGM_FAULT;
 			i = SIGBUS;
 			break;
 
 		case IA32_EXCEPTION_FPERROR:
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 			
 		case IA32_EXCEPTION_ALIGNMENT_CHECK:
 			ucode = framep->tf_special.ifa;	/* VA */
 			i = SIGBUS;
 			break;
 			
 		case IA32_EXCEPTION_STREAMING_SIMD:
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 
 		default:
 			goto dopanic;
 		}
 		break;
 	}
 
 	case IA64_VEC_IA32_INTERRUPT: {
 		/*
 		 * INT n instruction - probably a syscall.
 		 */
 		if (((framep->tf_special.isr >> 16) & 0xffff) == 0x80) {
 			ia32_syscall(framep);
 			goto out;
 		} else {
 			ucode = (framep->tf_special.isr >> 16) & 0xffff;
 			i = SIGILL;
 			break;
 		}
 	}
 
 	case IA64_VEC_IA32_INTERCEPT: {
 		/*
 		 * Maybe need to emulate ia32 instruction.
 		 */
 		goto dopanic;
 	}
 
 	default:
 		goto dopanic;
 	}
 
 	if (print_usertrap)
 		printtrap(vector, framep, 1, user);
 
 	trapsignal(td, i, ucode);
 
 out:
 	if (user) {
 		userret(td, framep, sticks);
 		mtx_assert(&Giant, MA_NOTOWNED);
 #ifdef DIAGNOSTIC
 		cred_free_thread(td);
 #endif
 		do_ast(framep);
 	}
 	return;
 
 dopanic:
 	printtrap(vector, framep, 1, user);
 #ifdef DDB
 	kdb_trap(vector, framep);
 #endif
 	panic("trap");
 }
 
 
 /*
  * Handle break instruction based system calls.
  */
 void
 break_syscall(struct trapframe *tf)
 {
 	uint64_t *bsp, *tfp;
 	uint64_t iip, psr;
 	int error, nargs;
 
 	/* Save address of break instruction. */
 	iip = tf->tf_special.iip;
 	psr = tf->tf_special.psr;
 
 	/* Advance to the next instruction. */
 	tf->tf_special.psr += IA64_PSR_RI_1;
 	if ((tf->tf_special.psr & IA64_PSR_RI) > IA64_PSR_RI_2) {
 		tf->tf_special.iip += 16;
 		tf->tf_special.psr &= ~IA64_PSR_RI;
 	}
 
 	/*
 	 * Copy the arguments on the register stack into the trapframe
 	 * to avoid having interleaved NaT collections.
 	 */
 	tfp = &tf->tf_scratch.gr16;
 	nargs = tf->tf_special.cfm & 0x7f;
 	bsp = (uint64_t*)(curthread->td_kstack + tf->tf_special.ndirty);
 	bsp -= (((uintptr_t)bsp & 0x1ff) < (nargs << 3)) ? (nargs + 1): nargs;
 	while (nargs--) {
 		*tfp++ = *bsp++;
 		if (((uintptr_t)bsp & 0x1ff) == 0x1f8)
 			bsp++;
 	}
 	error = syscall(tf);
 	if (error == ERESTART) {
 		tf->tf_special.iip = iip;
 		tf->tf_special.psr = psr;
 	}
 
 	do_ast(tf);
 }
 
 /*
  * Process a system call.
  *
  * See syscall.s for details as to how we get here. In order to support
  * the ERESTART case, we return the error to our caller. They deal with
  * the hairy details.
  */
 int
 syscall(struct trapframe *tf)
 {
 	struct sysent *callp;
 	struct proc *p;
 	struct thread *td;
 	u_int64_t *args;
 	int code, error;
 	u_int sticks;
 
 	code = tf->tf_scratch.gr15;
 	args = &tf->tf_scratch.gr16;
 
 	atomic_add_int(&cnt.v_syscall, 1);
 
 	td = curthread;
 	p = td->td_proc;
 
 	td->td_frame = tf;
 	sticks = td->td_sticks;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		thread_user_enter(p, td);
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/* (*p->p_sysent->sv_prepsyscall)(tf, args, &code, &params); */
 		panic("prepsyscall");
 	} else {
 		/*
 		 * syscall() and __syscall() are handled the same on
 		 * the ia64, as everything is 64-bit aligned, anyway.
 		 */
 		if (code == SYS_syscall || code == SYS___syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = args[0];
 			args++;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	/*
 	 * Try to run the syscall without Giant if the syscall is MP safe.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, (callp->sy_narg & SYF_ARGMASK), args);
 #endif
 
 	td->td_retval[0] = 0;
 	td->td_retval[1] = 0;
 	tf->tf_scratch.gr10 = EJUSTRETURN;
 
 	STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK));
 
 	error = (*callp->sy_call)(td, args);
 
 	if (error != EJUSTRETURN) {
 		/*
 		 * Save the "raw" error code in r10. We use this to handle
 		 * syscall restarts (see do_ast()).
 		 */
 		tf->tf_scratch.gr10 = error;
 		if (error == 0) {
 			tf->tf_scratch.gr8 = td->td_retval[0];
 			tf->tf_scratch.gr9 = td->td_retval[1];
 		} else if (error != ERESTART) {
 			if (error < p->p_sysent->sv_errsize)
 				error = p->p_sysent->sv_errtbl[error];
 			/*
 			 * Translated error codes are returned in r8. User
 			 * processes use the translated error code.
 			 */
 			tf->tf_scratch.gr8 = error;
 		}
 	}
 
 	/*
 	 * Release Giant if we had to get it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	userret(td, tf, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	return (error);
 }
 
 #include <i386/include/psl.h>
 
 static void
 ia32_syscall(struct trapframe *framep)
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_eflags;
 	u_int sticks;
 	int error;
 	int narg;
 	u_int32_t args[8];
 	u_int64_t args64[8];
 	u_int code;
 
 	/*
 	 * note: PCPU_LAZY_INC() can only be used if we can afford
 	 * occassional inaccuracy in the count.
 	 */
 	cnt.v_syscall++;
 
 	sticks = td->td_sticks;
 	td->td_frame = framep;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	params = (caddr_t)(framep->tf_special.sp & ((1L<<32)-1))
 	    + sizeof(u_int32_t);
 	code = framep->tf_scratch.gr8;		/* eax */
 	orig_eflags = ia64_get_eflag();
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(framep, args, &code, &params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword32(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 * We use a 32-bit fetch in case params is not
 			 * aligned.
 			 */
 			code = fuword32(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	if (params != NULL && narg != 0)
 		error = copyin(params, (caddr_t)args,
 		    (u_int)(narg * sizeof(int)));
 	else
 		error = 0;
 
 	for (i = 0; i < narg; i++)
 		args64[i] = args[i];
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, args64);
 #endif
 	/*
 	 * Try to run the syscall without Giant if the syscall
 	 * is MP safe.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = framep->tf_scratch.gr10;	/* edx */
 
 		STOPEVENT(p, S_SCE, narg);
 
 		error = (*callp->sy_call)(td, args64);
 	}
 
 	switch (error) {
 	case 0:
 		framep->tf_scratch.gr8 = td->td_retval[0];	/* eax */
 		framep->tf_scratch.gr10 = td->td_retval[1];	/* edx */
 		ia64_set_eflag(ia64_get_eflag() & ~PSL_C);
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. XXX Assume int 0x80.
 		 */
 		framep->tf_special.iip -= 2;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		framep->tf_scratch.gr8 = error;
 		ia64_set_eflag(ia64_get_eflag() | PSL_C);
 		break;
 	}
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_eflags & PSL_T) && !(orig_eflags & PSL_VM)) {
 		ia64_set_eflag(ia64_get_eflag() & ~PSL_T);
 		trapsignal(td, SIGTRAP, 0);
 	}
 
 	/*
 	 * Release Giant if we previously set it.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, framep, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/kern_clock.c
===================================================================
--- head/sys/kern/kern_clock.c	(revision 116360)
+++ head/sys/kern/kern_clock.c	(revision 116361)
@@ -1,483 +1,483 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ntp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/callout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
 #include <machine/cpu.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
 #ifdef DEVICE_POLLING
 extern void hardclock_device_poll(void);
 #endif /* DEVICE_POLLING */
 
 static void initclocks(void *dummy);
 SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
 
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 long cp_time[CPUSTATES];
 
 SYSCTL_OPAQUE(_kern, OID_AUTO, cp_time, CTLFLAG_RD, &cp_time, sizeof(cp_time),
     "LU", "CPU time statistics");
 
 /*
  * Clock handling routines.
  *
  * This code is written to operate with two timers that run independently of
  * each other.
  *
  * The main timer, running hz times per second, is used to trigger interval
  * timers, timeouts and rescheduling as needed.
  *
  * The second timer handles kernel and user profiling,
  * and does resource use estimation.  If the second timer is programmable,
  * it is randomized to avoid aliasing between the two clocks.  For example,
  * the randomization prevents an adversary from always giving up the cpu
  * just before its quantum expires.  Otherwise, it would never accumulate
  * cpu ticks.  The mean frequency of the second timer is stathz.
  *
  * If no second timer exists, stathz will be zero; in this case we drive
  * profiling and statistics off the main clock.  This WILL NOT be accurate;
  * do not do it unless absolutely necessary.
  *
  * The statistics clock may (or may not) be run at a higher rate while
  * profiling.  This profile clock runs at profhz.  We require that profhz
  * be an integral multiple of stathz.
  *
  * If the statistics clock is running fast, it must be divided by the ratio
  * profhz/stathz for statistics.  (For profiling, every tick counts.)
  *
  * Time-of-day is maintained using a "timecounter", which may or may
  * not be related to the hardware generating the above mentioned
  * interrupts.
  */
 
 int	stathz;
 int	profhz;
 int	profprocs;
 int	ticks;
 int	psratio;
 
 /*
  * Initialize clock frequencies and start both clocks running.
  */
 /* ARGSUSED*/
 static void
 initclocks(dummy)
 	void *dummy;
 {
 	register int i;
 
 	/*
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
 	cpu_initclocks();
 
 	/*
 	 * Compute profhz/stathz, and fix profhz if needed.
 	 */
 	i = stathz ? stathz : hz;
 	if (profhz == 0)
 		profhz = i;
 	psratio = profhz / i;
 }
 
 /*
  * Each time the real-time timer fires, this function is called on all CPUs.
  * Note that hardclock() calls hardclock_process() for the boot CPU, so only
  * the other CPUs in the system need to call this function.
  */
 void
 hardclock_process(frame)
 	register struct clockframe *frame;
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
-	if (p->p_flag & P_THREADED) {
+	if (p->p_flag & P_SA) {
 		/* XXXKSE What to do? */
 	} else {
 		pstats = p->p_stats;
 		if (CLKF_USERMODE(frame) &&
 		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
 		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
 			p->p_sflag |= PS_ALRMPEND;
 			td->td_flags |= TDF_ASTPENDING;
 		}
 		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
 		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
 			p->p_sflag |= PS_PROFPEND;
 			td->td_flags |= TDF_ASTPENDING;
 		}
 	}
 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
 }
 
 /*
  * The real-time timer, interrupting hz times per second.
  */
 void
 hardclock(frame)
 	register struct clockframe *frame;
 {
 	int need_softclock = 0;
 
 	CTR0(KTR_CLK, "hardclock fired");
 	hardclock_process(frame);
 
 	tc_ticktock();
 	/*
 	 * If no separate statistics clock is available, run it from here.
 	 *
 	 * XXX: this only works for UP
 	 */
 	if (stathz == 0) {
 		profclock(frame);
 		statclock(frame);
 	}
 
 #ifdef DEVICE_POLLING
 	hardclock_device_poll();	/* this is very short and quick */
 #endif /* DEVICE_POLLING */
 
 	/*
 	 * Process callouts at a very low cpu priority, so we don't keep the
 	 * relatively high clock interrupt priority any longer than necessary.
 	 */
 	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
 	ticks++;
 	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
 		need_softclock = 1;
 	} else if (softticks + 1 == ticks)
 		++softticks;
 	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
 
 	/*
 	 * swi_sched acquires sched_lock, so we don't want to call it with
 	 * callout_lock held; incorrect locking order.
 	 */
 	if (need_softclock)
 		swi_sched(softclock_ih, 0);
 }
 
 /*
  * Compute number of ticks in the specified amount of time.
  */
 int
 tvtohz(tv)
 	struct timeval *tv;
 {
 	register unsigned long ticks;
 	register long sec, usec;
 
 	/*
 	 * If the number of usecs in the whole seconds part of the time
 	 * difference fits in a long, then the total number of usecs will
 	 * fit in an unsigned long.  Compute the total and convert it to
 	 * ticks, rounding up and adding 1 to allow for the current tick
 	 * to expire.  Rounding also depends on unsigned long arithmetic
 	 * to avoid overflow.
 	 *
 	 * Otherwise, if the number of ticks in the whole seconds part of
 	 * the time difference fits in a long, then convert the parts to
 	 * ticks separately and add, using similar rounding methods and
 	 * overflow avoidance.  This method would work in the previous
 	 * case but it is slightly slower and assumes that hz is integral.
 	 *
 	 * Otherwise, round the time difference down to the maximum
 	 * representable value.
 	 *
 	 * If ints have 32 bits, then the maximum value for any timeout in
 	 * 10ms ticks is 248 days.
 	 */
 	sec = tv->tv_sec;
 	usec = tv->tv_usec;
 	if (usec < 0) {
 		sec--;
 		usec += 1000000;
 	}
 	if (sec < 0) {
 #ifdef DIAGNOSTIC
 		if (usec > 0) {
 			sec++;
 			usec -= 1000000;
 		}
 		printf("tvotohz: negative time difference %ld sec %ld usec\n",
 		       sec, usec);
 #endif
 		ticks = 1;
 	} else if (sec <= LONG_MAX / 1000000)
 		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
 			/ tick + 1;
 	else if (sec <= LONG_MAX / hz)
 		ticks = sec * hz
 			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
 	else
 		ticks = LONG_MAX;
 	if (ticks > INT_MAX)
 		ticks = INT_MAX;
 	return ((int)ticks);
 }
 
 /*
  * Start profiling on a process.
  *
  * Kernel profiling passes proc0 which never exits and hence
  * keeps the profile clock running constantly.
  */
 void
 startprofclock(p)
 	register struct proc *p;
 {
 
 	/*
 	 * XXX; Right now sched_lock protects statclock(), but perhaps
 	 * it should be protected later on by a time_lock, which would
 	 * cover psdiv, etc. as well.
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
 		mtx_lock_spin(&sched_lock);
 		p->p_flag |= P_PROFIL;
 		if (++profprocs == 1)
 			cpu_startprofclock();
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Stop profiling on a process.
  */
 void
 stopprofclock(p)
 	register struct proc *p;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_PROFIL) {
 		if (p->p_profthreads != 0) {
 			p->p_flag |= P_STOPPROF;
 			while (p->p_profthreads != 0)
 				msleep(&p->p_profthreads, &p->p_mtx, PPAUSE,
 				    "stopprof", NULL);
 			p->p_flag &= ~P_STOPPROF;
 		}
 		mtx_lock_spin(&sched_lock);
 		p->p_flag &= ~P_PROFIL;
 		if (--profprocs == 0)
 			cpu_stopprofclock();
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Statistics clock.  Grab profile sample, and if divider reaches 0,
  * do process and kernel statistics.  Most of the statistics are only
  * used by user-level statistics programs.  The main exceptions are
  * ke->ke_uticks, p->p_sticks, p->p_iticks, and p->p_estcpu.
  * This should be called by all active processors.
  */
 void
 statclock(frame)
 	register struct clockframe *frame;
 {
 	struct pstats *pstats;
 	struct rusage *ru;
 	struct vmspace *vm;
 	struct thread *td;
 	struct kse *ke;
 	struct proc *p;
 	long rss;
 
 	td = curthread;
 	p = td->td_proc;
 
 	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
 	ke = td->td_kse;
 	if (CLKF_USERMODE(frame)) {
 		/*
 		 * Charge the time as appropriate.
 		 */
-		if (p->p_flag & P_THREADED)
+		if (p->p_flag & P_SA)
 			thread_statclock(1);
 		p->p_uticks++;
 		if (ke->ke_ksegrp->kg_nice > NZERO)
 			cp_time[CP_NICE]++;
 		else
 			cp_time[CP_USER]++;
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
 		 * - handling an interrupt,
 		 * - doing syscall or trap work on behalf of the current
 		 *   user process, or
 		 * - spinning in the idle loop.
 		 * Whichever it is, charge the time as appropriate.
 		 * Note that we charge interrupts to the current process,
 		 * regardless of whether they are ``for'' that process,
 		 * so that we know how much of its real time was spent
 		 * in ``non-process'' (i.e., interrupt) work.
 		 */
 		if ((td->td_ithd != NULL) || td->td_intr_nesting_level >= 2) {
 			p->p_iticks++;
 			cp_time[CP_INTR]++;
 		} else {
-			if (p->p_flag & P_THREADED)
+			if (p->p_flag & P_SA)
 				thread_statclock(0);
 			td->td_sticks++;
 			p->p_sticks++;
 			if (p != PCPU_GET(idlethread)->td_proc)
 				cp_time[CP_SYS]++;
 			else
 				cp_time[CP_IDLE]++;
 		}
 	}
 
 	sched_clock(ke);
 
 	/* Update resource usage integrals and maximums. */
 	if ((pstats = p->p_stats) != NULL &&
 	    (ru = &pstats->p_ru) != NULL &&
 	    (vm = p->p_vmspace) != NULL) {
 		ru->ru_ixrss += pgtok(vm->vm_tsize);
 		ru->ru_idrss += pgtok(vm->vm_dsize);
 		ru->ru_isrss += pgtok(vm->vm_ssize);
 		rss = pgtok(vmspace_resident_count(vm));
 		if (ru->ru_maxrss < rss)
 			ru->ru_maxrss = rss;
 	}
 	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
 }
 
 void
 profclock(frame)
 	register struct clockframe *frame;
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
 	int i;
 #endif
 
 	td = curthread;
 	if (CLKF_USERMODE(frame)) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
 		 * if there is no related user location yet, don't
 		 * bother trying to count it.
 		 */
 		td = curthread;
 		if (td->td_proc->p_flag & P_PROFIL)
 			addupc_intr(td, CLKF_PC(frame), 1);
 	}
 #ifdef GPROF
 	else {
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
 		if (g->state == GMON_PROF_ON) {
 			i = CLKF_PC(frame) - g->lowpc;
 			if (i < g->textsize) {
 				i /= HISTFRACTION * sizeof(*g->kcount);
 				g->kcount[i]++;
 			}
 		}
 	}
 #endif
 }
 
 /*
  * Return information about system clocks.
  */
 static int
 sysctl_kern_clockrate(SYSCTL_HANDLER_ARGS)
 {
 	struct clockinfo clkinfo;
 	/*
 	 * Construct clockinfo structure.
 	 */
 	bzero(&clkinfo, sizeof(clkinfo));
 	clkinfo.hz = hz;
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
 	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
 }
 
 SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
 	0, 0, sysctl_kern_clockrate, "S,clockinfo",
 	"Rate and period of various kernel clocks");
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 116360)
+++ head/sys/kern/kern_exec.c	(revision 116361)
@@ -1,1208 +1,1208 @@
 /*
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/reg.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int kern_execve(struct thread *td, char *fname, char **argv,
 	char **envv, struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 int ps_argsopen = 1;
 SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
 
 #ifdef __ia64__
 /* XXX HACK */
 static int regstkpages = 256;
 SYSCTL_INT(_machdep, OID_AUTO, regstkpages, CTLFLAG_RW, &regstkpages, 0, "");
 #endif
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 	   sizeof(p->p_sysent->sv_psstrings)));
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 	    sizeof(p->p_sysent->sv_usrstack)));
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  *
  * MPSAFE
  */
 static int
 kern_execve(td, fname, argv, envv, mac_p)
 	struct thread *td;
 	char *fname;
 	char **argv;
 	char **envv;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int textset;
 #ifdef MAC
 	struct label interplabel;	/* label of the interpreted vnode */
 	struct label execlabel;		/* optional label argument */
 	int will_transition, interplabelvalid = 0;
 #endif
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
-	if (p->p_flag & P_THREADED || p->p_numthreads > 1) {
+	if (p->p_flag & P_SA || p->p_numthreads > 1) {
 		if (thread_single(SINGLE_EXIT)) {
 			PROC_UNLOCK(p);
 			return (ERESTART);	/* Try again later. */
 		}
 		/*
 		 * If we get here all other threads are dead,
 		 * so unset the associated flags and lose KSE mode.
 		 */
-		p->p_flag &= ~P_THREADED;
+		p->p_flag &= ~P_SA;
 		td->td_mailbox = NULL;
 		thread_single_end();
 	}
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->userspace_argv = argv;
 	imgp->userspace_envv = envv;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->argc = imgp->envc = 0;
 	imgp->argv0 = NULL;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name[0] = '\0';
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p, &execlabel);
 	if (error) {
 		mtx_lock(&Giant);
 		goto exec_fail;
 	}
 #endif
 
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings
 	 */
 	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX +
 	    PAGE_SIZE);
 	if (imgp->stringbase == NULL) {
 		error = ENOMEM;
 		mtx_lock(&Giant);
 		goto exec_fail;
 	}
 	imgp->stringp = imgp->stringbase;
 	imgp->stringspace = ARG_MAX;
 	imgp->image_header = imgp->stringbase + ARG_MAX;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 	    UIO_USERSPACE, fname, td);
 
 	mtx_lock(&Giant);
 interpret:
 
 	error = namei(ndp);
 	if (error) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
 		    ARG_MAX + PAGE_SIZE);
 		goto exec_fail;
 	}
 
 	imgp->vp = ndp->ni_vp;
 	imgp->fname = fname;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	if (VOP_GETVOBJECT(imgp->vp, &imgp->object) == 0)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell 
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		mac_init_vnode_label(&interplabel);
 		mac_copy_vnode_label(&ndp->ni_vp->v_label, &interplabel);
 		interplabelvalid = 1;
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	FILEDESC_LOCK(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		tmp = fdcopy(td->td_proc->p_fd);
 		FILEDESC_UNLOCK(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
 		FILEDESC_UNLOCK(p->p_fd);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->endargs - imgp->stringbase;
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs))
 		newargs = pargs_alloc(i);
 
 	/* close files on exec */
 	fdcloseexec(td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
 	    interplabelvalid ? &interplabel : NULL, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 #ifdef KTRACE
 		if (p->p_tracevp != NULL && suser_cred(oldcred, PRISON_ROOT)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		error = fdcheckstd(td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabelvalid ? &interplabel : NULL, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 */
 	if (p->p_flag & P_TRACED)
 		psignal(p, SIGTRAP);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/* Free any previous argument cache */
 	oldargs = p->p_args;
 	p->p_args = NULL;
 
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		bcopy(imgp->stringbase, newargs->ar_args, i);
 		p->p_args = newargs;
 		newargs = NULL;
 	}
 	PROC_UNLOCK(p);
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL)
 		vrele(textvp);
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->stringbase != NULL)
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
 		    ARG_MAX + PAGE_SIZE);
 
 	if (imgp->object)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 	
 	if (imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 #ifdef MAC
 		mac_execve_exit(imgp);
 		if (interplabelvalid)
 			mac_destroy_vnode_label(&interplabel);
 #endif
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 		error = 0;
 	}
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabelvalid)
 		mac_destroy_vnode_label(&interplabel);
 #endif
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
         char    *fname; 
         char    **argv;
         char    **envv; 
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 
 	return (kern_execve(td, uap->fname, uap->argv, uap->envv, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 
 #ifdef MAC
 	return (kern_execve(td, uap->fname, uap->argv, uap->envv,
 	    uap->mac_p));
 #else
 	return (ENOSYS);
 #endif
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	GIANT_REQUIRED;
 
 	if (imgp->firstpage) {
 		exec_unmap_first_page(imgp);
 	}
 
 	VOP_GETVOBJECT(imgp->vp, &object);
 	VM_OBJECT_LOCK(object);
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	vm_page_lock_queues();
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		vm_page_unlock_queues();
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				vm_page_lock_queues();
 				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
 					vm_page_unlock_queues();
 					break;
 				}
 				if (ma[i]->valid) {
 					vm_page_unlock_queues();
 					break;
 				}
 				vm_page_busy(ma[i]);
 				vm_page_unlock_queues();
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		vm_page_lock_queues();
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				pmap_remove_all(ma[0]);
 				vm_page_free(ma[0]);
 			}
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	VM_OBJECT_UNLOCK(object);
 	vm_page_wire(ma[0]);
 	vm_page_wakeup(ma[0]);
 	vm_page_unlock_queues();
 
 	pmap_qenter((vm_offset_t)imgp->image_header, ma, 1);
 	imgp->firstpage = ma[0];
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	GIANT_REQUIRED;
 
 	if (imgp->firstpage) {
 		pmap_qremove((vm_offset_t)imgp->image_header, 1);
 		vm_page_lock_queues();
 		vm_page_unwire(imgp->firstpage, 1);
 		vm_page_unlock_queues();
 		imgp->firstpage = NULL;
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 
 	GIANT_REQUIRED;
 
 	stack_addr = sv->sv_usrstack - maxssiz;
 
 	imgp->vmspace_destroyed = 1;
 
 	EVENTHANDLER_INVOKE(process_exec, p);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		vm_page_lock_queues();
 		pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
 		    vm_map_max(map));
 		vm_page_unlock_queues();
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, 0);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	{
 		/*
 		 * Allocate backing store. We really need something
 		 * similar to vm_map_stack which can allow the backing 
 		 * store to grow upwards. This will do for now.
 		 */
 		vm_offset_t bsaddr;
 		bsaddr = p->p_sysent->sv_usrstack - 2 * maxssiz;
 		error = vm_map_find(map, 0, 0, &bsaddr,
 		    regstkpages * PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
 		FIRST_THREAD_IN_PROC(p)->td_md.md_bspstore = bsaddr;
 	}
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process
  *	address space into the temporary string buffer.
  */
 int
 exec_extract_strings(imgp)
 	struct image_params *imgp;
 {
 	char	**argv, **envv;
 	char	*argp, *envp;
 	int	error;
 	size_t	length;
 
 	/*
 	 * extract arguments first
 	 */
 
 	argv = imgp->userspace_argv;
 
 	if (argv) {
 		argp = (caddr_t)(intptr_t)fuword(argv);
 		if (argp == (caddr_t)-1)
 			return (EFAULT);
 		if (argp)
 			argv++;
 		if (imgp->argv0)
 			argp = imgp->argv0;
 		if (argp) {
 			do {
 				if (argp == (caddr_t)-1)
 					return (EFAULT);
 				if ((error = copyinstr(argp, imgp->stringp,
 				    imgp->stringspace, &length))) {
 					if (error == ENAMETOOLONG)
 						return (E2BIG);
 					return (error);
 				}
 				imgp->stringspace -= length;
 				imgp->stringp += length;
 				imgp->argc++;
 			} while ((argp = (caddr_t)(intptr_t)fuword(argv++)));
 		}
 	}	
 
 	imgp->endargs = imgp->stringp;
 
 	/*
 	 * extract environment strings
 	 */
 
 	envv = imgp->userspace_envv;
 
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1)
 				return (EFAULT);
 			if ((error = copyinstr(envp, imgp->stringp,
 			    imgp->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					return (E2BIG);
 				return (error);
 			}
 			imgp->stringspace -= length;
 			imgp->stringp += length;
 			imgp->envc++;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Copy strings out to the new process address space, constructing
  *	new arg and env vector tables. Return a pointer to the base
  *	so that it can be used as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
 		    imgp->auxarg_size) * sizeof(char *));
 
 	} else 
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2) *
 		    sizeof(char *));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->stringbase;
 	argc = imgp->argc;
 	envc = imgp->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 116360)
+++ head/sys/kern/kern_exit.c	(revision 116361)
@@ -1,790 +1,790 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/tty.h>
 #include <sys/wait.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
 #include <sys/mac.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #include <sys/jail.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <sys/user.h>
 
 /* Required to be non-static for SysVR4 emulator */
 MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status");
 
 static int wait1(struct thread *, struct wait_args *, int);
 
 /*
  * exit --
  *	Death of process.
  *
  * MPSAFE
  */
 void
 sys_exit(struct thread *td, struct sys_exit_args *uap)
 {
 
 	mtx_lock(&Giant);
 	exit1(td, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state
  * to zombie, and unlink proc from allproc and parent's lists.  Save exit
  * status and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
 {
 	struct proc *p, *nq, *q;
 	struct tty *tp;
 	struct vnode *ttyvp;
 	struct vmspace *vm;
 	struct vnode *vtmp;
 #ifdef KTRACE
 	struct vnode *tracevp;
 	struct ucred *tracecred;
 #endif
 
 	GIANT_REQUIRED;
 
 	p = td->td_proc;
 	if (p == initproc) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 
 	/*
 	 * MUST abort all other threads before proceeding past here.
 	 */
 	PROC_LOCK(p);
-	if (p->p_flag & P_THREADED || p->p_numthreads > 1) {
+	if (p->p_flag & P_SA || p->p_numthreads > 1) {
 		/*
 		 * First check if some other thread got here before us..
 		 * if so, act apropriatly, (exit or suspend);
 		 */
 		thread_suspend_check(0);
 
 		/*
 		 * Kill off the other threads. This requires
 		 * Some co-operation from other parts of the kernel
 		 * so it may not be instant.
 		 * With this state set:
 		 * Any thread entering the kernel from userspace will
 		 * thread_exit() in trap().  Any thread attempting to
 		 * sleep will return immediatly
 		 * with EINTR or EWOULDBLOCK, which will hopefully force them
 		 * to back out to userland, freeing resources as they go, and
 		 * anything attempting to return to userland will thread_exit()
 		 * from userret().  thread_exit() will unsuspend us
 		 * when the last other thread exits.
 		 */
 		if (thread_single(SINGLE_EXIT)) {
 			panic ("Exit: Single threading fouled up");
 		}
 		/*
 		 * All other activity in this process is now stopped.
 		 * Remove excess KSEs and KSEGRPS. XXXKSE (when we have them)
 		 * ...
 		 * Turn off threading support.
 		 */
-		p->p_flag &= ~P_THREADED;
+		p->p_flag &= ~P_SA;
 		thread_single_end();	/* Don't need this any more. */
 	}
 	/*
 	 * With this state set:
 	 * Any thread entering the kernel from userspace will thread_exit()
 	 * in trap().  Any thread attempting to sleep will return immediatly
 	 * with EINTR or EWOULDBLOCK, which will hopefully force them
 	 * to back out to userland, freeing resources as they go, and
 	 * anything attempting to return to userland will thread_exit()
 	 * from userret().  thread_exit() will do a wakeup on p->p_numthreads
 	 * if it transitions to 1.
 	 */
 
 	p->p_flag |= P_WEXIT;
 	PROC_UNLOCK(p);
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
 		mtx_lock(&ppeers_lock);
 		q = p->p_peers;
 		while (q != NULL) {
 			PROC_LOCK(q);
 			psignal(q, SIGKILL);
 			PROC_UNLOCK(q);
 			q = q->p_peers;
 		}
 		while (p->p_peers != NULL)
 			msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
 		mtx_unlock(&ppeers_lock);
 	}
 
 #ifdef PGINPROF
 	vmsizmon();
 #endif
 	STOPEVENT(p, S_EXIT, rv);
 	wakeup(&p->p_stype);	/* Wakeup anyone in procfs' PIOCWAIT */
 
 	/*
 	 * Check if any loadable modules need anything done at process exit.
 	 * e.g. SYSV IPC stuff
 	 * XXX what if one of these generates an error?
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
 	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
 		M_ZOMBIE, M_WAITOK);
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
 	SIGEMPTYSET(p->p_siglist);
 	SIGEMPTYSET(td->td_siglist);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
 	 * executing, prevent it from rearming itself and let it finish.
 	 */
 	if (timevalisset(&p->p_realtimer.it_value) &&
 	    callout_stop(&p->p_itcallout) == 0) {
 		timevalclear(&p->p_realtimer.it_interval);
 		msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
 		KASSERT(!timevalisset(&p->p_realtimer.it_value),
 		    ("realtime timer is still armed"));
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
 	funsetownlst(&p->p_sigiolst);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdfree(td);
 
 	/*
 	 * Remove ourself from our leader's peer list and wake our leader.
 	 */
 	mtx_lock(&ppeers_lock);
 	if (p->p_leader->p_peers) {
 		q = p->p_leader;
 		while (q->p_peers != p)
 			q = q->p_peers;
 		q->p_peers = p->p_peers;
 		wakeup(p->p_leader);
 	}
 	mtx_unlock(&ppeers_lock);
 
 	/* The next two chunks should probably be moved to vmspace_exit. */
 	vm = p->p_vmspace;
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 * Can't free the entire vmspace as the kernel stack
 	 * may be mapped within that space also.
 	 *
 	 * Processes sharing the same vmspace may exit in one order, and
 	 * get cleaned up by vmspace_exit() in a different order.  The
 	 * last exiting process to reach this point releases as much of
 	 * the environment as it can, and the last process cleaned up
 	 * by vmspace_exit() (which decrements exitingcnt) cleans up the
 	 * remainder.
 	 */
 	++vm->vm_exitingcnt;
 	if (--vm->vm_refcnt == 0) {
 		shmexit(vm);
 		vm_page_lock_queues();
 		pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map),
 		    vm_map_max(&vm->vm_map));
 		vm_page_unlock_queues();
 		(void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
 		    vm_map_max(&vm->vm_map));
 	}
 
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp;
 
 		sp = p->p_session;
 		if (sp->s_ttyvp) {
 			/*
 			 * Controlling process.
 			 * Signal foreground pgrp,
 			 * drain controlling terminal
 			 * and revoke access to controlling terminal.
 			 */
 			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
 				tp = sp->s_ttyp;
 				if (sp->s_ttyp->t_pgrp) {
 					PGRP_LOCK(sp->s_ttyp->t_pgrp);
 					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
 					PGRP_UNLOCK(sp->s_ttyp->t_pgrp);
 				}
 				/* XXX tp should be locked. */
 				sx_xunlock(&proctree_lock);
 				(void) ttywait(tp);
 				sx_xlock(&proctree_lock);
 				/*
 				 * The tty could have been revoked
 				 * if we blocked.
 				 */
 				if (sp->s_ttyvp) {
 					ttyvp = sp->s_ttyvp;
 					SESS_LOCK(p->p_session);
 					sp->s_ttyvp = NULL;
 					SESS_UNLOCK(p->p_session);
 					sx_xunlock(&proctree_lock);
 					VOP_REVOKE(ttyvp, REVOKEALL);
 					vrele(ttyvp);
 					sx_xlock(&proctree_lock);
 				}
 			}
 			if (sp->s_ttyvp) {
 				ttyvp = sp->s_ttyvp;
 				SESS_LOCK(p->p_session);
 				sp->s_ttyvp = NULL;
 				SESS_UNLOCK(p->p_session);
 				vrele(ttyvp);
 			}
 			/*
 			 * s_ttyp is not zero'd; we use this to indicate
 			 * that the session once had a controlling terminal.
 			 * (for logging and informational purposes)
 			 */
 		}
 		SESS_LOCK(p->p_session);
 		sp->s_leader = NULL;
 		SESS_UNLOCK(p->p_session);
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
 #ifdef KTRACE
 	/*
 	 * release trace file
 	 */
 	PROC_LOCK(p);
 	mtx_lock(&ktrace_mtx);
 	p->p_traceflag = 0;	/* don't trace the vrele() */
 	tracevp = p->p_tracevp;
 	p->p_tracevp = NULL;
 	tracecred = p->p_tracecred;
 	p->p_tracecred = NULL;
 	mtx_unlock(&ktrace_mtx);
 	PROC_UNLOCK(p);
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	/*
 	 * Release reference to text vnode
 	 */
 	if ((vtmp = p->p_textvp) != NULL) {
 		p->p_textvp = NULL;
 		vrele(vtmp);
 	}
 
 	/*
 	 * Release our limits structure.
 	 */
 	mtx_assert(&Giant, MA_OWNED);
 	if (--p->p_limit->p_refcnt == 0) {
 		FREE(p->p_limit, M_SUBPROC);
 		p->p_limit = NULL;
 	}
 
 	/*
 	 * Release this thread's reference to the ucred.  The actual proc
 	 * reference will stay around until the proc is harvested by
 	 * wait().  At this point the ucred is immutable (no other threads
 	 * from this proc are around that can change it) so we leave the
 	 * per-thread ucred pointer intact in case it is needed although
 	 * in theory nothing should be using it at this point.
 	 */
 	crfree(td->td_ucred);
 
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	sx_xlock(&allproc_lock);
 	LIST_REMOVE(p, p_list);
 	LIST_INSERT_HEAD(&zombproc, p, p_list);
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
 		wakeup(initproc);
 	for (; q != NULL; q = nq) {
 		nq = LIST_NEXT(q, p_sibling);
 		PROC_LOCK(q);
 		proc_reparent(q, initproc);
 		q->p_sigparent = SIGCHLD;
 		/*
 		 * Traced processes are killed
 		 * since their existence means someone is screwing up.
 		 */
 		if (q->p_flag & P_TRACED) {
 			q->p_flag &= ~P_TRACED;
 			psignal(q, SIGKILL);
 		}
 		PROC_UNLOCK(q);
 	}
 
 	/*
 	 * Save exit status and final rusage info, adding in child rusage
 	 * info and self times.
 	 */
 	PROC_LOCK(p);
 	p->p_xstat = rv;
 	*p->p_ru = p->p_stats->p_ru;
 	mtx_lock_spin(&sched_lock);
 	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
 	mtx_unlock_spin(&sched_lock);
 	ruadd(p->p_ru, &p->p_stats->p_cru);
 
 	/*
 	 * Notify interested parties of our demise.
 	 */
 	KNOTE(&p->p_klist, NOTE_EXIT);
 
 	/*
 	 * Notify parent that we're gone.  If parent has the PS_NOCLDWAIT
 	 * flag set, or if the handler is set to SIG_IGN, notify process
 	 * 1 instead (and hope it will handle this situation).
 	 */
 	PROC_LOCK(p->p_pptr);
 	mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
 	if (p->p_pptr->p_sigacts->ps_flag & (PS_NOCLDWAIT | PS_CLDSIGIGN)) {
 		struct proc *pp;
 
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 		pp = p->p_pptr;
 		PROC_UNLOCK(pp);
 		proc_reparent(p, initproc);
 		PROC_LOCK(p->p_pptr);
 		/*
 		 * If this was the last child of our parent, notify
 		 * parent, so in case he was wait(2)ing, he will
 		 * continue.
 		 */
 		if (LIST_EMPTY(&pp->p_children))
 			wakeup(pp);
 	} else
 		mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
 
 	if (p->p_sigparent && p->p_pptr != initproc)
 		psignal(p->p_pptr, p->p_sigparent);
 	else
 		psignal(p->p_pptr, SIGCHLD);
 	PROC_UNLOCK(p->p_pptr);
 
 	/*
 	 * If this is a kthread, then wakeup anyone waiting for it to exit.
 	 */
 	if (p->p_flag & P_KTHREAD)
 		wakeup(p);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Finally, call machine-dependent code to release the remaining
 	 * resources including address space.
 	 * The address space is released by "vmspace_exitfree(p)" in
 	 * vm_waitproc().
 	 */
 	cpu_exit(td);
 
 	PROC_LOCK(p);
 	PROC_LOCK(p->p_pptr);
 	sx_xunlock(&proctree_lock);
 	mtx_lock_spin(&sched_lock);
 
 	while (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 	/*
 	 * We have to wait until after acquiring all locks before
 	 * changing p_state.  If we block on a mutex then we will be
 	 * back at SRUN when we resume and our parent will never
 	 * harvest us.
 	 */
 	p->p_state = PRS_ZOMBIE;
 
 	wakeup(p->p_pptr);
 	PROC_UNLOCK(p->p_pptr);
 	cnt.v_swtch++;
 	binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	cpu_sched_exit(td); /* XXXKSE check if this should be in thread_exit */
 	/*
 	 * Allow the scheduler to adjust the priority of the
 	 * parent when a kseg is exiting.
 	 */
 	if (p->p_pid != 1) 
 		sched_exit(p->p_pptr, p);
 
 	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
 	 */
 	thread_exit();
 }
 
 #ifdef COMPAT_43
 /*
  * MPSAFE.  The dirty work is handled by wait1().
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
 {
 	struct wait_args w;
 
 	w.options = 0;
 	w.rusage = NULL;
 	w.pid = WAIT_ANY;
 	w.status = NULL;
 	return (wait1(td, &w, 1));
 }
 #endif /* COMPAT_43 */
 
 /*
  * MPSAFE.  The dirty work is handled by wait1().
  */
 int
 wait4(struct thread *td, struct wait_args *uap)
 {
 
 	return (wait1(td, uap, 0));
 }
 
 /*
  * MPSAFE
  */
 static int
 wait1(struct thread *td, struct wait_args *uap, int compat)
 {
 	struct rusage ru;
 	int nfound;
 	struct proc *p, *q, *t;
 	int status, error;
 
 	q = td->td_proc;
 	if (uap->pid == 0) {
 		PROC_LOCK(q);
 		uap->pid = -q->p_pgid;
 		PROC_UNLOCK(q);
 	}
 	if (uap->options &~ (WUNTRACED|WNOHANG|WCONTINUED|WLINUXCLONE))
 		return (EINVAL);
 	mtx_lock(&Giant);
 loop:
 	nfound = 0;
 	sx_xlock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		PROC_LOCK(p);
 		if (uap->pid != WAIT_ANY &&
 		    p->p_pid != uap->pid && p->p_pgid != -uap->pid) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		/*
 		 * This special case handles a kthread spawned by linux_clone
 		 * (see linux_misc.c).  The linux_wait4 and linux_waitpid
 		 * functions need to be able to distinguish between waiting
 		 * on a process and waiting on a thread.  It is a thread if
 		 * p_sigparent is not SIGCHLD, and the WLINUXCLONE option
 		 * signifies we want to wait for threads and not processes.
 		 */
 		if ((p->p_sigparent != SIGCHLD) ^
 		    ((uap->options & WLINUXCLONE) != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		nfound++;
 		if (p->p_state == PRS_ZOMBIE) {
 			td->td_retval[0] = p->p_pid;
 #ifdef COMPAT_43
 			if (compat)
 				td->td_retval[1] = p->p_xstat;
 			else
 #endif
 			if (uap->status) {
 				status = p->p_xstat;	/* convert to int */
 				PROC_UNLOCK(p);
 				if ((error = copyout(&status,
 				    uap->status, sizeof(status)))) {
 					sx_xunlock(&proctree_lock);
 					mtx_unlock(&Giant);
 					return (error);
 				}
 				PROC_LOCK(p);
 			}
 			if (uap->rusage) {
 				bcopy(p->p_ru, &ru, sizeof(ru));
 				PROC_UNLOCK(p);
 				if ((error = copyout(&ru,
 				    uap->rusage, sizeof (struct rusage)))) {
 					sx_xunlock(&proctree_lock);
 					mtx_unlock(&Giant);
 					return (error);
 				}
 			} else
 				PROC_UNLOCK(p);
 			/*
 			 * If we got the child via a ptrace 'attach',
 			 * we need to give it back to the old parent.
 			 */
 			if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
 				PROC_LOCK(p);
 				p->p_oppid = 0;
 				proc_reparent(p, t);
 				PROC_UNLOCK(p);
 				psignal(t, SIGCHLD);
 				wakeup(t);
 				PROC_UNLOCK(t);
 				sx_xunlock(&proctree_lock);
 				mtx_unlock(&Giant);
 				return (0);
 			}
 
 			/*
 			 * Remove other references to this process to ensure
 			 * we have an exclusive reference.
 			 */
 			sx_xlock(&allproc_lock);
 			LIST_REMOVE(p, p_list);	/* off zombproc */
 			sx_xunlock(&allproc_lock);
 			LIST_REMOVE(p, p_sibling);
 			leavepgrp(p);
 			sx_xunlock(&proctree_lock);
 
 			/*
 			 * As a side effect of this lock, we know that
 			 * all other writes to this proc are visible now, so
 			 * no more locking is needed for p.
 			 */
 			PROC_LOCK(p);
 			p->p_xstat = 0;		/* XXX: why? */
 			PROC_UNLOCK(p);
 			PROC_LOCK(q);
 			ruadd(&q->p_stats->p_cru, p->p_ru);
 			PROC_UNLOCK(q);
 			FREE(p->p_ru, M_ZOMBIE);
 			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
 			 */
 			(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);
 
 			/*
 			 * Free credentials, arguments, and sigacts
 			 */
 			crfree(p->p_ucred);
 			p->p_ucred = NULL;
 			pargs_drop(p->p_args);
 			p->p_args = NULL;
 			sigacts_free(p->p_sigacts);
 			p->p_sigacts = NULL;
 
 			/*
 			 * do any thread-system specific cleanups
 			 */
 			thread_wait(p);
 
 			/*
 			 * Give vm and machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
 			 */
 			vm_waitproc(p);
 #ifdef MAC
 			mac_destroy_proc(p);
 #endif
 			KASSERT(FIRST_THREAD_IN_PROC(p),
 			    ("wait1: no residual thread!"));
 			uma_zfree(proc_zone, p);
 			sx_xlock(&allproc_lock);
 			nprocs--;
 			sx_xunlock(&allproc_lock);
 			mtx_unlock(&Giant);
 			return (0);
 		}
 		mtx_lock_spin(&sched_lock);
 		if (P_SHOULDSTOP(p) && (p->p_suspcount == p->p_numthreads) &&
 		    ((p->p_flag & P_WAITED) == 0) &&
 		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
 			mtx_unlock_spin(&sched_lock);
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 #ifdef COMPAT_43
 			if (compat) {
 				td->td_retval[1] = W_STOPCODE(p->p_xstat);
 				PROC_UNLOCK(p);
 				error = 0;
 			} else
 #endif
 			if (uap->status) {
 				status = W_STOPCODE(p->p_xstat);
 				PROC_UNLOCK(p);
 				error = copyout(&status,
 					uap->status, sizeof(status));
 			} else {
 				PROC_UNLOCK(p);
 				error = 0;
 			}
 			mtx_unlock(&Giant);
 			return (error);
 		}
 		mtx_unlock_spin(&sched_lock);
 		if (uap->options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			p->p_flag &= ~P_CONTINUED;
 			PROC_UNLOCK(p);
 
 			if (uap->status) {
 				status = SIGCONT;
 				error = copyout(&status,
 				    uap->status, sizeof(status));
 			} else
 				error = 0;
 
 			mtx_unlock(&Giant);
 			return (error);
 		}
 		PROC_UNLOCK(p);
 	}
 	if (nfound == 0) {
 		sx_xunlock(&proctree_lock);
 		mtx_unlock(&Giant);
 		return (ECHILD);
 	}
 	if (uap->options & WNOHANG) {
 		sx_xunlock(&proctree_lock);
 		td->td_retval[0] = 0;
 		mtx_unlock(&Giant);
 		return (0);
 	}
 	PROC_LOCK(q);
 	sx_xunlock(&proctree_lock);
 	error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "wait", 0);
 	PROC_UNLOCK(q);
 	if (error) {
 		mtx_unlock(&Giant);
 		return (error);
 	}
 	goto loop;
 }
 
 /*
  * Make process 'parent' the new parent of process 'child'.
  * Must be called with an exclusive hold of proctree lock.
  */
 void
 proc_reparent(struct proc *child, struct proc *parent)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(child, MA_OWNED);
 	if (child->p_pptr == parent)
 		return;
 
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 	child->p_pptr = parent;
 }
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 116360)
+++ head/sys/kern/kern_fork.c	(revision 116361)
@@ -1,829 +1,829 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/mac.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/kthread.h>
 #include <sys/unistd.h>	
 #include <sys/jail.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <sys/vmmeter.h>
 #include <sys/user.h>
 #include <machine/critical.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 static int forksleep; /* Place for fork1() to sleep on. */
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return error;
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return error;
 }
 
 /*
  * MPSAFE
  */
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	/* Don't allow kernel only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return error;
 }
 
 
 int	nprocs = 1;				/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	sysctl_wire_old_buffer(req, sizeof(int));
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;			/* parent proc */
 	int flags;
 	int pages;
 	struct proc **procp;			/* child proc */
 {
 	struct proc *p2, *pptr;
 	uid_t uid;
 	struct proc *newproc;
 	int trypid;
 	int ok;
 	static int pidchecked = 0;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct proc *p1 = td->td_proc;
 	struct thread *td2;
 	struct kse *ke2;
 	struct ksegrp *kg2;
 	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	mtx_lock(&Giant);
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		vm_forkproc(td, NULL, NULL, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent.)
 		 */
 		if (flags & RFFDG) {
 			FILEDESC_LOCK(p1->p_fd);
 			if (p1->p_fd->fd_refcnt > 1) {
 				struct filedesc *newfd;
 
 				newfd = fdcopy(td->td_proc->p_fd);
 				FILEDESC_UNLOCK(p1->p_fd);
 				fdfree(td);
 				p1->p_fd = newfd;
 			} else
 				FILEDESC_UNLOCK(p1->p_fd);
 		}
 		mtx_unlock(&Giant);
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Note 1:1 allows for forking with one thread coming out on the
 	 * other side with the expectation that the process is about to
 	 * exec.
 	 */
-	if (p1->p_flag & P_THREADED) {
+	if (p1->p_flag & P_SA) {
 		/*
 		 * Idle the other threads for a second.
 		 * Since the user space is copied, it must remain stable.
 		 * In addition, all threads (from the user perspective)
 		 * need to either be suspended or in the kernel,
 		 * where they will try restart in the parent and will
 		 * be aborted in the child.
 		 */
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_NO_EXIT)) {
 			/* Abort.. someone else is single threading before us */
 			PROC_UNLOCK(p1);
 			mtx_unlock(&Giant);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 		/*
 		 * All other activity in this process
 		 * is now suspended at the user boundary,
 		 * (or other safe places if we think of any).
 		 */
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	uid = td->td_ucred->cr_ruid;
 	if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 */
 	PROC_LOCK(p1);
 	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
 	PROC_UNLOCK(p1);
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10) {
 			trypid = 10;
 		}
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			PROC_LOCK(p2);
 			while (p2->p_pid == trypid ||
 			    p2->p_pgrp->pg_id == trypid ||
 			    p2->p_session->s_sid == trypid) {
 				trypid++;
 				if (trypid >= pidchecked) {
 					PROC_UNLOCK(p2);
 					goto retry;
 				}
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp->pg_id > trypid &&
 			    pidchecked > p2->p_pgrp->pg_id)
 				pidchecked = p2->p_pgrp->pg_id;
 			if (p2->p_session->s_sid > trypid &&
 			    pidchecked > p2->p_session->s_sid)
 				pidchecked = p2->p_session->s_sid;
 			PROC_UNLOCK(p2);
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(td->td_proc->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		FILEDESC_LOCK(p1->p_fd);
 		fd = fdcopy(td->td_proc->p_fd);
 		FILEDESC_UNLOCK(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_LOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_UNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	td2 = FIRST_THREAD_IN_PROC(p2);
 	kg2 = FIRST_KSEGRP_IN_PROC(p2);
 	ke2 = FIRST_KSE_IN_KSEGRP(kg2);
 
 	/* Allocate and switch to an alternate kstack if specified */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 	bzero(&p2->p_startzero,
 	    (unsigned) RANGEOF(struct proc, p_startzero, p_endzero));
 	bzero(&ke2->ke_startzero,
 	    (unsigned) RANGEOF(struct kse, ke_startzero, ke_endzero));
 	bzero(&td2->td_startzero,
 	    (unsigned) RANGEOF(struct thread, td_startzero, td_endzero));
 	bzero(&kg2->kg_startzero,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startzero, kg_endzero));
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    (unsigned) RANGEOF(struct proc, p_startcopy, p_endcopy));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
 	    (unsigned) RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 #undef RANGEOF
 
 	/* Set up the thread as an active thread (as if runnable). */
 	ke2->ke_state = KES_THREAD;
 	ke2->ke_thread = td2;
 	td2->td_kse = ke2;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 * The p_stats substruct is set in vm_forkproc.
 	 */
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag = PS_INMEM;
 	/*
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
 	sched_fork(p1, p2);
 
 	mtx_unlock_spin(&sched_lock);
 	p2->p_ucred = crhold(td->td_ucred);
 	td2->td_ucred = crhold(p2->p_ucred);	/* XXXKSE */
 
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	/* Bump references to the text vnode (for procfs) */
 	p2->p_textvp = p1->p_textvp;
 	if (p2->p_textvp)
 		VREF(p2->p_textvp);
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * p_limit is copy-on-write, bump refcnt,
 	 */
 	p2->p_limit = p1->p_limit;
 	p2->p_limit->p_refcnt++;
 
 	/*
 	 * Setup linkage for kernel based threading
 	 */
 	if((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, 1);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		cnt.v_forks++;
 		cnt.v_forkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		cnt.v_vforks++;
 		cnt.v_vforkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else if (p1 == &proc0) {
 		cnt.v_kthreads++;
 		cnt.v_kthreadpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	} else {
 		cnt.v_rforks++;
 		cnt.v_rforkpages += p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize;
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	if ((flags & RFSTOPPED) == 0) {
 		mtx_lock_spin(&sched_lock);
 		p2->p_state = PRS_NORMAL;
 		TD_SET_CAN_RUN(td2);
 		setrunqueue(td2);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * tell any interested parties about the new process
 	 */
 	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * If other threads are waiting, let them continue now
 	 */
-	if (p1->p_flag & P_THREADED) {
+	if (p1->p_flag & P_SA) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	mtx_unlock(&Giant);
 	*procp = p2;
 	return (0);
 fail:
 	sx_xunlock(&allproc_lock);
 	uma_zfree(proc_zone, newproc);
-	if (p1->p_flag & P_THREADED) {
+	if (p1->p_flag & P_SA) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	tsleep(&forksleep, PUSER, "fork", hz / 2);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct thread *td;
 	struct proc *p;
 
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 	td = curthread;
 	p = td->td_proc;
 	td->td_oncpu = PCPU_GET(cpuid);
 	p->p_state = PRS_NORMAL;
 	/*
 	 * Finish setting up thread glue.  We need to initialize
 	 * the thread into a td_critnest=1 state.  Some platforms
 	 * may have already partially or fully initialized td_critnest
 	 * and/or td_md.md_savecrit (when applciable).
 	 *
 	 * see <arch>/<arch>/critical.c
 	 */
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_lock.mtx_recurse = 0;
 	cpu_critical_fork_exit();
 	CTR3(KTR_PROC, "fork_exit: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
          * have this call a non-return function to stay in kernel mode.
          * initproc has its own fork handler, but it does return.
          */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	PROC_LOCK(p);
 	if (p->p_flag & P_KTHREAD) {
 		PROC_UNLOCK(p);
 		mtx_lock(&Giant);
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	PROC_UNLOCK(p);
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame, 0);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/kern_kse.c
===================================================================
--- head/sys/kern/kern_kse.c	(revision 116360)
+++ head/sys/kern/kern_kse.c	(revision 116361)
@@ -1,2022 +1,2022 @@
 /* 
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/tty.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ucontext.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm_map.h>
 
 #include <machine/frame.h>
 
 /*
  * KSEGRP related storage.
  */
 static uma_zone_t ksegrp_zone;
 static uma_zone_t kse_zone;
 static uma_zone_t thread_zone;
 static uma_zone_t upcall_zone;
 
 /* DEBUG ONLY */
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 static int thread_debug = 0;
 SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
 	&thread_debug, 0, "thread debug");
 
 static int max_threads_per_proc = 150;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 static int max_groups_per_proc = 50;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
 
 static int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
 	&max_threads_hits, 0, "");
 
 static int virtual_cpu;
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
 TAILQ_HEAD(, kse_upcall) zombie_upcalls = 
 	TAILQ_HEAD_INITIALIZER(zombie_upcalls);
 struct mtx kse_zombie_lock;
 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
 
 static void kse_purge(struct proc *p, struct thread *td);
 static void kse_purge_group(struct thread *td);
 static int thread_update_usr_ticks(struct thread *td, int user);
 static void thread_alloc_spare(struct thread *td, struct thread *spare);
 
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 	int def_val;
 
 #ifdef SMP
 	def_val = mp_ncpus;
 #else
 	def_val = 1;
 #endif
 	if (virtual_cpu == 0)
 		new_val = def_val;
 	else
 		new_val = virtual_cpu;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < 0)
 		return (EINVAL);
 	virtual_cpu = new_val;
 	return (0);
 }
 
 /* DEBUG ONLY */
 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
 	"debug virtual cpus");
 
 /*
  * Prepare a thread for use.
  */
 static void
 thread_ctor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu	= NOCPU;
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static void
 thread_init(void *mem, int size)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	mtx_lock(&Giant);
 	vm_thread_new(td, 0);
 	mtx_unlock(&Giant);
 	cpu_thread_setup(td);
 	td->td_sched = (struct td_sched *)&td[1];
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	vm_thread_dispose(td);
 }
 
 /*
  * Initialize type-stable parts of a kse (when newly created).
  */
 static void
 kse_init(void *mem, int size)
 {
 	struct kse	*ke;
 
 	ke = (struct kse *)mem;
 	ke->ke_sched = (struct ke_sched *)&ke[1];
 }
 
 /*
  * Initialize type-stable parts of a ksegrp (when newly created).
  */
 static void
 ksegrp_init(void *mem, int size)
 {
 	struct ksegrp	*kg;
 
 	kg = (struct ksegrp *)mem;
 	kg->kg_sched = (struct kg_sched *)&kg[1];
 }
 
 /* 
  * KSE is linked into kse group.
  */
 void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
 	ke->ke_state	= KES_UNQUEUED;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu	= NOCPU;
 	ke->ke_flags	= 0;
 }
 
 void
 kse_unlink(struct kse *ke)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = ke->ke_ksegrp;
 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 	if (ke->ke_state == KES_IDLE) {
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 		kg->kg_idle_kses--;
 	}
 	if (--kg->kg_kses == 0)
 		ksegrp_unlink(kg);
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	kse_stash(ke);
 }
 
 void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
 	TAILQ_INIT(&kg->kg_threads);
 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
 	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
 	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
 	kg->kg_proc = p;
 	/*
 	 * the following counters are in the -zero- section
 	 * and may not need clearing
 	 */
 	kg->kg_numthreads = 0;
 	kg->kg_runnable   = 0;
 	kg->kg_kses       = 0;
 	kg->kg_runq_kses  = 0; /* XXXKSE change name */
 	kg->kg_idle_kses  = 0;
 	kg->kg_numupcalls = 0;
 	/* link it in now that it's consistent */
 	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
 void
 ksegrp_unlink(struct ksegrp *kg)
 {
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
 	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
 	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
 
 	p = kg->kg_proc;
 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 	p->p_numksegrps--;
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	ksegrp_stash(kg);
 }
 
 struct kse_upcall *
 upcall_alloc(void)
 {
 	struct kse_upcall *ku;
 
 	ku = uma_zalloc(upcall_zone, M_WAITOK);
 	bzero(ku, sizeof(*ku));
 	return (ku);
 }
 
 void
 upcall_free(struct kse_upcall *ku)
 {
 
 	uma_zfree(upcall_zone, ku);
 }
 
 void
 upcall_link(struct kse_upcall *ku, struct ksegrp *kg)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link);
 	ku->ku_ksegrp = kg;
 	kg->kg_numupcalls++;
 }
 
 void
 upcall_unlink(struct kse_upcall *ku)
 {
 	struct ksegrp *kg = ku->ku_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
 	TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link); 
 	kg->kg_numupcalls--;
 	upcall_stash(ku);
 }
 
 void
 upcall_remove(struct thread *td)
 {
 
 	if (td->td_upcall) {
 		td->td_upcall->ku_owner = NULL;
 		upcall_unlink(td->td_upcall);
 		td->td_upcall = 0;
 	} 
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  */
 void
 proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td)
 {
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 	p->p_numksegrps = 0;
 	p->p_numthreads = 0;
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
 }
 
 /*
 struct kse_thr_interrupt_args {
 	struct kse_thr_mailbox * tmbx;
 };
 */
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
 	struct proc *p;
 	struct thread *td2;
 
 	p = td->td_proc;
-	if (!(p->p_flag & P_THREADED) || (uap->tmbx == NULL))
+	if (!(p->p_flag & P_SA) || (uap->tmbx == NULL))
 		return (EINVAL);
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		if (td2->td_mailbox == uap->tmbx) {
 			td2->td_flags |= TDF_INTERRUPT;
 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
 				if (td2->td_flags & TDF_CVWAITQ)
 					cv_abort(td2);
 				else
 					abortsleep(td2);
 			}
 			mtx_unlock_spin(&sched_lock);
 			return (0);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (ESRCH);
 }
 
 /*
 struct kse_exit_args {
 	register_t dummy;
 };
 */
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct kse *ke;
 	struct kse_upcall *ku, *ku2;
 	int    error, count;
 
 	p = td->td_proc;
 	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
 		return (EINVAL);
 	kg = td->td_ksegrp;
 	count = 0;
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	FOREACH_UPCALL_IN_GROUP(kg, ku2) {
 		if (ku2->ku_flags & KUF_EXITING)
 			count++;
 	}
 	if ((kg->kg_numupcalls - count) == 1 &&
 	    (kg->kg_numthreads > 1)) {
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	ku->ku_flags |= KUF_EXITING;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	error = suword(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE);
 	PROC_LOCK(p);
 	if (error)
 		psignal(p, SIGSEGV);
 	mtx_lock_spin(&sched_lock);
 	upcall_remove(td);
 	ke = td->td_kse;
 	if (p->p_numthreads == 1) {
 		kse_purge(p, td);
-		p->p_flag &= ~P_THREADED;
+		p->p_flag &= ~P_SA;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	} else {
 		if (kg->kg_numthreads == 1) { /* Shutdown a group */
 			kse_purge_group(td);
 			ke->ke_flags |= KEF_EXIT;
 		}
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return (0);
 }
 
 /*
  * Either becomes an upcall or waits for an awakening event and
  * then becomes an upcall. Only error cases return.
  */
 /*
 struct kse_release_args {
 	struct timespec *timeout;
 };
 */
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct timespec ts, ts2, ts3, timeout;
 	struct timeval tv;
 	int error;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 	if (td->td_upcall == NULL || TD_CAN_UNBIND(td))
 		return (EINVAL);
 	if (uap->timeout != NULL) {
 		if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
 			return (error);
 		getnanouptime(&ts);
 		timespecadd(&ts, &timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
 	}
 	mtx_lock_spin(&sched_lock);
 	/* Change OURSELF to become an upcall. */
 	td->td_flags = TDF_UPCALLING;
 #if 0	/* XXX This shouldn't be necessary */
 	if (p->p_sflag & PS_NEEDSIGCHK)
 		td->td_flags |= TDF_ASTPENDING;
 #endif
 	mtx_unlock_spin(&sched_lock);
 	PROC_LOCK(p);
 	while ((td->td_upcall->ku_flags & KUF_DOUPCALL) == 0 &&
 	       (kg->kg_completed == NULL)) {
 		kg->kg_upsleeps++;
 		error = msleep(&kg->kg_completed, &p->p_mtx, PPAUSE|PCATCH,
 			"kse_rel", (uap->timeout ? tvtohz(&tv) : 0));
 		kg->kg_upsleeps--;
 		PROC_UNLOCK(p);
 		if (uap->timeout == NULL || error != EWOULDBLOCK)
 			return (0);
 		getnanouptime(&ts2);
 		if (timespeccmp(&ts2, &ts, >=))
 			return (0);
 		ts3 = ts;
 		timespecsub(&ts3, &ts2);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		PROC_LOCK(p);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /* struct kse_wakeup_args {
 	struct kse_mailbox *mbx;
 }; */
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct thread *td2;
 
 	p = td->td_proc;
 	td2 = NULL;
 	ku = NULL;
 	/* KSE-enabled processes only, please. */
-	if (!(p->p_flag & P_THREADED))
+	if (!(p->p_flag & P_SA))
 		return (EINVAL);
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (uap->mbx) {
 		FOREACH_KSEGRP_IN_PROC(p, kg) {
 			FOREACH_UPCALL_IN_GROUP(kg, ku) {
 				if (ku->ku_mailbox == uap->mbx)
 					break;
 			}
 			if (ku)
 				break;
 		}
 	} else {
 		kg = td->td_ksegrp;
 		if (kg->kg_upsleeps) {
 			wakeup_one(&kg->kg_completed);
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		ku = TAILQ_FIRST(&kg->kg_upcalls);
 	}
 	if (ku) {
 		if ((td2 = ku->ku_owner) == NULL) {
 			panic("%s: no owner", __func__);
 		} else if (TD_ON_SLEEPQ(td2) &&
 		           (td2->td_wchan == &kg->kg_completed)) {
 			abortsleep(td2);
 		} else {
 			ku->ku_flags |= KUF_DOUPCALL;
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	return (ESRCH);
 }
 
 /* 
  * No new KSEG: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate max new KSEs and schedule an upcall.
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
 	int newgroup;
 }; */
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
 	struct kse *newke;
 	struct ksegrp *newkg;
 	struct ksegrp *kg;
 	struct proc *p;
 	struct kse_mailbox mbx;
 	struct kse_upcall *newku;
 	int err, ncpus;
 
 	p = td->td_proc;
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
 	/* Too bad, why hasn't kernel always a cpu counter !? */
 #ifdef SMP
 	ncpus = mp_ncpus;
 #else
 	ncpus = 1;
 #endif
 	if (thread_debug && virtual_cpu != 0)
 		ncpus = virtual_cpu;
 
 	/* Easier to just set it than to test and set */
 	PROC_LOCK(p);
-	p->p_flag |= P_THREADED;
+	p->p_flag |= P_SA;
 	PROC_UNLOCK(p);
 	kg = td->td_ksegrp;
 	if (uap->newgroup) {
 		/* Have race condition but it is cheap */ 
 		if (p->p_numksegrps >= max_groups_per_proc) 
 			return (EPROCLIM);
 		/* 
 		 * If we want a new KSEGRP it doesn't matter whether
 		 * we have already fired up KSE mode before or not.
 		 * We put the process in KSE mode and create a new KSEGRP.
 		 */
 		newkg = ksegrp_alloc();
 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
 		      kg_startzero, kg_endzero));
 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 		mtx_lock_spin(&sched_lock);
 		if (p->p_numksegrps >= max_groups_per_proc) {
 			mtx_unlock_spin(&sched_lock);
 			ksegrp_free(newkg);
 			return (EPROCLIM);
 		}
 		ksegrp_link(newkg, p);
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		newkg = kg;
 	}
 
 	/*
 	 * Creating upcalls more than number of physical cpu does
 	 * not help performance. 
 	 */
 	if (newkg->kg_numupcalls >= ncpus)
 		return (EPROCLIM);
 
 	if (newkg->kg_numupcalls == 0) {
 		/*
 		 * Initialize KSE group, optimized for MP.
 		 * Create KSEs as many as physical cpus, this increases
 		 * concurrent even if userland is not MP safe and can only run
 		 * on single CPU (for early version of libpthread, it is true).
 		 * In ideal world, every physical cpu should execute a thread.
 		 * If there is enough KSEs, threads in kernel can be
 		 * executed parallel on different cpus with full speed, 
 		 * Concurrent in kernel shouldn't be restricted by number of 
 		 * upcalls userland provides.
 		 * Adding more upcall structures only increases concurrent
 		 * in userland.
 		 * Highest performance configuration is:
 		 * N kses = N upcalls = N phyiscal cpus
 		 */
 		while (newkg->kg_kses < ncpus) {
 			newke = kse_alloc();
 			bzero(&newke->ke_startzero, RANGEOF(struct kse,
 			      ke_startzero, ke_endzero));
 #if 0
 			mtx_lock_spin(&sched_lock);
 			bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
 			      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
 			mtx_unlock_spin(&sched_lock);
 #endif
 			mtx_lock_spin(&sched_lock);
 			kse_link(newke, newkg);
 			/* Add engine */
 			kse_reassign(newke);
 			mtx_unlock_spin(&sched_lock);
 		}
 	}
 	newku = upcall_alloc();
 	newku->ku_mailbox = uap->mbx;
 	newku->ku_func = mbx.km_func;
 	bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
 
 	/* For the first call this may not have been set */
 	if (td->td_standin == NULL)
 		thread_alloc_spare(td, NULL);
 
 	mtx_lock_spin(&sched_lock);
 	if (newkg->kg_numupcalls >= ncpus) {
 		mtx_unlock_spin(&sched_lock);
 		upcall_free(newku);
 		return (EPROCLIM);
 	}
 	upcall_link(newku, newkg);
 	if (mbx.km_quantum)
 		newkg->kg_upquantum = max(1, mbx.km_quantum/tick);
 
 	/*
 	 * Each upcall structure has an owner thread, find which
 	 * one owns it.
 	 */
 	if (uap->newgroup) {
 		/* 
 		 * Because new ksegrp hasn't thread,
 		 * create an initial upcall thread to own it.
 		 */
 		thread_schedule_upcall(td, newku);
 	} else {
 		/*
 		 * If current thread hasn't an upcall structure,
 		 * just assign the upcall to it.
 		 */
 		if (td->td_upcall == NULL) {
 			newku->ku_owner = td;
 			td->td_upcall = newku;
 		} else {
 			/*
 			 * Create a new upcall thread to own it.
 			 */
 			thread_schedule_upcall(td, newku);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, 0);
 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
 	    NULL, NULL, ksegrp_init, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
 	    NULL, NULL, kse_init, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 }
 
 /*
  * Stash an embarasingly extra thread into the zombie thread queue.
  */
 void
 thread_stash(struct thread *td)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra kse into the zombie kse queue.
  */
 void
 kse_stash(struct kse *ke)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra upcall into the zombie upcall queue.
  */
 
 void
 upcall_stash(struct kse_upcall *ku)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
  */
 void
 ksegrp_stash(struct ksegrp *kg)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Reap zombie kse resource.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 	struct kse *ke_first, *ke_next;
 	struct ksegrp *kg_first, * kg_next;
 	struct kse_upcall *ku_first, *ku_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if ((!TAILQ_EMPTY(&zombie_threads))
 	    || (!TAILQ_EMPTY(&zombie_kses))
 	    || (!TAILQ_EMPTY(&zombie_ksegrps))
 	    || (!TAILQ_EMPTY(&zombie_upcalls))) {
 		mtx_lock_spin(&kse_zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		ke_first = TAILQ_FIRST(&zombie_kses);
 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		ku_first = TAILQ_FIRST(&zombie_upcalls);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		if (ke_first)
 			TAILQ_INIT(&zombie_kses);
 		if (kg_first)
 			TAILQ_INIT(&zombie_ksegrps);
 		if (ku_first)
 			TAILQ_INIT(&zombie_upcalls);
 		mtx_unlock_spin(&kse_zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_runq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 		while (ke_first) {
 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
 			kse_free(ke_first);
 			ke_first = ke_next;
 		}
 		while (kg_first) {
 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 			ksegrp_free(kg_first);
 			kg_first = kg_next;
 		}
 		while (ku_first) {
 			ku_next = TAILQ_NEXT(ku_first, ku_link);
 			upcall_free(ku_first);
 			ku_first = ku_next;
 		}
 	}
 }
 
 /*
  * Allocate a ksegrp.
  */
 struct ksegrp *
 ksegrp_alloc(void)
 {
 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
 }
 
 /*
  * Allocate a kse.
  */
 struct kse *
 kse_alloc(void)
 {
 	return (uma_zalloc(kse_zone, M_WAITOK));
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
 	thread_reap(); /* check if any zombies to get */
 	return (uma_zalloc(thread_zone, M_WAITOK));
 }
 
 /*
  * Deallocate a ksegrp.
  */
 void
 ksegrp_free(struct ksegrp *td)
 {
 	uma_zfree(ksegrp_zone, td);
 }
 
 /*
  * Deallocate a kse.
  */
 void
 kse_free(struct kse *td)
 {
 	uma_zfree(kse_zone, td);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
  * The list is anchored in the ksegrp structure.
  */
 int
 thread_export_context(struct thread *td)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	uintptr_t mbx;
 	void *addr;
 	int error,temp;
 	mcontext_t mc;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Export the user/machine context. */
 	get_mcontext(td, &mc, 0);
 	addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext);
 	error = copyout(&mc, addr, sizeof(mcontext_t));
 	if (error)
 		goto bad;
 
 	/* Exports clock ticks in kernel mode */
 	addr = (caddr_t)(&td->td_mailbox->tm_sticks);
 	temp = fuword(addr) + td->td_usticks;
 	if (suword(addr, temp)) {
 		error = EFAULT;
 		goto bad;
 	}
 
 	/* Get address in latest mbox of list pointer */
 	addr = (void *)(&td->td_mailbox->tm_next);
 	/*
 	 * Put the saved address of the previous first
 	 * entry into this one
 	 */
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			error = EFAULT;
 			goto bad;
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = td->td_mailbox;
 			/*
 			 * The thread context may be taken away by
 			 * other upcall threads when we unlock
 			 * process lock. it's no longer valid to
 			 * use it again in any other places.
 			 */
 			td->td_mailbox = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	td->td_usticks = 0;
 	return (0);
 
 bad:
 	PROC_LOCK(p);
 	psignal(p, SIGSEGV);
 	PROC_UNLOCK(p);
 	/* The mailbox is bad, don't use it */
 	td->td_mailbox = NULL;
 	td->td_usticks = 0;
 	return (error);
 }
 
 /*
  * Take the list of completed mailboxes for this KSEGRP and put them on this
  * upcall's mailbox as it's the next one going up.
  */
 static int
 thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku)
 {
 	struct proc *p = kg->kg_proc;
 	void *addr;
 	uintptr_t mbx;
 
 	addr = (void *)(&ku->ku_mailbox->km_completed);
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * This function should be called at statclock interrupt time
  */
 int
 thread_statclock(int user)
 {
 	struct thread *td = curthread;
 	
 	if (td->td_ksegrp->kg_numupcalls == 0)
 		return (-1);
 	if (user) {
 		/* Current always do via ast() */
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= (TDF_USTATCLOCK|TDF_ASTPENDING);
 		mtx_unlock_spin(&sched_lock);
 		td->td_uuticks++;
 	} else {
 		if (td->td_mailbox != NULL)
 			td->td_usticks++;
 		else {
 			/* XXXKSE
 		 	 * We will call thread_user_enter() for every
 			 * kernel entry in future, so if the thread mailbox
 			 * is NULL, it must be a UTS kernel, don't account
 			 * clock ticks for it.
 			 */
 		}
 	}
 	return (0);
 }
 
 /*
  * Export state clock ticks for userland
  */
 static int
 thread_update_usr_ticks(struct thread *td, int user)
 {
 	struct proc *p = td->td_proc;
 	struct kse_thr_mailbox *tmbx;
 	struct kse_upcall *ku;
 	struct ksegrp *kg;
 	caddr_t addr;
 	uint uticks;
 
 	if ((ku = td->td_upcall) == NULL)
 		return (-1);
 	
 	tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 	if ((tmbx == NULL) || (tmbx == (void *)-1))
 		return (-1);
 	if (user) {
 		uticks = td->td_uuticks;
 		td->td_uuticks = 0;
 		addr = (caddr_t)&tmbx->tm_uticks;
 	} else {
 		uticks = td->td_usticks;
 		td->td_usticks = 0;
 		addr = (caddr_t)&tmbx->tm_sticks;
 	}
 	if (uticks) {
 		if (suword(addr, uticks+fuword(addr))) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (-2);
 		}
 	}
 	kg = td->td_ksegrp;
 	if (kg->kg_upquantum && ticks >= kg->kg_nextupcall) {
 		mtx_lock_spin(&sched_lock);
 		td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 /*
  * Discard the current thread and exit from its context.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	struct thread *td;
 	struct kse *ke;
 	struct proc *p;
 	struct ksegrp	*kg;
 
 	td = curthread;
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	KASSERT(ke != NULL, ("thread exiting without a kse"));
 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
 
 	if (td->td_standin != NULL) {
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff.
 	 */
 	if (p->p_numthreads > 1) {
 		thread_unlink(td);
 		if (p->p_maxthrwaits)
 			wakeup(&p->p_numthreads);
 		/*
 		 * The test below is NOT true if we are the
 		 * sole exiting thread. P_STOPPED_SNGL is unset
 		 * in exit1() after it is the only survivor.
 		 */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 
 		/*
 		 * Because each upcall structure has an owner thread,
 		 * owner thread exits only when process is in exiting
 		 * state, so upcall to userland is no longer needed,
 		 * deleting upcall structure is safe here.
 		 * So when all threads in a group is exited, all upcalls
 		 * in the group should be automatically freed.
 		 */
 		if (td->td_upcall)
 			upcall_remove(td);
 	
 		ke->ke_state = KES_UNQUEUED;
 		ke->ke_thread = NULL;
 		/* 
 		 * Decide what to do with the KSE attached to this thread.
 		 */
 		if (ke->ke_flags & KEF_EXIT)
 			kse_unlink(ke);
 		else
 			kse_reassign(ke);
 		PROC_UNLOCK(p);
 		td->td_kse	= NULL;
 		td->td_state	= TDS_INACTIVE;
 #if 0
 		td->td_proc	= NULL;
 #endif
 		td->td_ksegrp	= NULL;
 		td->td_last_kse	= NULL;
 		PCPU_SET(deadthread, td);
 	} else {
 		PROC_UNLOCK(p);
 	}
 	/* XXX Shouldn't cpu_throw() here. */
 	mtx_assert(&sched_lock, MA_OWNED);
 #if !defined(__alpha__) && !defined(__powerpc__) 
 	cpu_throw(td, choosethread());
 #else
 	cpu_throw();
 #endif
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /* 
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant held, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()"));
 	KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_standin != NULL) {
 			thread_free(td->td_standin);
 			td->td_standin = NULL;
 		}
 		cpu_thread_clean(td);
 	}
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  */
 void
 thread_link(struct thread *td, struct ksegrp *kg)
 {
 	struct proc *p;
 
 	p = kg->kg_proc;
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_ksegrp   = kg;
 	td->td_last_kse = NULL;
 	td->td_flags    = 0;
 	td->td_kse      = NULL;
 
 	LIST_INIT(&td->td_contested);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
 	p->p_numthreads++;
 	kg->kg_numthreads++;
 }
 
 void
 thread_unlink(struct thread *td)
 {      
 	struct proc *p = td->td_proc;
 	struct ksegrp *kg = td->td_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
 	kg->kg_numthreads--;
 	/* could clear a few other things here */
 } 
 
 /*
  * Purge a ksegrp resource. When a ksegrp is preparing to
  * exit, it calls this function. 
  */
 static void
 kse_purge_group(struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	kg = td->td_ksegrp;
  	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
 	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 		KASSERT(ke->ke_state == KES_IDLE,
 			("%s: wrong idle KSE state", __func__));
 		kse_unlink(ke);
 	}
 	KASSERT((kg->kg_kses == 1),
 		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
 	KASSERT((kg->kg_numupcalls == 0),
 	        ("%s: ksegrp still has %d upcall datas",
 		__func__, kg->kg_numupcalls));
 }
 
 /*
  * Purge a process's KSE resource. When a process is preparing to 
  * exit, it calls kse_purge to release any extra KSE resources in 
  * the process.
  */
 static void
 kse_purge(struct proc *p, struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 		p->p_numksegrps--;
 		/*
 		 * There is no ownership for KSE, after all threads
 		 * in the group exited, it is possible that some KSEs 
 		 * were left in idle queue, gc them now.
 		 */
 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 			KASSERT(ke->ke_state == KES_IDLE,
 			   ("%s: wrong idle KSE state", __func__));
 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 			kg->kg_idle_kses--;
 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 			kg->kg_kses--;
 			kse_stash(ke);
 		}
 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
 		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
 		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
 		KASSERT((kg->kg_numupcalls == 0),
 		        ("%s: ksegrp still has %d upcall datas",
 			__func__, kg->kg_numupcalls));
 	
 		if (kg != td->td_ksegrp)
 			ksegrp_stash(kg);
 	}
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
 	p->p_numksegrps++;
 }
 
 /*
  * This function is intended to be used to initialize a spare thread
  * for upcall. Initialize thread's large data area outside sched_lock
  * for thread_schedule_upcall().
  */
 void
 thread_alloc_spare(struct thread *td, struct thread *spare)
 {
 	if (td->td_standin)
 		return;
 	if (spare == NULL)
 		spare = thread_alloc();
 	td->td_standin = spare;
 	bzero(&spare->td_startzero,
 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
 	spare->td_proc = td->td_proc;
 	spare->td_ucred = crhold(td->td_ucred);
 }
 
 /*
  * Create a thread and schedule it for upcall on the KSE given.
  * Use our thread's standin so that we don't have to allocate one.
  */
 struct thread *
 thread_schedule_upcall(struct thread *td, struct kse_upcall *ku)
 {
 	struct thread *td2;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/* 
 	 * Schedule an upcall thread on specified kse_upcall,
 	 * the kse_upcall must be free.
 	 * td must have a spare thread.
 	 */
 	KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__));
 	if ((td2 = td->td_standin) != NULL) {
 		td->td_standin = NULL;
 	} else {
 		panic("no reserve thread when scheduling an upcall");
 		return (NULL);
 	}
 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	thread_link(td2, ku->ku_ksegrp);
 	/* inherit blocked thread's context */
 	cpu_set_upcall(td2, td);
 	/* Let the new thread become owner of the upcall */
 	ku->ku_owner   = td2;
 	td2->td_upcall = ku;
 	td2->td_flags  = TDF_UPCALLING;
 #if 0	/* XXX This shouldn't be necessary */
 	if (td->td_proc->p_sflag & PS_NEEDSIGCHK)
 		td2->td_flags |= TDF_ASTPENDING;
 #endif
 	td2->td_kse    = NULL;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	setrunqueue(td2);
 	return (td2);	/* bogus.. should be a void function */
 }
 
 void
 thread_signal_add(struct thread *td, int sig)
 {
 	struct kse_upcall *ku;
 	struct proc *p;
 	sigset_t ss;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&p->p_sigacts->ps_mtx, MA_OWNED);
 	td = curthread;
 	ku = td->td_upcall;
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	PROC_UNLOCK(p);
 	error = copyin(&ku->ku_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
 	if (error)
 		goto error;
 
 	SIGADDSET(ss, sig);
 
 	error = copyout(&ss, &ku->ku_mailbox->km_sigscaught, sizeof(sigset_t));
 	if (error)
 		goto error;
 
 	PROC_LOCK(p);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	return;
 error:
 	PROC_LOCK(p);
 	sigexit(td, SIGILL);
 }
 
 
 /*
  * Schedule an upcall to notify a KSE process recieved signals.
  *
  */
 void
 thread_signal_upcall(struct thread *td)
 {
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_UPCALLING;
 	mtx_unlock_spin(&sched_lock);
 
 	return;
 }
 
 void
 thread_switchout(struct thread *td)
 {
 	struct kse_upcall *ku;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * If the outgoing thread is in threaded group and has never
 	 * scheduled an upcall, decide whether this is a short
 	 * or long term event and thus whether or not to schedule
 	 * an upcall.
 	 * If it is a short term event, just suspend it in
 	 * a way that takes its KSE with it.
 	 * Select the events for which we want to schedule upcalls.
 	 * For now it's just sleep.
 	 * XXXKSE eventually almost any inhibition could do.
 	 */
 	if (TD_CAN_UNBIND(td) && (td->td_standin) && TD_ON_SLEEPQ(td)) {
 		/* 
 		 * Release ownership of upcall, and schedule an upcall
 		 * thread, this new upcall thread becomes the owner of
 		 * the upcall structure.
 		 */
 		ku = td->td_upcall;
 		ku->ku_owner = NULL;
 		td->td_upcall = NULL; 
 		td->td_flags &= ~TDF_CAN_UNBIND;
 		thread_schedule_upcall(td, ku);
 	}
 }
 
 /*
  * Setup done on the thread when it enters the kernel.
  * XXXKSE Presently only for syscalls but eventually all kernel entries.
  */
 void
 thread_user_enter(struct proc *p, struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct kse_thr_mailbox *tmbx;
 
 	kg = td->td_ksegrp;
 
 	/*
 	 * First check that we shouldn't just abort.
 	 * But check if we are the single thread first!
 	 */
 	PROC_LOCK(p);
 	if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we are doing a syscall in a KSE environment,
 	 * note where our mailbox is. There is always the
 	 * possibility that we could do this lazily (in kse_reassign()),
 	 * but for now do it every time.
 	 */
 	kg = td->td_ksegrp;
 	if (kg->kg_numupcalls) {
 		ku = td->td_upcall;
 		KASSERT(ku, ("%s: no upcall owned", __func__));
 		KASSERT((ku->ku_owner == td), ("%s: wrong owner", __func__));
 		KASSERT(!TD_CAN_UNBIND(td), ("%s: can unbind", __func__));
 		ku->ku_mflags = fuword((void *)&ku->ku_mailbox->km_flags);
 		tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 		if ((tmbx == NULL) || (tmbx == (void *)-1)) {
 			td->td_mailbox = NULL;
 		} else {
 			td->td_mailbox = tmbx;
 			if (td->td_standin == NULL)
 				thread_alloc_spare(td, NULL);
 			mtx_lock_spin(&sched_lock);
 			if (ku->ku_mflags & KMF_NOUPCALL)
 				td->td_flags &= ~TDF_CAN_UNBIND;
 			else
 				td->td_flags |= TDF_CAN_UNBIND;
 			mtx_unlock_spin(&sched_lock);
 		}
 	}
 }
 
 /*
  * The extra work we go through if we are a threaded process when we
  * return to userland.
  *
  * If we are a KSE process and returning to user mode, check for
  * extra work to do before we return (e.g. for more syscalls
  * to complete first).  If we were in a critical section, we should
  * just return to let it finish. Same if we were in the UTS (in
  * which case the mailbox's context's busy indicator will be set).
  * The only traps we suport will have set the mailbox.
  * We will clear it here.
  */
 int
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	int error = 0, upcalls, uts_crit;
 	struct kse_upcall *ku;
 	struct ksegrp *kg, *kg2;
 	struct proc *p;
 	struct timespec ts;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Nothing to do with non-threaded group/process */
 	if (td->td_ksegrp->kg_numupcalls == 0)
 		return (0);
 
 	/*
 	 * Stat clock interrupt hit in userland, it 
 	 * is returning from interrupt, charge thread's
 	 * userland time for UTS.
 	 */
 	if (td->td_flags & TDF_USTATCLOCK) {
 		thread_update_usr_ticks(td, 1);
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_USTATCLOCK;
 		mtx_unlock_spin(&sched_lock);
 		if (kg->kg_completed || 
 		    (td->td_upcall->ku_flags & KUF_DOUPCALL))
 			thread_user_enter(p, td);
 	}
 
 	uts_crit = (td->td_mailbox == NULL);
 	ku = td->td_upcall;
 	/* 
 	 * Optimisation:
 	 * This thread has not started any upcall.
 	 * If there is no work to report other than ourself,
 	 * then it can return direct to userland.
 	 */
 	if (TD_CAN_UNBIND(td)) {
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_CAN_UNBIND;
 		if ((td->td_flags & TDF_NEEDSIGCHK) == 0 &&
 		    (kg->kg_completed == NULL) &&
 		    (ku->ku_flags & KUF_DOUPCALL) == 0 &&
 		    (kg->kg_upquantum && ticks < kg->kg_nextupcall)) {
 			mtx_unlock_spin(&sched_lock);
 			thread_update_usr_ticks(td, 0);
 			nanotime(&ts);
 			error = copyout(&ts,
 				(caddr_t)&ku->ku_mailbox->km_timeofday,
 				sizeof(ts));
 			td->td_mailbox = 0;
 			ku->ku_mflags = 0;
 			if (error)
 				goto out;
 			return (0);
 		}
 		mtx_unlock_spin(&sched_lock);
 		error = thread_export_context(td);
 		if (error) {
 			/*
 			 * Failing to do the KSE operation just defaults
 			 * back to synchonous operation, so just return from
 			 * the syscall.
 			 */
 			goto out;
 		}
 		/*
 		 * There is something to report, and we own an upcall
 		 * strucuture, we can go to userland.
 		 * Turn ourself into an upcall thread.
 		 */
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_UPCALLING;
 		mtx_unlock_spin(&sched_lock);
 	} else if (td->td_mailbox && (ku == NULL)) {
 		error = thread_export_context(td);
 		/* possibly upcall with error? */
 		PROC_LOCK(p);
 		/*
 		 * There are upcall threads waiting for
 		 * work to do, wake one of them up.
 		 * XXXKSE Maybe wake all of them up. 
 		 */
 		if (!error && kg->kg_upsleeps)
 			wakeup_one(&kg->kg_completed);
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 
 	KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
 
 	if (p->p_numthreads > max_threads_per_proc) {
 		max_threads_hits++;
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		p->p_maxthrwaits++;
 		while (p->p_numthreads > max_threads_per_proc) {
 			upcalls = 0;
 			FOREACH_KSEGRP_IN_PROC(p, kg2) {
 				if (kg2->kg_numupcalls == 0)
 					upcalls++;
 				else
 					upcalls += kg2->kg_numupcalls;
 			}
 			if (upcalls >= max_threads_per_proc)
 				break;
 			mtx_unlock_spin(&sched_lock);
 			if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
 			    "maxthreads", NULL)) {
 				mtx_lock_spin(&sched_lock);
 				break;
 			} else {
 				mtx_lock_spin(&sched_lock);
 			}
 		}
 		p->p_maxthrwaits--;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_flags & TDF_UPCALLING) {
 		uts_crit = 0;
 		kg->kg_nextupcall = ticks+kg->kg_upquantum;
 		/* 
 		 * There is no more work to do and we are going to ride
 		 * this thread up to userland as an upcall.
 		 * Do the last parts of the setup needed for the upcall.
 		 */
 		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
 
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_UPCALLING;
 		if (ku->ku_flags & KUF_DOUPCALL)
 			ku->ku_flags &= ~KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 
 		/*
 		 * Set user context to the UTS
 		 */
 		if (!(ku->ku_mflags & KMF_NOUPCALL)) {
 			cpu_set_upcall_kse(td, ku);
 			error = suword(&ku->ku_mailbox->km_curthread, 0);
 			if (error)
 				goto out;
 		}
 
 		/*
 		 * Unhook the list of completed threads.
 		 * anything that completes after this gets to 
 		 * come in next time.
 		 * Put the list of completed thread mailboxes on
 		 * this KSE's mailbox.
 		 */
 		if (!(ku->ku_mflags & KMF_NOCOMPLETED) &&
 		    (error = thread_link_mboxes(kg, ku)) != 0)
 			goto out;
 	}
 	if (!uts_crit) {
 		nanotime(&ts);
 		error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts));
 	}
 
 out:
 	if (error) {
 		/*
 		 * Things are going to be so screwed we should just kill
 		 * the process.
 		 * how do we do that?
 		 */
 		PROC_LOCK(td->td_proc);
 		psignal(td->td_proc, SIGSEGV);
 		PROC_UNLOCK(td->td_proc);
 	} else {
 		/*
 		 * Optimisation:
 		 * Ensure that we have a spare thread available,
 		 * for when we re-enter the kernel.
 		 */
 		if (td->td_standin == NULL)
 			thread_alloc_spare(td, NULL);
 	}
 
 	ku->ku_mflags = 0;
 	/*
 	 * Clear thread mailbox first, then clear system tick count.
 	 * The order is important because thread_statclock() use 
 	 * mailbox pointer to see if it is an userland thread or
 	 * an UTS kernel thread.
 	 */
 	td->td_mailbox = NULL;
 	td->td_usticks = 0;
 	return (error);	/* go sync */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accellerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int force_exit)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
-	if ((p->p_flag & P_THREADED) == 0 && p->p_numthreads == 1)
+	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread) 
 		return (1);
 
 	if (force_exit == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 	} else
 		p->p_flag &= ~P_SINGLE_EXIT;
 	p->p_flag |= P_STOPPED_SINGLE;
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = td;
 	while ((p->p_numthreads - p->p_suspcount) != 1) {
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				if (force_exit == SINGLE_EXIT) {
 					if (TD_IS_SUSPENDED(td2)) {
 						thread_unsuspend_one(td2);
 					}
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR)) {
 						if (td2->td_flags & TDF_CVWAITQ)
 							cv_abort(td2);
 						else
 							abortsleep(td2);
 					}
 				} else {
 					if (TD_IS_SUSPENDED(td2))
 						continue;
 					/*
 					 * maybe other inhibitted states too?
 					 * XXXKSE Is it totally safe to
 					 * suspend a non-interruptable thread?
 					 */
 					if (td2->td_inhibitors &
 					    (TDI_SLEEPING | TDI_SWAPPED))
 						thread_suspend_one(td2);
 				}
 			}
 		}
 		/* 
 		 * Maybe we suspended some threads.. was it enough? 
 		 */
 		if ((p->p_numthreads - p->p_suspcount) == 1)
 			break;
 
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_one(td);
 		DROP_GIANT();
 		PROC_UNLOCK(p);
 		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (force_exit == SINGLE_EXIT) { 
 		if (td->td_upcall)
 			upcall_remove(td);
 		kse_purge(p, td);
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless 
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p)) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked 
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		} 
 		if (return_instead)
 			return (1);
 
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			while (mtx_owned(&Giant))
 				mtx_unlock(&Giant);
-			if (p->p_flag & P_THREADED)
+			if (p->p_flag & P_SA)
 				thread_exit();
 			else
 				thr_exit1();
 		}
 
 		/*
 		 * When a thread suspends, it just
 		 * moves to the processes's suspend queue
 		 * and stays there.
 		 */
 		thread_suspend_one(td);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 		DROP_GIANT();
 		PROC_UNLOCK(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
 	/*
 	 * Hack: If we are suspending but are on the sleep queue
 	 * then we are in msleep or the cv equivalent. We
 	 * want to look like we have two Inhibitors.
 	 * May already be set.. doesn't matter.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		TD_SET_SLEEPING(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_unsuspend_one(p->p_singlethread);
 	}
 }
 
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~P_STOPPED_SINGLE;
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 116360)
+++ head/sys/kern/kern_sig.c	(revision 116361)
@@ -1,2611 +1,2611 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 
 #if defined (__alpha__) && !defined(COMPAT_43)
 #error "You *really* need COMPAT_43 on the alpha for longjmp(3)"
 #endif
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int	coredump(struct thread *);
 static char	*expand_name(const char *, uid_t, pid_t);
 static int	killpg1(struct thread *td, int sig, int pgid, int all);
 static int	issignal(struct thread *p);
 static int	sigprop(int sig);
 static void	stop(struct proc *);
 static void	tdsigwakeup(struct thread *td, int sig, sig_t action);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 static int	kern_sigtimedwait(struct thread *td, sigset_t set,
 				siginfo_t *info, struct timespec *timeout);
 
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 #define	SA_PROC		0x80		/* deliverable to any thread */
 
 static int sigproptbl[NSIG] = {
         SA_KILL|SA_PROC,		/* SIGHUP */
         SA_KILL|SA_PROC,		/* SIGINT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGQUIT */
         SA_KILL|SA_CORE,		/* SIGILL */
         SA_KILL|SA_CORE,		/* SIGTRAP */
         SA_KILL|SA_CORE,		/* SIGABRT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGEMT */
         SA_KILL|SA_CORE,		/* SIGFPE */
         SA_KILL|SA_PROC,		/* SIGKILL */
         SA_KILL|SA_CORE,		/* SIGBUS */
         SA_KILL|SA_CORE,		/* SIGSEGV */
         SA_KILL|SA_CORE,		/* SIGSYS */
         SA_KILL|SA_PROC,		/* SIGPIPE */
         SA_KILL|SA_PROC,		/* SIGALRM */
         SA_KILL|SA_PROC,		/* SIGTERM */
         SA_IGNORE|SA_PROC,		/* SIGURG */
         SA_STOP|SA_PROC,		/* SIGSTOP */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTSTP */
         SA_IGNORE|SA_CONT|SA_PROC,	/* SIGCONT */
         SA_IGNORE|SA_PROC,		/* SIGCHLD */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTIN */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTOU */
         SA_IGNORE|SA_PROC,		/* SIGIO */
         SA_KILL,			/* SIGXCPU */
         SA_KILL,			/* SIGXFSZ */
         SA_KILL|SA_PROC,		/* SIGVTALRM */
         SA_KILL|SA_PROC,		/* SIGPROF */
         SA_IGNORE|SA_PROC,		/* SIGWINCH  */
         SA_IGNORE|SA_PROC,		/* SIGINFO */
         SA_KILL|SA_PROC,		/* SIGUSR1 */
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  * XXXKSE   the check for a pending stop is not done under KSE
  *
  * MP SAFE.
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_siglist or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 	sigset_t set;
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If our mask changed we may have to move signal that were
 	 * previously masked by all threads to our siglist.
 	 */
 	set = p->p_siglist;
 	SIGSETNAND(set, td->td_sigmask);
 	SIGSETNAND(p->p_siglist, set);
 	SIGSETOR(td->td_siglist, set);
 
 	if (SIGPENDING(td)) {
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct proc *p = curthread->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	return ((p->p_flag & P_ALTSTACK) ?
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	    ((p->p_sigstk.ss_size == 0) ? (p->p_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size))
 #else
 	    ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  *
  * MPSAFE
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct thread *td0;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig))
 			oact->sa_flags |= SA_SIGINFO;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 #ifdef COMPAT_SUNOS
 		if (act->sa_flags & SA_USERTRAMP)
 			SIGADDSET(ps->ps_usertramp, sig);
 		else
 			SIGDELSET(ps->ps_usertramp, sig);
 #endif
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			SIGDELSET(p->p_siglist, sig);
 			FOREACH_THREAD_IN_PROC(p, td0)
 				SIGDELSET(td0->td_siglist, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__) && !defined(__alpha__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(p)
 	register struct proc *p;
 {
 	register struct sigacts *ps;
 	register int sig;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(p->p_siglist, sig);
 			/*
 			 * There is only one thread at this point.
 			 */
 			SIGDELSET(FIRST_THREAD_IN_PROC(p)->td_siglist, sig);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Clear out the td's sigmask.  Normal processes use the proc sigmask.
 	 */
 	SIGEMPTYSET(FIRST_THREAD_IN_PROC(p)->td_sigmask);
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	p->p_sigstk.ss_flags = SS_DISABLE;
 	p->p_sigstk.ss_size = 0;
 	p->p_sigstk.ss_sp = 0;
 	p->p_flag &= ~P_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(td, how, set, oset, old)
 	struct thread *td;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	PROC_LOCK(td->td_proc);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(td->td_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (error);
 }
 
 /*
  * sigprocmask() - MP SAFE
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * osigprocmask() - MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	siginfo_t info;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &info, NULL);
 	if (error)
 		return (error);
 
 	error = copyout(&info.si_signo, uap->sig, sizeof(info.si_signo));
 	/* Repost if we got an error. */
 	if (error && info.si_signo) {
 		PROC_LOCK(td->td_proc);
 		tdsignal(td, info.si_signo);
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (error);
 }
 /*
  * MPSAFE
  */
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	siginfo_t info;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &info, timeout);
 	if (error)
 		return (error);
 
 	error = copyout(&info, uap->info, sizeof(info));
 	/* Repost if we got an error. */
 	if (error && info.si_signo) {
 		PROC_LOCK(td->td_proc);
 		tdsignal(td, info.si_signo);
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	siginfo_t info;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &info, NULL);
 	if (error)
 		return (error);
 
 	error = copyout(&info, uap->info, sizeof(info));
 	/* Repost if we got an error. */
 	if (error && info.si_signo) {
 		PROC_LOCK(td->td_proc);
 		tdsignal(td, info.si_signo);
 		PROC_UNLOCK(td->td_proc);
 	}
 	return (error);
 }
 
 static int
 kern_sigtimedwait(struct thread *td, sigset_t set, siginfo_t *info,
     struct timespec *timeout)
 {
 	register struct sigacts *ps;
 	sigset_t oldmask;
 	struct proc *p; 
 	int error;
 	int sig;
 	int hz;
 
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
 	SIG_CANTMASK(set);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	oldmask = td->td_sigmask;
 	td->td_sigmask = set;
 	signotify(td);
 
 	mtx_lock(&ps->ps_mtx);
 	sig = cursig(td);
 	if (sig)
 		goto out;
 
 	/*
 	 * POSIX says this must be checked after looking for pending
 	 * signals.
 	 */
 	if (timeout) {
 		struct timeval tv;
 
 		if (timeout->tv_nsec > 1000000000) {
 			error = EINVAL;
 			goto out;
 		}
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		hz = tvtohz(&tv);
 	} else
 		hz = 0;
 
 	mtx_unlock(&ps->ps_mtx);
 	error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "pause", hz);
 	mtx_lock(&ps->ps_mtx);
 	if (error == EINTR)
 		error = 0;
 	else if (error)
 		goto out;
 
 	sig = cursig(td);
 out:
 	td->td_sigmask = oldmask;
 	if (sig) {
 		sig_t action;
 
 		action = ps->ps_sigact[_SIG_IDX(sig)];
 		mtx_unlock(&ps->ps_mtx);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG))
 			ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 			    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 		_STOPEVENT(p, S_SIG, sig);
 
 		if (action == SIG_DFL)
 			sigexit(td, sig);
 			/* NOTREACHED */
 
 		SIGDELSET(td->td_siglist, sig);
 		info->si_signo = sig;
 		info->si_code = 0;
 	} else
 		mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t siglist;
 
 	PROC_LOCK(p);
 	siglist = p->p_siglist;
 	SIGSETOR(siglist, td->td_siglist);
 	PROC_UNLOCK(p);
 	return (copyout(&siglist, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t siglist;
 
 	PROC_LOCK(p);
 	siglist = p->p_siglist;
 	SIGSETOR(siglist, td->td_siglist);
 	PROC_UNLOCK(p);
 	SIG2OSIG(siglist, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 #ifdef COMPAT_SUNOS
 		nsap->sa_flags |= SA_USERTRAMP;
 #endif
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 #ifdef COMPAT_SUNOS
 		vec.sv_flags &= ~SA_NOCLDSTOP;
 #endif
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETOR(td->td_sigmask, set);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETLO(td->td_sigmask, set);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (0);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Suspend process until signal, providing mask to be set
  * in the meantime.  Note nonstandard calling convention:
  * libc stub passes mask, not pointer, to save a copyin.
  ***** XXXKSE this doesn't make sense under KSE.
  ***** Do we suspend the thread or all threads in the process?
  ***** How do we suspend threads running NOW on another processor?
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	SIG_CANTMASK(mask);
 	td->td_sigmask = mask;
 	signotify(td);
 	while (msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t mask;
 
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(td->td_sigmask, mask);
 	signotify(td);
 	while (msleep(p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct proc *p = td->td_proc;
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	PROC_LOCK(p);
 	oss.ss_sp = p->p_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		p->p_sigstk.ss_sp = nss.ss_sp;
 		p->p_sigstk.ss_size = 0;
 		p->p_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		p->p_flag |= P_ALTSTACK;
 	}
 	PROC_UNLOCK(p);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	PROC_LOCK(p);
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = p->p_sigstk;
 		oss->ss_flags = (p->p_flag & P_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack) {
 			PROC_UNLOCK(p);
 			return (EPERM);
 		}
 		if ((ss->ss_flags & ~SS_DISABLE) != 0) {
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz) {
 				PROC_UNLOCK(p);
 				return (ENOMEM);
 			}
 			p->p_sigstk = *ss;
 			p->p_flag |= P_ALTSTACK;
 		} else {
 			p->p_flag &= ~P_ALTSTACK;
 		}
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(td, sig, pgid, all)
 	register struct thread *td;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &allproc, p_list) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p->p_state == PRS_ZOMBIE) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 kill(td, uap)
 	register struct thread *td;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL)
 			return (ESRCH);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 okillpg(td, uap)
 	struct thread *td;
 	register struct okillpg_args *uap;
 {
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Send a signal caused by a trap to the current thread.
  * If it will be caught immediately, deliver it with correct code.
  * Otherwise, post it normally.
  *
  * MPSAFE
  */
 void
 trapsignal(struct thread *td, int sig, u_long code)
 {
 	struct sigacts *ps;
 	struct proc *p;
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		p->p_stats->p_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig,
 						&td->td_sigmask, code);
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsignal(td, sig);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If we know the signal is bound for a specific thread then we
 	 * assume that we are in that threads context.  This is the case
 	 * for SIGXCPU, SIGILL, etc.  Otherwise someone did a kill() from
 	 * userland and the real thread doesn't actually matter.
 	 */
 	if ((prop & SA_PROC) != 0 && curthread->td_proc == p)
 		return (curthread);
 
 	/*
 	 * We should search for the first thread that is blocked in
 	 * sigsuspend with this signal unmasked.
 	 */
 
 	/* XXX */
 
 	/*
 	 * Find the first thread in the proc that doesn't have this signal
 	 * masked.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		if (!SIGISMEMBER(td->td_sigmask, sig))
 			return (td);
 
 	return (FIRST_THREAD_IN_PROC(p));
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * MPSAFE
  */
 void
 psignal(struct proc *p, int sig)
 {
 	struct thread *td;
 	int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Find a thread to deliver the signal to.
 	 */
 	td = sigtd(p, sig, prop);
 
 	tdsignal(td, sig);
 }
 
 /*
  * MPSAFE
  */
 void
 tdsignal(struct thread *td, int sig)
 {
 	struct proc *p;
 	register sig_t action;
 	sigset_t *siglist;
 	struct thread *td0;
 	register int prop;
 	struct sigacts *ps;
 
 	KASSERT(_SIG_VALID(sig),
 	    ("tdsignal(): invalid signal %d\n", sig));
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
 
 	prop = sigprop(sig);
 
 	/*
 	 * If this thread is blocking this signal then we'll leave it in the
 	 * proc so that we can find it in the first thread that unblocks it.
 	 */
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		siglist = &p->p_siglist;
 	else
 		siglist = &td->td_siglist;
 
 	/*
 	 * If proc is traced, always give parent a chance;
 	 * if signal event is tracked by procfs, give *that*
 	 * a chance, as well.
 	 */
 	if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) {
 		action = SIG_DFL;
 	} else {
 		/*
 		 * If the signal is being ignored,
 		 * then we forget about it immediately.
 		 * (Note: we don't set SIGCONT in ps_sigignore,
 		 * and if it is set to SIG_IGN,
 		 * action will be SIG_DFL here.)
 		 */
 		mtx_lock(&ps->ps_mtx);
 		if (SIGISMEMBER(ps->ps_sigignore, sig) ||
 		    (p->p_flag & P_WEXIT)) {
 			mtx_unlock(&ps->ps_mtx);
 			return;
 		}
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			action = SIG_HOLD;
 		else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 			action = SIG_CATCH;
 		else
 			action = SIG_DFL;
 		mtx_unlock(&ps->ps_mtx);
 	}
 
 	if (prop & SA_CONT) {
 		SIG_STOPSIGMASK(p->p_siglist);
 		/*
 		 * XXX Should investigate leaving STOP and CONT sigs only in
 		 * the proc's siglist.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td0)
 			SIG_STOPSIGMASK(td0->td_siglist);
 	}
 
 	if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL))
 		        return;
 		SIG_CONTSIGMASK(p->p_siglist);
 		FOREACH_THREAD_IN_PROC(p, td0)
 			SIG_CONTSIGMASK(td0->td_siglist);
 		p->p_flag &= ~P_CONTINUED;
 	}
 	SIGADDSET(*siglist, sig);
 	signotify(td);			/* uses schedlock */
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return;
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediatly, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
 		 * either winding down or already on the suspended queue.
 		 */
 		if (p->p_flag & P_TRACED) {
 			/*
 			 * The traced process is already stopped,
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
 			goto out;
 		}
 
 		if (sig == SIGKILL) {
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in siglist as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * siglist.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			p->p_flag |= P_CONTINUED;
 			if (action == SIG_DFL) {
 				SIGDELSET(*siglist, sig);
 			} else if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 * It would seem that the answer would be to
 				 * run an upcall in the next KSE to run, and
 				 * deliver the signal that way. In a NON KSE
 				 * process, we need to make sure that the
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			SIGDELSET(*siglist, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) {
 			if (td->td_flags & TDF_CVWAITQ)
 				cv_abort(td);
 			else
 				abortsleep(td);
 		}
 		mtx_unlock_spin(&sched_lock);
 		goto out;
 		/*
 		 * XXXKSE  What about threads that are waiting on mutexes?
 		 * Shouldn't they abort too?
 		 * No, hopefully mutexes are short lived.. They'll
 		 * eventually hit thread_suspend_check().
 		 */
 	}  else if (p->p_state == PRS_NORMAL) {
 		if ((p->p_flag & P_TRACED) || (action != SIG_DFL) ||
 			!(prop & SA_STOP)) {
 			mtx_lock_spin(&sched_lock);
 			tdsigwakeup(td, sig, action);
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 		if (prop & SA_STOP) {
 			if (p->p_flag & P_PPWAIT)
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			mtx_lock_spin(&sched_lock);
 			FOREACH_THREAD_IN_PROC(p, td0) {
 				if (TD_IS_SLEEPING(td0) &&
 					(td->td_flags & TDF_SINTR))
 					thread_suspend_one(td0);
 			}
 			thread_stopped(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				SIGDELSET(p->p_siglist, p->p_xstat);
 				FOREACH_THREAD_IN_PROC(p, td0)
 					SIGDELSET(td0->td_siglist, p->p_xstat);
 			}
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		} 
 		else
 			goto runfast;
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		SIGDELSET(*siglist, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 
 runfast:
 	mtx_lock_spin(&sched_lock);
 	tdsigwakeup(td, sig, action);
 	thread_unsuspend(p);
 	mtx_unlock_spin(&sched_lock);
 out:
 	/* If we jump here, sched_lock should not be owned. */
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread. We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	prop = sigprop(sig);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if ((action == SIG_DFL) && (prop & SA_KILL)) {
 		if (td->td_priority > PUSER) {
 			td->td_priority = PUSER;
 		}
 	}
 	if (TD_IS_SLEEPING(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0) {
 			return;
 		}
 		/*
 		 * Process is sleeping and traced.  Make it runnable
 		 * so it can discover the signal in issignal() and stop
 		 * for its parent.
 		 */
 		if (p->p_flag & P_TRACED) {
 			p->p_flag &= ~P_STOPPED_TRACE;
 		} else {
 
 			/*
 			 * If SIGCONT is default (or ignored) and process is
 			 * asleep, we are finished; the process should not
 			 * be awakened.
 			 */
 			if ((prop & SA_CONT) && action == SIG_DFL) {
 				SIGDELSET(p->p_siglist, sig);
 				/*
 				 * It may be on either list in this state.
 				 * Remove from both for now.
 				 */
 				SIGDELSET(td->td_siglist, sig);
 				return;
 			}
 
 			/*
 			 * Raise priority to at least PUSER.
 			 */
 			if (td->td_priority > PUSER) {
 				td->td_priority = PUSER;
 			}
 		}
 		if (td->td_flags & TDF_CVWAITQ) 
 			cv_abort(td);
 		else
 			abortsleep(td);
 	}
 #ifdef SMP
 	  else {
 		/*
 		 * Other states do nothing with the signal immediatly,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 		if (TD_IS_RUNNING(td) && td != curthread) {
 			forward_signal(td);
 		}
 	  }
 #endif
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(td)
 	struct thread *td;
 {
 	struct proc *p;
 	struct sigacts *ps;
 	sigset_t sigpending;
 	register int sig, prop;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_siglist;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		_STOPEVENT(p, S_SIG, sig);
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			SIGDELSET(td->td_siglist, sig);
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop.
 			 */
 			mtx_unlock(&ps->ps_mtx);
 			WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 			    &p->p_mtx.mtx_object, "Stopping for traced signal");
 			p->p_xstat = sig;
 			PROC_LOCK(p->p_pptr);
 			psignal(p->p_pptr, SIGCHLD);
 			PROC_UNLOCK(p->p_pptr);
 			mtx_lock_spin(&sched_lock);
 			stop(p);	/* uses schedlock too eventually */
 			thread_suspend_one(td);
 			PROC_UNLOCK(p);
 			DROP_GIANT();
 			p->p_stats->p_ru.ru_nivcsw++;
 			mi_switch();
 			mtx_unlock_spin(&sched_lock);
 			PICKUP_GIANT();
 			PROC_LOCK(p);
 			mtx_lock(&ps->ps_mtx);
 
 			/*
 			 * If parent wants us to take the signal,
 			 * then it will leave it in p->p_xstat;
 			 * otherwise we just look for signals again.
 			 */
 			SIGDELSET(td->td_siglist, sig);	/* clear old signal */
 			sig = p->p_xstat;
 			if (sig == 0)
 				continue;
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 
 			/*
 			 * Put the new signal into td_siglist.  If the
 			 * signal is being masked, look for other signals.
 			 */
 			SIGADDSET(td->td_siglist, sig);
 			if (SIGISMEMBER(td->td_sigmask, sig))
 				continue;
 			signotify(td);
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.mtx_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				mtx_lock_spin(&sched_lock);
 				thread_stopped(p);
 				thread_suspend_one(td);
 				PROC_UNLOCK(p);
 				DROP_GIANT();
 				p->p_stats->p_ru.ru_nivcsw++;
 				mi_switch();
 				mtx_unlock_spin(&sched_lock);
 				PICKUP_GIANT();
 				PROC_LOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		SIGDELSET(td->td_siglist, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Put the argument process into the stopped state and notify the parent
  * via wakeup.  Signals are handled elsewhere.  The process must not be
  * on the run queue.  Must be called with the proc p locked and the scheduler
  * lock held.
  */
 static void
 stop(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_STOPPED_SIG;
 	p->p_flag &= ~P_WAITED;
 	wakeup(p->p_pptr);
 }
 
 /*
  * MPSAFE
  */
 void
 thread_stopped(struct proc *p)
 {
 	struct proc *p1 = curthread->td_proc;
 	struct sigacts *ps;
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == p1)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		mtx_unlock_spin(&sched_lock);
 		stop(p);
 		PROC_LOCK(p->p_pptr);
 		ps = p->p_pptr->p_sigacts;
 		mtx_lock(&ps->ps_mtx);
 		if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 			mtx_unlock(&ps->ps_mtx);
 			psignal(p->p_pptr, SIGCHLD);
 		} else
 			mtx_unlock(&ps->ps_mtx);
 		PROC_UNLOCK(p->p_pptr);
 		mtx_lock_spin(&sched_lock);
 	}
 }
  
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIGDELSET(td->td_siglist, sig);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 	_STOPEVENT(p, S_SIG, sig);
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		p->p_stats->p_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
-		if (p->p_flag & P_THREADED)
+		if (p->p_flag & P_SA)
 			thread_signal_add(curthread, sig);
 		else
 			(*p->p_sysent->sv_sendsig)(action, sig,
 			    &returnmask, code);
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  *
  * MPSAFE
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	if (sigprop(sig) & SA_CORE) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 */
 		PROC_UNLOCK(p);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else {
 		PROC_UNLOCK(p);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 	}
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 static char corefilename[MAXPATHLEN+1] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 	const char *name;
 	uid_t uid;
 	pid_t pid;
 {
 	const char *format, *appendstr;
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	size_t i, l, n;
 
 	format = corefilename;
 	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
 	if (temp == NULL)
 		return (NULL);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				appendstr = "%";
 				break;
 			case 'N':	/* process name */
 				appendstr = name;
 				break;
 			case 'P':	/* process id */
 				sprintf(buf, "%u", pid);
 				appendstr = buf;
 				break;
 			case 'U':	/* user id */
 				sprintf(buf, "%u", uid);
 				appendstr = buf;
 				break;
 			default:
 				appendstr = "";
 			  	log(LOG_ERR,
 				    "Unknown format character %c in `%s'\n",
 				    format[i], format);
 			}
 			l = strlen(appendstr);
 			if ((n + l) >= MAXPATHLEN)
 				goto toolong;
 			memcpy(temp + n, appendstr, l);
 			n += l;
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	if (format[i] != '\0')
 		goto toolong;
 	return (temp);
 toolong:
 	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
 	    (long)pid, name, (u_long)uid);
 	free(temp, M_TEMP);
 	return (NULL);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	register struct vnode *vp;
 	register struct ucred *cred = td->td_ucred;
 	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 
 	PROC_LOCK(p);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = p->p_rlimit[RLIMIT_CORE].rlim_cur;
 	if (limit == 0) {
 		PROC_UNLOCK(p);
 		return 0;
 	}
 	PROC_UNLOCK(p);
 
 restart:
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
 	if (name == NULL)
 		return (EINVAL);
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR);
 	free(name, M_TEMP);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
 		goto out2;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	error = VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK);
 	if (error)
 		goto out2;
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			return (error);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(td, vp, limit) :
 	  ENOSYS;
 
 	lf.l_type = F_UNLCK;
 	VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	vn_finished_write(mp);
 out2:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).
  * Flag error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	psignal(p, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using
  * stored credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	PROC_LOCK(p);
 	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	PROC_LOCK(p);
 	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
 	PROC_UNLOCK(p);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/kern_switch.c
===================================================================
--- head/sys/kern/kern_switch.c	(revision 116360)
+++ head/sys/kern/kern_switch.c	(revision 116361)
@@ -1,723 +1,723 @@
 /*
  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /***
 Here is the logic..
 
 If there are N processors, then there are at most N KSEs (kernel
 schedulable entities) working to process threads that belong to a
 KSEGOUP (kg). If there are X of these KSEs actually running at the
 moment in question, then there are at most M (N-X) of these KSEs on
 the run queue, as running KSEs are not on the queue.
 
 Runnable threads are queued off the KSEGROUP in priority order.
 If there are M or more threads runnable, the top M threads
 (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
 their priority from those threads and are put on the run queue.
 
 The last thread that had a priority high enough to have a KSE associated
 with it, AND IS ON THE RUN QUEUE is pointed to by
 kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
 assigned as all the available KSEs are activly running, or because there
 are no threads queued, that pointer is NULL.
 
 When a KSE is removed from the run queue to become runnable, we know
 it was associated with the highest priority thread in the queue (at the head
 of the queue). If it is also the last assigned we know M was 1 and must
 now be 0. Since the thread is no longer queued that pointer must be
 removed from it. Since we know there were no more KSEs available,
 (M was 1 and is now 0) and since we are not FREEING our KSE
 but using it, we know there are STILL no more KSEs available, we can prove
 that the next thread in the ksegrp list will not have a KSE to assign to
 it, so we can show that the pointer must be made 'invalid' (NULL).
 
 The pointer exists so that when a new thread is made runnable, it can
 have its priority compared with the last assigned thread to see if
 it should 'steal' its KSE or not.. i.e. is it 'earlier'
 on the list than that thread or later.. If it's earlier, then the KSE is
 removed from the last assigned (which is now not assigned a KSE)
 and reassigned to the new thread, which is placed earlier in the list.
 The pointer is then backed up to the previous thread (which may or may not
 be the new thread).
 
 When a thread sleeps or is removed, the KSE becomes available and if there 
 are queued threads that are not assigned KSEs, the highest priority one of
 them is assigned the KSE, which is then placed back on the run queue at
 the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
 to point to it.
 
 The following diagram shows 2 KSEs and 3 threads from a single process.
 
  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
               \    \____   
                \        \
     KSEGROUP---thread--thread--thread    (queued in priority order)
         \                 / 
          \_______________/
           (last_assigned)
 
 The result of this scheme is that the M available KSEs are always
 queued at the priorities they have inherrited from the M highest priority
 threads for that KSEGROUP. If this situation changes, the KSEs are 
 reassigned to keep this true.
 ***/
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sched.h>
 #if defined(SMP) && defined(__i386__)
 #include <sys/smp.h>
 #endif
 #include <machine/critical.h>
 
 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 
 void panc(char *string1, char *string2);
 
 #if 0
 static void runq_readjust(struct runq *rq, struct kse *ke);
 #endif
 /************************************************************************
  * Functions that manipulate runnability from a thread perspective.	*
  ************************************************************************/
 /*
  * Select the KSE that will be run next.  From that find the thread, and
  * remove it from the KSEGRP's run queue.  If there is thread clustering,
  * this will be what does it.
  */
 struct thread *
 choosethread(void)
 {
 	struct kse *ke;
 	struct thread *td;
 	struct ksegrp *kg;
 
 #if defined(SMP) && defined(__i386__)
 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
 		/* Shutting down, run idlethread on AP's */
 		td = PCPU_GET(idlethread);
 		ke = td->td_kse;
 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
 		ke->ke_flags |= KEF_DIDRUN;
 		TD_SET_RUNNING(td);
 		return (td);
 	}
 #endif
 
 retry:
 	ke = sched_choose();
 	if (ke) {
 		td = ke->ke_thread;
 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
 		kg = ke->ke_ksegrp;
-		if (td->td_proc->p_flag & P_THREADED) {
+		if (td->td_proc->p_flag & P_SA) {
 			if (kg->kg_last_assigned == td) {
 				kg->kg_last_assigned = TAILQ_PREV(td,
 				    threadqueue, td_runq);
 			}
 			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 		}
 		kg->kg_runnable--;
 		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
 		    td, td->td_priority);
 	} else {
 		/* Simulate runq_choose() having returned the idle thread */
 		td = PCPU_GET(idlethread);
 		ke = td->td_kse;
 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
 	}
 	ke->ke_flags |= KEF_DIDRUN;
 
 	/*
 	 * If we are in panic, only allow system threads,
 	 * plus the one we are running in, to be run.
 	 */
 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
 	    (td->td_flags & TDF_INPANIC) == 0)) {
 		/* note that it is no longer on the run queue */
 		TD_SET_CAN_RUN(td);
 		goto retry;
 	}
 
 	TD_SET_RUNNING(td);
 	return (td);
 }
 
 /*
  * Given a surplus KSE, either assign a new runable thread to it
  * (and put it in the run queue) or put it in the ksegrp's idle KSE list.
  * Assumes that the original thread is not runnable.
  */
 void
 kse_reassign(struct kse *ke)
 {
 	struct ksegrp *kg;
 	struct thread *td;
 	struct thread *original;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	original = ke->ke_thread;
 	KASSERT(original == NULL || TD_IS_INHIBITED(original),
     	    ("reassigning KSE with runnable thread"));
 	kg = ke->ke_ksegrp;
 	if (original)
 		original->td_kse = NULL;
 
 	/*
 	 * Find the first unassigned thread
 	 */
 	if ((td = kg->kg_last_assigned) != NULL)
 		td = TAILQ_NEXT(td, td_runq);
 	else 
 		td = TAILQ_FIRST(&kg->kg_runq);
 
 	/*
 	 * If we found one, assign it the kse, otherwise idle the kse.
 	 */
 	if (td) {
 		kg->kg_last_assigned = td;
 		td->td_kse = ke;
 		ke->ke_thread = td;
 		sched_add(ke);
 		CTR2(KTR_RUNQ, "kse_reassign: ke%p -> td%p", ke, td);
 		return;
 	}
 
 	ke->ke_state = KES_IDLE;
 	ke->ke_thread = NULL;
 	TAILQ_INSERT_TAIL(&kg->kg_iq, ke, ke_kgrlist);
 	kg->kg_idle_kses++;
 	CTR1(KTR_RUNQ, "kse_reassign: ke%p on idle queue", ke);
 	return;
 }
 
 #if 0
 /*
  * Remove a thread from its KSEGRP's run queue.
  * This in turn may remove it from a KSE if it was already assigned
  * to one, possibly causing a new thread to be assigned to the KSE
  * and the KSE getting a new priority.
  */
 static void
 remrunqueue(struct thread *td)
 {
 	struct thread *td2, *td3;
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
 	kg = td->td_ksegrp;
 	ke = td->td_kse;
 	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
 	kg->kg_runnable--;
 	TD_SET_CAN_RUN(td);
 	/*
 	 * If it is not a threaded process, take the shortcut.
 	 */
-	if ((td->td_proc->p_flag & P_THREADED) == 0) {
+	if ((td->td_proc->p_flag & P_SA) == 0) {
 		/* Bring its kse with it, leave the thread attached */
 		sched_rem(ke);
 		ke->ke_state = KES_THREAD; 
 		return;
 	}
    	td3 = TAILQ_PREV(td, threadqueue, td_runq);
 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 	if (ke) {
 		/*
 		 * This thread has been assigned to a KSE.
 		 * We need to dissociate it and try assign the
 		 * KSE to the next available thread. Then, we should
 		 * see if we need to move the KSE in the run queues.
 		 */
 		sched_rem(ke);
 		ke->ke_state = KES_THREAD; 
 		td2 = kg->kg_last_assigned;
 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
 		if (td2 == td) 
 			kg->kg_last_assigned = td3;
 		kse_reassign(ke);
 	}
 }
 #endif
 
 /*
  * Change the priority of a thread that is on the run queue.
  */
 void
 adjustrunqueue( struct thread *td, int newpri) 
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
 
 	ke = td->td_kse;
 	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
 	/*
 	 * If it is not a threaded process, take the shortcut.
 	 */
-	if ((td->td_proc->p_flag & P_THREADED) == 0) {
+	if ((td->td_proc->p_flag & P_SA) == 0) {
 		/* We only care about the kse in the run queue. */
 		td->td_priority = newpri;
 		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
 			sched_rem(ke);
 			sched_add(ke);
 		}
 		return;
 	}
 
 	/* It is a threaded process */
 	kg = td->td_ksegrp;
 	kg->kg_runnable--;
 	TD_SET_CAN_RUN(td);
 	if (ke) {
 		if (kg->kg_last_assigned == td) {
 			kg->kg_last_assigned =
 			    TAILQ_PREV(td, threadqueue, td_runq);
 		}
 		sched_rem(ke);
 	}
 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
 	td->td_priority = newpri;
 	setrunqueue(td);
 }
 
 void
 setrunqueue(struct thread *td)
 {
 	struct kse *ke;
 	struct ksegrp *kg;
 	struct thread *td2;
 	struct thread *tda;
 
 	CTR1(KTR_RUNQ, "setrunqueue: td%p", td);
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
 	    ("setrunqueue: bad thread state"));
 	TD_SET_RUNQ(td);
 	kg = td->td_ksegrp;
 	kg->kg_runnable++;
-	if ((td->td_proc->p_flag & P_THREADED) == 0) {
+	if ((td->td_proc->p_flag & P_SA) == 0) {
 		/*
 		 * Common path optimisation: Only one of everything
 		 * and the KSE is always already attached.
 		 * Totally ignore the ksegrp run queue.
 		 */
 		sched_add(td->td_kse);
 		return;
 	}
 
 	tda = kg->kg_last_assigned;
 	if ((ke = td->td_kse) == NULL) {
 		if (kg->kg_idle_kses) {
 			/*
 			 * There is a free one so it's ours for the asking..
 			 */
 			ke = TAILQ_FIRST(&kg->kg_iq);
 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 			ke->ke_state = KES_THREAD;
 			kg->kg_idle_kses--;
 		} else if (tda && (tda->td_priority > td->td_priority)) {
 			/*
 			 * None free, but there is one we can commandeer.
 			 */
 			ke = tda->td_kse;
 			tda->td_kse = NULL;
 			ke->ke_thread = NULL;
 			tda = kg->kg_last_assigned =
 		    	    TAILQ_PREV(tda, threadqueue, td_runq);
 			sched_rem(ke);
 		}
 	} else {
 		/* 
 		 * Temporarily disassociate so it looks like the other cases.
 		 */
 		ke->ke_thread = NULL;
 		td->td_kse = NULL;
 	}
 
 	/*
 	 * Add the thread to the ksegrp's run queue at
 	 * the appropriate place.
 	 */
 	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
 		if (td2->td_priority > td->td_priority) {
 			TAILQ_INSERT_BEFORE(td2, td, td_runq);
 			break;
 		}
 	}
 	if (td2 == NULL) {
 		/* We ran off the end of the TAILQ or it was empty. */
 		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
 	}
 
 	/*
 	 * If we have a ke to use, then put it on the run queue and
 	 * If needed, readjust the last_assigned pointer.
 	 */
 	if (ke) {
 		if (tda == NULL) {
 			/*
 			 * No pre-existing last assigned so whoever is first
 			 * gets the KSE we brought in.. (maybe us)
 			 */
 			td2 = TAILQ_FIRST(&kg->kg_runq);
 			KASSERT((td2->td_kse == NULL),
 			    ("unexpected ke present"));
 			td2->td_kse = ke;
 			ke->ke_thread = td2;
 			kg->kg_last_assigned = td2;
 		} else if (tda->td_priority > td->td_priority) {
 			/*
 			 * It's ours, grab it, but last_assigned is past us
 			 * so don't change it.
 			 */
 			td->td_kse = ke;
 			ke->ke_thread = td;
 		} else {
 			/* 
 			 * We are past last_assigned, so 
 			 * put the new kse on whatever is next,
 			 * which may or may not be us.
 			 */
 			td2 = TAILQ_NEXT(tda, td_runq);
 			kg->kg_last_assigned = td2;
 			td2->td_kse = ke;
 			ke->ke_thread = td2;
 		}
 		sched_add(ke);
 	}
 }
 
 /************************************************************************
  * Critical section marker functions					*
  ************************************************************************/
 /* Critical sections that prevent preemption. */
 void
 critical_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_critnest == 0)
 		cpu_critical_enter();
 	td->td_critnest++;
 }
 
 void
 critical_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_critnest == 1) {
 		td->td_critnest = 0;
 		cpu_critical_exit();
 	} else {
 		td->td_critnest--;
 	}
 }
 
 
 /************************************************************************
  * SYSTEM RUN QUEUE manipulations and tests				*
  ************************************************************************/
 /*
  * Initialize a run structure.
  */
 void
 runq_init(struct runq *rq)
 {
 	int i;
 
 	bzero(rq, sizeof *rq);
 	for (i = 0; i < RQ_NQS; i++)
 		TAILQ_INIT(&rq->rq_queues[i]);
 }
 
 /*
  * Clear the status bit of the queue corresponding to priority level pri,
  * indicating that it is empty.
  */
 static __inline void
 runq_clrbit(struct runq *rq, int pri)
 {
 	struct rqbits *rqb;
 
 	rqb = &rq->rq_status;
 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
 	    rqb->rqb_bits[RQB_WORD(pri)],
 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
 	    RQB_BIT(pri), RQB_WORD(pri));
 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
 }
 
 /*
  * Find the index of the first non-empty run queue.  This is done by
  * scanning the status bits, a set bit indicates a non-empty queue.
  */
 static __inline int
 runq_findbit(struct runq *rq)
 {
 	struct rqbits *rqb;
 	int pri;
 	int i;
 
 	rqb = &rq->rq_status;
 	for (i = 0; i < RQB_LEN; i++)
 		if (rqb->rqb_bits[i]) {
 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
 			    rqb->rqb_bits[i], i, pri);
 			return (pri);
 		}
 
 	return (-1);
 }
 
 /*
  * Set the status bit of the queue corresponding to priority level pri,
  * indicating that it is non-empty.
  */
 static __inline void
 runq_setbit(struct runq *rq, int pri)
 {
 	struct rqbits *rqb;
 
 	rqb = &rq->rq_status;
 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
 	    rqb->rqb_bits[RQB_WORD(pri)],
 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
 	    RQB_BIT(pri), RQB_WORD(pri));
 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
 }
 
 /*
  * Add the KSE to the queue specified by its priority, and set the
  * corresponding status bit.
  */
 void
 runq_add(struct runq *rq, struct kse *ke)
 {
 	struct rqhead *rqh;
 	int pri;
 
 	pri = ke->ke_thread->td_priority / RQ_PPQ;
 	ke->ke_rqindex = pri;
 	runq_setbit(rq, pri);
 	rqh = &rq->rq_queues[pri];
 	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
 	    ke->ke_proc, ke->ke_thread->td_priority, pri, rqh);
 	TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
 }
 
 /*
  * Return true if there are runnable processes of any priority on the run
  * queue, false otherwise.  Has no side effects, does not modify the run
  * queue structure.
  */
 int
 runq_check(struct runq *rq)
 {
 	struct rqbits *rqb;
 	int i;
 
 	rqb = &rq->rq_status;
 	for (i = 0; i < RQB_LEN; i++)
 		if (rqb->rqb_bits[i]) {
 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
 			    rqb->rqb_bits[i], i);
 			return (1);
 		}
 	CTR0(KTR_RUNQ, "runq_check: empty");
 
 	return (0);
 }
 
 /*
  * Find the highest priority process on the run queue.
  */
 struct kse *
 runq_choose(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct kse *ke;
 	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 		ke = TAILQ_FIRST(rqh);
 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
 		CTR3(KTR_RUNQ,
 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
 		return (ke);
 	}
 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
 
 	return (NULL);
 }
 
 /*
  * Remove the KSE from the queue specified by its priority, and clear the
  * corresponding status bit if the queue becomes empty.
  * Caller must set ke->ke_state afterwards.
  */
 void
 runq_remove(struct runq *rq, struct kse *ke)
 {
 	struct rqhead *rqh;
 	int pri;
 
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		("runq_remove: process swapped out"));
 	pri = ke->ke_rqindex;
 	rqh = &rq->rq_queues[pri];
 	CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p",
 	    ke, ke->ke_thread->td_priority, pri, rqh);
 	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
 	TAILQ_REMOVE(rqh, ke, ke_procq);
 	if (TAILQ_EMPTY(rqh)) {
 		CTR0(KTR_RUNQ, "runq_remove: empty");
 		runq_clrbit(rq, pri);
 	}
 }
 
 #if 0
 void
 panc(char *string1, char *string2)
 {
 	printf("%s", string1);
 	Debugger(string2);
 }
 
 void
 thread_sanity_check(struct thread *td, char *string)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct kse *ke;
 	struct thread *td2 = NULL;
 	unsigned int prevpri;
 	int	saw_lastassigned = 0;
 	int unassigned = 0;
 	int assigned = 0;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 	ke = td->td_kse;
 
 
 	if (ke) {
 		if (p != ke->ke_proc) {
 			panc(string, "wrong proc");
 		}
 		if (ke->ke_thread != td) {
 			panc(string, "wrong thread");
 		}
 	}
 	
-	if ((p->p_flag & P_THREADED) == 0) {
+	if ((p->p_flag & P_SA) == 0) {
 		if (ke == NULL) {
 			panc(string, "non KSE thread lost kse");
 		}
 	} else {
 		prevpri = 0;
 		saw_lastassigned = 0;
 		unassigned = 0;
 		assigned = 0;
 		TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
 			if (td2->td_priority < prevpri) {
 				panc(string, "thread runqueue unosorted");
 			}
 			if ((td2->td_state == TDS_RUNQ) &&
 			    td2->td_kse &&
 			    (td2->td_kse->ke_state != KES_ONRUNQ)) {
 				panc(string, "KSE wrong state");
 			}
 			prevpri = td2->td_priority;
 			if (td2->td_kse) {
 				assigned++;
 				if (unassigned) {
 					panc(string, "unassigned before assigned");
 				}
  				if  (kg->kg_last_assigned == NULL) {
 					panc(string, "lastassigned corrupt");
 				}
 				if (saw_lastassigned) {
 					panc(string, "last assigned not last");
 				}
 				if (td2->td_kse->ke_thread != td2) {
 					panc(string, "mismatched kse/thread");
 				}
 			} else {
 				unassigned++;
 			}
 			if (td2 == kg->kg_last_assigned) {
 				saw_lastassigned = 1;
 				if (td2->td_kse == NULL) {
 					panc(string, "last assigned not assigned");
 				}
 			}
 		}
 		if (kg->kg_last_assigned && (saw_lastassigned == 0)) {
 			panc(string, "where on earth does lastassigned point?");
 		}
 #if 0
 		FOREACH_THREAD_IN_GROUP(kg, td2) {
 			if (((td2->td_flags & TDF_UNBOUND) == 0) && 
 			    (TD_ON_RUNQ(td2))) {
 				assigned++;
 				if (td2->td_kse == NULL) {
 					panc(string, "BOUND thread with no KSE");
 				}
 			}
 		}
 #endif
 #if 0
 		if ((unassigned + assigned) != kg->kg_runnable) {
 			panc(string, "wrong number in runnable");
 		}
 #endif
 	}
 	if (assigned == 12345) {
 		printf("%p %p %p %p %p %d, %d",
 		    td, td2, ke, kg, p, assigned, saw_lastassigned);
 	}
 }
 #endif
 
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 116360)
+++ head/sys/kern/kern_synch.c	(revision 116361)
@@ -1,695 +1,695 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #ifdef __i386__
 #include "opt_swtch.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #ifdef SWTCH_OPTIM_STATS
 #include <machine/md_var.h>
 #endif
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 int	hogticks;
 int	lbolt;
 
 static struct callout loadav_callout;
 static struct callout lbolt_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int      fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 static void	endtsleep(void *);
 static void	loadav(void *arg);
 static void	lboltcb(void *arg);
 
 /*
  * We're only looking at 7 bits of the address; everything is
  * aligned to 4, lots of things are aligned to greater powers
  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
  */
 #define TABLESIZE	128
 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE];
 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 
 void
 sleepinit(void)
 {
 	int i;
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	for (i = 0; i < TABLESIZE; i++)
 		TAILQ_INIT(&slpque[i]);
 }
 
 /*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The mutex argument is exited before the caller is suspended, and
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
 
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int sig, catch = priority & PCATCH;
 	int rval = 0;
 	WITNESS_SAVE_DECL(mtx);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
 	/* XXX: mtx == NULL ?? */
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mtx->mtx_object,
 	    "Sleeping on \"%s\"", wmesg);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
 	    ("sleeping without a mutex"));
 	/*
 	 * If we are capable of async syscalls and there isn't already
 	 * another one ready to return, start a new thread
 	 * and queue it as ready to run. Note that there is danger here
 	 * because we need to make sure that we don't sleep allocating
 	 * the thread (recursion here might be bad).
 	 */
 	mtx_lock_spin(&sched_lock);
-	if (p->p_flag & P_THREADED || p->p_numthreads > 1) {
+	if (p->p_flag & P_SA || p->p_numthreads > 1) {
 		/*
 		 * Just don't bother if we are exiting
 		 * and not the exiting thread or thread was marked as
 		 * interrupted.
 		 */
 		if (catch &&
 		    (((p->p_flag & P_WEXIT) && (p->p_singlethread != td)) ||
 		     (td->td_flags & TDF_INTERRUPT))) {
 			td->td_flags &= ~TDF_INTERRUPT;
 			mtx_unlock_spin(&sched_lock);
 			return (EINTR);
 		}
 	}
 	if (cold ) {
 		/*
 		 * During autoconfiguration, just give interrupts
 		 * a chance, then just return.
 		 * Don't run any other procs or panic below,
 		 * in case this is the idle process and already asleep.
 		 */
 		if (mtx != NULL && priority & PDROP)
 			mtx_unlock(mtx);
 		mtx_unlock_spin(&sched_lock);
 		return (0);
 	}
 
 	DROP_GIANT();
 
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(&mtx->mtx_object, mtx);
 		mtx_unlock(mtx);
 		if (priority & PDROP)
 			mtx = NULL;
 	}
 
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	CTR5(KTR_PROC, "msleep: thread %p (pid %d, %s) on %s (%p)",
 	    td, p->p_pid, p->p_comm, wmesg, ident);
 
 	td->td_wchan = ident;
 	td->td_wmesg = wmesg;
 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], td, td_slpq);
 	TD_SET_ON_SLEEPQ(td);
 	if (timo)
 		callout_reset(&td->td_slpcallout, timo, endtsleep, td);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there, and
 	 * a wakeup or a SIGCONT (or both) could occur while we were stopped.
 	 * without resuming us, thus we must be ready for sleep
 	 * when cursig is called.  If the wakeup happens while we're
 	 * stopped, td->td_wchan will be 0 upon return from cursig.
 	 */
 	if (catch) {
 		CTR3(KTR_PROC, "msleep caught: thread %p (pid %d, %s)", td,
 		    p->p_pid, p->p_comm);
 		td->td_flags |= TDF_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		if (sig == 0 && thread_suspend_check(1))
 			sig = SIGSTOP;
 		mtx_lock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		if (sig != 0) {
 			if (TD_ON_SLEEPQ(td))
 				unsleep(td);
 		} else if (!TD_ON_SLEEPQ(td))
 			catch = 0;
 	} else
 		sig = 0;
 
 	/*
 	 * Let the scheduler know we're about to voluntarily go to sleep.
 	 */
 	sched_sleep(td, priority & PRIMASK);
 
 	if (TD_ON_SLEEPQ(td)) {
 		p->p_stats->p_ru.ru_nvcsw++;
 		TD_SET_SLEEPING(td);
 		mi_switch();
 	}
 	/*
 	 * We're awake from voluntary sleep.
 	 */
 	CTR3(KTR_PROC, "msleep resume: thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	td->td_flags &= ~TDF_SINTR;
 	if (td->td_flags & TDF_TIMEOUT) {
 		td->td_flags &= ~TDF_TIMEOUT;
 		if (sig == 0)
 			rval = EWOULDBLOCK;
 	} else if (td->td_flags & TDF_TIMOFAIL) {
 		td->td_flags &= ~TDF_TIMOFAIL;
 	} else if (timo && callout_stop(&td->td_slpcallout) == 0) {
 		/*
 		 * This isn't supposed to be pretty.  If we are here, then
 		 * the endtsleep() callout is currently executing on another
 		 * CPU and is either spinning on the sched_lock or will be
 		 * soon.  If we don't synchronize here, there is a chance
 		 * that this process may msleep() again before the callout
 		 * has a chance to run and the callout may end up waking up
 		 * the wrong msleep().  Yuck.
 		 */
 		TD_SET_SLEEPING(td);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		td->td_flags &= ~TDF_TIMOFAIL;
 	} 
 	if ((td->td_flags & TDF_INTERRUPT) && (priority & PCATCH) &&
 	    (rval == 0)) {
 		td->td_flags &= ~TDF_INTERRUPT;
 		rval = EINTR;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	if (rval == 0 && catch) {
 		PROC_LOCK(p);
 		/* XXX: shouldn't we always be calling cursig() */
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		if (sig != 0 || (sig = cursig(td))) {
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
 				rval = ERESTART;
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	if (mtx != NULL) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(&mtx->mtx_object, mtx);
 	}
 	return (rval);
 }
 
 /*
  * Implement timeout for msleep()
  *
  * If process hasn't been awakened (wchan non-zero),
  * set timeout flag and undo the sleep.  If proc
  * is stopped, just unsleep so it will remain stopped.
  * MP-safe, called without the Giant mutex.
  */
 static void
 endtsleep(arg)
 	void *arg;
 {
 	register struct thread *td = arg;
 
 	CTR3(KTR_PROC, "endtsleep: thread %p (pid %d, %s)",
 	    td, td->td_proc->p_pid, td->td_proc->p_comm);
 	mtx_lock_spin(&sched_lock);
 	/*
 	 * This is the other half of the synchronization with msleep()
 	 * described above.  If the TDS_TIMEOUT flag is set, we lost the
 	 * race and just need to put the process back on the runqueue.
 	 */
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 		td->td_flags |= TDF_TIMEOUT;
 		td->td_wmesg = NULL;
 	} else {
 		td->td_flags |= TDF_TIMOFAIL;
 	}
 	TD_CLR_SLEEPING(td);
 	setrunnable(td);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Abort a thread, as if an interrupt had occured.  Only abort
  * interruptable waits (unfortunatly it isn't only safe to abort others).
  * This is about identical to cv_abort().
  * Think about merging them?
  * Also, whatever the signal code does...
  */
 void
 abortsleep(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * If the TDF_TIMEOUT flag is set, just leave. A
 	 * timeout is scheduled anyhow.
 	 */
 	if ((td->td_flags & (TDF_TIMEOUT | TDF_SINTR)) == TDF_SINTR) {
 		if (TD_ON_SLEEPQ(td)) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 		}
 	}
 }
 
 /*
  * Remove a process from its wait queue
  */
 void
 unsleep(struct thread *td)
 {
 
 	mtx_lock_spin(&sched_lock);
 	if (TD_ON_SLEEPQ(td)) {
 		TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_slpq);
 		TD_CLR_ON_SLEEPQ(td);
 		td->td_wmesg = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make all processes sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct thread *td;
 	struct thread *ntd;
 	struct proc *p;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 restart:
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			goto restart;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Make a process sleeping on the specified identifier runnable.
  * May wake more than one process if a target process is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct thread *td;
 	register struct proc *p;
 	struct thread *ntd;
 
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 	for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) {
 		ntd = TAILQ_NEXT(td, td_slpq);
 		if (td->td_wchan == ident) {
 			unsleep(td);
 			TD_CLR_SLEEPING(td);
 			setrunnable(td);
 			p = td->td_proc;
 			CTR3(KTR_PROC,"wakeup1: thread %p (pid %d, %s)",
 			    td, p->p_pid, p->p_comm);
 			break;
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * The machine independent parts of mi_switch().
  */
 void
 mi_switch(void)
 {
 	struct bintime new_switchtime;
 	struct thread *td;
 #if !defined(__alpha__) && !defined(__powerpc__)
 	struct thread *newtd;
 #endif
 	struct proc *p;
 	u_int sched_nest;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1,
 	    ("mi_switch: switch in a critical section"));
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	binuptime(&new_switchtime);
 	bintime_add(&p->p_runtime, &new_switchtime);
 	bintime_sub(&p->p_runtime, PCPU_PTR(switchtime));
 
 #ifdef DDB
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (db_active) {
 		mtx_unlock_spin(&sched_lock);
 		db_print_backtrace();
 		db_error("Context switches not allowed in the debugger.");
 	}
 #endif
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * over max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime.sec > p->p_cpulimit) {
 		p->p_sflag |= PS_XCPU;
 		td->td_flags |= TDF_ASTPENDING;
 	}
 
 	/*
 	 * Finish up stats for outgoing thread.
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
 	CTR3(KTR_PROC, "mi_switch: old thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	sched_nest = sched_lock.mtx_recurse;
-	if (td->td_proc->p_flag & P_THREADED)
+	if (td->td_proc->p_flag & P_SA)
 		thread_switchout(td);
 	sched_switchout(td);
 
 #if !defined(__alpha__) && !defined(__powerpc__) 
 	newtd = choosethread();
 	if (td != newtd)
 		cpu_switch(td, newtd);	/* SHAZAM!! */
 #ifdef SWTCH_OPTIM_STATS
 	else
 		stupid_switch++;
 #endif
 #else
 	cpu_switch();		/* SHAZAM!!*/
 #endif
 
 	sched_lock.mtx_recurse = sched_nest;
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_switchin(td);
 
 	/* 
 	 * Start setting up stats etc. for the incoming thread.
 	 * Similar code in fork_exit() is returned to by cpu_switch()
 	 * in the case of a new thread/process.
 	 */
 	CTR3(KTR_PROC, "mi_switch: new thread %p (pid %d, %s)", td, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.sec) == 0)
 		binuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Call the switchin function while still holding the scheduler lock
 	 * (used by the idlezero code and the general page-zeroing code)
 	 */
 	if (td->td_switchin)
 		td->td_switchin();
 
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
 	default:
 		break;
 	}
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return;
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return;
 		/* XXX: intentional fall-through ? */
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			wakeup(&proc0);
 		}
 	} else
 		sched_wakeup(td);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  * XXXKSE   Needs complete rewrite when correct info is available.
  * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 	struct proc *p;
 	struct thread *td;
 
 	avg = &averunnable;
 	sx_slock(&allproc_lock);
 	nrun = 0;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		FOREACH_THREAD_IN_PROC(p, td) {
 			switch (td->td_state) {
 			case TDS_RUNQ:
 			case TDS_RUNNING:
 				if ((p->p_flag & P_NOLOAD) != 0)
 					goto nextproc;
 				nrun++; /* XXXKSE */
 			default:
 				break;
 			}
 nextproc:
 			continue;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 	    loadav, NULL);
 }
 
 static void
 lboltcb(void *arg)
 {
 	wakeup(&lbolt);
 	callout_reset(&lbolt_callout, hz, lboltcb, NULL);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(dummy)
 	void *dummy;
 {
 	callout_init(&loadav_callout, 0);
 	callout_init(&lbolt_callout, 1);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 	lboltcb(NULL);
 }
 
 /*
  * General purpose yield system call
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
 	struct ksegrp *kg = td->td_ksegrp;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_lock_spin(&sched_lock);
 	kg->kg_proc->p_stats->p_ru.ru_nvcsw++;
 	sched_prio(td, PRI_MAX_TIMESHARE);
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
 	td->td_retval[0] = 0;
 
 	return (0);
 }
 
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 116360)
+++ head/sys/kern/kern_thread.c	(revision 116361)
@@ -1,2022 +1,2022 @@
 /* 
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible 
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/tty.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ucontext.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/pmap.h>
 #include <vm/uma.h>
 #include <vm/vm_map.h>
 
 #include <machine/frame.h>
 
 /*
  * KSEGRP related storage.
  */
 static uma_zone_t ksegrp_zone;
 static uma_zone_t kse_zone;
 static uma_zone_t thread_zone;
 static uma_zone_t upcall_zone;
 
 /* DEBUG ONLY */
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 static int thread_debug = 0;
 SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
 	&thread_debug, 0, "thread debug");
 
 static int max_threads_per_proc = 150;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 static int max_groups_per_proc = 50;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
 	&max_groups_per_proc, 0, "Limit on thread groups per proc");
 
 static int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
 	&max_threads_hits, 0, "");
 
 static int virtual_cpu;
 
 #define RANGEOF(type, start, end) (offsetof(type, end) - offsetof(type, start))
 
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 TAILQ_HEAD(, kse) zombie_kses = TAILQ_HEAD_INITIALIZER(zombie_kses);
 TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
 TAILQ_HEAD(, kse_upcall) zombie_upcalls = 
 	TAILQ_HEAD_INITIALIZER(zombie_upcalls);
 struct mtx kse_zombie_lock;
 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
 
 static void kse_purge(struct proc *p, struct thread *td);
 static void kse_purge_group(struct thread *td);
 static int thread_update_usr_ticks(struct thread *td, int user);
 static void thread_alloc_spare(struct thread *td, struct thread *spare);
 
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 	int def_val;
 
 #ifdef SMP
 	def_val = mp_ncpus;
 #else
 	def_val = 1;
 #endif
 	if (virtual_cpu == 0)
 		new_val = def_val;
 	else
 		new_val = virtual_cpu;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < 0)
 		return (EINVAL);
 	virtual_cpu = new_val;
 	return (0);
 }
 
 /* DEBUG ONLY */
 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
 	"debug virtual cpus");
 
 /*
  * Prepare a thread for use.
  */
 static void
 thread_ctor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu	= NOCPU;
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static void
 thread_init(void *mem, int size)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	mtx_lock(&Giant);
 	vm_thread_new(td, 0);
 	mtx_unlock(&Giant);
 	cpu_thread_setup(td);
 	td->td_sched = (struct td_sched *)&td[1];
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	vm_thread_dispose(td);
 }
 
 /*
  * Initialize type-stable parts of a kse (when newly created).
  */
 static void
 kse_init(void *mem, int size)
 {
 	struct kse	*ke;
 
 	ke = (struct kse *)mem;
 	ke->ke_sched = (struct ke_sched *)&ke[1];
 }
 
 /*
  * Initialize type-stable parts of a ksegrp (when newly created).
  */
 static void
 ksegrp_init(void *mem, int size)
 {
 	struct ksegrp	*kg;
 
 	kg = (struct ksegrp *)mem;
 	kg->kg_sched = (struct kg_sched *)&kg[1];
 }
 
 /* 
  * KSE is linked into kse group.
  */
 void
 kse_link(struct kse *ke, struct ksegrp *kg)
 {
 	struct proc *p = kg->kg_proc;
 
 	TAILQ_INSERT_HEAD(&kg->kg_kseq, ke, ke_kglist);
 	kg->kg_kses++;
 	ke->ke_state	= KES_UNQUEUED;
 	ke->ke_proc	= p;
 	ke->ke_ksegrp	= kg;
 	ke->ke_thread	= NULL;
 	ke->ke_oncpu	= NOCPU;
 	ke->ke_flags	= 0;
 }
 
 void
 kse_unlink(struct kse *ke)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = ke->ke_ksegrp;
 	TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 	if (ke->ke_state == KES_IDLE) {
 		TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 		kg->kg_idle_kses--;
 	}
 	if (--kg->kg_kses == 0)
 		ksegrp_unlink(kg);
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	kse_stash(ke);
 }
 
 void
 ksegrp_link(struct ksegrp *kg, struct proc *p)
 {
 
 	TAILQ_INIT(&kg->kg_threads);
 	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_slpq);	/* links with td_runq */
 	TAILQ_INIT(&kg->kg_kseq);	/* all kses in ksegrp */
 	TAILQ_INIT(&kg->kg_iq);		/* all idle kses in ksegrp */
 	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
 	kg->kg_proc = p;
 	/*
 	 * the following counters are in the -zero- section
 	 * and may not need clearing
 	 */
 	kg->kg_numthreads = 0;
 	kg->kg_runnable   = 0;
 	kg->kg_kses       = 0;
 	kg->kg_runq_kses  = 0; /* XXXKSE change name */
 	kg->kg_idle_kses  = 0;
 	kg->kg_numupcalls = 0;
 	/* link it in now that it's consistent */
 	p->p_numksegrps++;
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
 }
 
 void
 ksegrp_unlink(struct ksegrp *kg)
 {
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
 	KASSERT((kg->kg_kses == 0), ("ksegrp_unlink: residual kses"));
 	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
 
 	p = kg->kg_proc;
 	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 	p->p_numksegrps--;
 	/*
 	 * Aggregate stats from the KSE
 	 */
 	ksegrp_stash(kg);
 }
 
 struct kse_upcall *
 upcall_alloc(void)
 {
 	struct kse_upcall *ku;
 
 	ku = uma_zalloc(upcall_zone, M_WAITOK);
 	bzero(ku, sizeof(*ku));
 	return (ku);
 }
 
 void
 upcall_free(struct kse_upcall *ku)
 {
 
 	uma_zfree(upcall_zone, ku);
 }
 
 void
 upcall_link(struct kse_upcall *ku, struct ksegrp *kg)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link);
 	ku->ku_ksegrp = kg;
 	kg->kg_numupcalls++;
 }
 
 void
 upcall_unlink(struct kse_upcall *ku)
 {
 	struct ksegrp *kg = ku->ku_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
 	TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link); 
 	kg->kg_numupcalls--;
 	upcall_stash(ku);
 }
 
 void
 upcall_remove(struct thread *td)
 {
 
 	if (td->td_upcall) {
 		td->td_upcall->ku_owner = NULL;
 		upcall_unlink(td->td_upcall);
 		td->td_upcall = 0;
 	} 
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  */
 void
 proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td)
 {
 
 	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
 	p->p_numksegrps = 0;
 	p->p_numthreads = 0;
 
 	ksegrp_link(kg, p);
 	kse_link(ke, kg);
 	thread_link(td, kg);
 }
 
 /*
 struct kse_thr_interrupt_args {
 	struct kse_thr_mailbox * tmbx;
 };
 */
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
 	struct proc *p;
 	struct thread *td2;
 
 	p = td->td_proc;
-	if (!(p->p_flag & P_THREADED) || (uap->tmbx == NULL))
+	if (!(p->p_flag & P_SA) || (uap->tmbx == NULL))
 		return (EINVAL);
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		if (td2->td_mailbox == uap->tmbx) {
 			td2->td_flags |= TDF_INTERRUPT;
 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR)) {
 				if (td2->td_flags & TDF_CVWAITQ)
 					cv_abort(td2);
 				else
 					abortsleep(td2);
 			}
 			mtx_unlock_spin(&sched_lock);
 			return (0);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (ESRCH);
 }
 
 /*
 struct kse_exit_args {
 	register_t dummy;
 };
 */
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct kse *ke;
 	struct kse_upcall *ku, *ku2;
 	int    error, count;
 
 	p = td->td_proc;
 	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
 		return (EINVAL);
 	kg = td->td_ksegrp;
 	count = 0;
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	FOREACH_UPCALL_IN_GROUP(kg, ku2) {
 		if (ku2->ku_flags & KUF_EXITING)
 			count++;
 	}
 	if ((kg->kg_numupcalls - count) == 1 &&
 	    (kg->kg_numthreads > 1)) {
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	ku->ku_flags |= KUF_EXITING;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	error = suword(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE);
 	PROC_LOCK(p);
 	if (error)
 		psignal(p, SIGSEGV);
 	mtx_lock_spin(&sched_lock);
 	upcall_remove(td);
 	ke = td->td_kse;
 	if (p->p_numthreads == 1) {
 		kse_purge(p, td);
-		p->p_flag &= ~P_THREADED;
+		p->p_flag &= ~P_SA;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	} else {
 		if (kg->kg_numthreads == 1) { /* Shutdown a group */
 			kse_purge_group(td);
 			ke->ke_flags |= KEF_EXIT;
 		}
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	return (0);
 }
 
 /*
  * Either becomes an upcall or waits for an awakening event and
  * then becomes an upcall. Only error cases return.
  */
 /*
 struct kse_release_args {
 	struct timespec *timeout;
 };
 */
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct timespec ts, ts2, ts3, timeout;
 	struct timeval tv;
 	int error;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 	if (td->td_upcall == NULL || TD_CAN_UNBIND(td))
 		return (EINVAL);
 	if (uap->timeout != NULL) {
 		if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
 			return (error);
 		getnanouptime(&ts);
 		timespecadd(&ts, &timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
 	}
 	mtx_lock_spin(&sched_lock);
 	/* Change OURSELF to become an upcall. */
 	td->td_flags = TDF_UPCALLING;
 #if 0	/* XXX This shouldn't be necessary */
 	if (p->p_sflag & PS_NEEDSIGCHK)
 		td->td_flags |= TDF_ASTPENDING;
 #endif
 	mtx_unlock_spin(&sched_lock);
 	PROC_LOCK(p);
 	while ((td->td_upcall->ku_flags & KUF_DOUPCALL) == 0 &&
 	       (kg->kg_completed == NULL)) {
 		kg->kg_upsleeps++;
 		error = msleep(&kg->kg_completed, &p->p_mtx, PPAUSE|PCATCH,
 			"kse_rel", (uap->timeout ? tvtohz(&tv) : 0));
 		kg->kg_upsleeps--;
 		PROC_UNLOCK(p);
 		if (uap->timeout == NULL || error != EWOULDBLOCK)
 			return (0);
 		getnanouptime(&ts2);
 		if (timespeccmp(&ts2, &ts, >=))
 			return (0);
 		ts3 = ts;
 		timespecsub(&ts3, &ts2);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		PROC_LOCK(p);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /* struct kse_wakeup_args {
 	struct kse_mailbox *mbx;
 }; */
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct thread *td2;
 
 	p = td->td_proc;
 	td2 = NULL;
 	ku = NULL;
 	/* KSE-enabled processes only, please. */
-	if (!(p->p_flag & P_THREADED))
+	if (!(p->p_flag & P_SA))
 		return (EINVAL);
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (uap->mbx) {
 		FOREACH_KSEGRP_IN_PROC(p, kg) {
 			FOREACH_UPCALL_IN_GROUP(kg, ku) {
 				if (ku->ku_mailbox == uap->mbx)
 					break;
 			}
 			if (ku)
 				break;
 		}
 	} else {
 		kg = td->td_ksegrp;
 		if (kg->kg_upsleeps) {
 			wakeup_one(&kg->kg_completed);
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		ku = TAILQ_FIRST(&kg->kg_upcalls);
 	}
 	if (ku) {
 		if ((td2 = ku->ku_owner) == NULL) {
 			panic("%s: no owner", __func__);
 		} else if (TD_ON_SLEEPQ(td2) &&
 		           (td2->td_wchan == &kg->kg_completed)) {
 			abortsleep(td2);
 		} else {
 			ku->ku_flags |= KUF_DOUPCALL;
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (0);
 	}
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 	return (ESRCH);
 }
 
 /* 
  * No new KSEG: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate max new KSEs and schedule an upcall.
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
 	int newgroup;
 }; */
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
 	struct kse *newke;
 	struct ksegrp *newkg;
 	struct ksegrp *kg;
 	struct proc *p;
 	struct kse_mailbox mbx;
 	struct kse_upcall *newku;
 	int err, ncpus;
 
 	p = td->td_proc;
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
 	/* Too bad, why hasn't kernel always a cpu counter !? */
 #ifdef SMP
 	ncpus = mp_ncpus;
 #else
 	ncpus = 1;
 #endif
 	if (thread_debug && virtual_cpu != 0)
 		ncpus = virtual_cpu;
 
 	/* Easier to just set it than to test and set */
 	PROC_LOCK(p);
-	p->p_flag |= P_THREADED;
+	p->p_flag |= P_SA;
 	PROC_UNLOCK(p);
 	kg = td->td_ksegrp;
 	if (uap->newgroup) {
 		/* Have race condition but it is cheap */ 
 		if (p->p_numksegrps >= max_groups_per_proc) 
 			return (EPROCLIM);
 		/* 
 		 * If we want a new KSEGRP it doesn't matter whether
 		 * we have already fired up KSE mode before or not.
 		 * We put the process in KSE mode and create a new KSEGRP.
 		 */
 		newkg = ksegrp_alloc();
 		bzero(&newkg->kg_startzero, RANGEOF(struct ksegrp,
 		      kg_startzero, kg_endzero));
 		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
 		      RANGEOF(struct ksegrp, kg_startcopy, kg_endcopy));
 		mtx_lock_spin(&sched_lock);
 		if (p->p_numksegrps >= max_groups_per_proc) {
 			mtx_unlock_spin(&sched_lock);
 			ksegrp_free(newkg);
 			return (EPROCLIM);
 		}
 		ksegrp_link(newkg, p);
 		mtx_unlock_spin(&sched_lock);
 	} else {
 		newkg = kg;
 	}
 
 	/*
 	 * Creating upcalls more than number of physical cpu does
 	 * not help performance. 
 	 */
 	if (newkg->kg_numupcalls >= ncpus)
 		return (EPROCLIM);
 
 	if (newkg->kg_numupcalls == 0) {
 		/*
 		 * Initialize KSE group, optimized for MP.
 		 * Create KSEs as many as physical cpus, this increases
 		 * concurrent even if userland is not MP safe and can only run
 		 * on single CPU (for early version of libpthread, it is true).
 		 * In ideal world, every physical cpu should execute a thread.
 		 * If there is enough KSEs, threads in kernel can be
 		 * executed parallel on different cpus with full speed, 
 		 * Concurrent in kernel shouldn't be restricted by number of 
 		 * upcalls userland provides.
 		 * Adding more upcall structures only increases concurrent
 		 * in userland.
 		 * Highest performance configuration is:
 		 * N kses = N upcalls = N phyiscal cpus
 		 */
 		while (newkg->kg_kses < ncpus) {
 			newke = kse_alloc();
 			bzero(&newke->ke_startzero, RANGEOF(struct kse,
 			      ke_startzero, ke_endzero));
 #if 0
 			mtx_lock_spin(&sched_lock);
 			bcopy(&ke->ke_startcopy, &newke->ke_startcopy,
 			      RANGEOF(struct kse, ke_startcopy, ke_endcopy));
 			mtx_unlock_spin(&sched_lock);
 #endif
 			mtx_lock_spin(&sched_lock);
 			kse_link(newke, newkg);
 			/* Add engine */
 			kse_reassign(newke);
 			mtx_unlock_spin(&sched_lock);
 		}
 	}
 	newku = upcall_alloc();
 	newku->ku_mailbox = uap->mbx;
 	newku->ku_func = mbx.km_func;
 	bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
 
 	/* For the first call this may not have been set */
 	if (td->td_standin == NULL)
 		thread_alloc_spare(td, NULL);
 
 	mtx_lock_spin(&sched_lock);
 	if (newkg->kg_numupcalls >= ncpus) {
 		mtx_unlock_spin(&sched_lock);
 		upcall_free(newku);
 		return (EPROCLIM);
 	}
 	upcall_link(newku, newkg);
 	if (mbx.km_quantum)
 		newkg->kg_upquantum = max(1, mbx.km_quantum/tick);
 
 	/*
 	 * Each upcall structure has an owner thread, find which
 	 * one owns it.
 	 */
 	if (uap->newgroup) {
 		/* 
 		 * Because new ksegrp hasn't thread,
 		 * create an initial upcall thread to own it.
 		 */
 		thread_schedule_upcall(td, newku);
 	} else {
 		/*
 		 * If current thread hasn't an upcall structure,
 		 * just assign the upcall to it.
 		 */
 		if (td->td_upcall == NULL) {
 			newku->ku_owner = td;
 			td->td_upcall = newku;
 		} else {
 			/*
 			 * Create a new upcall thread to own it.
 			 */
 			thread_schedule_upcall(td, newku);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    UMA_ALIGN_CACHE, 0);
 	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
 	    NULL, NULL, ksegrp_init, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	kse_zone = uma_zcreate("KSE", sched_sizeof_kse(),
 	    NULL, NULL, kse_init, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 }
 
 /*
  * Stash an embarasingly extra thread into the zombie thread queue.
  */
 void
 thread_stash(struct thread *td)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra kse into the zombie kse queue.
  */
 void
 kse_stash(struct kse *ke)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_kses, ke, ke_procq);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra upcall into the zombie upcall queue.
  */
 
 void
 upcall_stash(struct kse_upcall *ku)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
  */
 void
 ksegrp_stash(struct ksegrp *kg)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Reap zombie kse resource.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 	struct kse *ke_first, *ke_next;
 	struct ksegrp *kg_first, * kg_next;
 	struct kse_upcall *ku_first, *ku_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if ((!TAILQ_EMPTY(&zombie_threads))
 	    || (!TAILQ_EMPTY(&zombie_kses))
 	    || (!TAILQ_EMPTY(&zombie_ksegrps))
 	    || (!TAILQ_EMPTY(&zombie_upcalls))) {
 		mtx_lock_spin(&kse_zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		ke_first = TAILQ_FIRST(&zombie_kses);
 		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		ku_first = TAILQ_FIRST(&zombie_upcalls);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		if (ke_first)
 			TAILQ_INIT(&zombie_kses);
 		if (kg_first)
 			TAILQ_INIT(&zombie_ksegrps);
 		if (ku_first)
 			TAILQ_INIT(&zombie_upcalls);
 		mtx_unlock_spin(&kse_zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_runq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 		while (ke_first) {
 			ke_next = TAILQ_NEXT(ke_first, ke_procq);
 			kse_free(ke_first);
 			ke_first = ke_next;
 		}
 		while (kg_first) {
 			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
 			ksegrp_free(kg_first);
 			kg_first = kg_next;
 		}
 		while (ku_first) {
 			ku_next = TAILQ_NEXT(ku_first, ku_link);
 			upcall_free(ku_first);
 			ku_first = ku_next;
 		}
 	}
 }
 
 /*
  * Allocate a ksegrp.
  */
 struct ksegrp *
 ksegrp_alloc(void)
 {
 	return (uma_zalloc(ksegrp_zone, M_WAITOK));
 }
 
 /*
  * Allocate a kse.
  */
 struct kse *
 kse_alloc(void)
 {
 	return (uma_zalloc(kse_zone, M_WAITOK));
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
 	thread_reap(); /* check if any zombies to get */
 	return (uma_zalloc(thread_zone, M_WAITOK));
 }
 
 /*
  * Deallocate a ksegrp.
  */
 void
 ksegrp_free(struct ksegrp *td)
 {
 	uma_zfree(ksegrp_zone, td);
 }
 
 /*
  * Deallocate a kse.
  */
 void
 kse_free(struct kse *td)
 {
 	uma_zfree(kse_zone, td);
 }
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
  * The list is anchored in the ksegrp structure.
  */
 int
 thread_export_context(struct thread *td)
 {
 	struct proc *p;
 	struct ksegrp *kg;
 	uintptr_t mbx;
 	void *addr;
 	int error,temp;
 	mcontext_t mc;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Export the user/machine context. */
 	get_mcontext(td, &mc, 0);
 	addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext);
 	error = copyout(&mc, addr, sizeof(mcontext_t));
 	if (error)
 		goto bad;
 
 	/* Exports clock ticks in kernel mode */
 	addr = (caddr_t)(&td->td_mailbox->tm_sticks);
 	temp = fuword(addr) + td->td_usticks;
 	if (suword(addr, temp)) {
 		error = EFAULT;
 		goto bad;
 	}
 
 	/* Get address in latest mbox of list pointer */
 	addr = (void *)(&td->td_mailbox->tm_next);
 	/*
 	 * Put the saved address of the previous first
 	 * entry into this one
 	 */
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			error = EFAULT;
 			goto bad;
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = td->td_mailbox;
 			/*
 			 * The thread context may be taken away by
 			 * other upcall threads when we unlock
 			 * process lock. it's no longer valid to
 			 * use it again in any other places.
 			 */
 			td->td_mailbox = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	td->td_usticks = 0;
 	return (0);
 
 bad:
 	PROC_LOCK(p);
 	psignal(p, SIGSEGV);
 	PROC_UNLOCK(p);
 	/* The mailbox is bad, don't use it */
 	td->td_mailbox = NULL;
 	td->td_usticks = 0;
 	return (error);
 }
 
 /*
  * Take the list of completed mailboxes for this KSEGRP and put them on this
  * upcall's mailbox as it's the next one going up.
  */
 static int
 thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku)
 {
 	struct proc *p = kg->kg_proc;
 	void *addr;
 	uintptr_t mbx;
 
 	addr = (void *)(&ku->ku_mailbox->km_completed);
 	for (;;) {
 		mbx = (uintptr_t)kg->kg_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)kg->kg_completed) {
 			kg->kg_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * This function should be called at statclock interrupt time
  */
 int
 thread_statclock(int user)
 {
 	struct thread *td = curthread;
 	
 	if (td->td_ksegrp->kg_numupcalls == 0)
 		return (-1);
 	if (user) {
 		/* Current always do via ast() */
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= (TDF_USTATCLOCK|TDF_ASTPENDING);
 		mtx_unlock_spin(&sched_lock);
 		td->td_uuticks++;
 	} else {
 		if (td->td_mailbox != NULL)
 			td->td_usticks++;
 		else {
 			/* XXXKSE
 		 	 * We will call thread_user_enter() for every
 			 * kernel entry in future, so if the thread mailbox
 			 * is NULL, it must be a UTS kernel, don't account
 			 * clock ticks for it.
 			 */
 		}
 	}
 	return (0);
 }
 
 /*
  * Export state clock ticks for userland
  */
 static int
 thread_update_usr_ticks(struct thread *td, int user)
 {
 	struct proc *p = td->td_proc;
 	struct kse_thr_mailbox *tmbx;
 	struct kse_upcall *ku;
 	struct ksegrp *kg;
 	caddr_t addr;
 	uint uticks;
 
 	if ((ku = td->td_upcall) == NULL)
 		return (-1);
 	
 	tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 	if ((tmbx == NULL) || (tmbx == (void *)-1))
 		return (-1);
 	if (user) {
 		uticks = td->td_uuticks;
 		td->td_uuticks = 0;
 		addr = (caddr_t)&tmbx->tm_uticks;
 	} else {
 		uticks = td->td_usticks;
 		td->td_usticks = 0;
 		addr = (caddr_t)&tmbx->tm_sticks;
 	}
 	if (uticks) {
 		if (suword(addr, uticks+fuword(addr))) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (-2);
 		}
 	}
 	kg = td->td_ksegrp;
 	if (kg->kg_upquantum && ticks >= kg->kg_nextupcall) {
 		mtx_lock_spin(&sched_lock);
 		td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 	}
 	return (0);
 }
 
 /*
  * Discard the current thread and exit from its context.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().
  */
 void
 thread_exit(void)
 {
 	struct thread *td;
 	struct kse *ke;
 	struct proc *p;
 	struct ksegrp	*kg;
 
 	td = curthread;
 	kg = td->td_ksegrp;
 	p = td->td_proc;
 	ke = td->td_kse;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	KASSERT(ke != NULL, ("thread exiting without a kse"));
 	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR1(KTR_PROC, "thread_exit: thread %p", td);
 	KASSERT(!mtx_owned(&Giant), ("dying thread owns giant"));
 
 	if (td->td_standin != NULL) {
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff.
 	 */
 	if (p->p_numthreads > 1) {
 		thread_unlink(td);
 		if (p->p_maxthrwaits)
 			wakeup(&p->p_numthreads);
 		/*
 		 * The test below is NOT true if we are the
 		 * sole exiting thread. P_STOPPED_SNGL is unset
 		 * in exit1() after it is the only survivor.
 		 */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 
 		/*
 		 * Because each upcall structure has an owner thread,
 		 * owner thread exits only when process is in exiting
 		 * state, so upcall to userland is no longer needed,
 		 * deleting upcall structure is safe here.
 		 * So when all threads in a group is exited, all upcalls
 		 * in the group should be automatically freed.
 		 */
 		if (td->td_upcall)
 			upcall_remove(td);
 	
 		ke->ke_state = KES_UNQUEUED;
 		ke->ke_thread = NULL;
 		/* 
 		 * Decide what to do with the KSE attached to this thread.
 		 */
 		if (ke->ke_flags & KEF_EXIT)
 			kse_unlink(ke);
 		else
 			kse_reassign(ke);
 		PROC_UNLOCK(p);
 		td->td_kse	= NULL;
 		td->td_state	= TDS_INACTIVE;
 #if 0
 		td->td_proc	= NULL;
 #endif
 		td->td_ksegrp	= NULL;
 		td->td_last_kse	= NULL;
 		PCPU_SET(deadthread, td);
 	} else {
 		PROC_UNLOCK(p);
 	}
 	/* XXX Shouldn't cpu_throw() here. */
 	mtx_assert(&sched_lock, MA_OWNED);
 #if !defined(__alpha__) && !defined(__powerpc__) 
 	cpu_throw(td, choosethread());
 #else
 	cpu_throw();
 #endif
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /* 
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant held, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	KASSERT((p->p_numthreads == 1), ("Muliple threads in wait1()"));
 	KASSERT((p->p_numksegrps == 1), ("Muliple ksegrps in wait1()"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_standin != NULL) {
 			thread_free(td->td_standin);
 			td->td_standin = NULL;
 		}
 		cpu_thread_clean(td);
 	}
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  */
 void
 thread_link(struct thread *td, struct ksegrp *kg)
 {
 	struct proc *p;
 
 	p = kg->kg_proc;
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_ksegrp   = kg;
 	td->td_last_kse = NULL;
 	td->td_flags    = 0;
 	td->td_kse      = NULL;
 
 	LIST_INIT(&td->td_contested);
 	callout_init(&td->td_slpcallout, 1);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
 	p->p_numthreads++;
 	kg->kg_numthreads++;
 }
 
 void
 thread_unlink(struct thread *td)
 {      
 	struct proc *p = td->td_proc;
 	struct ksegrp *kg = td->td_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
 	kg->kg_numthreads--;
 	/* could clear a few other things here */
 } 
 
 /*
  * Purge a ksegrp resource. When a ksegrp is preparing to
  * exit, it calls this function. 
  */
 static void
 kse_purge_group(struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
 	kg = td->td_ksegrp;
  	KASSERT(kg->kg_numthreads == 1, ("%s: bad thread number", __func__));
 	while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 		KASSERT(ke->ke_state == KES_IDLE,
 			("%s: wrong idle KSE state", __func__));
 		kse_unlink(ke);
 	}
 	KASSERT((kg->kg_kses == 1),
 		("%s: ksegrp still has %d KSEs", __func__, kg->kg_kses));
 	KASSERT((kg->kg_numupcalls == 0),
 	        ("%s: ksegrp still has %d upcall datas",
 		__func__, kg->kg_numupcalls));
 }
 
 /*
  * Purge a process's KSE resource. When a process is preparing to 
  * exit, it calls kse_purge to release any extra KSE resources in 
  * the process.
  */
 static void
 kse_purge(struct proc *p, struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse *ke;
 
  	KASSERT(p->p_numthreads == 1, ("bad thread number"));
 	while ((kg = TAILQ_FIRST(&p->p_ksegrps)) != NULL) {
 		TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
 		p->p_numksegrps--;
 		/*
 		 * There is no ownership for KSE, after all threads
 		 * in the group exited, it is possible that some KSEs 
 		 * were left in idle queue, gc them now.
 		 */
 		while ((ke = TAILQ_FIRST(&kg->kg_iq)) != NULL) {
 			KASSERT(ke->ke_state == KES_IDLE,
 			   ("%s: wrong idle KSE state", __func__));
 			TAILQ_REMOVE(&kg->kg_iq, ke, ke_kgrlist);
 			kg->kg_idle_kses--;
 			TAILQ_REMOVE(&kg->kg_kseq, ke, ke_kglist);
 			kg->kg_kses--;
 			kse_stash(ke);
 		}
 		KASSERT(((kg->kg_kses == 0) && (kg != td->td_ksegrp)) ||
 		        ((kg->kg_kses == 1) && (kg == td->td_ksegrp)),
 		        ("ksegrp has wrong kg_kses: %d", kg->kg_kses));
 		KASSERT((kg->kg_numupcalls == 0),
 		        ("%s: ksegrp still has %d upcall datas",
 			__func__, kg->kg_numupcalls));
 	
 		if (kg != td->td_ksegrp)
 			ksegrp_stash(kg);
 	}
 	TAILQ_INSERT_HEAD(&p->p_ksegrps, td->td_ksegrp, kg_ksegrp);
 	p->p_numksegrps++;
 }
 
 /*
  * This function is intended to be used to initialize a spare thread
  * for upcall. Initialize thread's large data area outside sched_lock
  * for thread_schedule_upcall().
  */
 void
 thread_alloc_spare(struct thread *td, struct thread *spare)
 {
 	if (td->td_standin)
 		return;
 	if (spare == NULL)
 		spare = thread_alloc();
 	td->td_standin = spare;
 	bzero(&spare->td_startzero,
 	    (unsigned)RANGEOF(struct thread, td_startzero, td_endzero));
 	spare->td_proc = td->td_proc;
 	spare->td_ucred = crhold(td->td_ucred);
 }
 
 /*
  * Create a thread and schedule it for upcall on the KSE given.
  * Use our thread's standin so that we don't have to allocate one.
  */
 struct thread *
 thread_schedule_upcall(struct thread *td, struct kse_upcall *ku)
 {
 	struct thread *td2;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/* 
 	 * Schedule an upcall thread on specified kse_upcall,
 	 * the kse_upcall must be free.
 	 * td must have a spare thread.
 	 */
 	KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__));
 	if ((td2 = td->td_standin) != NULL) {
 		td->td_standin = NULL;
 	} else {
 		panic("no reserve thread when scheduling an upcall");
 		return (NULL);
 	}
 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    (unsigned) RANGEOF(struct thread, td_startcopy, td_endcopy));
 	thread_link(td2, ku->ku_ksegrp);
 	/* inherit blocked thread's context */
 	cpu_set_upcall(td2, td);
 	/* Let the new thread become owner of the upcall */
 	ku->ku_owner   = td2;
 	td2->td_upcall = ku;
 	td2->td_flags  = TDF_UPCALLING;
 #if 0	/* XXX This shouldn't be necessary */
 	if (td->td_proc->p_sflag & PS_NEEDSIGCHK)
 		td2->td_flags |= TDF_ASTPENDING;
 #endif
 	td2->td_kse    = NULL;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	setrunqueue(td2);
 	return (td2);	/* bogus.. should be a void function */
 }
 
 void
 thread_signal_add(struct thread *td, int sig)
 {
 	struct kse_upcall *ku;
 	struct proc *p;
 	sigset_t ss;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&p->p_sigacts->ps_mtx, MA_OWNED);
 	td = curthread;
 	ku = td->td_upcall;
 	mtx_unlock(&p->p_sigacts->ps_mtx);
 	PROC_UNLOCK(p);
 	error = copyin(&ku->ku_mailbox->km_sigscaught, &ss, sizeof(sigset_t));
 	if (error)
 		goto error;
 
 	SIGADDSET(ss, sig);
 
 	error = copyout(&ss, &ku->ku_mailbox->km_sigscaught, sizeof(sigset_t));
 	if (error)
 		goto error;
 
 	PROC_LOCK(p);
 	mtx_lock(&p->p_sigacts->ps_mtx);
 	return;
 error:
 	PROC_LOCK(p);
 	sigexit(td, SIGILL);
 }
 
 
 /*
  * Schedule an upcall to notify a KSE process recieved signals.
  *
  */
 void
 thread_signal_upcall(struct thread *td)
 {
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_UPCALLING;
 	mtx_unlock_spin(&sched_lock);
 
 	return;
 }
 
 void
 thread_switchout(struct thread *td)
 {
 	struct kse_upcall *ku;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * If the outgoing thread is in threaded group and has never
 	 * scheduled an upcall, decide whether this is a short
 	 * or long term event and thus whether or not to schedule
 	 * an upcall.
 	 * If it is a short term event, just suspend it in
 	 * a way that takes its KSE with it.
 	 * Select the events for which we want to schedule upcalls.
 	 * For now it's just sleep.
 	 * XXXKSE eventually almost any inhibition could do.
 	 */
 	if (TD_CAN_UNBIND(td) && (td->td_standin) && TD_ON_SLEEPQ(td)) {
 		/* 
 		 * Release ownership of upcall, and schedule an upcall
 		 * thread, this new upcall thread becomes the owner of
 		 * the upcall structure.
 		 */
 		ku = td->td_upcall;
 		ku->ku_owner = NULL;
 		td->td_upcall = NULL; 
 		td->td_flags &= ~TDF_CAN_UNBIND;
 		thread_schedule_upcall(td, ku);
 	}
 }
 
 /*
  * Setup done on the thread when it enters the kernel.
  * XXXKSE Presently only for syscalls but eventually all kernel entries.
  */
 void
 thread_user_enter(struct proc *p, struct thread *td)
 {
 	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct kse_thr_mailbox *tmbx;
 
 	kg = td->td_ksegrp;
 
 	/*
 	 * First check that we shouldn't just abort.
 	 * But check if we are the single thread first!
 	 */
 	PROC_LOCK(p);
 	if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we are doing a syscall in a KSE environment,
 	 * note where our mailbox is. There is always the
 	 * possibility that we could do this lazily (in kse_reassign()),
 	 * but for now do it every time.
 	 */
 	kg = td->td_ksegrp;
 	if (kg->kg_numupcalls) {
 		ku = td->td_upcall;
 		KASSERT(ku, ("%s: no upcall owned", __func__));
 		KASSERT((ku->ku_owner == td), ("%s: wrong owner", __func__));
 		KASSERT(!TD_CAN_UNBIND(td), ("%s: can unbind", __func__));
 		ku->ku_mflags = fuword((void *)&ku->ku_mailbox->km_flags);
 		tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 		if ((tmbx == NULL) || (tmbx == (void *)-1)) {
 			td->td_mailbox = NULL;
 		} else {
 			td->td_mailbox = tmbx;
 			if (td->td_standin == NULL)
 				thread_alloc_spare(td, NULL);
 			mtx_lock_spin(&sched_lock);
 			if (ku->ku_mflags & KMF_NOUPCALL)
 				td->td_flags &= ~TDF_CAN_UNBIND;
 			else
 				td->td_flags |= TDF_CAN_UNBIND;
 			mtx_unlock_spin(&sched_lock);
 		}
 	}
 }
 
 /*
  * The extra work we go through if we are a threaded process when we
  * return to userland.
  *
  * If we are a KSE process and returning to user mode, check for
  * extra work to do before we return (e.g. for more syscalls
  * to complete first).  If we were in a critical section, we should
  * just return to let it finish. Same if we were in the UTS (in
  * which case the mailbox's context's busy indicator will be set).
  * The only traps we suport will have set the mailbox.
  * We will clear it here.
  */
 int
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	int error = 0, upcalls, uts_crit;
 	struct kse_upcall *ku;
 	struct ksegrp *kg, *kg2;
 	struct proc *p;
 	struct timespec ts;
 
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	/* Nothing to do with non-threaded group/process */
 	if (td->td_ksegrp->kg_numupcalls == 0)
 		return (0);
 
 	/*
 	 * Stat clock interrupt hit in userland, it 
 	 * is returning from interrupt, charge thread's
 	 * userland time for UTS.
 	 */
 	if (td->td_flags & TDF_USTATCLOCK) {
 		thread_update_usr_ticks(td, 1);
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_USTATCLOCK;
 		mtx_unlock_spin(&sched_lock);
 		if (kg->kg_completed || 
 		    (td->td_upcall->ku_flags & KUF_DOUPCALL))
 			thread_user_enter(p, td);
 	}
 
 	uts_crit = (td->td_mailbox == NULL);
 	ku = td->td_upcall;
 	/* 
 	 * Optimisation:
 	 * This thread has not started any upcall.
 	 * If there is no work to report other than ourself,
 	 * then it can return direct to userland.
 	 */
 	if (TD_CAN_UNBIND(td)) {
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_CAN_UNBIND;
 		if ((td->td_flags & TDF_NEEDSIGCHK) == 0 &&
 		    (kg->kg_completed == NULL) &&
 		    (ku->ku_flags & KUF_DOUPCALL) == 0 &&
 		    (kg->kg_upquantum && ticks < kg->kg_nextupcall)) {
 			mtx_unlock_spin(&sched_lock);
 			thread_update_usr_ticks(td, 0);
 			nanotime(&ts);
 			error = copyout(&ts,
 				(caddr_t)&ku->ku_mailbox->km_timeofday,
 				sizeof(ts));
 			td->td_mailbox = 0;
 			ku->ku_mflags = 0;
 			if (error)
 				goto out;
 			return (0);
 		}
 		mtx_unlock_spin(&sched_lock);
 		error = thread_export_context(td);
 		if (error) {
 			/*
 			 * Failing to do the KSE operation just defaults
 			 * back to synchonous operation, so just return from
 			 * the syscall.
 			 */
 			goto out;
 		}
 		/*
 		 * There is something to report, and we own an upcall
 		 * strucuture, we can go to userland.
 		 * Turn ourself into an upcall thread.
 		 */
 		mtx_lock_spin(&sched_lock);
 		td->td_flags |= TDF_UPCALLING;
 		mtx_unlock_spin(&sched_lock);
 	} else if (td->td_mailbox && (ku == NULL)) {
 		error = thread_export_context(td);
 		/* possibly upcall with error? */
 		PROC_LOCK(p);
 		/*
 		 * There are upcall threads waiting for
 		 * work to do, wake one of them up.
 		 * XXXKSE Maybe wake all of them up. 
 		 */
 		if (!error && kg->kg_upsleeps)
 			wakeup_one(&kg->kg_completed);
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 
 	KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
 
 	if (p->p_numthreads > max_threads_per_proc) {
 		max_threads_hits++;
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		p->p_maxthrwaits++;
 		while (p->p_numthreads > max_threads_per_proc) {
 			upcalls = 0;
 			FOREACH_KSEGRP_IN_PROC(p, kg2) {
 				if (kg2->kg_numupcalls == 0)
 					upcalls++;
 				else
 					upcalls += kg2->kg_numupcalls;
 			}
 			if (upcalls >= max_threads_per_proc)
 				break;
 			mtx_unlock_spin(&sched_lock);
 			if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
 			    "maxthreads", NULL)) {
 				mtx_lock_spin(&sched_lock);
 				break;
 			} else {
 				mtx_lock_spin(&sched_lock);
 			}
 		}
 		p->p_maxthrwaits--;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	}
 
 	if (td->td_flags & TDF_UPCALLING) {
 		uts_crit = 0;
 		kg->kg_nextupcall = ticks+kg->kg_upquantum;
 		/* 
 		 * There is no more work to do and we are going to ride
 		 * this thread up to userland as an upcall.
 		 * Do the last parts of the setup needed for the upcall.
 		 */
 		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
 
 		mtx_lock_spin(&sched_lock);
 		td->td_flags &= ~TDF_UPCALLING;
 		if (ku->ku_flags & KUF_DOUPCALL)
 			ku->ku_flags &= ~KUF_DOUPCALL;
 		mtx_unlock_spin(&sched_lock);
 
 		/*
 		 * Set user context to the UTS
 		 */
 		if (!(ku->ku_mflags & KMF_NOUPCALL)) {
 			cpu_set_upcall_kse(td, ku);
 			error = suword(&ku->ku_mailbox->km_curthread, 0);
 			if (error)
 				goto out;
 		}
 
 		/*
 		 * Unhook the list of completed threads.
 		 * anything that completes after this gets to 
 		 * come in next time.
 		 * Put the list of completed thread mailboxes on
 		 * this KSE's mailbox.
 		 */
 		if (!(ku->ku_mflags & KMF_NOCOMPLETED) &&
 		    (error = thread_link_mboxes(kg, ku)) != 0)
 			goto out;
 	}
 	if (!uts_crit) {
 		nanotime(&ts);
 		error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts));
 	}
 
 out:
 	if (error) {
 		/*
 		 * Things are going to be so screwed we should just kill
 		 * the process.
 		 * how do we do that?
 		 */
 		PROC_LOCK(td->td_proc);
 		psignal(td->td_proc, SIGSEGV);
 		PROC_UNLOCK(td->td_proc);
 	} else {
 		/*
 		 * Optimisation:
 		 * Ensure that we have a spare thread available,
 		 * for when we re-enter the kernel.
 		 */
 		if (td->td_standin == NULL)
 			thread_alloc_spare(td, NULL);
 	}
 
 	ku->ku_mflags = 0;
 	/*
 	 * Clear thread mailbox first, then clear system tick count.
 	 * The order is important because thread_statclock() use 
 	 * mailbox pointer to see if it is an userland thread or
 	 * an UTS kernel thread.
 	 */
 	td->td_mailbox = NULL;
 	td->td_usticks = 0;
 	return (error);	/* go sync */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accellerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int force_exit)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
-	if ((p->p_flag & P_THREADED) == 0 && p->p_numthreads == 1)
+	if ((p->p_flag & P_SA) == 0 && p->p_numthreads == 1)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread) 
 		return (1);
 
 	if (force_exit == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 	} else
 		p->p_flag &= ~P_SINGLE_EXIT;
 	p->p_flag |= P_STOPPED_SINGLE;
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = td;
 	while ((p->p_numthreads - p->p_suspcount) != 1) {
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				if (force_exit == SINGLE_EXIT) {
 					if (TD_IS_SUSPENDED(td2)) {
 						thread_unsuspend_one(td2);
 					}
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR)) {
 						if (td2->td_flags & TDF_CVWAITQ)
 							cv_abort(td2);
 						else
 							abortsleep(td2);
 					}
 				} else {
 					if (TD_IS_SUSPENDED(td2))
 						continue;
 					/*
 					 * maybe other inhibitted states too?
 					 * XXXKSE Is it totally safe to
 					 * suspend a non-interruptable thread?
 					 */
 					if (td2->td_inhibitors &
 					    (TDI_SLEEPING | TDI_SWAPPED))
 						thread_suspend_one(td2);
 				}
 			}
 		}
 		/* 
 		 * Maybe we suspended some threads.. was it enough? 
 		 */
 		if ((p->p_numthreads - p->p_suspcount) == 1)
 			break;
 
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_one(td);
 		DROP_GIANT();
 		PROC_UNLOCK(p);
 		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (force_exit == SINGLE_EXIT) { 
 		if (td->td_upcall)
 			upcall_remove(td);
 		kse_purge(p, td);
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless 
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p)) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked 
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		} 
 		if (return_instead)
 			return (1);
 
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td)) {
 			while (mtx_owned(&Giant))
 				mtx_unlock(&Giant);
-			if (p->p_flag & P_THREADED)
+			if (p->p_flag & P_SA)
 				thread_exit();
 			else
 				thr_exit1();
 		}
 
 		/*
 		 * When a thread suspends, it just
 		 * moves to the processes's suspend queue
 		 * and stays there.
 		 */
 		thread_suspend_one(td);
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) {
 				thread_unsuspend_one(p->p_singlethread);
 			}
 		}
 		DROP_GIANT();
 		PROC_UNLOCK(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
 	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
 	/*
 	 * Hack: If we are suspending but are on the sleep queue
 	 * then we are in msleep or the cv equivalent. We
 	 * want to look like we have two Inhibitors.
 	 * May already be set.. doesn't matter.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		TD_SET_SLEEPING(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_unsuspend_one(p->p_singlethread);
 	}
 }
 
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~P_STOPPED_SINGLE;
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
 		while (( td = TAILQ_FIRST(&p->p_suspended))) {
 			thread_unsuspend_one(td);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 
Index: head/sys/kern/sched_4bsd.c
===================================================================
--- head/sys/kern/sched_4bsd.c	(revision 116360)
+++ head/sys/kern/sched_4bsd.c	(revision 116361)
@@ -1,715 +1,715 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 
 /*
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 struct ke_sched {
 	int	ske_cpticks;	/* (j) Ticks of cpu time. */
 };
 
 static struct ke_sched ke_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = NULL;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = NULL;
 
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
 
 static struct callout schedcpu_callout;
 static struct callout roundrobin_callout;
 
 static void	roundrobin(void *arg);
 static void	schedcpu(void *arg);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
 static void	updatepri(struct ksegrp *kg);
 static void	resetpriority(struct ksegrp *kg);
 
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 /*
  * Global run queue.
  */
 static struct runq runq;
 SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 
 	new_val = sched_quantum * tick;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < tick)
 		return (EINVAL);
 	sched_quantum = new_val / tick;
 	hogticks = 2 * sched_quantum;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I",
 	"Roundrobin scheduling quantum in microseconds");
 
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 static void
 maybe_resched(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (td->td_priority < curthread->td_priority && curthread->td_kse)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Force switch among equal priority processes every 100ms.
  * We don't actually need to force a context switch of the current process.
  * The act of firing the event triggers a context switch to softclock() and
  * then switching back out again which is equivalent to a preemption, thus
  * no further work is needed on the local CPU.
  */
 /* ARGSUSED */
 static void
 roundrobin(void *arg)
 {
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	forward_roundrobin();
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (p_estcpu) usage in 5 * loadav time
  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		p_estcpu *= decay;
  * will compute
  * 	p_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(void *arg)
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 	int realstathz;
 	int awake;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		mtx_lock_spin(&sched_lock);
 		p->p_swtime++;
 		FOREACH_KSEGRP_IN_PROC(p, kg) { 
 			awake = 0;
 			FOREACH_KSE_IN_GROUP(kg, ke) {
 				/*
 				 * Increment time in/out of memory and sleep
 				 * time (if sleeping).  We ignore overflow;
 				 * with 16-bit int's (remember them?)
 				 * overflow takes 45 days.
 				 */
 				/*
 				 * The kse slptimes are not touched in wakeup
 				 * because the thread may not HAVE a KSE.
 				 */
 				if (ke->ke_state == KES_ONRUNQ) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				} else if ((ke->ke_state == KES_THREAD) &&
 				    (TD_IS_RUNNING(ke->ke_thread))) {
 					awake = 1;
 					/* Do not clear KEF_DIDRUN */
 				} else if (ke->ke_flags & KEF_DIDRUN) {
 					awake = 1;
 					ke->ke_flags &= ~KEF_DIDRUN;
 				}
 
 				/*
 				 * pctcpu is only for ps?
 				 * Do it per kse.. and add them up at the end?
 				 * XXXKSE
 				 */
 				ke->ke_pctcpu
 				    = (ke->ke_pctcpu * ccpu) >>
 				    FSHIFT;
 				/*
 				 * If the kse has been idle the entire second,
 				 * stop recalculating its priority until
 				 * it wakes up.
 				 */
 				if (ke->ke_sched->ske_cpticks == 0)
 					continue;
 #if	(FSHIFT >= CCPU_SHIFT)
 				ke->ke_pctcpu += (realstathz == 100)
 				    ? ((fixpt_t) ke->ke_sched->ske_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
 				    100 * (((fixpt_t) ke->ke_sched->ske_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 				ke->ke_pctcpu += ((FSCALE - ccpu) *
 				    (ke->ke_sched->ske_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
 				ke->ke_sched->ske_cpticks = 0;
 			} /* end of kse loop */
 			/* 
 			 * If there are ANY running threads in this KSEGRP,
 			 * then don't count it as sleeping.
 			 */
 			if (awake) {
 				if (kg->kg_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
 					 * up from the long sleep should have
 					 * unwound the slptime and reset our
 					 * priority before we run at the stale
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
 					updatepri(kg);
 				}
 				kg->kg_slptime = 0;
 			} else {
 				kg->kg_slptime++;
 			}
 			if (kg->kg_slptime > 1)
 				continue;
 			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
 		      	resetpriority(kg);
 			FOREACH_THREAD_IN_GROUP(kg, td) {
 				if (td->td_priority >= PUSER) {
 					sched_prio(td, kg->kg_user_pri);
 				}
 			}
 		} /* end of ksegrp loop */
 		mtx_unlock_spin(&sched_lock);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay p_estcpu to zero.
  */
 static void
 updatepri(struct ksegrp *kg)
 {
 	register unsigned int newcpu;
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 
 	newcpu = kg->kg_estcpu;
 	if (kg->kg_slptime > 5 * loadfac)
 		kg->kg_estcpu = 0;
 	else {
 		kg->kg_slptime--;	/* the first time was done in schedcpu */
 		while (newcpu && --kg->kg_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		kg->kg_estcpu = newcpu;
 	}
 	resetpriority(kg);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 static void
 resetpriority(struct ksegrp *kg)
 {
 	register unsigned int newpriority;
 	struct thread *td;
 
 	if (kg->kg_pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (kg->kg_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
 		kg->kg_user_pri = newpriority;
 	}
 	FOREACH_THREAD_IN_GROUP(kg, td) {
 		maybe_resched(td);			/* XXXKSE silly */
 	}
 }
 
 /* ARGSUSED */
 static void
 sched_setup(void *dummy)
 {
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	hogticks = 2 * sched_quantum;
 
 	callout_init(&schedcpu_callout, 1);
 	callout_init(&roundrobin_callout, 0);
 
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
 }
 
 /* External interfaces start here */
 int
 sched_runnable(void)
 {
         return runq_check(&runq);
 }
 
 int 
 sched_rr_interval(void)
 {
 	if (sched_quantum == 0)
 		sched_quantum = SCHED_QUANTUM;
 	return (sched_quantum);
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (p_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time p_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 sched_clock(struct kse *ke)
 {
 	struct ksegrp *kg;
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = ke->ke_ksegrp;
 	td = ke->ke_thread;
 
 	ke->ke_sched->ske_cpticks++;
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
 	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(kg);
 		if (td->td_priority >= PUSER)
 			td->td_priority = kg->kg_user_pri;
 	}
 }
 /*
  * charge childs scheduling cpu usage to parent.
  *
  * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
  * Charge it to the ksegrp that did the wait since process estcpu is sum of
  * all ksegrps, this is strictly as expected.  Assume that the child process
  * aggregated all the estcpu into the 'built-in' ksegrp.
  */
 void
 sched_exit(struct proc *p, struct proc *p1)
 {
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_exit_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + child->kg_estcpu);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 	child->ke_sched->ske_cpticks = 0;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	child->kg_estcpu = kg->kg_estcpu;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_nice = nice;
 	resetpriority(kg);
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg->kg_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
  * This may include moving the thread within the KSEGRP,
  * changing the assignment of a kse to the thread,
  * and moving a KSE in the system run queue.
  */
 void
 sched_prio(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (TD_ON_RUNQ(td)) {
 		adjustrunqueue(td, prio);
 	} else {
 		td->td_priority = prio;
 	}
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_ksegrp->kg_slptime = 0;
 	td->td_priority = prio;
 }
 
 void
 sched_switchin(struct thread *td)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	td->td_oncpu = PCPU_GET(cpuid);
 }
 
 void
 sched_switchout(struct thread *td)
 {
 	struct kse *ke;
 	struct proc *p;
 
 	ke = td->td_kse;
 	p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_THREAD), ("mi_switch: kse state?"));
 
 	td->td_lastcpu = td->td_oncpu;
 	td->td_last_kse = ke;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	/*
 	 * At the last moment, if this thread is still marked RUNNING,
 	 * then put it back on the run queue as it has not been suspended
 	 * or stopped or any thing else similar.
 	 */
 	if (TD_IS_RUNNING(td)) {
 		/* Put us back on the run queue (kse and all). */
 		setrunqueue(td);
-	} else if (p->p_flag & P_THREADED) {
+	} else if (p->p_flag & P_SA) {
 		/*
 		 * We will not be on the run queue. So we must be
 		 * sleeping or similar. As it's available,
 		 * someone else can use the KSE if they need it.
 		 */
 		kse_reassign(ke);
 	}
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	kg = td->td_ksegrp;
 	if (kg->kg_slptime > 1)
 		updatepri(kg);
 	kg->kg_slptime = 0;
 	setrunqueue(td);
 	maybe_resched(td);
 }
 
 void
 sched_add(struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("runq_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("runq_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("runq_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_add: process swapped out"));
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(&runq, ke);
 }
 
 void
 sched_rem(struct kse *ke)
 {
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("runq_remove: process swapped out"));
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	runq_remove(&runq, ke);
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kse *ke;
 
 	ke = runq_choose(&runq);
 
 	if (ke != NULL) {
 		runq_remove(&runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		KASSERT((ke->ke_thread != NULL),
 		    ("runq_choose: No thread on KSE"));
 		KASSERT((ke->ke_thread->td_kse != NULL),
 		    ("runq_choose: No KSE on thread"));
 		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 		    ("runq_choose: process swapped out"));
 	}
 	return (ke);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
 	 * incomplete workaround for not setting it properly elsewhere.
 	 * Now that some interrupt handlers are threads, not setting it
 	 * properly elsewhere can clobber it in the window between setting
 	 * it here and returning to user mode, so don't waste time setting
 	 * it perfectly here.
 	 */
 	kg = td->td_ksegrp;
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp));
 }
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread));
 }
 
 fixpt_t
 sched_pctcpu(struct kse *ke)
 {
 	return (ke->ke_pctcpu);
 }
Index: head/sys/kern/sched_ule.c
===================================================================
--- head/sys/kern/sched_ule.c	(revision 116360)
+++ head/sys/kern/sched_ule.c	(revision 116361)
@@ -1,1280 +1,1280 @@
 /*-
  * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 #define KTR_ULE         KTR_NFS
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 /* XXX This is bogus compatability crap for ps */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 static void sched_setup(void *dummy);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
 
 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED");
 
 static int sched_strict;
 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, "");
 
 static int slice_min = 1;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
 
 static int slice_max = 2;
 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
 
 int realstathz;
 int tickincr = 1;
 
 #ifdef SMP
 /* Callout to handle load balancing SMP systems. */
 static struct callout kseq_lb_callout;
 #endif
 
 /*
  * These datastructures are allocated within their parent datastructure but
  * are scheduler specific.
  */
 
 struct ke_sched {
 	int		ske_slice;
 	struct runq	*ske_runq;
 	/* The following variables are only used for pctcpu calculation */
 	int		ske_ltick;	/* Last tick that we were running on */
 	int		ske_ftick;	/* First tick that we were running on */
 	int		ske_ticks;	/* Tick count */
 	/* CPU that we have affinity for. */
 	u_char		ske_cpu;
 };
 #define	ke_slice	ke_sched->ske_slice
 #define	ke_runq		ke_sched->ske_runq
 #define	ke_ltick	ke_sched->ske_ltick
 #define	ke_ftick	ke_sched->ske_ftick
 #define	ke_ticks	ke_sched->ske_ticks
 #define	ke_cpu		ke_sched->ske_cpu
 
 struct kg_sched {
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
 };
 #define	kg_slptime	kg_sched->skg_slptime
 #define	kg_runtime	kg_sched->skg_runtime
 
 struct td_sched {
 	int	std_slptime;
 };
 #define	td_slptime	td_sched->std_slptime
 
 struct td_sched td_sched;
 struct ke_sched ke_sched;
 struct kg_sched kg_sched;
 
 struct ke_sched *kse0_sched = &ke_sched;
 struct kg_sched *ksegrp0_sched = &kg_sched;
 struct p_sched *proc0_sched = NULL;
 struct td_sched *thread0_sched = &td_sched;
 
 /*
  * This priority range has 20 priorities on either end that are reachable
  * only through nice values.
  *
  * PRI_RANGE:	Total priority range for timeshare threads.
  * PRI_NRESV:	Reserved priorities for nice.
  * PRI_BASE:	The start of the dynamic range.
  * DYN_RANGE:	Number of priorities that are available int the dynamic
  *		priority range.
  */
 #define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
 #define	SCHED_PRI_NRESV		PRIO_TOTAL
 #define	SCHED_PRI_NHALF		(PRIO_TOTAL / 2)
 #define	SCHED_PRI_NTHRESH	(SCHED_PRI_NHALF - 1)
 #define	SCHED_PRI_BASE		((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE)
 #define	SCHED_DYN_RANGE		(SCHED_PRI_RANGE - SCHED_PRI_NRESV)
 #define	SCHED_PRI_INTERACT(score)					\
     ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE)
 
 /*
  * These determine the interactivity of a process.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
  * SLP_RUN_THROTTLE:	Divisor for reducing slp/run time.
  * INTERACT_RANGE:	Range of interactivity values.  Smaller is better.
  * INTERACT_HALF:	Convenience define, half of the interactivity range.
  * INTERACT_THRESH:	Threshhold for placement on the current runq.
  */
 #define	SCHED_SLP_RUN_MAX	((hz / 10) << 10)
 #define	SCHED_SLP_RUN_THROTTLE	(10)
 #define	SCHED_INTERACT_RANGE	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_RANGE / 2)
 #define	SCHED_INTERACT_THRESH	(10)
 
 /*
  * These parameters and macros determine the size of the time slice that is
  * granted to each thread.
  *
  * SLICE_MIN:	Minimum time slice granted, in units of ticks.
  * SLICE_MAX:	Maximum time slice granted.
  * SLICE_RANGE:	Range of available time slices scaled by hz.
  * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
  * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
  */
 #define	SCHED_SLICE_MIN			(slice_min)
 #define	SCHED_SLICE_MAX			(slice_max)
 #define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
 #define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
 #define	SCHED_SLICE_NICE(nice)						\
     (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH))
 
 /*
  * This macro determines whether or not the kse belongs on the current or
  * next run queue.
  * 
  * XXX nice value should effect how interactive a kg is.
  */
 #define	SCHED_INTERACTIVE(kg)						\
     (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
 #define	SCHED_CURR(kg, ke)						\
     (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg))
 
 /*
  * Cpu percentage computation macros and defines.
  *
  * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
  * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
  */
 
 #define	SCHED_CPU_TIME	10
 #define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
 
 /*
  * kseq - per processor runqs and statistics.
  */
 
 #define	KSEQ_NCLASS	(PRI_IDLE + 1)	/* Number of run classes. */
 
 struct kseq {
 	struct runq	ksq_idle;		/* Queue of IDLE threads. */
 	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
 	struct runq	*ksq_next;		/* Next timeshare queue. */
 	struct runq	*ksq_curr;		/* Current queue. */
 	int		ksq_loads[KSEQ_NCLASS];	/* Load for each class */
 	int		ksq_load;		/* Aggregate load. */
 	short		ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */
 	short		ksq_nicemin;		/* Least nice. */
 #ifdef SMP
 	unsigned int	ksq_rslices;	/* Slices on run queue */
 #endif
 };
 
 /*
  * One kse queue per processor.
  */
 #ifdef SMP
 struct kseq	kseq_cpu[MAXCPU];
 #define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
 #define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
 #else
 struct kseq	kseq_cpu;
 #define	KSEQ_SELF()	(&kseq_cpu)
 #define	KSEQ_CPU(x)	(&kseq_cpu)
 #endif
 
 static void sched_slice(struct kse *ke);
 static void sched_priority(struct ksegrp *kg);
 static int sched_interact_score(struct ksegrp *kg);
 void sched_pctcpu_update(struct kse *ke);
 int sched_pickcpu(void);
 
 /* Operations on per processor queues */
 static struct kse * kseq_choose(struct kseq *kseq);
 static void kseq_setup(struct kseq *kseq);
 static void kseq_add(struct kseq *kseq, struct kse *ke);
 static void kseq_rem(struct kseq *kseq, struct kse *ke);
 static void kseq_nice_add(struct kseq *kseq, int nice);
 static void kseq_nice_rem(struct kseq *kseq, int nice);
 void kseq_print(int cpu);
 #ifdef SMP
 struct kseq * kseq_load_highest(void);
 void kseq_balance(void *arg);
 void kseq_move(struct kseq *from, int cpu);
 #endif
 
 void
 kseq_print(int cpu)
 {
 	struct kseq *kseq;
 	int i;
 
 	kseq = KSEQ_CPU(cpu);
 
 	printf("kseq:\n");
 	printf("\tload:           %d\n", kseq->ksq_load);
 	printf("\tload ITHD:      %d\n", kseq->ksq_loads[PRI_ITHD]);
 	printf("\tload REALTIME:  %d\n", kseq->ksq_loads[PRI_REALTIME]);
 	printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]);
 	printf("\tload IDLE:      %d\n", kseq->ksq_loads[PRI_IDLE]);
 	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
 	printf("\tnice counts:\n");
 	for (i = 0; i < PRIO_TOTAL + 1; i++)
 		if (kseq->ksq_nice[i])
 			printf("\t\t%d = %d\n",
 			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
 }
 
 static void
 kseq_add(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++;
 	kseq->ksq_load++;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 	CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))",
 	    ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority,
 	    ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin);
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices += ke->ke_slice;
 #endif
 }
 
 static void
 kseq_rem(struct kseq *kseq, struct kse *ke)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--;
 	kseq->ksq_load--;
 	ke->ke_runq = NULL;
 	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
 		kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice);
 #ifdef SMP
 	kseq->ksq_rslices -= ke->ke_slice;
 #endif
 }
 
 static void
 kseq_nice_add(struct kseq *kseq, int nice)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
 	if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1)
 		kseq->ksq_nicemin = nice;
 }
 
 static void
 kseq_nice_rem(struct kseq *kseq, int nice) 
 {
 	int n;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* Normalize to zero. */
 	n = nice + SCHED_PRI_NHALF;
 	kseq->ksq_nice[n]--;
 	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
 
 	/*
 	 * If this wasn't the smallest nice value or there are more in
 	 * this bucket we can just return.  Otherwise we have to recalculate
 	 * the smallest nice.
 	 */
 	if (nice != kseq->ksq_nicemin ||
 	    kseq->ksq_nice[n] != 0 ||
 	    kseq->ksq_loads[PRI_TIMESHARE] == 0)
 		return;
 
 	for (; n < SCHED_PRI_NRESV + 1; n++)
 		if (kseq->ksq_nice[n]) {
 			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
 			return;
 		}
 }
 
 #ifdef SMP
 /*
  * kseq_balance is a simple CPU load balancing algorithm.  It operates by
  * finding the least loaded and most loaded cpu and equalizing their load
  * by migrating some processes.
  *
  * Dealing only with two CPUs at a time has two advantages.  Firstly, most
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
  * algorithm simplicity and more gradual effects on load in larger systems.
  *
  * It could be improved by considering the priorities and slices assigned to
  * each task prior to balancing them.  There are many pathological cases with
  * any approach and so the semi random algorithm below may work as well as any.
  *
  */
 void
 kseq_balance(void *arg)
 {
 	struct kseq *kseq;
 	int high_load;
 	int low_load;
 	int high_cpu;
 	int low_cpu;
 	int move;
 	int diff;
 	int i;
 
 	high_cpu = 0;
 	low_cpu = 0;
 	high_load = 0;
 	low_load = -1;
 
 	mtx_lock_spin(&sched_lock);
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > high_load) {
 			high_load = kseq->ksq_load;
 			high_cpu = i;
 		}
 		if (low_load == -1 || kseq->ksq_load < low_load) {
 			low_load = kseq->ksq_load;
 			low_cpu = i;
 		}
 	}
 
 	/*
 	 * Nothing to do.
 	 */
 	if (high_load < 2 || low_load == high_load)
 		goto out;
 
 	diff = high_load - low_load;
 	move = diff / 2;
 	if (diff & 0x1)
 		move++;
 
 	for (i = 0; i < move; i++)
 		kseq_move(KSEQ_CPU(high_cpu), low_cpu);
 
 out:
 	mtx_unlock_spin(&sched_lock);
 	callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL);
 
 	return;
 }
 
 struct kseq *
 kseq_load_highest(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	cpu = 0;
 	load = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load > load) {
 			load = kseq->ksq_load;
 			cpu = i;
 		}
 	}
 	if (load > 1)
 		return (KSEQ_CPU(cpu));
 
 	return (NULL);
 }
 
 void
 kseq_move(struct kseq *from, int cpu)
 {
 	struct kse *ke;
 
 	ke = kseq_choose(from);
 	runq_remove(ke->ke_runq, ke);
 	ke->ke_state = KES_THREAD;
 	kseq_rem(from, ke);
 
 	ke->ke_cpu = cpu;
 	sched_add(ke);
 }
 #endif
 
 struct kse *
 kseq_choose(struct kseq *kseq)
 {
 	struct kse *ke;
 	struct runq *swap;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	swap = NULL;
 
 	for (;;) {
 		ke = runq_choose(kseq->ksq_curr);
 		if (ke == NULL) {
 			/*
 			 * We already swaped once and didn't get anywhere.
 			 */
 			if (swap)
 				break;
 			swap = kseq->ksq_curr;
 			kseq->ksq_curr = kseq->ksq_next;
 			kseq->ksq_next = swap;
 			continue;
 		}
 		/*
 		 * If we encounter a slice of 0 the kse is in a
 		 * TIMESHARE kse group and its nice was too far out
 		 * of the range that receives slices. 
 		 */
 		if (ke->ke_slice == 0) {
 			runq_remove(ke->ke_runq, ke);
 			sched_slice(ke);
 			ke->ke_runq = kseq->ksq_next;
 			runq_add(ke->ke_runq, ke);
 			continue;
 		}
 		return (ke);
 	}
 
 	return (runq_choose(&kseq->ksq_idle));
 }
 
 static void
 kseq_setup(struct kseq *kseq)
 {
 	runq_init(&kseq->ksq_timeshare[0]);
 	runq_init(&kseq->ksq_timeshare[1]);
 	runq_init(&kseq->ksq_idle);
 
 	kseq->ksq_curr = &kseq->ksq_timeshare[0];
 	kseq->ksq_next = &kseq->ksq_timeshare[1];
 
 	kseq->ksq_loads[PRI_ITHD] = 0;
 	kseq->ksq_loads[PRI_REALTIME] = 0;
 	kseq->ksq_loads[PRI_TIMESHARE] = 0;
 	kseq->ksq_loads[PRI_IDLE] = 0;
 	kseq->ksq_load = 0;
 #ifdef SMP
 	kseq->ksq_rslices = 0;
 #endif
 }
 
 static void
 sched_setup(void *dummy)
 {
 	int i;
 
 	slice_min = (hz/100);
 	slice_max = (hz/10);
 
 	mtx_lock_spin(&sched_lock);
 	/* init kseqs */
 	for (i = 0; i < MAXCPU; i++)
 		kseq_setup(KSEQ_CPU(i));
 
 	kseq_add(KSEQ_SELF(), &kse0);
 	mtx_unlock_spin(&sched_lock);
 #ifdef SMP
 	callout_init(&kseq_lb_callout, 1);
 	kseq_balance(NULL);
 #endif
 }
 
 /*
  * Scale the scheduling priority according to the "interactivity" of this
  * process.
  */
 static void
 sched_priority(struct ksegrp *kg)
 {
 	int pri;
 
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 
 	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
 	pri += SCHED_PRI_BASE;
 	pri += kg->kg_nice;
 
 	if (pri > PRI_MAX_TIMESHARE)
 		pri = PRI_MAX_TIMESHARE;
 	else if (pri < PRI_MIN_TIMESHARE)
 		pri = PRI_MIN_TIMESHARE;
 
 	kg->kg_user_pri = pri;
 
 	return;
 }
 
 /*
  * Calculate a time slice based on the properties of the kseg and the runq
  * that we're on.  This is only for PRI_TIMESHARE ksegrps.
  */
 static void
 sched_slice(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 
 	kg = ke->ke_ksegrp;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 
 	/*
 	 * Rationale:
 	 * KSEs in interactive ksegs get the minimum slice so that we
 	 * quickly notice if it abuses its advantage.
 	 *
 	 * KSEs in non-interactive ksegs are assigned a slice that is
 	 * based on the ksegs nice value relative to the least nice kseg
 	 * on the run queue for this cpu.
 	 *
 	 * If the KSE is less nice than all others it gets the maximum
 	 * slice and other KSEs will adjust their slice relative to
 	 * this when they first expire.
 	 *
 	 * There is 20 point window that starts relative to the least
 	 * nice kse on the run queue.  Slice size is determined by
 	 * the kse distance from the last nice ksegrp.
 	 *
 	 * If you are outside of the window you will get no slice and
 	 * you will be reevaluated each time you are selected on the
 	 * run queue.
 	 *	
 	 */
 
 	if (!SCHED_INTERACTIVE(kg)) {
 		int nice;
 
 		nice = kg->kg_nice + (0 - kseq->ksq_nicemin);
 		if (kseq->ksq_loads[PRI_TIMESHARE] == 0 ||
 		    kg->kg_nice < kseq->ksq_nicemin)
 			ke->ke_slice = SCHED_SLICE_MAX;
 		else if (nice <= SCHED_PRI_NTHRESH)
 			ke->ke_slice = SCHED_SLICE_NICE(nice);
 		else
 			ke->ke_slice = 0;
 	} else
 		ke->ke_slice = SCHED_SLICE_MIN;
 
 	CTR6(KTR_ULE,
 	    "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)",
 	    ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin,
 	    kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg));
 
 	/*
 	 * Check to see if we need to scale back the slp and run time
 	 * in the kg.  This will cause us to forget old interactivity
 	 * while maintaining the current ratio.
 	 */
 	CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)",
 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
 	    sched_interact_score(kg));
 
 	if ((kg->kg_runtime + kg->kg_slptime) >  SCHED_SLP_RUN_MAX) {
 		kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE;
 		kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE;
 	}
 	CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)",
 	    ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10,
 	    sched_interact_score(kg));
 
 	return;
 }
 
 static int
 sched_interact_score(struct ksegrp *kg)
 {
 	int big;
 	int small;
 	int base;
 
 	if (kg->kg_runtime > kg->kg_slptime) {
 		big = kg->kg_runtime;
 		small = kg->kg_slptime;
 		base = SCHED_INTERACT_HALF;
 	} else {
 		big = kg->kg_slptime;
 		small = kg->kg_runtime;
 		base = 0;
 	}
 
 	big /= SCHED_INTERACT_HALF;
 	if (big != 0)
 		small /= big;
 	else
 		small = 0;
 
 	small += base;
 	/* XXX Factor in nice */
 	return (small);
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
  * at most SCHED_SLICE_MAX.
  */
 int
 sched_rr_interval(void)
 {
 	return (SCHED_SLICE_MAX);
 }
 
 void
 sched_pctcpu_update(struct kse *ke)
 {
 	/*
 	 * Adjust counters and watermark for pctcpu calc.
 	 *
 	 * Shift the tick count out so that the divide doesn't round away
 	 * our results.
 	 */
 	ke->ke_ticks <<= 10;
 	ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) *
 		    SCHED_CPU_TICKS;
 	ke->ke_ticks >>= 10;
 	ke->ke_ltick = ticks;
 	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
 }
 
 #ifdef SMP
 /* XXX Should be changed to kseq_load_lowest() */
 int
 sched_pickcpu(void)
 {
 	struct kseq *kseq;
 	int load;
 	int cpu;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (!smp_started)
 		return (0);
 
 	load = 0;
 	cpu = 0;
 
 	for (i = 0; i < mp_maxid; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		kseq = KSEQ_CPU(i);
 		if (kseq->ksq_load < load) {
 			cpu = i;
 			load = kseq->ksq_load;
 		}
 	}
 
 	CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu);
 	return (cpu);
 }
 #else
 int
 sched_pickcpu(void)
 {
 	return (0);
 }
 #endif
 
 void
 sched_prio(struct thread *td, u_char prio)
 {
 	struct kse *ke;
 	struct runq *rq;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	ke = td->td_kse;
 	td->td_priority = prio;
 
 	if (TD_ON_RUNQ(td)) {
 		rq = ke->ke_runq;
 
 		runq_remove(rq, ke);
 		runq_add(rq, ke);
 	}
 }
 
 void
 sched_switchout(struct thread *td)
 {
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	ke = td->td_kse;
 
 	td->td_last_kse = ke;
         td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
         td->td_flags &= ~TDF_NEEDRESCHED;
 
 	if (TD_IS_RUNNING(td)) {
 		runq_add(ke->ke_runq, ke);
 		/* setrunqueue(td); */
 		return;
 	}
 	if (ke->ke_runq)
 		kseq_rem(KSEQ_CPU(ke->ke_cpu), ke);
 	/*
 	 * We will not be on the run queue. So we must be
 	 * sleeping or similar.
 	 */
-	if (td->td_proc->p_flag & P_THREADED)
+	if (td->td_proc->p_flag & P_SA)
 		kse_reassign(ke);
 }
 
 void
 sched_switchin(struct thread *td)
 {
 	/* struct kse *ke = td->td_kse; */
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	td->td_oncpu = PCPU_GET(cpuid);
 
 	if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE &&
 	    td->td_priority != td->td_ksegrp->kg_user_pri)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_nice(struct ksegrp *kg, int nice)
 {
 	struct kse *ke;
 	struct thread *td;
 	struct kseq *kseq;
 
 	PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED);
 	/*
 	 * We need to adjust the nice counts for running KSEs.
 	 */
 	if (kg->kg_pri_class == PRI_TIMESHARE)
 		FOREACH_KSE_IN_GROUP(kg, ke) {
 			if (ke->ke_state != KES_ONRUNQ &&
 			    ke->ke_state != KES_THREAD)
 				continue;
 			kseq = KSEQ_CPU(ke->ke_cpu);
 			kseq_nice_rem(kseq, kg->kg_nice);
 			kseq_nice_add(kseq, nice);
 		}
 	kg->kg_nice = nice;
 	sched_priority(kg);
 	FOREACH_THREAD_IN_GROUP(kg, td)
 		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
 sched_sleep(struct thread *td, u_char prio)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	td->td_slptime = ticks;
 	td->td_priority = prio;
 
 	CTR2(KTR_ULE, "sleep kse %p (tick: %d)",
 	    td->td_kse, td->td_slptime);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	/*
 	 * Let the kseg know how long we slept for.  This is because process
 	 * interactivity behavior is modeled in the kseg.
 	 */
 	if (td->td_slptime) {
 		struct ksegrp *kg;
 		int hzticks;
 
 		kg = td->td_ksegrp;
 		hzticks = ticks - td->td_slptime;
 		kg->kg_slptime += hzticks << 10;
 		sched_priority(kg);
 		CTR2(KTR_ULE, "wakeup kse %p (%d ticks)",
 		    td->td_kse, hzticks);
 		td->td_slptime = 0;
 	}
 	setrunqueue(td);
         if (td->td_priority < curthread->td_priority)
                 curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
  * Penalize the parent for creating a new child and initialize the child's
  * priority.
  */
 void
 sched_fork(struct proc *p, struct proc *p1)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1));
 	sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1));
 	sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1));
 }
 
 void
 sched_fork_kse(struct kse *ke, struct kse *child)
 {
 
 	child->ke_slice = ke->ke_slice;
 	child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */
 	child->ke_runq = NULL;
 
 	/*
 	 * Claim that we've been running for one second for statistical
 	 * purposes.
 	 */
 	child->ke_ticks = 0;
 	child->ke_ltick = ticks;
 	child->ke_ftick = ticks - hz;
 }
 
 void
 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 
 	PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED);
 	/* XXX Need something better here */
 	if (kg->kg_slptime > kg->kg_runtime) {
 		child->kg_slptime = SCHED_DYN_RANGE;
 		child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE;
 	} else {
 		child->kg_runtime = SCHED_DYN_RANGE;
 		child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE;
 	}
 
 	child->kg_user_pri = kg->kg_user_pri;
 	child->kg_nice = kg->kg_nice;
 }
 
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_class(struct ksegrp *kg, int class)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (kg->kg_pri_class == class)
 		return;
 
 	FOREACH_KSE_IN_GROUP(kg, ke) {
 		if (ke->ke_state != KES_ONRUNQ &&
 		    ke->ke_state != KES_THREAD)
 			continue;
 		kseq = KSEQ_CPU(ke->ke_cpu);
 
 		kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--;
 		kseq->ksq_loads[PRI_BASE(class)]++;
 
 		if (kg->kg_pri_class == PRI_TIMESHARE)
 			kseq_nice_rem(kseq, kg->kg_nice);
 		else if (class == PRI_TIMESHARE)
 			kseq_nice_add(kseq, kg->kg_nice);
 	}
 
 	kg->kg_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
 sched_exit(struct proc *p, struct proc *child)
 {
 	/* XXX Need something better here */
 	mtx_assert(&sched_lock, MA_OWNED);
 	sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child));
 }
 
 void
 sched_exit_kse(struct kse *ke, struct kse *child)
 {
 	kseq_rem(KSEQ_CPU(child->ke_cpu), child);
 }
 
 void
 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child)
 {
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
 }
 
 void
 sched_clock(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 	struct thread *td;
 #if 0
 	struct kse *nke;
 #endif
 
 	/*
 	 * sched_setup() apparently happens prior to stathz being set.  We
 	 * need to resolve the timers earlier in the boot so we can avoid
 	 * calculating this here.
 	 */
 	if (realstathz == 0) {
 		realstathz = stathz ? stathz : hz;
 		tickincr = hz / realstathz;
 		/*
 		 * XXX This does not work for values of stathz that are much
 		 * larger than hz.
 		 */
 		if (tickincr == 0)
 			tickincr = 1;
 	}
 
 	td = ke->ke_thread;
 	kg = ke->ke_ksegrp;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((td != NULL), ("schedclock: null thread pointer"));
 
 	/* Adjust ticks for pctcpu */
 	ke->ke_ticks++;
 	ke->ke_ltick = ticks;
 
 	/* Go up to one second beyond our max and then trim back down */
 	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
 		sched_pctcpu_update(ke);
 
 	if (td->td_flags & TDF_IDLETD)
 		return;
 
 	CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)",
 	    ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10);
 
 	/*
 	 * We only do slicing code for TIMESHARE ksegrps.
 	 */
 	if (kg->kg_pri_class != PRI_TIMESHARE)
 		return;
 	/*
 	 * Check for a higher priority task on the run queue.  This can happen
 	 * on SMP if another processor woke up a process on our runq.
 	 */
 	kseq = KSEQ_SELF();
 #if 0
 	if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) {
 		if (sched_strict &&
 		    nke->ke_thread->td_priority < td->td_priority)
 			td->td_flags |= TDF_NEEDRESCHED;
 		else if (nke->ke_thread->td_priority <
 		    td->td_priority SCHED_PRIO_SLOP)
 		    
 		if (nke->ke_thread->td_priority < td->td_priority)
 			td->td_flags |= TDF_NEEDRESCHED;
 	}
 #endif
 	/*
 	 * We used a tick charge it to the ksegrp so that we can compute our
 	 * interactivity.
 	 */
 	kg->kg_runtime += tickincr << 10;
 
 	/*
 	 * We used up one time slice.
 	 */
 	ke->ke_slice--;
 #ifdef SMP
 	kseq->ksq_rslices--;
 #endif
 
 	if (ke->ke_slice > 0)
 		return;
 	/*
 	 * We're out of time, recompute priorities and requeue.
 	 */
 	kseq_rem(kseq, ke);
 	sched_priority(kg);
 	sched_slice(ke);
 	if (SCHED_CURR(kg, ke))
 		ke->ke_runq = kseq->ksq_curr;
 	else
 		ke->ke_runq = kseq->ksq_next;
 	kseq_add(kseq, ke);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
 int
 sched_runnable(void)
 {
 	struct kseq *kseq;
 	int load;
 
 	load = 1;
 
 	mtx_lock_spin(&sched_lock);
 	kseq = KSEQ_SELF();
 
 	if (kseq->ksq_load)
 		goto out;
 #ifdef SMP
 	/*
 	 * For SMP we may steal other processor's KSEs.  Just search until we
 	 * verify that at least on other cpu has a runnable task.
 	 */
 	if (smp_started) {
 		int i;
 
 		for (i = 0; i < mp_maxid; i++) {
 			if (CPU_ABSENT(i))
 				continue;
 			kseq = KSEQ_CPU(i);
 			if (kseq->ksq_load > 1)
 				goto out;
 		}
 	}
 #endif
 	load = 0;
 out:
 	mtx_unlock_spin(&sched_lock);
 	return (load);
 }
 
 void
 sched_userret(struct thread *td)
 {
 	struct ksegrp *kg;
 	
 	kg = td->td_ksegrp;
 
 	if (td->td_priority != kg->kg_user_pri) {
 		mtx_lock_spin(&sched_lock);
 		td->td_priority = kg->kg_user_pri;
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 struct kse *
 sched_choose(void)
 {
 	struct kseq *kseq;
 	struct kse *ke;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef SMP
 retry:
 #endif
 	kseq = KSEQ_SELF();
 	ke = kseq_choose(kseq);
 	if (ke) {
 		runq_remove(ke->ke_runq, ke);
 		ke->ke_state = KES_THREAD;
 
 		if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) {
 			CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)",
 			    ke, ke->ke_runq, ke->ke_slice,
 			    ke->ke_thread->td_priority);
 		}
 		return (ke);
 	}
 
 #ifdef SMP
 	if (smp_started) {
 		/*
 		 * Find the cpu with the highest load and steal one proc.
 		 */
 		if ((kseq = kseq_load_highest()) == NULL)
 			return (NULL);
 
 		/*
 		 * Remove this kse from this kseq and runq and then requeue
 		 * on the current processor.  Then we will dequeue it
 		 * normally above.
 		 */
 		kseq_move(kseq, PCPU_GET(cpuid));
 		goto retry;
 	}
 #endif
 
 	return (NULL);
 }
 
 void
 sched_add(struct kse *ke)
 {
 	struct kseq *kseq;
 	struct ksegrp *kg;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE"));
 	KASSERT((ke->ke_thread->td_kse != NULL),
 	    ("sched_add: No KSE on thread"));
 	KASSERT(ke->ke_state != KES_ONRUNQ,
 	    ("sched_add: kse %p (%s) already in run queue", ke,
 	    ke->ke_proc->p_comm));
 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
 	    ("sched_add: process swapped out"));
 	KASSERT(ke->ke_runq == NULL,
 	    ("sched_add: KSE %p is still assigned to a run queue", ke));
 
 	kg = ke->ke_ksegrp;
 
 	switch (PRI_BASE(kg->kg_pri_class)) {
 	case PRI_ITHD:
 	case PRI_REALTIME:
 		kseq = KSEQ_SELF();
 		ke->ke_runq = kseq->ksq_curr;
 		ke->ke_slice = SCHED_SLICE_MAX;
 		ke->ke_cpu = PCPU_GET(cpuid);
 		break;
 	case PRI_TIMESHARE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		if (SCHED_CURR(kg, ke))
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = kseq->ksq_next;
 		break;
 	case PRI_IDLE:
 		kseq = KSEQ_CPU(ke->ke_cpu);
 		/*
 		 * This is for priority prop.
 		 */
 		if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE)
 			ke->ke_runq = kseq->ksq_curr;
 		else
 			ke->ke_runq = &kseq->ksq_idle;
 		ke->ke_slice = SCHED_SLICE_MIN;
 		break;
 	default:
 		panic("Unknown pri class.\n");
 		break;
 	}
 
 	ke->ke_ksegrp->kg_runq_kses++;
 	ke->ke_state = KES_ONRUNQ;
 
 	runq_add(ke->ke_runq, ke);
 	kseq_add(kseq, ke);
 }
 
 void
 sched_rem(struct kse *ke)
 {
 	struct kseq *kseq;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue"));
 
 	ke->ke_state = KES_THREAD;
 	ke->ke_ksegrp->kg_runq_kses--;
 	kseq = KSEQ_CPU(ke->ke_cpu);
 	runq_remove(ke->ke_runq, ke);
 	kseq_rem(kseq, ke);
 }
 
 fixpt_t
 sched_pctcpu(struct kse *ke)
 {
 	fixpt_t pctcpu;
 
 	pctcpu = 0;
 
 	mtx_lock_spin(&sched_lock);
 	if (ke->ke_ticks) {
 		int rtick;
 
 		/* Update to account for time potentially spent sleeping */
 		ke->ke_ltick = ticks;
 		sched_pctcpu_update(ke);
 
 		/* How many rtick per second ? */
 		rtick = ke->ke_ticks / SCHED_CPU_TIME;
 		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
 	}
 
 	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
 	mtx_unlock_spin(&sched_lock);
 
 	return (pctcpu);
 }
 
 int
 sched_sizeof_kse(void)
 {
 	return (sizeof(struct kse) + sizeof(struct ke_sched));
 }
 
 int
 sched_sizeof_ksegrp(void)
 {
 	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
 }
 
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
 
 int
 sched_sizeof_thread(void)
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 116360)
+++ head/sys/kern/subr_trap.c	(revision 116361)
@@ -1,272 +1,272 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #ifdef __i386__
 #include "opt_npx.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 
 /*
  * Define the code needed before returning to user mode, for
  * trap and syscall.
  *
  * MPSAFE
  */
 void
 userret(td, frame, oticks)
 	struct thread *td;
 	struct trapframe *frame;
 	u_int oticks;
 {
 	struct proc *p = td->td_proc;
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 #ifdef INVARIANTS
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 #endif
 
 	/*
 	 * Let the scheduler adjust our priority etc.
 	 */
 	sched_userret(td);
 
 	/*
 	 * We need to check to see if we have to exit or wait due to a
 	 * single threading requirement or some other STOP condition.
 	 * Don't bother doing all the work if the stop bits are not set
 	 * at this time.. If we miss it, we miss it.. no big deal.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);	/* Can suspend or kill */
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Do special thread processing, e.g. upcall tweaking and such.
 	 */
-	if (p->p_flag & P_THREADED) {
+	if (p->p_flag & P_SA) {
 		thread_userret(td, frame);
 	}
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 		quad_t ticks;
 
 		mtx_lock_spin(&sched_lock);
 		ticks = td->td_sticks - oticks;
 		mtx_unlock_spin(&sched_lock);
 		addupc_task(td, TRAPF_PC(frame), (u_int)ticks * psratio);
 	}
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  * This function will return with preemption disabled.
  */
 void
 ast(struct trapframe *framep)
 {
 	struct thread *td;
 	struct proc *p;
 	struct kse *ke;
 	struct ksegrp *kg;
 	struct rlimit *rlim;
 	u_int prticks, sticks;
 	int sflag;
 	int flags;
 	int sig;
 #if defined(DEV_NPX) && !defined(SMP)
 	int ucode;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
 	kg = td->td_ksegrp;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	td->td_frame = framep;
 
 	/*
 	 * This updates the p_sflag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
 	 * AST's saved in sflag, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
 	mtx_lock_spin(&sched_lock);
 	ke = td->td_kse;
 	sticks = td->td_sticks;
 	flags = td->td_flags;
 	sflag = p->p_sflag;
 	p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU);
 #ifdef MAC
 	p->p_sflag &= ~PS_MACPEND;
 #endif
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
 	    TDF_NEEDRESCHED | TDF_OWEUPC);
 	cnt.v_soft++;
 	prticks = 0;
 	if (flags & TDF_OWEUPC && p->p_flag & P_PROFIL) {
 		prticks = p->p_stats->p_prof.pr_ticks;
 		p->p_stats->p_prof.pr_ticks = 0;
 	}
 	mtx_unlock_spin(&sched_lock);
 	/*
 	 * XXXKSE While the fact that we owe a user profiling
 	 * tick is stored per KSE in this code, the statistics
 	 * themselves are still stored per process.
 	 * This should probably change, by which I mean that
 	 * possibly the location of both might change.
 	 */
 
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	if (flags & TDF_OWEUPC && p->p_flag & P_PROFIL)
 		addupc_task(td, p->p_stats->p_prof.pr_addr, prticks);
 	if (sflag & PS_ALRMPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
 	}
 #if defined(DEV_NPX) && !defined(SMP)
 	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
 		atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags,
 		    PCB_NPXTRAP);
 		ucode = npxtrap();
 		if (ucode != -1) {
 			trapsignal(td, SIGFPE, ucode);
 		}
 	}
 #endif
 	if (sflag & PS_PROFPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
 	if (sflag & PS_XCPU) {
 		PROC_LOCK(p);
 		rlim = &p->p_rlimit[RLIMIT_CPU];
 		mtx_lock_spin(&sched_lock);
 		if (p->p_runtime.sec >= rlim->rlim_max) {
 			mtx_unlock_spin(&sched_lock);
 			killproc(p, "exceeded maximum CPU limit");
 		} else {
 			if (p->p_cpulimit < rlim->rlim_max)
 				p->p_cpulimit += 5;
 			mtx_unlock_spin(&sched_lock);
 			psignal(p, SIGXCPU);
 		}
 		PROC_UNLOCK(p);
 	}
 #ifdef MAC
 	if (sflag & PS_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
 		mtx_lock_spin(&sched_lock);
 		sched_prio(td, kg->kg_user_pri);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 	}
 	if (flags & TDF_NEEDSIGCHK) {
 		int sigs;
 
 		sigs = 0;
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			postsig(sig);
 			sigs++;
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
-		if (p->p_flag & P_THREADED && sigs) {
+		if (p->p_flag & P_SA && sigs) {
 			struct kse_upcall *ku = td->td_upcall;
 			if ((void *)TRAPF_PC(framep) != ku->ku_func) {
 				mtx_lock_spin(&sched_lock);
 				ku->ku_flags |= KUF_DOUPCALL;
 				mtx_unlock_spin(&sched_lock);
 			}
 		}
 	}
 
 	userret(td, framep, sticks);
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/tty.c
===================================================================
--- head/sys/kern/tty.c	(revision 116360)
+++ head/sys/kern/tty.c	(revision 116361)
@@ -1,2723 +1,2723 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Copyright (c) 2002 Networks Associates Technologies, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed for the FreeBSD Project by
  * ThinkSec AS and NAI Labs, the Security Research Division of Network
  * Associates, Inc.  under DARPA/SPAWAR contract N66001-01-C-8035
  * ("CBOSS"), as part of the DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tty.c	8.8 (Berkeley) 1/21/94
  */
 
 /*-
  * TODO:
  *	o Fix races for sending the start char in ttyflush().
  *	o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
  *	  With luck, there will be MIN chars before select() returns().
  *	o Handle CLOCAL consistently for ptys.  Perhaps disallow setting it.
  *	o Don't allow input in TS_ZOMBIE case.  It would be visible through
  *	  FIONREAD.
  *	o Do the new sio locking stuff here and use it to avoid special
  *	  case for EXTPROC?
  *	o Lock PENDIN too?
  *	o Move EXTPROC and/or PENDIN to t_state?
  *	o Wrap most of ttioctl in spltty/splx.
  *	o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
  *	o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
  *	o Don't allow certain termios flags to affect disciplines other
  *	  than TTYDISC.  Cancel their effects before switch disciplines
  *	  and ignore them if they are set while we are in another
  *	  discipline.
  *	o Now that historical speed conversions are handled here, don't
  *	  do them in drivers.
  *	o Check for TS_CARR_ON being set while everything is closed and not
  *	  waiting for carrier.  TS_CARR_ON isn't cleared if nothing is open,
  *	  so it would live until the next open even if carrier drops.
  *	o Restore TS_WOPEN since it is useful in pstat.  It must be cleared
  *	  only when _all_ openers leave open().
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_tty.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sx.h>
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #include <sys/ioctl_compat.h>
 #endif
 #include <sys/proc.h>
 #define	TTYDEFCHARS
 #include <sys/tty.h>
 #undef	TTYDEFCHARS
 #include <sys/fcntl.h>
 #include <sys/conf.h>
 #include <sys/poll.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures");
 
 long tk_cancc;
 long tk_nin;
 long tk_nout;
 long tk_rawcc;
 
 static int	proc_compare(struct proc *p1, struct proc *p2);
 static int	ttnread(struct tty *tp);
 static void	ttyecho(int c, struct tty *tp);
 static int	ttyoutput(int c, struct tty *tp);
 static void	ttypend(struct tty *tp);
 static void	ttyretype(struct tty *tp);
 static void	ttyrub(int c, struct tty *tp);
 static void	ttyrubo(struct tty *tp, int cnt);
 static void	ttyunblock(struct tty *tp);
 static int	ttywflush(struct tty *tp);
 static int	filt_ttyread(struct knote *kn, long hint);
 static void	filt_ttyrdetach(struct knote *kn);
 static int	filt_ttywrite(struct knote *kn, long hint);
 static void	filt_ttywdetach(struct knote *kn);
 
 /*
  * Table with character classes and parity. The 8th bit indicates parity,
  * the 7th bit indicates the character is an alphameric or underscore (for
  * ALTWERASE), and the low 6 bits indicate delay type.  If the low 6 bits
  * are 0 then the character needs no special processing on output; classes
  * other than 0 might be translated or (not currently) require delays.
  */
 #define	E	0x00	/* Even parity. */
 #define	O	0x80	/* Odd parity. */
 #define	PARITY(c)	(char_type[c] & O)
 
 #define	ALPHA	0x40	/* Alpha or underscore. */
 #define	ISALPHA(c)	(char_type[(c) & TTY_CHARMASK] & ALPHA)
 
 #define	CCLASSMASK	0x3f
 #define	CCLASS(c)	(char_type[c] & CCLASSMASK)
 
 #define	BS	BACKSPACE
 #define	CC	CONTROL
 #define	CR	RETURN
 #define	NA	ORDINARY | ALPHA
 #define	NL	NEWLINE
 #define	NO	ORDINARY
 #define	TB	TAB
 #define	VT	VTAB
 
 static u_char const char_type[] = {
 	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,	/* nul - bel */
 	O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
 	O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
 	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
 	O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
 	E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
 	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
 	O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
 	O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
 	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
 	E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
 	O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
 	E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
 	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
 	O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
 	E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
 	/*
 	 * Meta chars; should be settable per character set;
 	 * for now, treat them all as normal characters.
 	 */
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 	NA,   NA,   NA,   NA,   NA,   NA,   NA,   NA,
 };
 #undef	BS
 #undef	CC
 #undef	CR
 #undef	NA
 #undef	NL
 #undef	NO
 #undef	TB
 #undef	VT
 
 /* Macros to clear/set/test flags. */
 #define	SET(t, f)	(t) |= (f)
 #define	CLR(t, f)	(t) &= ~(f)
 #define	ISSET(t, f)	((t) & (f))
 
 #undef MAX_INPUT		/* XXX wrong in <sys/syslimits.h> */
 #define	MAX_INPUT	TTYHOG	/* XXX limit is usually larger for !ICANON */
 
 /*
  * list of struct tty where pstat(8) can pick it up with sysctl
  */
 static SLIST_HEAD(, tty) tty_list;
 
 static int  drainwait = 5*60;
 SYSCTL_INT(_kern, OID_AUTO, drainwait, CTLFLAG_RW, &drainwait,
 	0, "Output drain timeout in seconds");
 
 /*
  * Initial open of tty, or (re)entry to standard tty line discipline.
  */
 int
 ttyopen(dev_t device, struct tty *tp)
 {
 	int s;
 
 	s = spltty();
 	tp->t_dev = device;
 	if (!ISSET(tp->t_state, TS_ISOPEN)) {
 		SET(tp->t_state, TS_ISOPEN);
 		if (ISSET(tp->t_cflag, CLOCAL))
 			SET(tp->t_state, TS_CONNECTED);
 		bzero(&tp->t_winsize, sizeof(tp->t_winsize));
 	}
 	/* XXX don't hang forever on output */
 	if (tp->t_timeout < 0)
 		tp->t_timeout = drainwait*hz;
 	ttsetwater(tp);
 	splx(s);
 	return (0);
 }
 
 /*
  * Handle close() on a tty line: flush and set to initial state,
  * bumping generation number so that pending read/write calls
  * can detect recycling of the tty.
  * XXX our caller should have done `spltty(); l_close(); ttyclose();'
  * and l_close() should have flushed, but we repeat the spltty() and
  * the flush in case there are buggy callers.
  */
 int
 ttyclose(struct tty *tp)
 {
 	int s;
 
 	funsetown(&tp->t_sigio);
 	s = spltty();
 	if (constty == tp)
 		constty = NULL;
 
 	ttyflush(tp, FREAD | FWRITE);
 	clist_free_cblocks(&tp->t_canq);
 	clist_free_cblocks(&tp->t_outq);
 	clist_free_cblocks(&tp->t_rawq);
 
 	tp->t_gen++;
 	tp->t_line = TTYDISC;
 	tp->t_pgrp = NULL;
 	tp->t_session = NULL;
 	tp->t_state = 0;
 	splx(s);
 	return (0);
 }
 
 #define	FLUSHQ(q) {							\
 	if ((q)->c_cc)							\
 		ndflush(q, (q)->c_cc);					\
 }
 
 /* Is 'c' a line delimiter ("break" character)? */
 #define	TTBREAKC(c, lflag)							\
 	((c) == '\n' || (((c) == cc[VEOF] ||				\
 	  (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) &&	\
 	 (c) != _POSIX_VDISABLE))
 
 /*
  * Process input of a single character received on a tty.
  */
 int
 ttyinput(int c, struct tty *tp)
 {
 	tcflag_t iflag, lflag;
 	cc_t *cc;
 	int i, err;
 
 	/*
 	 * If input is pending take it first.
 	 */
 	lflag = tp->t_lflag;
 	if (ISSET(lflag, PENDIN))
 		ttypend(tp);
 	/*
 	 * Gather stats.
 	 */
 	if (ISSET(lflag, ICANON)) {
 		++tk_cancc;
 		++tp->t_cancc;
 	} else {
 		++tk_rawcc;
 		++tp->t_rawcc;
 	}
 	++tk_nin;
 
 	/*
 	 * Block further input iff:
 	 * current input > threshold AND input is available to user program
 	 * AND input flow control is enabled and not yet invoked.
 	 * The 3 is slop for PARMRK.
 	 */
 	iflag = tp->t_iflag;
 	if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 &&
 	    (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
 	    (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
 	    !ISSET(tp->t_state, TS_TBLOCK))
 		ttyblock(tp);
 
 	/* Handle exceptional conditions (break, parity, framing). */
 	cc = tp->t_cc;
 	err = (ISSET(c, TTY_ERRORMASK));
 	if (err) {
 		CLR(c, TTY_ERRORMASK);
 		if (ISSET(err, TTY_BI)) {
 			if (ISSET(iflag, IGNBRK))
 				return (0);
 			if (ISSET(iflag, BRKINT)) {
 				ttyflush(tp, FREAD | FWRITE);
 				if (tp->t_pgrp != NULL) {
 					PGRP_LOCK(tp->t_pgrp);
 					pgsignal(tp->t_pgrp, SIGINT, 1);
 					PGRP_UNLOCK(tp->t_pgrp);
 				}
 				goto endcase;
 			}
 			if (ISSET(iflag, PARMRK))
 				goto parmrk;
 		} else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
 			|| ISSET(err, TTY_FE)) {
 			if (ISSET(iflag, IGNPAR))
 				return (0);
 			else if (ISSET(iflag, PARMRK)) {
 parmrk:
 				if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
 				    MAX_INPUT - 3)
 					goto input_overflow;
 				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 				(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
 				(void)putc(c | TTY_QUOTE, &tp->t_rawq);
 				goto endcase;
 			} else
 				c = 0;
 		}
 	}
 
 	if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
 		CLR(c, 0x80);
 	if (!ISSET(lflag, EXTPROC)) {
 		/*
 		 * Check for literal nexting very first
 		 */
 		if (ISSET(tp->t_state, TS_LNCH)) {
 			SET(c, TTY_QUOTE);
 			CLR(tp->t_state, TS_LNCH);
 		}
 		/*
 		 * Scan for special characters.  This code
 		 * is really just a big case statement with
 		 * non-constant cases.  The bottom of the
 		 * case statement is labeled ``endcase'', so goto
 		 * it after a case match, or similar.
 		 */
 
 		/*
 		 * Control chars which aren't controlled
 		 * by ICANON, ISIG, or IXON.
 		 */
 		if (ISSET(lflag, IEXTEN)) {
 			if (CCEQ(cc[VLNEXT], c)) {
 				if (ISSET(lflag, ECHO)) {
 					if (ISSET(lflag, ECHOE)) {
 						(void)ttyoutput('^', tp);
 						(void)ttyoutput('\b', tp);
 					} else
 						ttyecho(c, tp);
 				}
 				SET(tp->t_state, TS_LNCH);
 				goto endcase;
 			}
 			if (CCEQ(cc[VDISCARD], c)) {
 				if (ISSET(lflag, FLUSHO))
 					CLR(tp->t_lflag, FLUSHO);
 				else {
 					ttyflush(tp, FWRITE);
 					ttyecho(c, tp);
 					if (tp->t_rawq.c_cc + tp->t_canq.c_cc)
 						ttyretype(tp);
 					SET(tp->t_lflag, FLUSHO);
 				}
 				goto startoutput;
 			}
 		}
 		/*
 		 * Signals.
 		 */
 		if (ISSET(lflag, ISIG)) {
 			if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) {
 				if (!ISSET(lflag, NOFLSH))
 					ttyflush(tp, FREAD | FWRITE);
 				ttyecho(c, tp);
 				if (tp->t_pgrp != NULL) {
 					PGRP_LOCK(tp->t_pgrp);
 					pgsignal(tp->t_pgrp,
 					    CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1);
 					PGRP_UNLOCK(tp->t_pgrp);
 				}
 				goto endcase;
 			}
 			if (CCEQ(cc[VSUSP], c)) {
 				if (!ISSET(lflag, NOFLSH))
 					ttyflush(tp, FREAD);
 				ttyecho(c, tp);
 				if (tp->t_pgrp != NULL) {
 					PGRP_LOCK(tp->t_pgrp);
 					pgsignal(tp->t_pgrp, SIGTSTP, 1);
 					PGRP_UNLOCK(tp->t_pgrp);
 				}
 				goto endcase;
 			}
 		}
 		/*
 		 * Handle start/stop characters.
 		 */
 		if (ISSET(iflag, IXON)) {
 			if (CCEQ(cc[VSTOP], c)) {
 				if (!ISSET(tp->t_state, TS_TTSTOP)) {
 					SET(tp->t_state, TS_TTSTOP);
 					(*tp->t_stop)(tp, 0);
 					return (0);
 				}
 				if (!CCEQ(cc[VSTART], c))
 					return (0);
 				/*
 				 * if VSTART == VSTOP then toggle
 				 */
 				goto endcase;
 			}
 			if (CCEQ(cc[VSTART], c))
 				goto restartoutput;
 		}
 		/*
 		 * IGNCR, ICRNL, & INLCR
 		 */
 		if (c == '\r') {
 			if (ISSET(iflag, IGNCR))
 				return (0);
 			else if (ISSET(iflag, ICRNL))
 				c = '\n';
 		} else if (c == '\n' && ISSET(iflag, INLCR))
 			c = '\r';
 	}
 	if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) {
 		/*
 		 * From here on down canonical mode character
 		 * processing takes place.
 		 */
 		/*
 		 * erase or erase2 (^H / ^?)
 		 */
 		if (CCEQ(cc[VERASE], c) || CCEQ(cc[VERASE2], c) ) {
 			if (tp->t_rawq.c_cc)
 				ttyrub(unputc(&tp->t_rawq), tp);
 			goto endcase;
 		}
 		/*
 		 * kill (^U)
 		 */
 		if (CCEQ(cc[VKILL], c)) {
 			if (ISSET(lflag, ECHOKE) &&
 			    tp->t_rawq.c_cc == tp->t_rocount &&
 			    !ISSET(lflag, ECHOPRT))
 				while (tp->t_rawq.c_cc)
 					ttyrub(unputc(&tp->t_rawq), tp);
 			else {
 				ttyecho(c, tp);
 				if (ISSET(lflag, ECHOK) ||
 				    ISSET(lflag, ECHOKE))
 					ttyecho('\n', tp);
 				FLUSHQ(&tp->t_rawq);
 				tp->t_rocount = 0;
 			}
 			CLR(tp->t_state, TS_LOCAL);
 			goto endcase;
 		}
 		/*
 		 * word erase (^W)
 		 */
 		if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
 			int ctype;
 
 			/*
 			 * erase whitespace
 			 */
 			while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t')
 				ttyrub(c, tp);
 			if (c == -1)
 				goto endcase;
 			/*
 			 * erase last char of word and remember the
 			 * next chars type (for ALTWERASE)
 			 */
 			ttyrub(c, tp);
 			c = unputc(&tp->t_rawq);
 			if (c == -1)
 				goto endcase;
 			if (c == ' ' || c == '\t') {
 				(void)putc(c, &tp->t_rawq);
 				goto endcase;
 			}
 			ctype = ISALPHA(c);
 			/*
 			 * erase rest of word
 			 */
 			do {
 				ttyrub(c, tp);
 				c = unputc(&tp->t_rawq);
 				if (c == -1)
 					goto endcase;
 			} while (c != ' ' && c != '\t' &&
 			    (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
 			(void)putc(c, &tp->t_rawq);
 			goto endcase;
 		}
 		/*
 		 * reprint line (^R)
 		 */
 		if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
 			ttyretype(tp);
 			goto endcase;
 		}
 		/*
 		 * ^T - kernel info and generate SIGINFO
 		 */
 		if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
 			if (ISSET(lflag, ISIG) && tp->t_pgrp != NULL) {
 				PGRP_LOCK(tp->t_pgrp);
 				pgsignal(tp->t_pgrp, SIGINFO, 1);
 				PGRP_UNLOCK(tp->t_pgrp);
 			}
 			if (!ISSET(lflag, NOKERNINFO))
 				ttyinfo(tp);
 			goto endcase;
 		}
 	}
 	/*
 	 * Check for input buffer overflow
 	 */
 	if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
 input_overflow:
 		if (ISSET(iflag, IMAXBEL)) {
 			if (tp->t_outq.c_cc < tp->t_ohiwat)
 				(void)ttyoutput(CTRL('g'), tp);
 		}
 		goto endcase;
 	}
 
 	if (   c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
 	     && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
 		(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 
 	/*
 	 * Put data char in q for user and
 	 * wakeup on seeing a line delimiter.
 	 */
 	if (putc(c, &tp->t_rawq) >= 0) {
 		if (!ISSET(lflag, ICANON)) {
 			ttwakeup(tp);
 			ttyecho(c, tp);
 			goto endcase;
 		}
 		if (TTBREAKC(c, lflag)) {
 			tp->t_rocount = 0;
 			catq(&tp->t_rawq, &tp->t_canq);
 			ttwakeup(tp);
 		} else if (tp->t_rocount++ == 0)
 			tp->t_rocol = tp->t_column;
 		if (ISSET(tp->t_state, TS_ERASE)) {
 			/*
 			 * end of prterase \.../
 			 */
 			CLR(tp->t_state, TS_ERASE);
 			(void)ttyoutput('/', tp);
 		}
 		i = tp->t_column;
 		ttyecho(c, tp);
 		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
 			/*
 			 * Place the cursor over the '^' of the ^D.
 			 */
 			i = imin(2, tp->t_column - i);
 			while (i > 0) {
 				(void)ttyoutput('\b', tp);
 				i--;
 			}
 		}
 	}
 endcase:
 	/*
 	 * IXANY means allow any character to restart output.
 	 */
 	if (ISSET(tp->t_state, TS_TTSTOP) &&
 	    !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP])
 		return (0);
 restartoutput:
 	CLR(tp->t_lflag, FLUSHO);
 	CLR(tp->t_state, TS_TTSTOP);
 startoutput:
 	return (ttstart(tp));
 }
 
 /*
  * Output a single character on a tty, doing output processing
  * as needed (expanding tabs, newline processing, etc.).
  * Returns < 0 if succeeds, otherwise returns char to resend.
  * Must be recursive.
  */
 static int
 ttyoutput(int c, struct tty *tp)
 {
 	tcflag_t oflag;
 	int col, s;
 
 	oflag = tp->t_oflag;
 	if (!ISSET(oflag, OPOST)) {
 		if (ISSET(tp->t_lflag, FLUSHO))
 			return (-1);
 		if (putc(c, &tp->t_outq))
 			return (c);
 		tk_nout++;
 		tp->t_outcc++;
 		return (-1);
 	}
 	/*
 	 * Do tab expansion if OXTABS is set.  Special case if we external
 	 * processing, we don't do the tab expansion because we'll probably
 	 * get it wrong.  If tab expansion needs to be done, let it happen
 	 * externally.
 	 */
 	CLR(c, ~TTY_CHARMASK);
 	if (c == '\t' &&
 	    ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
 		c = 8 - (tp->t_column & 7);
 		if (!ISSET(tp->t_lflag, FLUSHO)) {
 			s = spltty();		/* Don't interrupt tabs. */
 			c -= b_to_q("        ", c, &tp->t_outq);
 			tk_nout += c;
 			tp->t_outcc += c;
 			splx(s);
 		}
 		tp->t_column += c;
 		return (c ? -1 : '\t');
 	}
 	if (c == CEOT && ISSET(oflag, ONOEOT))
 		return (-1);
 
 	/*
 	 * Newline translation: if ONLCR is set,
 	 * translate newline into "\r\n".
 	 */
 	if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
 		tk_nout++;
 		tp->t_outcc++;
 		if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
 			return (c);
 	}
 	/* If OCRNL is set, translate "\r" into "\n". */
 	else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
 		c = '\n';
 	/* If ONOCR is set, don't transmit CRs when on column 0. */
 	else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
 		return (-1);
 
 	tk_nout++;
 	tp->t_outcc++;
 	if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
 		return (c);
 
 	col = tp->t_column;
 	switch (CCLASS(c)) {
 	case BACKSPACE:
 		if (col > 0)
 			--col;
 		break;
 	case CONTROL:
 		break;
 	case NEWLINE:
 		if (ISSET(tp->t_oflag, ONLCR | ONLRET))
 			col = 0;
 		break;
 	case RETURN:
 		col = 0;
 		break;
 	case ORDINARY:
 		++col;
 		break;
 	case TAB:
 		col = (col + 8) & ~7;
 		break;
 	}
 	tp->t_column = col;
 	return (-1);
 }
 
 /*
  * Ioctls for all tty devices.  Called after line-discipline specific ioctl
  * has been called to do discipline-specific functions and/or reject any
  * of these ioctl commands.
  */
 /* ARGSUSED */
 int
 ttioctl(struct tty *tp, u_long cmd, void *data, int flag)
 {
 	struct proc *p;
 	struct thread *td;
 	struct pgrp *pgrp;
 	int s, error;
 
 	td = curthread;			/* XXX */
 	p = td->td_proc;
 
 	/* If the ioctl involves modification, hang if in the background. */
 	switch (cmd) {
 	case  TIOCCBRK:
 	case  TIOCCONS:
 	case  TIOCDRAIN:
 	case  TIOCEXCL:
 	case  TIOCFLUSH:
 #ifdef TIOCHPCL
 	case  TIOCHPCL:
 #endif
 	case  TIOCNXCL:
 	case  TIOCSBRK:
 	case  TIOCSCTTY:
 	case  TIOCSDRAINWAIT:
 	case  TIOCSETA:
 	case  TIOCSETAF:
 	case  TIOCSETAW:
 	case  TIOCSETD:
 	case  TIOCSPGRP:
 	case  TIOCSTART:
 	case  TIOCSTAT:
 	case  TIOCSTI:
 	case  TIOCSTOP:
 	case  TIOCSWINSZ:
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	case  TIOCLBIC:
 	case  TIOCLBIS:
 	case  TIOCLSET:
 	case  TIOCSETC:
 	case OTIOCSETD:
 	case  TIOCSETN:
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif
 		sx_slock(&proctree_lock);
 		PROC_LOCK(p);
 		while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) &&
 		    !SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTOU) &&
 		    !SIGISMEMBER(td->td_sigmask, SIGTTOU)) {
 			pgrp = p->p_pgrp;
 			PROC_UNLOCK(p);
 			if (pgrp->pg_jobc == 0) {
 				sx_sunlock(&proctree_lock);
 				return (EIO);
 			}
 			PGRP_LOCK(pgrp);
 			sx_sunlock(&proctree_lock);
 			pgsignal(pgrp, SIGTTOU, 1);
 			PGRP_UNLOCK(pgrp);
 			error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
 					 0);
 			if (error)
 				return (error);
 			sx_slock(&proctree_lock);
 			PROC_LOCK(p);
 		}
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 		break;
 	}
 
 	switch (cmd) {			/* Process the ioctl. */
 	case FIOASYNC:			/* set/clear async i/o */
 		s = spltty();
 		if (*(int *)data)
 			SET(tp->t_state, TS_ASYNC);
 		else
 			CLR(tp->t_state, TS_ASYNC);
 		splx(s);
 		break;
 	case FIONBIO:			/* set/clear non-blocking i/o */
 		break;			/* XXX: delete. */
 	case FIONREAD:			/* get # bytes to read */
 		s = spltty();
 		*(int *)data = ttnread(tp);
 		splx(s);
 		break;
 
 	case FIOSETOWN:
 		/*
 		 * Policy -- Don't allow FIOSETOWN on someone else's
 		 *           controlling tty
 		 */
 		if (tp->t_session != NULL && !isctty(p, tp))
 			return (ENOTTY);
 
 		error = fsetown(*(int *)data, &tp->t_sigio);
 		if (error)
 			return (error);
 		break;
 	case FIOGETOWN:
 		if (tp->t_session != NULL && !isctty(p, tp))
 			return (ENOTTY);
 		*(int *)data = fgetown(&tp->t_sigio);
 		break;
 
 	case TIOCEXCL:			/* set exclusive use of tty */
 		s = spltty();
 		SET(tp->t_state, TS_XCLUDE);
 		splx(s);
 		break;
 	case TIOCFLUSH: {		/* flush buffers */
 		int flags = *(int *)data;
 
 		if (flags == 0)
 			flags = FREAD | FWRITE;
 		else
 			flags &= FREAD | FWRITE;
 		ttyflush(tp, flags);
 		break;
 	}
 	case TIOCCONS:			/* become virtual console */
 		if (*(int *)data) {
 			struct nameidata nid;
 
 			if (constty && constty != tp &&
 			    ISSET(constty->t_state, TS_CONNECTED))
 				return (EBUSY);
 
 			/* Ensure user can open the real console. */
 			NDINIT(&nid, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE,
 			    "/dev/console", td);
 			if ((error = namei(&nid)) != 0)
 				return (error);
 			NDFREE(&nid, NDF_ONLY_PNBUF);
 			error = VOP_ACCESS(nid.ni_vp, VREAD, td->td_ucred, td);
 			vput(nid.ni_vp);
 			if (error)
 				return (error);
 
 			constty = tp;
 		} else if (tp == constty)
 			constty = NULL;
 		break;
 	case TIOCDRAIN:			/* wait till output drained */
 		error = ttywait(tp);
 		if (error)
 			return (error);
 		break;
 	case TIOCGETA: {		/* get termios struct */
 		struct termios *t = (struct termios *)data;
 
 		bcopy(&tp->t_termios, t, sizeof(struct termios));
 		break;
 	}
 	case TIOCGETD:			/* get line discipline */
 		*(int *)data = tp->t_line;
 		break;
 	case TIOCGWINSZ:		/* get window size */
 		*(struct winsize *)data = tp->t_winsize;
 		break;
 	case TIOCGPGRP:			/* get pgrp of tty */
 		if (!isctty(p, tp))
 			return (ENOTTY);
 		*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		break;
 #ifdef TIOCHPCL
 	case TIOCHPCL:			/* hang up on last close */
 		s = spltty();
 		SET(tp->t_cflag, HUPCL);
 		splx(s);
 		break;
 #endif
 	case TIOCNXCL:			/* reset exclusive use of tty */
 		s = spltty();
 		CLR(tp->t_state, TS_XCLUDE);
 		splx(s);
 		break;
 	case TIOCOUTQ:			/* output queue size */
 		*(int *)data = tp->t_outq.c_cc;
 		break;
 	case TIOCSETA:			/* set termios struct */
 	case TIOCSETAW:			/* drain output, set */
 	case TIOCSETAF: {		/* drn out, fls in, set */
 		struct termios *t = (struct termios *)data;
 
 		if (t->c_ispeed == 0)
 			t->c_ispeed = t->c_ospeed;
 		if (t->c_ispeed == 0)
 			t->c_ispeed = tp->t_ospeed;
 		if (t->c_ispeed == 0)
 			return (EINVAL);
 		s = spltty();
 		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
 			error = ttywait(tp);
 			if (error) {
 				splx(s);
 				return (error);
 			}
 			if (cmd == TIOCSETAF)
 				ttyflush(tp, FREAD);
 		}
 		if (!ISSET(t->c_cflag, CIGNORE)) {
 			/*
 			 * Set device hardware.
 			 */
 			if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
 				splx(s);
 				return (error);
 			}
 			if (ISSET(t->c_cflag, CLOCAL) &&
 			    !ISSET(tp->t_cflag, CLOCAL)) {
 				/*
 				 * XXX disconnections would be too hard to
 				 * get rid of without this kludge.  The only
 				 * way to get rid of controlling terminals
 				 * is to exit from the session leader.
 				 */
 				CLR(tp->t_state, TS_ZOMBIE);
 
 				wakeup(TSA_CARR_ON(tp));
 				ttwakeup(tp);
 				ttwwakeup(tp);
 			}
 			if ((ISSET(tp->t_state, TS_CARR_ON) ||
 			     ISSET(t->c_cflag, CLOCAL)) &&
 			    !ISSET(tp->t_state, TS_ZOMBIE))
 				SET(tp->t_state, TS_CONNECTED);
 			else
 				CLR(tp->t_state, TS_CONNECTED);
 			tp->t_cflag = t->c_cflag;
 			tp->t_ispeed = t->c_ispeed;
 			if (t->c_ospeed != 0)
 				tp->t_ospeed = t->c_ospeed;
 			ttsetwater(tp);
 		}
 		if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
 		    cmd != TIOCSETAF) {
 			if (ISSET(t->c_lflag, ICANON))
 				SET(tp->t_lflag, PENDIN);
 			else {
 				/*
 				 * XXX we really shouldn't allow toggling
 				 * ICANON while we're in a non-termios line
 				 * discipline.  Now we have to worry about
 				 * panicing for a null queue.
 				 */
 				if (tp->t_canq.c_cbreserved > 0 &&
 				    tp->t_rawq.c_cbreserved > 0) {
 					catq(&tp->t_rawq, &tp->t_canq);
 					/*
 					 * XXX the queue limits may be
 					 * different, so the old queue
 					 * swapping method no longer works.
 					 */
 					catq(&tp->t_canq, &tp->t_rawq);
 				}
 				CLR(tp->t_lflag, PENDIN);
 			}
 			ttwakeup(tp);
 		}
 		tp->t_iflag = t->c_iflag;
 		tp->t_oflag = t->c_oflag;
 		/*
 		 * Make the EXTPROC bit read only.
 		 */
 		if (ISSET(tp->t_lflag, EXTPROC))
 			SET(t->c_lflag, EXTPROC);
 		else
 			CLR(t->c_lflag, EXTPROC);
 		tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
 		if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
 		    t->c_cc[VTIME] != tp->t_cc[VTIME])
 			ttwakeup(tp);
 		bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
 		splx(s);
 		break;
 	}
 	case TIOCSETD: {		/* set line discipline */
 		int t = *(int *)data;
 		dev_t device = tp->t_dev;
 
 		if ((u_int)t >= nlinesw)
 			return (ENXIO);
 		if (t != tp->t_line) {
 			s = spltty();
 			(*linesw[tp->t_line].l_close)(tp, flag);
 			error = (*linesw[t].l_open)(device, tp);
 			if (error) {
 				(void)(*linesw[tp->t_line].l_open)(device, tp);
 				splx(s);
 				return (error);
 			}
 			tp->t_line = t;
 			splx(s);
 		}
 		break;
 	}
 	case TIOCSTART:			/* start output, like ^Q */
 		s = spltty();
 		if (ISSET(tp->t_state, TS_TTSTOP) ||
 		    ISSET(tp->t_lflag, FLUSHO)) {
 			CLR(tp->t_lflag, FLUSHO);
 			CLR(tp->t_state, TS_TTSTOP);
 			ttstart(tp);
 		}
 		splx(s);
 		break;
 	case TIOCSTI:			/* simulate terminal input */
 		if ((flag & FREAD) == 0 && suser(td))
 			return (EPERM);
 		if (!isctty(p, tp) && suser(td))
 			return (EACCES);
 		s = spltty();
 		(*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
 		splx(s);
 		break;
 	case TIOCSTOP:			/* stop output, like ^S */
 		s = spltty();
 		if (!ISSET(tp->t_state, TS_TTSTOP)) {
 			SET(tp->t_state, TS_TTSTOP);
 			(*tp->t_stop)(tp, 0);
 		}
 		splx(s);
 		break;
 	case TIOCSCTTY:			/* become controlling tty */
 		/* Session ctty vnode pointer set in vnode layer. */
 		sx_slock(&proctree_lock);
 		if (!SESS_LEADER(p) ||
 		    ((p->p_session->s_ttyvp || tp->t_session) &&
 		     (tp->t_session != p->p_session))) {
 			sx_sunlock(&proctree_lock);
 			return (EPERM);
 		}
 		tp->t_session = p->p_session;
 		tp->t_pgrp = p->p_pgrp;
 		SESS_LOCK(p->p_session);
 		p->p_session->s_ttyp = tp;
 		SESS_UNLOCK(p->p_session);
 		PROC_LOCK(p);
 		p->p_flag |= P_CONTROLT;
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 		break;
 	case TIOCSPGRP: {		/* set pgrp of tty */
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(*(int *)data);
 		if (!isctty(p, tp)) {
 			if (pgrp != NULL)
 				PGRP_UNLOCK(pgrp);
 			sx_sunlock(&proctree_lock);
 			return (ENOTTY);
 		}
 		if (pgrp == NULL) {
 			sx_sunlock(&proctree_lock);
 			return (EPERM);
 		}
 		PGRP_UNLOCK(pgrp);
 		if (pgrp->pg_session != p->p_session) {
 			sx_sunlock(&proctree_lock);
 			return (EPERM);
 		}
 		sx_sunlock(&proctree_lock);
 		tp->t_pgrp = pgrp;
 		break;
 	}
 	case TIOCSTAT:			/* simulate control-T */
 		s = spltty();
 		ttyinfo(tp);
 		splx(s);
 		break;
 	case TIOCSWINSZ:		/* set window size */
 		if (bcmp((caddr_t)&tp->t_winsize, data,
 		    sizeof (struct winsize))) {
 			tp->t_winsize = *(struct winsize *)data;
 			if (tp->t_pgrp != NULL) {
 				PGRP_LOCK(tp->t_pgrp);
 				pgsignal(tp->t_pgrp, SIGWINCH, 1);
 				PGRP_UNLOCK(tp->t_pgrp);
 			}
 		}
 		break;
 	case TIOCSDRAINWAIT:
 		error = suser(td);
 		if (error)
 			return (error);
 		tp->t_timeout = *(int *)data * hz;
 		wakeup(TSA_OCOMPLETE(tp));
 		wakeup(TSA_OLOWAT(tp));
 		break;
 	case TIOCGDRAINWAIT:
 		*(int *)data = tp->t_timeout / hz;
 		break;
 	default:
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		return (ttcompat(tp, cmd, data, flag));
 #else
 		return (ENOIOCTL);
 #endif
 	}
 	return (0);
 }
 
 int
 ttypoll(dev_t dev, int events, struct thread *td)
 {
 	int s;
 	int revents = 0;
 	struct tty *tp;
 
 	tp = dev->si_tty;
 	if (tp == NULL)	/* XXX used to return ENXIO, but that means true! */
 		return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM))
 			| POLLHUP);
 
 	s = spltty();
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
 			revents |= events & (POLLIN | POLLRDNORM);
 		else
 			selrecord(td, &tp->t_rsel);
 	}
 	if (events & (POLLOUT | POLLWRNORM)) {
 		if ((tp->t_outq.c_cc <= tp->t_olowat &&
 		     ISSET(tp->t_state, TS_CONNECTED))
 		    || ISSET(tp->t_state, TS_ZOMBIE))
 			revents |= events & (POLLOUT | POLLWRNORM);
 		else
 			selrecord(td, &tp->t_wsel);
 	}
 	splx(s);
 	return (revents);
 }
 
 static struct filterops ttyread_filtops =
 	{ 1, NULL, filt_ttyrdetach, filt_ttyread };
 static struct filterops ttywrite_filtops =
 	{ 1, NULL, filt_ttywdetach, filt_ttywrite };
 
 int
 ttykqfilter(dev_t dev, struct knote *kn)
 {
 	struct tty *tp = dev->si_tty;
 	struct klist *klist;
 	int s;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		klist = &tp->t_rsel.si_note;
 		kn->kn_fop = &ttyread_filtops;
 		break;
 	case EVFILT_WRITE:
 		klist = &tp->t_wsel.si_note;
 		kn->kn_fop = &ttywrite_filtops;
 		break;
 	default:
 		return (1);
 	}
 
 	kn->kn_hook = (caddr_t)dev;
 
 	s = spltty();
 	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
 	splx(s);
 
 	return (0);
 }
 
 static void
 filt_ttyrdetach(struct knote *kn)
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 	int s = spltty();
 
 	SLIST_REMOVE(&tp->t_rsel.si_note, kn, knote, kn_selnext);
 	splx(s);
 }
 
 static int
 filt_ttyread(struct knote *kn, long hint)
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 
 	kn->kn_data = ttnread(tp);
 	if (ISSET(tp->t_state, TS_ZOMBIE)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_data > 0);
 }
 
 static void
 filt_ttywdetach(struct knote *kn)
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 	int s = spltty();
 
 	SLIST_REMOVE(&tp->t_wsel.si_note, kn, knote, kn_selnext);
 	splx(s);
 }
 
 static int
 filt_ttywrite(struct knote *kn, long hint)
 {
 	struct tty *tp = ((dev_t)kn->kn_hook)->si_tty;
 
 	kn->kn_data = tp->t_outq.c_cc;
 	if (ISSET(tp->t_state, TS_ZOMBIE))
 		return (1);
 	return (kn->kn_data <= tp->t_olowat &&
 	    ISSET(tp->t_state, TS_CONNECTED));
 }
 
 /*
  * Must be called at spltty().
  */
 static int
 ttnread(struct tty *tp)
 {
 	int nread;
 
 	if (ISSET(tp->t_lflag, PENDIN))
 		ttypend(tp);
 	nread = tp->t_canq.c_cc;
 	if (!ISSET(tp->t_lflag, ICANON)) {
 		nread += tp->t_rawq.c_cc;
 		if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
 			nread = 0;
 	}
 	return (nread);
 }
 
 /*
  * Wait for output to drain.
  */
 int
 ttywait(struct tty *tp)
 {
 	int error, s;
 
 	error = 0;
 	s = spltty();
 	while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
 	       ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
 		(*tp->t_oproc)(tp);
 		if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
 		    ISSET(tp->t_state, TS_CONNECTED)) {
 			SET(tp->t_state, TS_SO_OCOMPLETE);
 			error = ttysleep(tp, TSA_OCOMPLETE(tp),
 					 TTOPRI | PCATCH, "ttywai",
 					 tp->t_timeout);
 			if (error) {
 				if (error == EWOULDBLOCK)
 					error = EIO;
 				break;
 			}
 		} else
 			break;
 	}
 	if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
 		error = EIO;
 	splx(s);
 	return (error);
 }
 
 /*
  * Flush if successfully wait.
  */
 static int
 ttywflush(struct tty *tp)
 {
 	int error;
 
 	if ((error = ttywait(tp)) == 0)
 		ttyflush(tp, FREAD);
 	return (error);
 }
 
 /*
  * Flush tty read and/or write queues, notifying anyone waiting.
  */
 void
 ttyflush(struct tty *tp, int rw)
 {
 	int s;
 
 	s = spltty();
 #if 0
 again:
 #endif
 	if (rw & FWRITE) {
 		FLUSHQ(&tp->t_outq);
 		CLR(tp->t_state, TS_TTSTOP);
 	}
 	(*tp->t_stop)(tp, rw);
 	if (rw & FREAD) {
 		FLUSHQ(&tp->t_canq);
 		FLUSHQ(&tp->t_rawq);
 		CLR(tp->t_lflag, PENDIN);
 		tp->t_rocount = 0;
 		tp->t_rocol = 0;
 		CLR(tp->t_state, TS_LOCAL);
 		ttwakeup(tp);
 		if (ISSET(tp->t_state, TS_TBLOCK)) {
 			if (rw & FWRITE)
 				FLUSHQ(&tp->t_outq);
 			ttyunblock(tp);
 
 			/*
 			 * Don't let leave any state that might clobber the
 			 * next line discipline (although we should do more
 			 * to send the START char).  Not clearing the state
 			 * may have caused the "putc to a clist with no
 			 * reserved cblocks" panic/printf.
 			 */
 			CLR(tp->t_state, TS_TBLOCK);
 
 #if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
 			if (ISSET(tp->t_iflag, IXOFF)) {
 				/*
 				 * XXX wait a bit in the hope that the stop
 				 * character (if any) will go out.  Waiting
 				 * isn't good since it allows races.  This
 				 * will be fixed when the stop character is
 				 * put in a special queue.  Don't bother with
 				 * the checks in ttywait() since the timeout
 				 * will save us.
 				 */
 				SET(tp->t_state, TS_SO_OCOMPLETE);
 				ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
 					 "ttyfls", hz / 10);
 				/*
 				 * Don't try sending the stop character again.
 				 */
 				CLR(tp->t_state, TS_TBLOCK);
 				goto again;
 			}
 #endif
 		}
 	}
 	if (rw & FWRITE) {
 		FLUSHQ(&tp->t_outq);
 		ttwwakeup(tp);
 	}
 	splx(s);
 }
 
 /*
  * Copy in the default termios characters.
  */
 void
 termioschars(struct termios *t)
 {
 
 	bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
 }
 
 /*
  * Old interface.
  */
 void
 ttychars(struct tty *tp)
 {
 
 	termioschars(&tp->t_termios);
 }
 
 /*
  * Handle input high water.  Send stop character for the IXOFF case.  Turn
  * on our input flow control bit and propagate the changes to the driver.
  * XXX the stop character should be put in a special high priority queue.
  */
 void
 ttyblock(struct tty *tp)
 {
 
 	SET(tp->t_state, TS_TBLOCK);
 	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
 	    putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
 		CLR(tp->t_state, TS_TBLOCK);	/* try again later */
 	ttstart(tp);
 }
 
 /*
  * Handle input low water.  Send start character for the IXOFF case.  Turn
  * off our input flow control bit and propagate the changes to the driver.
  * XXX the start character should be put in a special high priority queue.
  */
 static void
 ttyunblock(struct tty *tp)
 {
 
 	CLR(tp->t_state, TS_TBLOCK);
 	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
 	    putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
 		SET(tp->t_state, TS_TBLOCK);	/* try again later */
 	ttstart(tp);
 }
 
 #ifdef notyet
 /* Not used by any current (i386) drivers. */
 /*
  * Restart after an inter-char delay.
  */
 void
 ttrstrt(void *tp_arg)
 {
 	struct tty *tp;
 	int s;
 
 	KASSERT(tp_arg != NULL, ("ttrstrt"));
 
 	tp = tp_arg;
 	s = spltty();
 
 	CLR(tp->t_state, TS_TIMEOUT);
 	ttstart(tp);
 
 	splx(s);
 }
 #endif
 
 int
 ttstart(struct tty *tp)
 {
 
 	if (tp->t_oproc != NULL)	/* XXX: Kludge for pty. */
 		(*tp->t_oproc)(tp);
 	return (0);
 }
 
 /*
  * "close" a line discipline
  */
 int
 ttylclose(struct tty *tp, int flag)
 {
 
 	if (flag & FNONBLOCK || ttywflush(tp))
 		ttyflush(tp, FREAD | FWRITE);
 	return (0);
 }
 
 /*
  * Handle modem control transition on a tty.
  * Flag indicates new state of carrier.
  * Returns 0 if the line should be turned off, otherwise 1.
  */
 int
 ttymodem(struct tty *tp, int flag)
 {
 
 	if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
 		/*
 		 * MDMBUF: do flow control according to carrier flag
 		 * XXX TS_CAR_OFLOW doesn't do anything yet.  TS_TTSTOP
 		 * works if IXON and IXANY are clear.
 		 */
 		if (flag) {
 			CLR(tp->t_state, TS_CAR_OFLOW);
 			CLR(tp->t_state, TS_TTSTOP);
 			ttstart(tp);
 		} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
 			SET(tp->t_state, TS_CAR_OFLOW);
 			SET(tp->t_state, TS_TTSTOP);
 			(*tp->t_stop)(tp, 0);
 		}
 	} else if (flag == 0) {
 		/*
 		 * Lost carrier.
 		 */
 		CLR(tp->t_state, TS_CARR_ON);
 		if (ISSET(tp->t_state, TS_ISOPEN) &&
 		    !ISSET(tp->t_cflag, CLOCAL)) {
 			SET(tp->t_state, TS_ZOMBIE);
 			CLR(tp->t_state, TS_CONNECTED);
 			if (tp->t_session) {
 				sx_slock(&proctree_lock);
 				if (tp->t_session->s_leader) {
 					struct proc *p;
 
 					p = tp->t_session->s_leader;
 					PROC_LOCK(p);
 					psignal(p, SIGHUP);
 					PROC_UNLOCK(p);
 				}
 				sx_sunlock(&proctree_lock);
 			}
 			ttyflush(tp, FREAD | FWRITE);
 			return (0);
 		}
 	} else {
 		/*
 		 * Carrier now on.
 		 */
 		SET(tp->t_state, TS_CARR_ON);
 		if (!ISSET(tp->t_state, TS_ZOMBIE))
 			SET(tp->t_state, TS_CONNECTED);
 		wakeup(TSA_CARR_ON(tp));
 		ttwakeup(tp);
 		ttwwakeup(tp);
 	}
 	return (1);
 }
 
 /*
  * Reinput pending characters after state switch
  * call at spltty().
  */
 static void
 ttypend(struct tty *tp)
 {
 	struct clist tq;
 	int c;
 
 	CLR(tp->t_lflag, PENDIN);
 	SET(tp->t_state, TS_TYPEN);
 	/*
 	 * XXX this assumes too much about clist internals.  It may even
 	 * fail if the cblock slush pool is empty.  We can't allocate more
 	 * cblocks here because we are called from an interrupt handler
 	 * and clist_alloc_cblocks() can wait.
 	 */
 	tq = tp->t_rawq;
 	bzero(&tp->t_rawq, sizeof tp->t_rawq);
 	tp->t_rawq.c_cbmax = tq.c_cbmax;
 	tp->t_rawq.c_cbreserved = tq.c_cbreserved;
 	while ((c = getc(&tq)) >= 0)
 		ttyinput(c, tp);
 	CLR(tp->t_state, TS_TYPEN);
 }
 
 /*
  * Process a read call on a tty device.
  */
 int
 ttread(struct tty *tp, struct uio *uio, int flag)
 {
 	struct clist *qp;
 	int c;
 	tcflag_t lflag;
 	cc_t *cc = tp->t_cc;
 	struct thread *td;
 	struct proc *p;
 	int s, first, error = 0;
 	int has_stime = 0, last_cc = 0;
 	long slp = 0;		/* XXX this should be renamed `timo'. */
 	struct timeval stime;
 	struct pgrp *pg;
 
 	td = curthread;
 	p = td->td_proc;
 loop:
 	s = spltty();
 	lflag = tp->t_lflag;
 	/*
 	 * take pending input first
 	 */
 	if (ISSET(lflag, PENDIN)) {
 		ttypend(tp);
 		splx(s);	/* reduce latency */
 		s = spltty();
 		lflag = tp->t_lflag;	/* XXX ttypend() clobbers it */
 	}
 
 	/*
 	 * Hang process if it's in the background.
 	 */
 	if (isbackground(p, tp)) {
 		splx(s);
 		sx_slock(&proctree_lock);
 		PROC_LOCK(p);
 		if (SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTIN) ||
 		    SIGISMEMBER(td->td_sigmask, SIGTTIN) ||
 		    (p->p_flag & P_PPWAIT) || p->p_pgrp->pg_jobc == 0) {
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 			return (EIO);
 		}
 		pg = p->p_pgrp;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pg);
 		sx_sunlock(&proctree_lock);
 		pgsignal(pg, SIGTTIN, 1);
 		PGRP_UNLOCK(pg);
 		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
 		if (error)
 			return (error);
 		goto loop;
 	}
 
 	if (ISSET(tp->t_state, TS_ZOMBIE)) {
 		splx(s);
 		return (0);	/* EOF */
 	}
 
 	/*
 	 * If canonical, use the canonical queue,
 	 * else use the raw queue.
 	 *
 	 * (should get rid of clists...)
 	 */
 	qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
 
 	if (flag & IO_NDELAY) {
 		if (qp->c_cc > 0)
 			goto read;
 		if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
 			splx(s);
 			return (0);
 		}
 		splx(s);
 		return (EWOULDBLOCK);
 	}
 	if (!ISSET(lflag, ICANON)) {
 		int m = cc[VMIN];
 		long t = cc[VTIME];
 		struct timeval timecopy;
 
 		/*
 		 * Check each of the four combinations.
 		 * (m > 0 && t == 0) is the normal read case.
 		 * It should be fairly efficient, so we check that and its
 		 * companion case (m == 0 && t == 0) first.
 		 * For the other two cases, we compute the target sleep time
 		 * into slp.
 		 */
 		if (t == 0) {
 			if (qp->c_cc < m)
 				goto sleep;
 			if (qp->c_cc > 0)
 				goto read;
 
 			/* m, t and qp->c_cc are all 0.  0 is enough input. */
 			splx(s);
 			return (0);
 		}
 		t *= 100000;		/* time in us */
 #define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
 			 ((t1).tv_usec - (t2).tv_usec))
 		if (m > 0) {
 			if (qp->c_cc <= 0)
 				goto sleep;
 			if (qp->c_cc >= m)
 				goto read;
 			getmicrotime(&timecopy);
 			if (!has_stime) {
 				/* first character, start timer */
 				has_stime = 1;
 				stime = timecopy;
 				slp = t;
 			} else if (qp->c_cc > last_cc) {
 				/* got a character, restart timer */
 				stime = timecopy;
 				slp = t;
 			} else {
 				/* nothing, check expiration */
 				slp = t - diff(timecopy, stime);
 				if (slp <= 0)
 					goto read;
 			}
 			last_cc = qp->c_cc;
 		} else {	/* m == 0 */
 			if (qp->c_cc > 0)
 				goto read;
 			getmicrotime(&timecopy);
 			if (!has_stime) {
 				has_stime = 1;
 				stime = timecopy;
 				slp = t;
 			} else {
 				slp = t - diff(timecopy, stime);
 				if (slp <= 0) {
 					/* Timed out, but 0 is enough input. */
 					splx(s);
 					return (0);
 				}
 			}
 		}
 #undef diff
 		/*
 		 * Rounding down may make us wake up just short
 		 * of the target, so we round up.
 		 * The formula is ceiling(slp * hz/1000000).
 		 * 32-bit arithmetic is enough for hz < 169.
 		 * XXX see tvtohz() for how to avoid overflow if hz
 		 * is large (divide by `tick' and/or arrange to
 		 * use tvtohz() if hz is large).
 		 */
 		slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
 		goto sleep;
 	}
 	if (qp->c_cc <= 0) {
 sleep:
 		/*
 		 * There is no input, or not enough input and we can block.
 		 */
 		error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
 				 ISSET(tp->t_state, TS_CONNECTED) ?
 				 "ttyin" : "ttyhup", (int)slp);
 		splx(s);
 		if (error == EWOULDBLOCK)
 			error = 0;
 		else if (error)
 			return (error);
 		/*
 		 * XXX what happens if another process eats some input
 		 * while we are asleep (not just here)?  It would be
 		 * safest to detect changes and reset our state variables
 		 * (has_stime and last_cc).
 		 */
 		slp = 0;
 		goto loop;
 	}
 read:
 	splx(s);
 	/*
 	 * Input present, check for input mapping and processing.
 	 */
 	first = 1;
 	if (ISSET(lflag, ICANON | ISIG))
 		goto slowcase;
 	for (;;) {
 		char ibuf[IBUFSIZ];
 		int icc;
 
 		icc = imin(uio->uio_resid, IBUFSIZ);
 		icc = q_to_b(qp, ibuf, icc);
 		if (icc <= 0) {
 			if (first)
 				goto loop;
 			break;
 		}
 		error = uiomove(ibuf, icc, uio);
 		/*
 		 * XXX if there was an error then we should ungetc() the
 		 * unmoved chars and reduce icc here.
 		 */
 		if (error)
 			break;
 		if (uio->uio_resid == 0)
 			break;
 		first = 0;
 	}
 	goto out;
 slowcase:
 	for (;;) {
 		c = getc(qp);
 		if (c < 0) {
 			if (first)
 				goto loop;
 			break;
 		}
 		/*
 		 * delayed suspend (^Y)
 		 */
 		if (CCEQ(cc[VDSUSP], c) &&
 		    ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
 			if (tp->t_pgrp != NULL) {
 				PGRP_LOCK(tp->t_pgrp);
 				pgsignal(tp->t_pgrp, SIGTSTP, 1);
 				PGRP_UNLOCK(tp->t_pgrp);
 			}
 			if (first) {
 				error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
 						 "ttybg3", 0);
 				if (error)
 					break;
 				goto loop;
 			}
 			break;
 		}
 		/*
 		 * Interpret EOF only in canonical mode.
 		 */
 		if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
 			break;
 		/*
 		 * Give user character.
 		 */
 		error = ureadc(c, uio);
 		if (error)
 			/* XXX should ungetc(c, qp). */
 			break;
 		if (uio->uio_resid == 0)
 			break;
 		/*
 		 * In canonical mode check for a "break character"
 		 * marking the end of a "line of input".
 		 */
 		if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
 			break;
 		first = 0;
 	}
 
 out:
 	/*
 	 * Look to unblock input now that (presumably)
 	 * the input queue has gone down.
 	 */
 	s = spltty();
 	if (ISSET(tp->t_state, TS_TBLOCK) &&
 	    tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat)
 		ttyunblock(tp);
 	splx(s);
 
 	return (error);
 }
 
 /*
  * Check the output queue on tp for space for a kernel message (from uprintf
  * or tprintf).  Allow some space over the normal hiwater mark so we don't
  * lose messages due to normal flow control, but don't let the tty run amok.
  * Sleeps here are not interruptible, but we return prematurely if new signals
  * arrive.
  */
 int
 ttycheckoutq(struct tty *tp, int wait)
 {
 	int hiwat, s;
 	sigset_t oldmask;
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	hiwat = tp->t_ohiwat;
 	SIGEMPTYSET(oldmask);
 	s = spltty();
 	if (wait) {
 		PROC_LOCK(p);
 		oldmask = td->td_siglist;
 		PROC_UNLOCK(p);
 	}
 	if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
 		while (tp->t_outq.c_cc > hiwat) {
 			ttstart(tp);
 			if (tp->t_outq.c_cc <= hiwat)
 				break;
 			if (!wait) {
 				splx(s);
 				return (0);
 			}
 			PROC_LOCK(p);
 			if (!SIGSETEQ(td->td_siglist, oldmask)) {
 				PROC_UNLOCK(p);
 				splx(s);
 				return (0);
 			}
 			PROC_UNLOCK(p);
 			SET(tp->t_state, TS_SO_OLOWAT);
 			tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
 		}
 	splx(s);
 	return (1);
 }
 
 /*
  * Process a write call on a tty device.
  */
 int
 ttwrite(struct tty *tp, struct uio *uio, int flag)
 {
 	char *cp = NULL;
 	int cc, ce;
 	struct thread *td;
 	struct proc *p;
 	int i, hiwat, cnt, error, s;
 	char obuf[OBUFSIZ];
 
 	hiwat = tp->t_ohiwat;
 	cnt = uio->uio_resid;
 	error = 0;
 	cc = 0;
 	td = curthread;
 	p = td->td_proc;
 loop:
 	s = spltty();
 	if (ISSET(tp->t_state, TS_ZOMBIE)) {
 		splx(s);
 		if (uio->uio_resid == cnt)
 			error = EIO;
 		goto out;
 	}
 	if (!ISSET(tp->t_state, TS_CONNECTED)) {
 		if (flag & IO_NDELAY) {
 			splx(s);
 			error = EWOULDBLOCK;
 			goto out;
 		}
 		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
 				 "ttydcd", 0);
 		splx(s);
 		if (error)
 			goto out;
 		goto loop;
 	}
 	splx(s);
 	/*
 	 * Hang the process if it's in the background.
 	 */
 	sx_slock(&proctree_lock);
 	PROC_LOCK(p);
 	if (isbackground(p, tp) &&
 	    ISSET(tp->t_lflag, TOSTOP) && !(p->p_flag & P_PPWAIT) &&
 	    !SIGISMEMBER(p->p_sigacts->ps_sigignore, SIGTTOU) &&
 	    !SIGISMEMBER(td->td_sigmask, SIGTTOU)) {
 		if (p->p_pgrp->pg_jobc == 0) {
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 			error = EIO;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 		PGRP_LOCK(p->p_pgrp);
 		sx_sunlock(&proctree_lock);
 		pgsignal(p->p_pgrp, SIGTTOU, 1);
 		PGRP_UNLOCK(p->p_pgrp);
 		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
 		if (error)
 			goto out;
 		goto loop;
 	} else {
 		PROC_UNLOCK(p);
 		sx_sunlock(&proctree_lock);
 	}
 	/*
 	 * Process the user's data in at most OBUFSIZ chunks.  Perform any
 	 * output translation.  Keep track of high water mark, sleep on
 	 * overflow awaiting device aid in acquiring new space.
 	 */
 	while (uio->uio_resid > 0 || cc > 0) {
 		if (ISSET(tp->t_lflag, FLUSHO)) {
 			uio->uio_resid = 0;
 			return (0);
 		}
 		if (tp->t_outq.c_cc > hiwat)
 			goto ovhiwat;
 		/*
 		 * Grab a hunk of data from the user, unless we have some
 		 * leftover from last time.
 		 */
 		if (cc == 0) {
 			cc = imin(uio->uio_resid, OBUFSIZ);
 			cp = obuf;
 			error = uiomove(cp, cc, uio);
 			if (error) {
 				cc = 0;
 				break;
 			}
 		}
 		/*
 		 * If nothing fancy need be done, grab those characters we
 		 * can handle without any of ttyoutput's processing and
 		 * just transfer them to the output q.  For those chars
 		 * which require special processing (as indicated by the
 		 * bits in char_type), call ttyoutput.  After processing
 		 * a hunk of data, look for FLUSHO so ^O's will take effect
 		 * immediately.
 		 */
 		while (cc > 0) {
 			if (!ISSET(tp->t_oflag, OPOST))
 				ce = cc;
 			else {
 				ce = cc - scanc((u_int)cc, (u_char *)cp,
 						char_type, CCLASSMASK);
 				/*
 				 * If ce is zero, then we're processing
 				 * a special character through ttyoutput.
 				 */
 				if (ce == 0) {
 					tp->t_rocount = 0;
 					if (ttyoutput(*cp, tp) >= 0) {
 						/* No Clists, wait a bit. */
 						ttstart(tp);
 						if (flag & IO_NDELAY) {
 							error = EWOULDBLOCK;
 							goto out;
 						}
 						error = ttysleep(tp, &lbolt,
 								 TTOPRI|PCATCH,
 								 "ttybf1", 0);
 						if (error)
 							goto out;
 						goto loop;
 					}
 					cp++;
 					cc--;
 					if (ISSET(tp->t_lflag, FLUSHO) ||
 					    tp->t_outq.c_cc > hiwat)
 						goto ovhiwat;
 					continue;
 				}
 			}
 			/*
 			 * A bunch of normal characters have been found.
 			 * Transfer them en masse to the output queue and
 			 * continue processing at the top of the loop.
 			 * If there are any further characters in this
 			 * <= OBUFSIZ chunk, the first should be a character
 			 * requiring special handling by ttyoutput.
 			 */
 			tp->t_rocount = 0;
 			i = b_to_q(cp, ce, &tp->t_outq);
 			ce -= i;
 			tp->t_column += ce;
 			cp += ce, cc -= ce, tk_nout += ce;
 			tp->t_outcc += ce;
 			if (i > 0) {
 				/* No Clists, wait a bit. */
 				ttstart(tp);
 				if (flag & IO_NDELAY) {
 					error = EWOULDBLOCK;
 					goto out;
 				}
 				error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
 						 "ttybf2", 0);
 				if (error)
 					goto out;
 				goto loop;
 			}
 			if (ISSET(tp->t_lflag, FLUSHO) ||
 			    tp->t_outq.c_cc > hiwat)
 				break;
 		}
 		ttstart(tp);
 	}
 out:
 	/*
 	 * If cc is nonzero, we leave the uio structure inconsistent, as the
 	 * offset and iov pointers have moved forward, but it doesn't matter
 	 * (the call will either return short or restart with a new uio).
 	 */
 	uio->uio_resid += cc;
 	return (error);
 
 ovhiwat:
 	ttstart(tp);
 	s = spltty();
 	/*
 	 * This can only occur if FLUSHO is set in t_lflag,
 	 * or if ttstart/oproc is synchronous (or very fast).
 	 */
 	if (tp->t_outq.c_cc <= hiwat) {
 		splx(s);
 		goto loop;
 	}
 	if (flag & IO_NDELAY) {
 		splx(s);
 		uio->uio_resid += cc;
 		return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
 	}
 	SET(tp->t_state, TS_SO_OLOWAT);
 	error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
 			 tp->t_timeout);
 	splx(s);
 	if (error == EWOULDBLOCK)
 		error = EIO;
 	if (error)
 		goto out;
 	goto loop;
 }
 
 /*
  * Rubout one character from the rawq of tp
  * as cleanly as possible.
  */
 static void
 ttyrub(int c, struct tty *tp)
 {
 	char *cp;
 	int savecol;
 	int tabc, s;
 
 	if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
 		return;
 	CLR(tp->t_lflag, FLUSHO);
 	if (ISSET(tp->t_lflag, ECHOE)) {
 		if (tp->t_rocount == 0) {
 			/*
 			 * Screwed by ttwrite; retype
 			 */
 			ttyretype(tp);
 			return;
 		}
 		if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
 			ttyrubo(tp, 2);
 		else {
 			CLR(c, ~TTY_CHARMASK);
 			switch (CCLASS(c)) {
 			case ORDINARY:
 				ttyrubo(tp, 1);
 				break;
 			case BACKSPACE:
 			case CONTROL:
 			case NEWLINE:
 			case RETURN:
 			case VTAB:
 				if (ISSET(tp->t_lflag, ECHOCTL))
 					ttyrubo(tp, 2);
 				break;
 			case TAB:
 				if (tp->t_rocount < tp->t_rawq.c_cc) {
 					ttyretype(tp);
 					return;
 				}
 				s = spltty();
 				savecol = tp->t_column;
 				SET(tp->t_state, TS_CNTTB);
 				SET(tp->t_lflag, FLUSHO);
 				tp->t_column = tp->t_rocol;
 				cp = tp->t_rawq.c_cf;
 				if (cp)
 					tabc = *cp;	/* XXX FIX NEXTC */
 				for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc))
 					ttyecho(tabc, tp);
 				CLR(tp->t_lflag, FLUSHO);
 				CLR(tp->t_state, TS_CNTTB);
 				splx(s);
 
 				/* savecol will now be length of the tab. */
 				savecol -= tp->t_column;
 				tp->t_column += savecol;
 				if (savecol > 8)
 					savecol = 8;	/* overflow screw */
 				while (--savecol >= 0)
 					(void)ttyoutput('\b', tp);
 				break;
 			default:			/* XXX */
 #define	PANICSTR	"ttyrub: would panic c = %d, val = %d\n"
 				(void)printf(PANICSTR, c, CCLASS(c));
 #ifdef notdef
 				panic(PANICSTR, c, CCLASS(c));
 #endif
 			}
 		}
 	} else if (ISSET(tp->t_lflag, ECHOPRT)) {
 		if (!ISSET(tp->t_state, TS_ERASE)) {
 			SET(tp->t_state, TS_ERASE);
 			(void)ttyoutput('\\', tp);
 		}
 		ttyecho(c, tp);
 	} else {
 		ttyecho(tp->t_cc[VERASE], tp);
 		/*
 		 * This code may be executed not only when an ERASE key
 		 * is pressed, but also when ^U (KILL) or ^W (WERASE) are.
 		 * So, I didn't think it was worthwhile to pass the extra
 		 * information (which would need an extra parameter,
 		 * changing every call) needed to distinguish the ERASE2
 		 * case from the ERASE.
 		 */
 	}
 	--tp->t_rocount;
 }
 
 /*
  * Back over cnt characters, erasing them.
  */
 static void
 ttyrubo(struct tty *tp, int cnt)
 {
 
 	while (cnt-- > 0) {
 		(void)ttyoutput('\b', tp);
 		(void)ttyoutput(' ', tp);
 		(void)ttyoutput('\b', tp);
 	}
 }
 
 /*
  * ttyretype --
  *	Reprint the rawq line.  Note, it is assumed that c_cc has already
  *	been checked.
  */
 static void
 ttyretype(struct tty *tp)
 {
 	char *cp;
 	int s, c;
 
 	/* Echo the reprint character. */
 	if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
 		ttyecho(tp->t_cc[VREPRINT], tp);
 
 	(void)ttyoutput('\n', tp);
 
 	/*
 	 * XXX
 	 * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE
 	 * BIT OF FIRST CHAR.
 	 */
 	s = spltty();
 	for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0);
 	    cp != NULL; cp = nextc(&tp->t_canq, cp, &c))
 		ttyecho(c, tp);
 	for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0);
 	    cp != NULL; cp = nextc(&tp->t_rawq, cp, &c))
 		ttyecho(c, tp);
 	CLR(tp->t_state, TS_ERASE);
 	splx(s);
 
 	tp->t_rocount = tp->t_rawq.c_cc;
 	tp->t_rocol = 0;
 }
 
 /*
  * Echo a typed character to the terminal.
  */
 static void
 ttyecho(int c, struct tty *tp)
 {
 
 	if (!ISSET(tp->t_state, TS_CNTTB))
 		CLR(tp->t_lflag, FLUSHO);
 	if ((!ISSET(tp->t_lflag, ECHO) &&
 	     (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
 	    ISSET(tp->t_lflag, EXTPROC))
 		return;
 	if (ISSET(tp->t_lflag, ECHOCTL) &&
 	    ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
 	    ISSET(c, TTY_CHARMASK) == 0177)) {
 		(void)ttyoutput('^', tp);
 		CLR(c, ~TTY_CHARMASK);
 		if (c == 0177)
 			c = '?';
 		else
 			c += 'A' - 1;
 	}
 	(void)ttyoutput(c, tp);
 }
 
 /*
  * Wake up any readers on a tty.
  */
 void
 ttwakeup(struct tty *tp)
 {
 
 	if (SEL_WAITING(&tp->t_rsel))
 		selwakeup(&tp->t_rsel);
 	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
 		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 	wakeup(TSA_HUP_OR_INPUT(tp));
 	KNOTE(&tp->t_rsel.si_note, 0);
 }
 
 /*
  * Wake up any writers on a tty.
  */
 void
 ttwwakeup(struct tty *tp)
 {
 
 	if (SEL_WAITING(&tp->t_wsel) && tp->t_outq.c_cc <= tp->t_olowat)
 		selwakeup(&tp->t_wsel);
 	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
 		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 	if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
 	    TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
 		CLR(tp->t_state, TS_SO_OCOMPLETE);
 		wakeup(TSA_OCOMPLETE(tp));
 	}
 	if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
 	    tp->t_outq.c_cc <= tp->t_olowat) {
 		CLR(tp->t_state, TS_SO_OLOWAT);
 		wakeup(TSA_OLOWAT(tp));
 	}
 	KNOTE(&tp->t_wsel.si_note, 0);
 }
 
 /*
  * Look up a code for a specified speed in a conversion table;
  * used by drivers to map software speed values to hardware parameters.
  */
 int
 ttspeedtab(int speed, struct speedtab *table)
 {
 
 	for ( ; table->sp_speed != -1; table++)
 		if (table->sp_speed == speed)
 			return (table->sp_code);
 	return (-1);
 }
 
 /*
  * Set input and output watermarks and buffer sizes.  For input, the
  * high watermark is about one second's worth of input above empty, the
  * low watermark is slightly below high water, and the buffer size is a
  * driver-dependent amount above high water.  For output, the watermarks
  * are near the ends of the buffer, with about 1 second's worth of input
  * between them.  All this only applies to the standard line discipline.
  */
 void
 ttsetwater(struct tty *tp)
 {
 	int cps, ttmaxhiwat, x;
 
 	/* Input. */
 	clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
 	switch (tp->t_ispeedwat) {
 	case (speed_t)-1:
 		cps = tp->t_ispeed / 10;
 		break;
 	case 0:
 		/*
 		 * This case is for old drivers that don't know about
 		 * t_ispeedwat.  Arrange for them to get the old buffer
 		 * sizes and watermarks.
 		 */
 		cps = TTYHOG - 2 * 256;
 		tp->t_ififosize = 2 * 256;
 		break;
 	default:
 		cps = tp->t_ispeedwat / 10;
 		break;
 	}
 	tp->t_ihiwat = cps;
 	tp->t_ilowat = 7 * cps / 8;
 	x = cps + tp->t_ififosize;
 	clist_alloc_cblocks(&tp->t_rawq, x, x);
 
 	/* Output. */
 	switch (tp->t_ospeedwat) {
 	case (speed_t)-1:
 		cps = tp->t_ospeed / 10;
 		ttmaxhiwat = 2 * TTMAXHIWAT;
 		break;
 	case 0:
 		cps = tp->t_ospeed / 10;
 		ttmaxhiwat = TTMAXHIWAT;
 		break;
 	default:
 		cps = tp->t_ospeedwat / 10;
 		ttmaxhiwat = 8 * TTMAXHIWAT;
 		break;
 	}
 #define CLAMP(x, h, l)	((x) > h ? h : ((x) < l) ? l : (x))
 	tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
 	x += cps;
 	x = CLAMP(x, ttmaxhiwat, TTMINHIWAT);	/* XXX clamps are too magic */
 	tp->t_ohiwat = roundup(x, CBSIZE);	/* XXX for compat */
 	x = imax(tp->t_ohiwat, TTMAXHIWAT);	/* XXX for compat/safety */
 	x += OBUFSIZ + 100;
 	clist_alloc_cblocks(&tp->t_outq, x, x);
 #undef	CLAMP
 }
 
 /*
  * Report on state of foreground process group.
  */
 void
 ttyinfo(struct tty *tp)
 {
 	struct proc *p, *pick;
 	struct timeval utime, stime;
 	const char *stmp, *sprefix;
 	long ltmp;
 	int tmp;
 	struct thread *td;
 
 	if (ttycheckoutq(tp,0) == 0)
 		return;
 
 	/* Print load average. */
 	tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
 	ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100);
 
 	if (tp->t_session == NULL)
 		ttyprintf(tp, "not a controlling terminal\n");
 	else if (tp->t_pgrp == NULL)
 		ttyprintf(tp, "no foreground process group\n");
 	else {
 		PGRP_LOCK(tp->t_pgrp);
 		if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == 0) {
 			PGRP_UNLOCK(tp->t_pgrp);
 			ttyprintf(tp, "empty foreground process group\n");
 		} else {
 			mtx_lock_spin(&sched_lock);
 
 			/* Pick interesting process. */
 			for (pick = NULL; p != 0; p = LIST_NEXT(p, p_pglist))
 				if (proc_compare(pick, p))
 					pick = p;
 			PGRP_UNLOCK(tp->t_pgrp);
 
 			td = FIRST_THREAD_IN_PROC(pick);
 			sprefix = "";
-			if (pick->p_flag & P_THREADED) {
+			if (pick->p_flag & P_SA) {
 				stmp = "KSE" ;  /* XXXKSE */
 			} else {
 				if (td) {
 					if (TD_ON_RUNQ(td) ||
 					    (TD_IS_RUNNING(td))) {
 						stmp = "running";
 					} else if (TD_ON_LOCK(td)) {
 						stmp = td->td_lockname;
 						sprefix = "*";
 					} else if (td->td_wmesg) {
 						stmp = td->td_wmesg;
 					} else {
 						stmp = "iowait";
 					}
 				} else {
 					stmp = "threadless";
 					panic("ttyinfo: no thread!?");
 				}
 			}
 			calcru(pick, &utime, &stime, NULL);
 			if (pick->p_state == PRS_NEW ||
 			    pick->p_state == PRS_ZOMBIE) {
 				ltmp = 0;
 			} else {
 				ltmp = pgtok(
 				    vmspace_resident_count(pick->p_vmspace));
 			}
 			mtx_unlock_spin(&sched_lock);
 
 			ttyprintf(tp, " cmd: %s %d [%s%s] ", pick->p_comm,
 			    pick->p_pid, sprefix, stmp);
 
 			/* Print user time. */
 			ttyprintf(tp, "%ld.%02ldu ",
 			    utime.tv_sec, utime.tv_usec / 10000);
 
 			/* Print system time. */
 			ttyprintf(tp, "%ld.%02lds ",
 			    (long)stime.tv_sec, stime.tv_usec / 10000);
 
 			/* Print percentage cpu, resident set size. */
 			ttyprintf(tp, "%d%% %ldk\n", tmp / 100, ltmp);
 
 		}
 	}
 	tp->t_rocount = 0;	/* so pending input will be retyped if BS */
 }
 
 /*
  * Returns 1 if p2 is "better" than p1
  *
  * The algorithm for picking the "interesting" process is thus:
  *
  *	1) Only foreground processes are eligible - implied.
  *	2) Runnable processes are favored over anything else.  The runner
  *	   with the highest cpu utilization is picked (p_estcpu).  Ties are
  *	   broken by picking the highest pid.
  *	3) The sleeper with the shortest sleep time is next.  With ties,
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
 #define ISRUN(p, val)						\
 do {								\
 	struct thread *td;					\
 	val = 0;						\
 	FOREACH_THREAD_IN_PROC(p, td) {				\
 		if (TD_ON_RUNQ(td) ||				\
 		    TD_IS_RUNNING(td)) {			\
 			val = 1;				\
 			break;					\
 		}						\
 	}							\
 } while (0)
 
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
 #define ONLYB   1
 #define BOTH    3
 
 static int
 proc_compare(struct proc *p1, struct proc *p2)
 {
 
 	int esta, estb;
 	struct ksegrp *kg;
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (p1 == NULL)
 		return (1);
 
 	ISRUN(p1, esta);
 	ISRUN(p2, estb);
 	
 	/*
 	 * see if at least one of them is runnable
 	 */
 	switch (TESTAB(esta, estb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
 		/*
 		 * tie - favor one with highest recent cpu utilization
 		 */
 		esta = estb = 0;
 		FOREACH_KSEGRP_IN_PROC(p1,kg) {
 			esta += kg->kg_estcpu;
 		}
 		FOREACH_KSEGRP_IN_PROC(p2,kg) {
 			estb += kg->kg_estcpu;
 		}
 		if (estb > esta)
 			return (1);
 		if (esta > estb)
 			return (0);
 		return (p2->p_pid > p1->p_pid);	/* tie - return highest pid */
 	}
 	/*
 	 * weed out zombies
 	 */
 	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
 	case ONLYA:
 		return (1);
 	case ONLYB:
 		return (0);
 	case BOTH:
 		return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
 	}
 
 #if 0 /* XXXKSE */
 	/*
 	 * pick the one with the smallest sleep time
 	 */
 	if (p2->p_slptime > p1->p_slptime)
 		return (0);
 	if (p1->p_slptime > p2->p_slptime)
 		return (1);
 	/*
 	 * favor one sleeping in a non-interruptible sleep
 	 */
 	if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
 		return (1);
 	if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
 		return (0);
 #endif
 	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
 }
 
 /*
  * Output char to tty; console putchar style.
  */
 int
 tputchar(int c, struct tty *tp)
 {
 	int s;
 
 	s = spltty();
 	if (!ISSET(tp->t_state, TS_CONNECTED)) {
 		splx(s);
 		return (-1);
 	}
 	if (c == '\n')
 		(void)ttyoutput('\r', tp);
 	(void)ttyoutput(c, tp);
 	ttstart(tp);
 	splx(s);
 	return (0);
 }
 
 /*
  * Sleep on chan, returning ERESTART if tty changed while we napped and
  * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep.  If
  * the tty is revoked, restarting a pending call will redo validation done
  * at the start of the call.
  */
 int
 ttysleep(struct tty *tp, void *chan, int pri, char *wmesg, int timo)
 {
 	int error;
 	int gen;
 
 	gen = tp->t_gen;
 	error = tsleep(chan, pri, wmesg, timo);
 	if (error)
 		return (error);
 	return (tp->t_gen == gen ? 0 : ERESTART);
 }
 
 /*
  * Allocate a tty struct.  Clists in the struct will be allocated by
  * ttyopen().
  */
 struct tty *
 ttymalloc(struct tty *tp)
 {
 
 	if (tp)
 		return(tp);
 	tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO);
 	ttyregister(tp);
 	return (tp);
 }
 
 #if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
 /*
  * Free a tty struct.  Clists in the struct should have been freed by
  * ttyclose().
  */
 void
 ttyfree(struct tty *tp)
 {
 	free(tp, M_TTYS);
 }
 #endif /* 0 */
 
 void
 ttyregister(struct tty *tp)
 {
 	tp->t_timeout = -1;
 	SLIST_INSERT_HEAD(&tty_list, tp, t_list);
 }
 
 static int
 sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
 {
 	struct tty *tp;
 	struct xtty xt;
 	int error;
 
 	SLIST_FOREACH(tp, &tty_list, t_list) {
 		bzero(&xt, sizeof xt);
 		xt.xt_size = sizeof xt;
 #define XT_COPY(field) xt.xt_##field = tp->t_##field
 		xt.xt_rawcc = tp->t_rawq.c_cc;
 		xt.xt_cancc = tp->t_canq.c_cc;
 		xt.xt_outcc = tp->t_outq.c_cc;
 		XT_COPY(line);
 		if (tp->t_dev)
 			xt.xt_dev = dev2udev(tp->t_dev);
 		XT_COPY(state);
 		XT_COPY(flags);
 		XT_COPY(timeout);
 		if (tp->t_pgrp)
 			xt.xt_pgid = tp->t_pgrp->pg_id;
 		if (tp->t_session)
 			xt.xt_sid = tp->t_session->s_sid;
 		XT_COPY(termios);
 		XT_COPY(winsize);
 		XT_COPY(column);
 		XT_COPY(rocount);
 		XT_COPY(rocol);
 		XT_COPY(ififosize);
 		XT_COPY(ihiwat);
 		XT_COPY(ilowat);
 		XT_COPY(ispeedwat);
 		XT_COPY(ohiwat);
 		XT_COPY(olowat);
 		XT_COPY(ospeedwat);
 #undef XT_COPY
 		error = SYSCTL_OUT(req, &xt, sizeof xt);
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_kern_ttys, "S,xtty", "All ttys");
 SYSCTL_LONG(_kern, OID_AUTO, tty_nin, CTLFLAG_RD,
 	&tk_nin, 0, "Total TTY in characters");
 SYSCTL_LONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
 	&tk_nout, 0, "Total TTY out characters");
 
 void
 nottystop(struct tty *tp, int rw)
 {
 
 	return;
 }
 
 int
 ttyread(dev_t dev, struct uio *uio, int flag)
 {
 	struct tty *tp;
 
 	tp = dev->si_tty;
 	if (tp == NULL)
 		return (ENODEV);
 	return ((*linesw[tp->t_line].l_read)(tp, uio, flag));
 }
 
 int
 ttywrite(dev_t dev, struct uio *uio, int flag)
 {
 	struct tty *tp;
 
 	tp = dev->si_tty;
 	if (tp == NULL)
 		return (ENODEV);
 	return ((*linesw[tp->t_line].l_write)(tp, uio, flag));
 }
Index: head/sys/sparc64/sparc64/trap.c
===================================================================
--- head/sys/sparc64/sparc64/trap.c	(revision 116360)
+++ head/sys/sparc64/sparc64/trap.c	(revision 116361)
@@ -1,630 +1,630 @@
 /*-
  * Copyright (c) 2001, Jake Burkholder
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *      The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by the University of
  *      California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      from: @(#)trap.c        7.4 (Berkeley) 5/13/91
  * 	from: FreeBSD: src/sys/i386/i386/trap.c,v 1.197 2001/07/19
  * $FreeBSD$
  */
 
 #include "opt_ddb.h"
 #include "opt_ktr.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/intr_machdep.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/trap.h>
 #include <machine/tstate.h>
 #include <machine/tte.h>
 #include <machine/tlb.h>
 #include <machine/tsb.h>
 #include <machine/watch.h>
 
 void trap(struct trapframe *tf);
 void syscall(struct trapframe *tf);
 
 static int trap_pfault(struct thread *td, struct trapframe *tf);
 
 extern char copy_fault[];
 extern char copy_nofault_begin[];
 extern char copy_nofault_end[];
 
 extern char fs_fault[];
 extern char fs_nofault_begin[];
 extern char fs_nofault_end[];
 extern char fs_nofault_intr_begin[];
 extern char fs_nofault_intr_end[];
 
 extern char *syscallnames[];
 
 const char *trap_msg[] = {
 	"reserved",
 	"instruction access exception",
 	"instruction access error",
 	"instruction access protection",
 	"illtrap instruction",
 	"illegal instruction",
 	"privileged opcode",
 	"floating point disabled",
 	"floating point exception ieee 754",
 	"floating point exception other",
 	"tag overflow",
 	"division by zero",
 	"data access exception",
 	"data access error",
 	"data access protection",
 	"memory address not aligned",
 	"privileged action",
 	"async data error",
 	"trap instruction 16",
 	"trap instruction 17",
 	"trap instruction 18",
 	"trap instruction 19",
 	"trap instruction 20",
 	"trap instruction 21",
 	"trap instruction 22",
 	"trap instruction 23",
 	"trap instruction 24",
 	"trap instruction 25",
 	"trap instruction 26",
 	"trap instruction 27",
 	"trap instruction 28",
 	"trap instruction 29",
 	"trap instruction 30",
 	"trap instruction 31",
 	"fast instruction access mmu miss",
 	"fast data access mmu miss",
 	"interrupt",
 	"physical address watchpoint",
 	"virtual address watchpoint",
 	"corrected ecc error",
 	"spill",
 	"fill",
 	"fill",
 	"breakpoint",
 	"clean window",
 	"range check",
 	"fix alignment",
 	"integer overflow",
 	"syscall",
 	"restore physical watchpoint",
 	"restore virtual watchpoint",
 	"kernel stack fault",
 };
 
 const int trap_sig[] = {
 	SIGILL,			/* reserved */
 	SIGILL,			/* instruction access exception */
 	SIGILL,			/* instruction access error */
 	SIGILL,			/* instruction access protection */
 	SIGILL,			/* illtrap instruction */
 	SIGILL,			/* illegal instruction */
 	SIGBUS,			/* privileged opcode */
 	SIGFPE,			/* floating point disabled */
 	SIGFPE,			/* floating point exception ieee 754 */
 	SIGFPE,			/* floating point exception other */
 	SIGEMT,			/* tag overflow */
 	SIGFPE,			/* division by zero */
 	SIGILL,			/* data access exception */
 	SIGILL,			/* data access error */
 	SIGBUS,			/* data access protection */
 	SIGBUS,			/* memory address not aligned */
 	SIGBUS,			/* privileged action */
 	SIGBUS,			/* async data error */
 	SIGILL,			/* trap instruction 16 */
 	SIGILL,			/* trap instruction 17 */
 	SIGILL,			/* trap instruction 18 */
 	SIGILL,			/* trap instruction 19 */
 	SIGILL,			/* trap instruction 20 */
 	SIGILL,			/* trap instruction 21 */
 	SIGILL,			/* trap instruction 22 */
 	SIGILL,			/* trap instruction 23 */
 	SIGILL,			/* trap instruction 24 */
 	SIGILL,			/* trap instruction 25 */
 	SIGILL,			/* trap instruction 26 */
 	SIGILL,			/* trap instruction 27 */
 	SIGILL,			/* trap instruction 28 */
 	SIGILL,			/* trap instruction 29 */
 	SIGILL,			/* trap instruction 30 */
 	SIGILL,			/* trap instruction 31 */
 	SIGSEGV,		/* fast instruction access mmu miss */
 	SIGSEGV,		/* fast data access mmu miss */
 	-1,			/* interrupt */
 	-1,			/* physical address watchpoint */
 	-1,			/* virtual address watchpoint */
 	-1,			/* corrected ecc error */
 	SIGILL,			/* spill */
 	SIGILL,			/* fill */
 	SIGILL,			/* fill */
 	SIGTRAP,		/* breakpoint */
 	SIGILL,			/* clean window */
 	SIGILL,			/* range check */
 	SIGILL,			/* fix alignment */
 	SIGILL,			/* integer overflow */
 	SIGSYS,			/* syscall */
 	-1,			/* restore physical watchpoint */
 	-1,			/* restore virtual watchpoint */
 	-1,			/* kernel stack fault */
 };
 
 CTASSERT(sizeof(struct trapframe) == 256);
 
 int debugger_on_signal = 0;
 SYSCTL_INT(_debug, OID_AUTO, debugger_on_signal, CTLFLAG_RW,
     &debugger_on_signal, 0, "");
 
 void
 trap(struct trapframe *tf)
 {
 	struct thread *td;
 	struct proc *p;
 	u_int sticks;
 	int error;
 	int sig;
 
 	td = PCPU_GET(curthread);
 
 	CTR4(KTR_TRAP, "trap: %p type=%s (%s) pil=%#lx", td,
 	    trap_msg[tf->tf_type & ~T_KERNEL],
 	    (TRAPF_USERMODE(tf) ? "user" : "kernel"), rdpr(pil));
 
 	atomic_add_int(&cnt.v_trap, 1);
 
 	if ((tf->tf_tstate & TSTATE_PRIV) == 0) {
 		KASSERT(td != NULL, ("trap: curthread NULL"));
 		KASSERT(td->td_proc != NULL, ("trap: curproc NULL"));
 
 		p = td->td_proc;
 		sticks = td->td_sticks;
 		td->td_frame = tf;
 		if (td->td_ucred != p->p_ucred)
 			cred_update_thread(td);
 
 		switch (tf->tf_type) {
 		case T_DATA_MISS:
 		case T_DATA_PROTECTION:
 		case T_INSTRUCTION_MISS:
 			sig = trap_pfault(td, tf);
 			break;
 		case T_FILL:
 			sig = rwindow_load(td, tf, 2);
 			break;
 		case T_FILL_RET:
 			sig = rwindow_load(td, tf, 1);
 			break;
 		case T_SPILL:
 			sig = rwindow_save(td);
 			break;
 		default:
 			if (tf->tf_type < 0 || tf->tf_type >= T_MAX ||
 			    trap_sig[tf->tf_type] == -1)
 				panic("trap: bad trap type");
 			sig = trap_sig[tf->tf_type];
 			break;
 		}
 
 		if (sig != 0) {
 			/* Translate fault for emulators. */
 			if (p->p_sysent->sv_transtrap != NULL) {
 				sig = p->p_sysent->sv_transtrap(sig,
 				    tf->tf_type);
 			}
 			if (debugger_on_signal &&
 			    (sig == 4 || sig == 10 || sig == 11))
 				Debugger("trapsig");
 			trapsignal(td, sig, tf->tf_type);
 		}
 
 		userret(td, tf, sticks);
 		mtx_assert(&Giant, MA_NOTOWNED);
 #ifdef DIAGNOSTIC
 		cred_free_thread(td);
 #endif
  	} else {
 		KASSERT((tf->tf_type & T_KERNEL) != 0,
 		    ("trap: kernel trap isn't"));
 
 		switch (tf->tf_type & ~T_KERNEL) {
 #ifdef DDB
 		case T_BREAKPOINT:
 		case T_KSTACK_FAULT:
 			error = (kdb_trap(tf) == 0);
 			break;
 #ifdef notyet
 		case T_PA_WATCHPOINT:
 		case T_VA_WATCHPOINT:
 			error = db_watch_trap(tf);
 			break;
 #endif
 #endif
 		case T_DATA_MISS:
 		case T_DATA_PROTECTION:
 		case T_INSTRUCTION_MISS:
 			error = trap_pfault(td, tf);
 			break;
 		case T_DATA_EXCEPTION:
 		case T_MEM_ADDRESS_NOT_ALIGNED:
 			if ((tf->tf_sfsr & MMU_SFSR_FV) != 0 &&
 			    MMU_SFSR_GET_ASI(tf->tf_sfsr) == ASI_AIUP) {
 				if (tf->tf_tpc >= (u_long)copy_nofault_begin &&
 				    tf->tf_tpc <= (u_long)copy_nofault_end) {
 					tf->tf_tpc = (u_long)copy_fault;
 					tf->tf_tnpc = tf->tf_tpc + 4;
 					error = 0;
 					break;
 				}
 				if (tf->tf_tpc >= (u_long)fs_nofault_begin &&
 				    tf->tf_tpc <= (u_long)fs_nofault_end) {
 					tf->tf_tpc = (u_long)fs_fault;
 					tf->tf_tnpc = tf->tf_tpc + 4;
 					error = 0;
 					break;
 				}
 			}
 			error = 1;	
 			break;
 		default:
 			error = 1;
 			break;
 		}
 
 		if (error != 0)
 			panic("trap: %s", trap_msg[tf->tf_type & ~T_KERNEL]);
 	}
 	CTR1(KTR_TRAP, "trap: td=%p return", td);
 }
 
 static int
 trap_pfault(struct thread *td, struct trapframe *tf)
 {
 	struct vmspace *vm;
 	struct pcb *pcb;
 	struct proc *p;
 	vm_offset_t va;
 	vm_prot_t prot;
 	u_long ctx;
 	int flags;
 	int type;
 	int rv;
 
 	if (td == NULL)
 		return (-1);
 	KASSERT(td->td_pcb != NULL, ("trap_pfault: pcb NULL"));
 	KASSERT(td->td_proc != NULL, ("trap_pfault: curproc NULL"));
 	KASSERT(td->td_proc->p_vmspace != NULL, ("trap_pfault: vmspace NULL"));
 
 	p = td->td_proc;
 
 	rv = KERN_SUCCESS;
 	ctx = TLB_TAR_CTX(tf->tf_tar);
 	pcb = td->td_pcb;
 	type = tf->tf_type & ~T_KERNEL;
 	va = TLB_TAR_VA(tf->tf_tar);
 
 	CTR4(KTR_TRAP, "trap_pfault: td=%p pm_ctx=%#lx va=%#lx ctx=%#lx",
 	    td, p->p_vmspace->vm_pmap.pm_context[PCPU_GET(cpuid)], va, ctx);
 
 	if (type == T_DATA_PROTECTION) {
 		prot = VM_PROT_WRITE;
 		flags = VM_FAULT_DIRTY;
 	} else {
 		if (type == T_DATA_MISS)
 			prot = VM_PROT_READ;
 		else
 			prot = VM_PROT_READ | VM_PROT_EXECUTE;
 		flags = VM_FAULT_NORMAL;
 	}
 
 	if (ctx != TLB_CTX_KERNEL) {
 		if ((tf->tf_tstate & TSTATE_PRIV) != 0 &&
 		    (tf->tf_tpc >= (u_long)fs_nofault_intr_begin &&
 		     tf->tf_tpc <= (u_long)fs_nofault_intr_end)) {
 			tf->tf_tpc = (u_long)fs_fault;
 			tf->tf_tnpc = tf->tf_tpc + 4;
 			return (0);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 */
 		vm = p->p_vmspace;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 * critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page. */
 		rv = vm_fault(&vm->vm_map, va, prot, flags);
 
 		/*
 		 * Now the process can be swapped again.
 		 */
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * This is a fault on kernel virtual memory.  Attempts to
 		 * access kernel memory from user mode cause privileged
 		 * action traps, not page fault.
 		 */
 		KASSERT(tf->tf_tstate & TSTATE_PRIV,
 		    ("trap_pfault: fault on nucleus context from user mode"));
 
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(kernel_map, va, prot, VM_FAULT_NORMAL);
 	}
 
 	CTR3(KTR_TRAP, "trap_pfault: return td=%p va=%#lx rv=%d",
 	    td, va, rv);
 	if (rv == KERN_SUCCESS)
 		return (0);
 	if (ctx != TLB_CTX_KERNEL && (tf->tf_tstate & TSTATE_PRIV) != 0) {
 		if (tf->tf_tpc >= (u_long)fs_nofault_begin &&
 		    tf->tf_tpc <= (u_long)fs_nofault_end) {
 			tf->tf_tpc = (u_long)fs_fault;
 			tf->tf_tnpc = tf->tf_tpc + 4;
 			return (0);
 		}
 		if (tf->tf_tpc >= (u_long)copy_nofault_begin &&
 		    tf->tf_tpc <= (u_long)copy_nofault_end) {
 			tf->tf_tpc = (u_long)copy_fault;
 			tf->tf_tnpc = tf->tf_tpc + 4;
 			return (0);
 		}
 	}
 	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 /* Maximum number of arguments that can be passed via the out registers. */
 #define	REG_MAXARGS	6
 
 /*
  * Syscall handler. The arguments to the syscall are passed in the o registers
  * by the caller, and are saved in the trap frame. The syscall number is passed
  * in %g1 (and also saved in the trap frame).
  */
 void
 syscall(struct trapframe *tf)
 {
 	struct sysent *callp;
 	struct thread *td;
 	register_t args[8];
 	register_t *argp;
 	struct proc *p;
 	u_int sticks;
 	u_long code;
 	u_long tpc;
 	int reg;
 	int regcnt;
 	int narg;
 	int error;
 
 	td = PCPU_GET(curthread);
 	KASSERT(td != NULL, ("trap: curthread NULL"));
 	KASSERT(td->td_proc != NULL, ("trap: curproc NULL"));
 
 	p = td->td_proc;
 
 	atomic_add_int(&cnt.v_syscall, 1);
 
 	narg = 0;
 	error = 0;
 	reg = 0;
 	regcnt = REG_MAXARGS;
 
 	sticks = td->td_sticks;
 	td->td_frame = tf;
 	if (td->td_ucred != p->p_ucred)
 		cred_update_thread(td);
-	if (p->p_flag & P_THREADED)
+	if (p->p_flag & P_SA)
 		thread_user_enter(p, td);
 	code = tf->tf_global[1];
 
 	/*
 	 * For syscalls, we don't want to retry the faulting instruction
 	 * (usually), instead we need to advance one instruction.
 	 */
 	tpc = tf->tf_tpc;
 	TF_DONE(tf);
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 #if 0
 		(*p->p_sysent->sv_prepsyscall)(tf, args, &code, &params);
 #endif	
 	} else 	if (code == SYS_syscall || code == SYS___syscall) {
 		code = tf->tf_out[reg++];
 		regcnt--;
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	if (narg <= regcnt) {
 		argp = &tf->tf_out[reg];
 		error = 0;
 	} else {
 		KASSERT(narg <= sizeof(args) / sizeof(args[0]),
 		    ("Too many syscall arguments!"));
 		argp = args;
 		bcopy(&tf->tf_out[reg], args, sizeof(args[0]) * regcnt);
 		error = copyin((void *)(tf->tf_out[6] + SPOFF +
 		    offsetof(struct frame, fr_pad[6])),
 		    &args[regcnt], (narg - regcnt) * sizeof(args[0]));
 	}
 
 	CTR5(KTR_SYSC, "syscall: td=%p %s(%#lx, %#lx, %#lx)", td,
 	    syscallnames[code], argp[0], argp[1], argp[2]);
 
 	/*
 	 * Try to run the syscall without the MP lock if the syscall
 	 * is MP safe.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_lock(&Giant);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, argp);
 #endif
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = 0;
 
 		STOPEVENT(p, S_SCE, narg);	/* MP aware */
 
 		error = (*callp->sy_call)(td, argp);
 
 		CTR5(KTR_SYSC, "syscall: p=%p error=%d %s return %#lx %#lx ", p,
 		    error, syscallnames[code], td->td_retval[0],
 		    td->td_retval[1]);
 	}
 	
 	/*
 	 * MP SAFE (we may or may not have the MP lock at this point)
 	 */
 	switch (error) {
 	case 0:
 		tf->tf_out[0] = td->td_retval[0];
 		tf->tf_out[1] = td->td_retval[1];
 		tf->tf_tstate &= ~TSTATE_XCC_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Undo the tpc advancement we have done above, we want to
 		 * reexecute the system call.
 		 */
 		tf->tf_tpc = tpc;
 		tf->tf_tnpc -= 4;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		tf->tf_out[0] = error;
 		tf->tf_tstate |= TSTATE_XCC_C;
 		break;
 	}
 
 	/*
 	 * Release Giant if we had to get it.  Don't use mtx_owned(),
 	 * we want to catch broken syscalls.
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0)
 		mtx_unlock(&Giant);
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, tf, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 #ifdef DIAGNOSTIC
 	cred_free_thread(td);
 #endif
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 116360)
+++ head/sys/sys/proc.h	(revision 116361)
@@ -1,933 +1,933 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/queue.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX */
 #include <sys/runq.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/_label.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	int		s_count;	/* (m) Ref cnt; pgrps in session. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct tty	*s_ttyp;	/* (m) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Pgrp id. */
 	int		pg_jobc;	/* (m) job cntl proc count */
 	struct mtx	pg_mtx;		/*  Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by sched_lock mtx
  *      k - only accessed by curthread
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      p - select lock (sellock)
  *      r - p_peers lock
  *      x - created at fork, only changes during single threading in exec
  *      z - zombie threads/kse/ksegroup lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct ithd;
 struct ke_sched;
 struct kg_sched;
 struct nlminfo;
 struct p_sched;
 struct td_sched;
 struct trapframe;
 
 /*
  * Here we define the four structures used for process information.
  *
  * The first is the thread. It might be though of as a "Kernel
  * Schedulable Entity Context".
  * This structure contains all the information as to where a thread of
  * execution is now, or was when it was suspended, why it was suspended,
  * and anything else that will be needed to restart it when it is
  * rescheduled. Always associated with a KSE when running, but can be
  * reassigned to an equivalent KSE when being restarted for
  * load balancing. Each of these is associated with a kernel stack
  * and a pcb.
  *
  * It is important to remember that a particular thread structure only
  * exists as long as the system call or kernel entrance (e.g. by pagefault)
  * which it is currently executing. It should therefore NEVER be referenced
  * by pointers in long lived structures that live longer than a single
  * request. If several threads complete their work at the same time,
  * they will all rewind their stacks to the user boundary, report their
  * completion state, and all but one will be freed. That last one will
  * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel
  * entrance. (basically to save freeing and then re-allocating it) The KSE
  * keeps a cached thread available to allow it to quickly
  * get one when it needs a new one. There is also a system
  * cache of free threads. Threads have priority and partake in priority
  * inheritance schemes.
  */
 struct thread;
 
 /*
  * The second structure is the Kernel Schedulable Entity. (KSE)
  * It represents the ability to take a slot in the scheduler queue.
  * As long as this is scheduled, it could continue to run any threads that
  * are assigned to the KSEGRP (see later) until either it runs out
  * of runnable threads of high enough priority, or CPU.
  * It runs on one CPU and is assigned a quantum of time. When a thread is
  * blocked, The KSE continues to run and will search for another thread
  * in a runnable state amongst those it has. It May decide to return to user
  * mode with a new 'empty' thread if there are no runnable threads.
  * Threads are temporarily associated with a KSE for scheduling reasons.
  */
 struct kse;
 
 /*
  * The KSEGRP is allocated resources across a number of CPUs.
  * (Including a number of CPUxQUANTA. It parcels these QUANTA up among
  * its KSEs, each of which should be running in a different CPU.
  * BASE priority and total available quanta are properties of a KSEGRP.
  * Multiple KSEGRPs in a single process compete against each other
  * for total quanta in the same way that a forked child competes against
  * it's parent process.
  */
 struct ksegrp;
 
 /*
  * A process is the owner of all system resources allocated to a task
  * except CPU quanta.
  * All KSEGs under one process see, and have the same access to, these
  * resources (e.g. files, memory, sockets, permissions kqueues).
  * A process may compete for CPU cycles on the same basis as a
  * forked process cluster by spawning several KSEGRPs.
  */
 struct proc;
 
 /***************
  * In pictures:
  With a single run queue used by all processors:
 
  RUNQ: --->KSE---KSE--...               SLEEPQ:[]---THREAD---THREAD---THREAD
 	   |   /                               []---THREAD
 	   KSEG---THREAD--THREAD--THREAD       []
 					       []---THREAD---THREAD
 
   (processors run THREADs from the KSEG until they are exhausted or
   the KSEG exhausts its quantum)
 
 With PER-CPU run queues:
 KSEs on the separate run queues directly
 They would be given priorities calculated from the KSEG.
 
  *
  *****************/
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * The first KSE available in the correct group will run this thread.
  * If several are available, use the one on the same CPU as last time.
  * When waiting to be run, threads are hung off the KSEGRP in priority order.
  * with N runnable and queued KSEs in the KSEGRP, the first N threads
  * are linked to them. Other threads are not yet assigned.
  */
 struct thread {
 	struct proc	*td_proc;	/* (*) Associated process. */
 	struct ksegrp	*td_ksegrp;	/* (*) Associated KSEG. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc */
 	TAILQ_ENTRY(thread) td_kglist;	/* (*) All threads in this ksegrp */
 
 	/* The two queues below should someday be merged */
 	TAILQ_ENTRY(thread) td_slpq;	/* (j) Sleep queue. XXXKSE */
 	TAILQ_ENTRY(thread) td_lockq;	/* (j) Lock queue. XXXKSE */
 	TAILQ_ENTRY(thread) td_runq;	/* (j/z) Run queue(s). XXXKSE */
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 
 /* Cleared during fork1() or thread_sched_upcall() */
 #define	td_startzero td_flags
 	int		td_flags;	/* (j) TDF_* flags. */
 	int		td_inhibitors;	/* (j) Why can not run */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	struct kse	*td_last_kse;	/* (j) Previous value of td_kse */
 	struct kse	*td_kse;	/* (j) Current KSE if running. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	void		*td_wchan;	/* (j) Sleep address. */
 	const char	*td_wmesg;	/* (j) Reason for sleep. */
 	u_char		td_lastcpu;	/* (j) Last cpu we were on. */
 	u_char		td_oncpu;	/* (j) Which cpu we are on. */
 	short		td_locks;	/* (k) DEBUG: lockmgr count of locks */
 	struct mtx	*td_blocked;	/* (j) Mutex process is blocked on. */
 	struct ithd	*td_ithd;	/* (b) For interrupt threads only. */
 	const char	*td_lockname;	/* (j) Name of lock blocked on. */
 	LIST_HEAD(, mtx) td_contested;	/* (j) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	void		(*td_switchin)(void); /* (k) Switchin special func. */
 	struct thread	*td_standin;	/* (*) Use this for an upcall */
 	u_int		td_prticks;	/* (*) Profclock hits in sys for user */
 	struct kse_upcall *td_upcall;	/* (*) Upcall structure. */
 	u_int64_t	td_sticks;	/* (j) Statclock hits in system mode. */
 	u_int		td_uuticks;	/* (*) Statclock hits in user, for UTS */
 	u_int		td_usticks;	/* (*) Statclock hits in kernel, for UTS */
 	u_int		td_critnest;	/* (k) Critical section nest level. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	sigset_t	td_siglist;	/* (c) Sigs arrived, not delivered. */
 	TAILQ_ENTRY(thread) td_umtx;	/* (c?) Link for when we're blocked. */
 
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall() */
 #define	td_startcopy td_endzero
 	u_char		td_base_pri;	/* (j) Thread base kernel priority. */
 	u_char		td_priority;	/* (j) Thread active priority. */
 #define	td_endcopy td_pcb
 
 /*
  * fields that must be manually set in fork1() or thread_sched_upcall()
  * or already have been set in the allocator, contstructor, etc..
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;
 	register_t	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack */
 	struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */
 	vm_offset_t	td_altkstack;	/* (a) Kernel VA of alternate kstack. */
 	int		td_altkstack_pages; /* (a) Size of the alternate kstack */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct td_sched	*td_sched;	/* (*) Scheduler specific data */
 };
 /* flags kept in td_flags */ 
 #define	TDF_INPANIC	0x000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_CAN_UNBIND	0x000004 /* Only temporarily bound. */
 #define	TDF_SINTR	0x000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x000020 /* This is one of the per-CPU idle threads */
 #define	TDF_SELECT	0x000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_CVWAITQ	0x000080 /* Thread is on a cv_waitq (not slpq). */
 #define	TDF_UPCALLING	0x000100 /* This thread is doing an upcall. */
 #define	TDF_ONSLEEPQ	0x000200 /* On the sleep queue. */
 #define	TDF_ASTPENDING	0x000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x002000 /* Thread is marked as interrupted. */
 #define	TDF_USTATCLOCK	0x004000 /* Stat clock hits in userland. */
 #define	TDF_OWEUPC	0x008000 /* Owe thread an addupc() call at next AST. */
 #define	TDF_NEEDRESCHED	0x010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x020000 /* Thread may need signal delivery. */
 #define	TDF_DEADLKTREAT	0x800000 /* Lock aquisition - deadlock treatment. */
 
 /* "private" flags kept in td_pflags */
 #define	TDP_OLDMASK	0x0001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x0002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x0004 /* Thread is currently in KTRACE code. */
 
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.. bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 #define	TD_CAN_UNBIND(td)					\
     (((td)->td_flags & TDF_CAN_UNBIND) == TDF_CAN_UNBIND &&	\
      ((td)->td_upcall != NULL))
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	do {(td)->td_state = TDS_RUNNING; } while (0)
 #define	TD_SET_RUNQ(td)		do {(td)->td_state = TDS_RUNQ; } while (0)
 #define	TD_SET_CAN_RUN(td)	do {(td)->td_state = TDS_CAN_RUN; } while (0)
 #define	TD_SET_ON_SLEEPQ(td)	do {(td)->td_flags |= TDF_ONSLEEPQ; } while (0)
 #define	TD_CLR_ON_SLEEPQ(td)	do {			\
 		(td)->td_flags &= ~TDF_ONSLEEPQ;	\
 		(td)->td_wchan = NULL;			\
 } while (0)
 
 /*
  * The schedulable entity that can be given a context to run.
  * A process may have several of these. Probably one per processor
  * but posibly a few more. In this universe they are grouped
  * with a KSEG that contains the priority and niceness
  * for the group.
  */
 struct kse {
 	struct proc	*ke_proc;	/* (*) Associated process. */
 	struct ksegrp	*ke_ksegrp;	/* (*) Associated KSEG. */
 	TAILQ_ENTRY(kse) ke_kglist;	/* (*) Queue of KSEs in ke_ksegrp. */
 	TAILQ_ENTRY(kse) ke_kgrlist;	/* (*) Queue of KSEs in this state. */
 	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
 
 #define	ke_startzero ke_flags
 	int		ke_flags;	/* (j) KEF_* flags. */
 	struct thread	*ke_thread;	/* (*) Active associated thread. */
 	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
 	u_char		ke_oncpu;	/* (j) Which cpu we are on. */
 	char		ke_rqindex;	/* (j) Run queue index. */
 	enum {
 		KES_UNUSED = 0x0,
 		KES_IDLE,
 		KES_ONRUNQ,
 		KES_UNQUEUED,		/* in transit */
 		KES_THREAD		/* slaved to thread state */
 	} ke_state;			/* (j) KSE status. */
 #define	ke_endzero ke_dummy
 	u_char		ke_dummy;
 	struct ke_sched	*ke_sched;	/* (*) Scheduler specific data */
 };
 
 /* flags kept in ke_flags */
 #define	KEF_DIDRUN	0x02000	/* KSE actually ran. */
 #define	KEF_EXIT	0x04000	/* KSE is being killed. */
 
 /*
  * The upcall management structure.
  * The upcall is used when returning to userland.  If a thread does not have
  * an upcall on return to userland the thread exports its context and exits.
  */
 struct kse_upcall {
 	TAILQ_ENTRY(kse_upcall) ku_link;	/* List of upcalls in KSEG. */
 	struct ksegrp		*ku_ksegrp;	/* Associated KSEG. */
 	struct thread		*ku_owner;	/* owning thread */
 	int			ku_flags;	/* KUF_* flags. */
 	struct kse_mailbox	*ku_mailbox;	/* userland mailbox address. */
 	stack_t			ku_stack;	/* userland upcall stack. */
 	void			*ku_func;	/* userland upcall function. */
 	unsigned int		ku_mflags;	/* cached upcall mailbox flags */
 };
 
 #define	KUF_DOUPCALL	0x00001		/* Do upcall now, don't wait */
 #define	KUF_EXITING	0x00002		/* Upcall structure is exiting */
 
 /*
  * Kernel-scheduled entity group (KSEG).  The scheduler considers each KSEG to
  * be an indivisible unit from a time-sharing perspective, though each KSEG may
  * contain multiple KSEs.
  */
 struct ksegrp {
 	struct proc	*kg_proc;	/* (*) Process that contains this KSEG. */
 	TAILQ_ENTRY(ksegrp) kg_ksegrp;	/* (*) Queue of KSEGs in kg_proc. */
 	TAILQ_HEAD(, kse) kg_kseq;	/* (ke_kglist) All KSEs. */
 	TAILQ_HEAD(, kse) kg_iq;	/* (ke_kgrlist) All idle KSEs. */
 	TAILQ_HEAD(, thread) kg_threads;/* (td_kglist) All threads. */
 	TAILQ_HEAD(, thread) kg_runq;	/* (td_runq) waiting RUNNABLE threads */
 	TAILQ_HEAD(, thread) kg_slpq;	/* (td_runq) NONRUNNABLE threads. */
 	TAILQ_HEAD(, kse_upcall) kg_upcalls;	/* All upcalls in the group */
 #define	kg_startzero kg_estcpu
 	u_int		kg_estcpu;	/* (j) Sum of the same field in KSEs. */
 	u_int		kg_slptime;	/* (j) How long completely blocked. */
 	struct thread	*kg_last_assigned; /* (j) Last thread assigned to a KSE. */
 	int		kg_runnable;	/* (j) Num runnable threads on queue. */
 	int		kg_runq_kses;	/* (j) Num KSEs on runq. */
 	int		kg_idle_kses;	/* (j) Num KSEs on iq */
 	int		kg_numupcalls;	/* (j) Num upcalls */
 	int		kg_upsleeps;	/* (c) Num threads in kse_release() */
 	struct kse_thr_mailbox *kg_completed; /* (c) Completed thread mboxes. */
 	int		kg_nextupcall;	/* (*) Next upcall time */
 	int		kg_upquantum;	/* (*) Quantum to schedule an upcall */
 #define	kg_endzero kg_pri_class
 
 #define	kg_startcopy	kg_endzero
 	u_char		kg_pri_class;	/* (j) Scheduling class. */
 	u_char		kg_user_pri;	/* (j) User pri from estcpu and nice. */
 	char		kg_nice;	/* (c + j) Process "nice" value. */
 #define	kg_endcopy kg_numthreads
 	int		kg_numthreads;	/* (j) Num threads in total */
 	int		kg_kses;	/* (j) Num KSEs in group. */
 	struct kg_sched	*kg_sched;	/* (*) Scheduler specific data */
 };
 
 /*
  * The old fashionned process. May have multiple threads, KSEGRPs
  * and KSEs. Starts off with a single embedded KSEGRP, KSE and THREAD.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, ksegrp) p_ksegrps;	/* (kg_ksegrp) All KSEGs. */
 	TAILQ_HEAD(, thread) p_threads;	/* (td_plist) Threads. (shortcut) */
 	TAILQ_HEAD(, thread) p_suspended; /* (td_runq) Suspended threads. */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Ptr to open files structure. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Ptr to tracking node */
 					/* Accumulated stats for all KSEs? */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c*) Process limits. */
 	struct vm_object *p_upages_obj; /* (a) Upages object. */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 
 	/*struct ksegrp	p_ksegrp;
 	struct kse	p_kse; */
 
 	/*
 	 * The following don't make too much sense..
 	 * See the td_ or ke_ versions of the same flags
 	 */
 	int		p_flag;		/* (c) P_* flags. */
 	int		p_sflag;	/* (j) PS_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* KSEs can be run */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid;	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtime;	/* (j) Time swapped in or out. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct bintime	p_runtime;	/* (j) Real time. */
 	u_int64_t	p_uu;		/* (j) Previous user time in usec. */
 	u_int64_t	p_su;		/* (j) Previous system time in usec. */
 	u_int64_t	p_iu;		/* (j) Previous intr time in usec. */
 	u_int64_t	p_uticks;	/* (j) Statclock hits in user mode. */
 	u_int64_t	p_sticks;	/* (j) Statclock hits in system mode. */
 	u_int64_t	p_iticks;	/* (j) Statclock hits in intr. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task */
 	int		p_maxthrwaits;	/* (c) Max threads num waiters */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	sigset_t	p_siglist;	/* (c) Sigs not delivered to a td. */
 	char		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct klist	p_klist;	/* (c) Knotes attached to this proc. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	void		*p_aioinfo;	/* (?) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (c) # threads in suspended mode */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_sigstk
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	stack_t		p_sigstk;	/* (c) Stack ptr and on-stack flag. */
 	u_int		p_magic;	/* (b) Magic number. */
 	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (j) Current CPU limit in seconds. */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xstat
 
 	u_short		p_xstat;	/* (c) Exit status; also stop sig. */
 	int		p_numthreads;	/* (j) Number of threads. */
 	int		p_numksegrps;	/* (?) number of ksegrps */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	struct user	*p_uarea;	/* (k) Kernel VA of u-area (CPU) */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct rusage	*p_ru;		/* (a) Exit information. XXX */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	p_label;	/* (*) Process (not subject) MAC label */
 	struct p_sched	*p_sched;	/* (*) Scheduler specific data */
 };
 
 #define	p_rlimit	p_limit->pl_rlimit
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. (SMP) */
 
 /* Status values (p_stat). */
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KTHREAD	0x00004	/* Kernel thread. (*)*/
 #define	P_NOLOAD	0x00008	/* Ignore during load avg calculations. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread in requesting to stop prof */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
-#define	P_THREADED	0x08000	/* Process is using threads. */
+#define	P_SA		0x08000	/* Using scheduler activations. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing */
 #define	P_STOPPED_SINGLE	0x80000	/* Only one thread can continue */
 					/* (not to user) */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 
 /* Should be moved to machine-dependent areas. */
 #define	P_COWINPROGRESS	0x400000 /* Snapshot copy-on-write in progress. */
 
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_ALTSTACK	0x2000000 /* Have alternate signal stack. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 
 #define	P_STOPPED		(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)		((p)->p_flag & P_STOPPED)
 
 /* These flags are kept in p_sflag and are protected with sched_lock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_XCPU		0x00002 /* Exceeded CPU limit. */
 #define	PS_ALRMPEND	0x00020	/* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040	/* Pending SIGPROF needs to be posted. */
 #define	PS_SWAPINREQ	0x00100	/* Swapin request due to wakeup. */
 #define	PS_SWAPPINGOUT	0x00200	/* Process is being swapped out. */
 #define	PS_SWAPPINGIN	0x04000	/* Process is being swapped in. */
 #define	PS_MACPEND	0x08000	/* Ast()-based MAC event pending. */
 
 /* used only in legacy conversion code */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_KSEGRP_IN_PROC(p, kg)					\
 	TAILQ_FOREACH((kg), &(p)->p_ksegrps, kg_ksegrp)
 #define	FOREACH_THREAD_IN_GROUP(kg, td)					\
 	TAILQ_FOREACH((td), &(kg)->kg_threads, td_kglist)
 #define	FOREACH_KSE_IN_GROUP(kg, ke)					\
 	TAILQ_FOREACH((ke), &(kg)->kg_kseq, ke_kglist)
 #define	FOREACH_UPCALL_IN_GROUP(kg, ku)					\
 	TAILQ_FOREACH((ku), &(kg)->kg_upcalls, ku_link)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 
 /* XXXKSE the lines below should probably only be used in 1:1 code */
 #define	FIRST_THREAD_IN_PROC(p) TAILQ_FIRST(&p->p_threads)
 #define	FIRST_KSEGRP_IN_PROC(p) TAILQ_FIRST(&p->p_ksegrps)
 #define	FIRST_KSE_IN_KSEGRP(kg) TAILQ_FIRST(&kg->kg_kseq)
 #define	FIRST_KSE_IN_PROC(p) FIRST_KSE_IN_KSEGRP(FIRST_KSEGRP_IN_PROC(p))
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s) {							\
 	if (--(s)->s_count == 0)					\
 		FREE(s, M_SESSION);					\
 }
 
 #define	STOPEVENT(p, e, v) do {						\
 	PROC_LOCK(p);							\
 	_STOPEVENT((p), (e), (v));					\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	if ((p)->p_stops & (e)) {					\
 		stopevent((p), (e), (v));				\
 	}								\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg)						\
 	do {								\
 		if ((pg) != NULL)					\
 			PGRP_LOCK(pg);					\
 	} while (0);
 
 #define	PGRP_UNLOCK_PGSIGNAL(pg)					\
 	do {								\
 		if ((pg) != NULL)					\
 			PGRP_UNLOCK(pg);				\
 	} while (0);
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(p)->p_lock++;							\
 	if (((p)->p_sflag & PS_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(--(p)->p_lock);						\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td))
 
 /* Lock and unlock process arguments. */
 #define	PARGS_LOCK(p)		mtx_lock(&pargs_ref_lock)
 #define	PARGS_UNLOCK(p)		mtx_unlock(&pargs_ref_lock)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern struct sx proctree_lock;
 extern struct mtx pargs_ref_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0 */
 extern struct ksegrp ksegrp0;		/* Primary ksegrp in proc0 */
 extern struct kse kse0;			/* Primary kse in proc0 */
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 extern int ps_argsopen;
 extern int ps_showallprocs;
 extern int sched_quantum;		/* Scheduling quantum in ticks. */
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct proc *updateproc;		/* Process slot for syncer (sic). */
 
 extern struct uma_zone *proc_zone;
 
 extern int lastpid;
 
 struct	proc *pfind(pid_t);	/* Find process by id. */
 struct	pgrp *pgfind(pid_t);	/* Find process group by id. */
 struct	proc *zpfind(pid_t);	/* Find zombie process by id. */
 
 void	adjustrunqueue(struct thread *, int newpri);
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp, struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, int, int, struct proc **);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 int	leavepgrp(struct proc *p);
 void	mi_switch(void);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
 void	threadinit(void);
 void	proc_linkup(struct proc *p, struct ksegrp *kg,
 	    struct kse *ke, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	setrunnable(struct thread *);
 void	setrunqueue(struct thread *);
 void	setsugid(struct proc *p);
 int	sigonstack(size_t sp);
 void	sleepinit(void);
 void	stopevent(struct proc *, u_int, u_int);
 void	cpu_idle(void);
 #if !defined(__alpha__) && !defined(__powerpc__) 
 void	cpu_switch(struct thread *old, struct thread *new);
 void	cpu_throw(struct thread *old, struct thread *new) __dead2;
 #else
 void	cpu_switch(void);
 void	cpu_throw(void) __dead2;
 #endif
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *, u_int);
 
 void	cpu_exit(struct thread *);
 void	cpu_sched_exit(struct thread *);
 void	exit1(struct thread *, int) __dead2;
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 
 /* New in KSE. */
 struct	ksegrp *ksegrp_alloc(void);
 void	ksegrp_free(struct ksegrp *kg);
 void	ksegrp_stash(struct ksegrp *kg);
 struct	kse *kse_alloc(void);
 void	kse_free(struct kse *ke);
 void	kse_stash(struct kse *ke);
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
 void	cpu_set_upcall_kse(struct thread *td, struct kse_upcall *ku);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_setup(struct thread *td);
 void	kse_reassign(struct kse *ke);
 void	kse_link(struct kse *ke, struct ksegrp *kg);
 void	kse_unlink(struct kse *ke);
 void	ksegrp_link(struct ksegrp *kg, struct proc *p);
 void	ksegrp_unlink(struct ksegrp *kg);
 void	thread_signal_add(struct thread *td, int sig);
 void	thread_signal_upcall(struct thread *td);
 struct	thread *thread_alloc(void);
 void	thread_exit(void) __dead2;
 int	thread_export_context(struct thread *td);
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct ksegrp *kg);
 void	thread_reap(void);
 struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku);
 int	thread_single(int how);
 #define	SINGLE_NO_EXIT 0			/* values for 'how' */
 #define	SINGLE_EXIT 1
 void	thread_single_end(void);
 void	thread_stash(struct thread *td);
 int	thread_suspend_check(int how);
 void	thread_suspend_one(struct thread *td);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_unsuspend_one(struct thread *td);
 int	thread_userret(struct thread *td, struct trapframe *frame);
 void	thread_user_enter(struct proc *p, struct thread *td);
 void	thread_wait(struct proc *p);
 int	thread_statclock(int user);
 struct kse_upcall *upcall_alloc(void);
 void	upcall_free(struct kse_upcall *ku);
 void	upcall_link(struct kse_upcall *ku, struct ksegrp *kg);
 void	upcall_unlink(struct kse_upcall *ku);
 void	upcall_remove(struct thread *td);
 void	upcall_stash(struct kse_upcall *ke);
 void	thread_sanity_check(struct thread *td, char *);
 void	thread_stopped(struct proc *p);
 void	thread_switchout(struct thread *td);
 void	thr_exit1(void);
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */