Index: head/sys/alpha/alpha/trap.c
===================================================================
--- head/sys/alpha/alpha/trap.c	(revision 72375)
+++ head/sys/alpha/alpha/trap.c	(revision 72376)
@@ -1,1153 +1,1152 @@
 /* $FreeBSD$ */
 /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */
 
 /*
  * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  * 
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  * 
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  * 
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /* #include "opt_fix_unaligned_vax_fp.h" */
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 #include "opt_simos.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/pioctl.h>
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <sys/user.h>
 #include <sys/ptrace.h>
 #include <machine/clock.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/pal.h>
 #include <machine/fpu.h>
 #include <machine/smp.h>
 
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 unsigned long	Sfloat_to_reg __P((unsigned int));
 unsigned int	reg_to_Sfloat __P((unsigned long));
 unsigned long	Tfloat_reg_cvt __P((unsigned long));
 #ifdef FIX_UNALIGNED_VAX_FP
 unsigned long	Ffloat_to_reg __P((unsigned int));
 unsigned int	reg_to_Ffloat __P((unsigned long));
 unsigned long	Gfloat_reg_cvt __P((unsigned long));
 #endif
 
 int		unaligned_fixup __P((unsigned long, unsigned long,
 		    unsigned long, struct proc *));
 
 static void printtrap __P((const unsigned long, const unsigned long,
       const unsigned long, const unsigned long, struct trapframe *, int, int));
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 void alpha_clear_resched(void);
 
 void
 alpha_clear_resched(void)
 {
 	clear_resched();
 }
 
 /*
  * Define the code needed before returning to user mode, for
  * trap and syscall.
  */
 void
 userret(p, frame, oticks)
 	register struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig;
 
 	/* take pending signals */
 	while ((sig = CURSIG(p)) != 0) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		postsig(sig);
 	}
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, a clock interrupt could
 		 * change our priority without changing run queues
 		 * (the running process is not kept on a run queue).
 		 * If this happened after we setrunqueue ourselves but
 		 * before we switch()'ed, we might not be on the queue
 		 * indicated by our priority.
 		 */
 		clear_resched();
 		DROP_GIANT_NOSWITCH();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		while ((sig = CURSIG(p)) != 0) {
 			if (!mtx_owned(&Giant))
 				mtx_lock(&Giant);
 			postsig(sig);
 		}
 		mtx_lock_spin(&sched_lock);
 	} 
 
 	/*
 	 * If profiling, charge recent system time to the trapped pc.
 	 */
 	if (p->p_sflag & PS_PROFIL) {
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, frame->tf_regs[FRAME_PC],
 		    (int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }
 
 static void
 printtrap(a0, a1, a2, entry, framep, isfatal, user)
 	const unsigned long a0, a1, a2, entry;
 	struct trapframe *framep;
 	int isfatal, user;
 {
 	char ubuf[64];
 	const char *entryname;
 
 	switch (entry) {
 	case ALPHA_KENTRY_INT:
 		entryname = "interrupt";
 		break;
 	case ALPHA_KENTRY_ARITH:
 		entryname = "arithmetic trap";
 		break;
 	case ALPHA_KENTRY_MM:
 		entryname = "memory management fault";
 		break;
 	case ALPHA_KENTRY_IF:
 		entryname = "instruction fault";
 		break;
 	case ALPHA_KENTRY_UNA:
 		entryname = "unaligned access fault";
 		break;
 	case ALPHA_KENTRY_SYS:
 		entryname = "system call";
 		break;
 	default:
 		snprintf(ubuf, sizeof(ubuf), "type %lx", entry);
 		entryname = (const char *) ubuf;
 		break;
 	}
 
 	printf("\n");
 	printf("%s %s trap:\n", isfatal? "fatal" : "handled",
 	       user ? "user" : "kernel");
 	printf("\n");
 	printf("    trap entry = 0x%lx (%s)\n", entry, entryname);
 	printf("    a0         = 0x%lx\n", a0);
 	printf("    a1         = 0x%lx\n", a1);
 	printf("    a2         = 0x%lx\n", a2);
 	printf("    pc         = 0x%lx\n", framep->tf_regs[FRAME_PC]);
 	printf("    ra         = 0x%lx\n", framep->tf_regs[FRAME_RA]);
 	printf("    curproc    = %p\n", curproc);
 	if (curproc != NULL)
 		printf("        pid = %d, comm = %s\n", curproc->p_pid,
 		       curproc->p_comm);
 	printf("\n");
 }
 
 /*
  * Trap is called from locore to handle most types of processor traps.
  * System calls are broken out for efficiency and ASTs are broken out
  * to make the code a bit cleaner and more representative of the
  * Alpha architecture.
  */
 /*ARGSUSED*/
 void
 trap(a0, a1, a2, entry, framep)
 	const unsigned long a0, a1, a2, entry;
 	struct trapframe *framep;
 {
 	register struct proc *p;
 	register int i;
 	u_int64_t ucode;
 	u_quad_t sticks;
 	int user;
 
 	/*
 	 * Find our per-cpu globals.
 	 */
 	globalp = (struct globaldata *) alpha_pal_rdval();
 
 	cnt.v_trap++;
 	p = curproc;
 	ucode = 0;
 	user = (framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) != 0;
 	if (user)  {
 		mtx_lock_spin(&sched_lock);
 		sticks = p->p_sticks;
 		mtx_unlock_spin(&sched_lock);
 		p->p_md.md_tf = framep;
 #if	0
 /* This is to catch some weird stuff on the UDB (mj) */
 		if (framep->tf_regs[FRAME_PC] > 0 && 
 		    framep->tf_regs[FRAME_PC] < 0x120000000) {
 			printf("PC Out of Whack\n");
 			printtrap(a0, a1, a2, entry, framep, 1, user);
 		}
 #endif
 	} else {
 		sticks = 0;		/* XXX bogus -Wuninitialized warning */
 	}
 
 #ifdef DIAGNOSTIC
 	if (user)
 		alpha_fpstate_check(p);
 #endif
 
 	switch (entry) {
 	case ALPHA_KENTRY_UNA:
 		/*
 		 * If user-land, do whatever fixups, printing, and
 		 * signalling is appropriate (based on system-wide
 		 * and per-process unaligned-access-handling flags).
 		 */
 		if (user) {
 			mtx_lock(&Giant);
 			if ((i = unaligned_fixup(a0, a1, a2, p)) == 0) {
 				mtx_unlock(&Giant);
 				goto out;
 			}
 			mtx_unlock(&Giant);
 			ucode = a0;		/* VA */
 			break;
 		}
 
 		/*
 		 * Unaligned access from kernel mode is always an error,
 		 * EVEN IF A COPY FAULT HANDLER IS SET!
 		 *
 		 * It's an error if a copy fault handler is set because
 		 * the various routines which do user-initiated copies
 		 * do so in a bcopy-like manner.  In other words, the
 		 * kernel never assumes that pointers provided by the
 		 * user are properly aligned, and so if the kernel
 		 * does cause an unaligned access it's a kernel bug.
 		 */
 		goto dopanic;
 
 	case ALPHA_KENTRY_ARITH:
 		/* 
 		 * If user-land, give a SIGFPE if software completion
 		 * is not requested or if the completion fails.
 		 */
 		if (user) {
 			mtx_lock(&Giant);
 			if (a0 & EXCSUM_SWC)
 				if (fp_software_completion(a1, p)) {
 					mtx_unlock(&Giant);
 					goto out;
 				}
 			mtx_unlock(&Giant);
 			i = SIGFPE;
 			ucode =  a0;		/* exception summary */
 			break;
 		}
 
 		/* Always fatal in kernel.  Should never happen. */
 		goto dopanic;
 
 	case ALPHA_KENTRY_IF:
 		/*
 		 * These are always fatal in kernel, and should never
 		 * happen.
 		 */
 		if (!user) {
 #ifdef DDB
 			/*
 			 * ...unless, of course, DDB is configured; BUGCHK
 			 * is used to invoke the kernel debugger, and we
 			 * might have set a breakpoint.
 			 */
 			if (a0 == ALPHA_IF_CODE_BUGCHK ||
 			    a0 == ALPHA_IF_CODE_BPT
 #ifdef SIMOS
 			    || a0 == ALPHA_IF_CODE_GENTRAP
 #endif
 			    ) {
 				if (kdb_trap(a0, a1, a2, entry, framep))
 					goto out;
 			}
 
 			/*
 			 * If we get here, DDB did _not_ handle the
 			 * trap, and we need to PANIC!
 			 */
 #endif
 			goto dopanic;
 		}
 		i = 0;
 		switch (a0) {
 		case ALPHA_IF_CODE_GENTRAP:
 			if (framep->tf_regs[FRAME_A0] == -2) { /* weird! */
 				i = SIGFPE;
 				ucode =  a0;	/* exception summary */
 				break;
 			}
 			/* FALLTHROUTH */
 		case ALPHA_IF_CODE_BPT:
 		case ALPHA_IF_CODE_BUGCHK:
 			if (p->p_md.md_flags & (MDP_STEP1|MDP_STEP2)) {
 				ptrace_clear_single_step(p);
 				p->p_md.md_tf->tf_regs[FRAME_PC] -= 4;
 			}
 			ucode = a0;		/* trap type */
 			i = SIGTRAP;
 			break;
 
 		case ALPHA_IF_CODE_OPDEC:
 			ucode = a0;		/* trap type */
 			i = SIGILL;
 			break;
 
 		case ALPHA_IF_CODE_FEN:
 			/*
 			 * on exit from the kernel, if proc == fpcurproc,
 			 * FP is enabled.
 			 */
 			if (PCPU_GET(fpcurproc) == p) {
 				printf("trap: fp disabled for fpcurproc == %p",
 				    p);
 				goto dopanic;
 			}
 	
 			alpha_fpstate_switch(p);
 
 			goto out;
 
 		default:
 			printf("trap: unknown IF type 0x%lx\n", a0);
 			goto dopanic;
 		}
 		break;
 
 	case ALPHA_KENTRY_MM:
 		switch (a1) {
 		case ALPHA_MMCSR_FOR:
 		case ALPHA_MMCSR_FOE:
 			pmap_emulate_reference(p, a0, user, 0);
 			goto out;
 
 		case ALPHA_MMCSR_FOW:
 			pmap_emulate_reference(p, a0, user, 1);
 			goto out;
 
 		case ALPHA_MMCSR_INVALTRANS:
 		case ALPHA_MMCSR_ACCESS:
 	    	{
 			register vm_offset_t va;
 			register struct vmspace *vm = NULL;
 			register vm_map_t map;
 			vm_prot_t ftype = 0;
 			int rv;
 
 			/*
 			 * If it was caused by fuswintr or suswintr,
 			 * just punt.  Note that we check the faulting
 			 * address against the address accessed by
 			 * [fs]uswintr, in case another fault happens
 			 * when they are running.
 			 */
 			if (!user &&
 			    p != NULL &&
 			    p->p_addr->u_pcb.pcb_onfault ==
 			      (unsigned long)fswintrberr &&
 			    p->p_addr->u_pcb.pcb_accessaddr == a0) {
 				framep->tf_regs[FRAME_PC] =
 				    p->p_addr->u_pcb.pcb_onfault;
 				p->p_addr->u_pcb.pcb_onfault = 0;
 				goto out;
 			}
 
 			mtx_lock(&Giant);
 			/*
 			 * It is only a kernel address space fault iff:
 			 *	1. !user and
 			 *	2. pcb_onfault not set or
 			 *	3. pcb_onfault set but kernel space data fault
 			 * The last can occur during an exec() copyin where the
 			 * argument space is lazy-allocated.
 			 *
 			 * For the purposes of the Linux emulator, we allow
 			 * kernel accesses to a small region of the
 			 * user stack which the emulator uses to
 			 * translate syscall arguments.
 			 */
 			if (!user 
 			    && ((a0 >= VM_MIN_KERNEL_ADDRESS) 
 				|| (p == NULL) 
 				|| (p->p_addr->u_pcb.pcb_onfault == 0))) {
 				if (a0 >= trunc_page(PS_STRINGS
 						     - szsigcode
 						     - SPARE_USRSPACE)
 				    && a0 < round_page(PS_STRINGS
 						       - szsigcode)) {
 					vm = p->p_vmspace;
 					map = &vm->vm_map;
 				} else {
 					map = kernel_map;
 				}
 			} else {
 				vm = p->p_vmspace;
 				map = &vm->vm_map;
 			}
 	
 			switch (a2) {
 			case -1:		/* instruction fetch fault */
 			case 0:			/* load instruction */
 				ftype = VM_PROT_READ;
 				break;
 			case 1:			/* store instruction */
 				ftype = VM_PROT_WRITE;
 				break;
 #ifdef DIAGNOSTIC
 			default:		/* XXX gcc -Wuninitialized */
 				goto dopanic;
 #endif
 			}
 	
 			va = trunc_page((vm_offset_t)a0);
 
 			if (map != kernel_map) {
 				/*
 				 * Keep swapout from messing with us
 				 * during thiscritical time.
 				 */
 				PROC_LOCK(p);
 				++p->p_lock;
 				PROC_UNLOCK(p);
 
 				/*
 				 * Grow the stack if necessary
 				 */
 				/* grow_stack returns false only if va falls into
 				 * a growable stack region and the stack growth
 				 * fails.  It returns true if va was not within
 				 * a growable stack region, or if the stack 
 				 * growth succeeded.
 				 */
 				if (!grow_stack (p, va)) {
 					rv = KERN_FAILURE;
 					PROC_LOCK(p);
 					--p->p_lock;
 					PROC_UNLOCK(p);				       
 					goto nogo;
 				}
 
 
 				/* Fault in the user page: */
 				rv = vm_fault(map, va, ftype,
 					      (ftype & VM_PROT_WRITE)
 						      ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 				PROC_LOCK(p);
 				--p->p_lock;
 				PROC_UNLOCK(p);
 			} else {
 				/*
 				 * Don't have to worry about process
 				 * locking or stacks in the kernel.
 				 */
 				rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 			}
 				
 		nogo:;
 			/*
 			 * If this was a stack access we keep track of the
 			 * maximum accessed stack size.  Also, if vm_fault
 			 * gets a protection failure it is due to accessing
 			 * the stack region outside the current limit and
 			 * we need to reflect that as an access error.
 			 */
 			if (map != kernel_map &&
 			    (caddr_t)va >= vm->vm_maxsaddr
 			    && (caddr_t)va < (caddr_t)USRSTACK) {
 				if (rv == KERN_SUCCESS) {
 					unsigned nss;
 	
 					nss = alpha_btop(round_page(USRSTACK - va));
 					if (nss > vm->vm_ssize)
 						vm->vm_ssize = nss;
 				} else if (rv == KERN_PROTECTION_FAILURE)
 					rv = KERN_INVALID_ADDRESS;
 			}
 			if (rv == KERN_SUCCESS) {
 				mtx_unlock(&Giant);
 				goto out;
 			}
 
 			mtx_unlock(&Giant);
 			if (!user) {
 				/* Check for copyin/copyout fault */
 				if (p != NULL &&
 				    p->p_addr->u_pcb.pcb_onfault != 0) {
 					framep->tf_regs[FRAME_PC] =
 					    p->p_addr->u_pcb.pcb_onfault;
 					p->p_addr->u_pcb.pcb_onfault = 0;
 					goto out;
 				}
 				goto dopanic;
 			}
 			ucode = a0;
 			i = SIGSEGV;
 #ifdef DEBUG
 			printtrap(a0, a1, a2, entry, framep, 1, user);
 #endif
 			break;
 		    }
 
 		default:
 			printf("trap: unknown MMCSR value 0x%lx\n", a1);
 			goto dopanic;
 		}
 		break;
 
 	default:
 		goto dopanic;
 	}
 
 #ifdef DEBUG
 	printtrap(a0, a1, a2, entry, framep, 1, user);
 #endif
 	framep->tf_regs[FRAME_TRAPARG_A0] = a0;
 	framep->tf_regs[FRAME_TRAPARG_A1] = a1;
 	framep->tf_regs[FRAME_TRAPARG_A2] = a2;
 	trapsignal(p, i, ucode);
 out:
 	if (user) {
 		framep->tf_regs[FRAME_SP] = alpha_pal_rdusp();
 		userret(p, framep, sticks);
 		if (mtx_owned(&Giant))
 			mtx_unlock(&Giant);
 	}
 	return;
 
 dopanic:
 	printtrap(a0, a1, a2, entry, framep, 1, user);
 
 	/* XXX dump registers */
 
 #ifdef DDB
 	kdb_trap(a0, a1, a2, entry, framep);
 #endif
 
 	panic("trap");
 }
 
 /*
  * Process a system call.
  *
  * System calls are strange beasts.  They are passed the syscall number
  * in v0, and the arguments in the registers (as normal).  They return
  * an error flag in a3 (if a3 != 0 on return, the syscall had an error),
  * and the return value (if any) in v0.
  *
  * The assembly stub takes care of moving the call number into a register
  * we can get to, and moves all of the argument registers into their places
  * in the trap frame.  On return, it restores the callee-saved registers,
  * a3, and v0 from the frame before returning to the user process.
  */
 void
 syscall(code, framep)
 	u_int64_t code;
 	struct trapframe *framep;
 {
 	struct sysent *callp;
 	struct proc *p;
 	int error = 0;
 	u_int64_t opc;
 	u_quad_t sticks;
 	u_int64_t args[10];					/* XXX */
 	u_int hidden = 0, nargs;
 
 	/*
 	 * Find our per-cpu globals.
 	 */
 	globalp = (struct globaldata *) alpha_pal_rdval();
 	mtx_lock(&Giant);
 
 	framep->tf_regs[FRAME_TRAPARG_A0] = 0;
 	framep->tf_regs[FRAME_TRAPARG_A1] = 0;
 	framep->tf_regs[FRAME_TRAPARG_A2] = 0;
 #if notdef				/* can't happen, ever. */
 	if ((framep->tf_regs[FRAME_PS] & ALPHA_PSL_USERMODE) == 0)
 		panic("syscall");
 #endif
 
 	cnt.v_syscall++;
 	p = curproc;
 	p->p_md.md_tf = framep;
 	opc = framep->tf_regs[FRAME_PC] - 4;
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 
 #ifdef DIAGNOSTIC
 	alpha_fpstate_check(p);
 #endif
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, &params); */
 		panic("prepsyscall");
 	} else {
 		/*
 		 * syscall() and __syscall() are handled the same on
 		 * the alpha, as everything is 64-bit aligned, anyway.
 		 */
 		if (code == SYS_syscall || code == SYS___syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = framep->tf_regs[FRAME_A0];
 			hidden = 1;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	nargs = (callp->sy_narg & SYF_ARGMASK) + hidden;
 	switch (nargs) {
 	default:
 		if (nargs > 10)		/* XXX */
 			panic("syscall: too many args (%d)", nargs);
 		error = copyin((caddr_t)(alpha_pal_rdusp()), &args[6],
 		    (nargs - 6) * sizeof(u_int64_t));
 	case 6:	
 		args[5] = framep->tf_regs[FRAME_A5];
 	case 5:	
 		args[4] = framep->tf_regs[FRAME_A4];
 	case 4:	
 		args[3] = framep->tf_regs[FRAME_A3];
 	case 3:	
 		args[2] = framep->tf_regs[FRAME_A2];
 	case 2:	
 		args[1] = framep->tf_regs[FRAME_A1];
 	case 1:	
 		args[0] = framep->tf_regs[FRAME_A0];
 	case 0:
 		break;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, (callp->sy_narg & SYF_ARGMASK), args + hidden);
 #endif
 	if (error == 0) {
 		p->p_retval[0] = 0;
 		p->p_retval[1] = 0;
 
 		STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK));
 
 		error = (*callp->sy_call)(p, args + hidden);
 	}
 
 
 	switch (error) {
 	case 0:
 		framep->tf_regs[FRAME_V0] = p->p_retval[0];
 		framep->tf_regs[FRAME_A4] = p->p_retval[1];
 		framep->tf_regs[FRAME_A3] = 0;
 		break;
 	case ERESTART:
 		framep->tf_regs[FRAME_PC] = opc;
 		break;
 	case EJUSTRETURN:
 		break;
 	default:
 		if (p->p_sysent->sv_errsize) {
 			if (error >= p->p_sysent->sv_errsize)
 				error = -1; /* XXX */
 			else
 				error = p->p_sysent->sv_errtbl[error];
 		}
 		framep->tf_regs[FRAME_V0] = error;
 		framep->tf_regs[FRAME_A3] = 1;
 		break;
 	}
 
 	userret(p, framep, sticks);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 	mtx_unlock(&Giant);
 
 #ifdef WITNESS
 	if (witness_list(p)) {
 		panic("system call %s returning with mutex(s) held\n",
 		    syscallnames[code]);
 	}
 #endif
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  */
 void
 ast(framep)
 	struct trapframe *framep;
 {
 	struct proc *p = CURPROC;
 	u_quad_t sticks;
 
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 
 	/*
 	 * We check for a pending AST here rather than in the assembly as
 	 * acquiring and releasing mutexes in assembly is not fun.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (!(astpending() || resched_wanted())) {
 		mtx_unlock_spin(&sched_lock);
 		return;
 	}
 
 	sticks = p->p_sticks;
 	p->p_md.md_tf = framep;
 
 	astoff();
 	cnt.v_soft++;
 	mtx_intr_enable(&sched_lock);
 	if (p->p_sflag & PS_OWEUPC) {
 		p->p_sflag &= ~PS_OWEUPC;
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, p->p_stats->p_prof.pr_addr,
 			    p->p_stats->p_prof.pr_ticks);
 	}
 	if (p->p_sflag & PS_ALRMPEND) {
 		p->p_sflag &= ~PS_ALRMPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGVTALRM);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (p->p_sflag & PS_PROFPEND) {
 		p->p_sflag &= ~PS_PROFPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGPROF);
 	} else
 		mtx_unlock_spin(&sched_lock);
 
 	userret(p, framep, sticks);
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
 
 /*
  * Unaligned access handler.  It's not clear that this can get much slower...
  *
  */
 const static int reg_to_framereg[32] = {
 	FRAME_V0,	FRAME_T0,	FRAME_T1,	FRAME_T2,
 	FRAME_T3,	FRAME_T4,	FRAME_T5,	FRAME_T6,
 	FRAME_T7,	FRAME_S0,	FRAME_S1,	FRAME_S2,
 	FRAME_S3,	FRAME_S4,	FRAME_S5,	FRAME_S6,
 	FRAME_A0,	FRAME_A1,	FRAME_A2,	FRAME_A3,
 	FRAME_A4,	FRAME_A5,	FRAME_T8,	FRAME_T9,
 	FRAME_T10,	FRAME_T11,	FRAME_RA,	FRAME_T12,
 	FRAME_AT,	FRAME_GP,	FRAME_SP,	-1,
 };
 
 #define	irp(p, reg)							\
 	((reg_to_framereg[(reg)] == -1) ? NULL :			\
 	    &(p)->p_md.md_tf->tf_regs[reg_to_framereg[(reg)]])
 
 #define	frp(p, reg)							\
 	(&(p)->p_addr->u_pcb.pcb_fp.fpr_regs[(reg)])
 
 #define	unaligned_load(storage, ptrf, mod)				\
 	if (copyin((caddr_t)va, &(storage), sizeof (storage)) == 0 &&	\
 	    (regptr = ptrf(p, reg)) != NULL)				\
 		signal = 0;						\
 	else								\
 		break;							\
 	*regptr = mod (storage);
 
 #define	unaligned_store(storage, ptrf, mod)				\
 	if ((regptr = ptrf(p, reg)) == NULL)				\
 		break;							\
 	(storage) = mod (*regptr);					\
 	if (copyout(&(storage), (caddr_t)va, sizeof (storage)) == 0)	\
 		signal = 0;						\
 	else								\
 		break;
 
 #define	unaligned_load_integer(storage)					\
 	unaligned_load(storage, irp, )
 
 #define	unaligned_store_integer(storage)				\
 	unaligned_store(storage, irp, )
 
 #define	unaligned_load_floating(storage, mod)				\
 	alpha_fpstate_save(p, 1);					\
 	unaligned_load(storage, frp, mod)
 
 #define	unaligned_store_floating(storage, mod)				\
 	alpha_fpstate_save(p, 0);					\
 	unaligned_store(storage, frp, mod)
 
 unsigned long
 Sfloat_to_reg(s)
 	unsigned int s;
 {
 	unsigned long sign, expn, frac;
 	unsigned long result;
 
 	sign = (s & 0x80000000) >> 31;
 	expn = (s & 0x7f800000) >> 23;
 	frac = (s & 0x007fffff) >>  0;
 
 	/* map exponent part, as appropriate. */
 	if (expn == 0xff)
 		expn = 0x7ff;
 	else if ((expn & 0x80) != 0)
 		expn = (0x400 | (expn & ~0x80));
 	else if ((expn & 0x80) == 0 && expn != 0)
 		expn = (0x380 | (expn & ~0x80));
 
 	result = (sign << 63) | (expn << 52) | (frac << 29);
 	return (result);
 }
 
 unsigned int
 reg_to_Sfloat(r)
 	unsigned long r;
 {
 	unsigned long sign, expn, frac;
 	unsigned int result;
 
 	sign = (r & 0x8000000000000000) >> 63;
 	expn = (r & 0x7ff0000000000000) >> 52;
 	frac = (r & 0x000fffffe0000000) >> 29;
 
 	/* map exponent part, as appropriate. */
 	expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00);
 
 	result = (sign << 31) | (expn << 23) | (frac << 0);
 	return (result);
 }
 
 /*
  * Conversion of T floating datums to and from register format
  * requires no bit reordering whatsoever.
  */
 unsigned long
 Tfloat_reg_cvt(input)
 	unsigned long input;
 {
 
 	return (input);
 }
 
 #ifdef FIX_UNALIGNED_VAX_FP
 unsigned long
 Ffloat_to_reg(f)
 	unsigned int f;
 {
 	unsigned long sign, expn, frlo, frhi;
 	unsigned long result;
 
 	sign = (f & 0x00008000) >> 15;
 	expn = (f & 0x00007f80) >>  7;
 	frhi = (f & 0x0000007f) >>  0;
 	frlo = (f & 0xffff0000) >> 16;
 
 	/* map exponent part, as appropriate. */
 	if ((expn & 0x80) != 0)
 		expn = (0x400 | (expn & ~0x80));
 	else if ((expn & 0x80) == 0 && expn != 0)
 		expn = (0x380 | (expn & ~0x80));
 
 	result = (sign << 63) | (expn << 52) | (frhi << 45) | (frlo << 29);
 	return (result);
 }
 
 unsigned int
 reg_to_Ffloat(r)
 	unsigned long r;
 {
 	unsigned long sign, expn, frhi, frlo;
 	unsigned int result;
 
 	sign = (r & 0x8000000000000000) >> 63;
 	expn = (r & 0x7ff0000000000000) >> 52;
 	frhi = (r & 0x000fe00000000000) >> 45;
 	frlo = (r & 0x00001fffe0000000) >> 29;
 
 	/* map exponent part, as appropriate. */
 	expn = (expn & 0x7f) | ((expn & 0x400) != 0 ? 0x80 : 0x00);
 
 	result = (sign << 15) | (expn << 7) | (frhi << 0) | (frlo << 16);
 	return (result);
 }
 
 /*
  * Conversion of G floating datums to and from register format is
  * symmetrical.  Just swap shorts in the quad...
  */
 unsigned long
 Gfloat_reg_cvt(input)
 	unsigned long input;
 {
 	unsigned long a, b, c, d;
 	unsigned long result;
 
 	a = (input & 0x000000000000ffff) >> 0;
 	b = (input & 0x00000000ffff0000) >> 16;
 	c = (input & 0x0000ffff00000000) >> 32;
 	d = (input & 0xffff000000000000) >> 48;
 
 	result = (a << 48) | (b << 32) | (c << 16) | (d << 0);
 	return (result);
 }
 #endif /* FIX_UNALIGNED_VAX_FP */
 
 extern int	alpha_unaligned_print, alpha_unaligned_fix;
 extern int	alpha_unaligned_sigbus;
 
 int
 unaligned_fixup(va, opcode, reg, p)
 	unsigned long va, opcode, reg;
 	struct proc *p;
 {
 	int doprint, dofix, dosigbus;
 	int signal, size;
 	const char *type;
 	unsigned long *regptr, longdata, uac;
 	int intdata;		/* signed to get extension when storing */
 	struct {
 		const char *type;	/* opcode name */
 		int size;		/* size, 0 if fixup not supported */
 	} tab[0x10] = {
 #ifdef FIX_UNALIGNED_VAX_FP
 		{ "ldf",	4 },	{ "ldg",	8 },
 #else
 		{ "ldf",	0 },	{ "ldg",	0 },
 #endif
 		{ "lds",	4 },	{ "ldt",	8 },
 #ifdef FIX_UNALIGNED_VAX_FP
 		{ "stf",	4 },	{ "stg",	8 },
 #else
 		{ "stf",	0 },	{ "stg",	0 },
 #endif
 		{ "sts",	4 },	{ "stt",	8 },
 		{ "ldl",	4 },	{ "ldq",	8 },
 		{ "ldl_l",	0 },	{ "ldq_l",	0 },	/* can't fix */
 		{ "stl",	4 },	{ "stq",	8 },
 		{ "stl_c",	0 },	{ "stq_c",	0 },	/* can't fix */
 	};
 
 	/*
 	 * Figure out what actions to take.
 	 *
 	 */
 
 	if (p)
 		uac = p->p_md.md_flags & MDP_UAC_MASK;
 	else
 		uac = 0;
 
 	doprint = alpha_unaligned_print && !(uac & MDP_UAC_NOPRINT);
 	dofix = alpha_unaligned_fix && !(uac & MDP_UAC_NOFIX);
 	dosigbus = alpha_unaligned_sigbus | (uac & MDP_UAC_SIGBUS);
 
 	/*
 	 * Find out which opcode it is.  Arrange to have the opcode
 	 * printed if it's an unknown opcode.
 	 */
 	if (opcode >= 0x20 && opcode <= 0x2f) {
 		type = tab[opcode - 0x20].type;
 		size = tab[opcode - 0x20].size;
 	} else {
 		type = "0x%lx";
 		size = 0;
 	}
 
 	/*
 	 * See if the user can access the memory in question.
 	 * Even if it's an unknown opcode, SEGV if the access
 	 * should have failed.
 	 */
 	if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) {
 		signal = SIGSEGV;
 		goto out;
 	}
 
 	/*
 	 * If we're supposed to be noisy, squawk now.
 	 */
 	if (doprint) {
 		uprintf(
 		"pid %d (%s): unaligned access: va=0x%lx pc=0x%lx ra=0x%lx op=",
 		    p->p_pid, p->p_comm, va, p->p_md.md_tf->tf_regs[FRAME_PC],
 		    p->p_md.md_tf->tf_regs[FRAME_RA]);
 		uprintf(type,opcode);
 		uprintf("\n");
 	}
 
 	/*
 	 * If we should try to fix it and know how, give it a shot.
 	 *
 	 * We never allow bad data to be unknowingly used by the
 	 * user process.  That is, if we decide not to fix up an
 	 * access we cause a SIGBUS rather than letting the user
 	 * process go on without warning.
 	 *
 	 * If we're trying to do a fixup, we assume that things
 	 * will be botched.  If everything works out OK, 
 	 * unaligned_{load,store}_* clears the signal flag.
 	 */
 	signal = SIGBUS;
 	if (dofix && size != 0) {
 		switch (opcode) {
 #ifdef FIX_UNALIGNED_VAX_FP
 		case 0x20:			/* ldf */
 			unaligned_load_floating(intdata, Ffloat_to_reg);
 			break;
 
 		case 0x21:			/* ldg */
 			unaligned_load_floating(longdata, Gfloat_reg_cvt);
 			break;
 #endif
 
 		case 0x22:			/* lds */
 			unaligned_load_floating(intdata, Sfloat_to_reg);
 			break;
 
 		case 0x23:			/* ldt */
 			unaligned_load_floating(longdata, Tfloat_reg_cvt);
 			break;
 
 #ifdef FIX_UNALIGNED_VAX_FP
 		case 0x24:			/* stf */
 			unaligned_store_floating(intdata, reg_to_Ffloat);
 			break;
 
 		case 0x25:			/* stg */
 			unaligned_store_floating(longdata, Gfloat_reg_cvt);
 			break;
 #endif
 
 		case 0x26:			/* sts */
 			unaligned_store_floating(intdata, reg_to_Sfloat);
 			break;
 
 		case 0x27:			/* stt */
 			unaligned_store_floating(longdata, Tfloat_reg_cvt);
 			break;
 
 		case 0x28:			/* ldl */
 			unaligned_load_integer(intdata);
 			break;
 
 		case 0x29:			/* ldq */
 			unaligned_load_integer(longdata);
 			break;
 
 		case 0x2c:			/* stl */
 			unaligned_store_integer(intdata);
 			break;
 
 		case 0x2d:			/* stq */
 			unaligned_store_integer(longdata);
 			break;
 
 #ifdef DIAGNOSTIC
 		default:
 			panic("unaligned_fixup: can't get here");
 #endif
 		}
 	} 
 
 	/*
 	 * Force SIGBUS if requested.
 	 */
 	if (dosigbus)
 		signal = SIGBUS;
 
 out:
 	return (signal);
 }
Index: head/sys/amd64/amd64/cpu_switch.S
===================================================================
--- head/sys/amd64/amd64/cpu_switch.S	(revision 72375)
+++ head/sys/amd64/amd64/cpu_switch.S	(revision 72376)
@@ -1,393 +1,379 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_npx.h"
 #include "opt_user_ldt.h"
 
-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.data
 
 	.globl	_panic
 
 #if defined(SWTCH_OPTIM_STATS)
 	.globl	_swtch_optim_stats, _tlb_flush_count
 _swtch_optim_stats:	.long	0		/* number of _swtch_optims */
 _tlb_flush_count:	.long	0
 #endif
 
 	.text
 
 /*
  * cpu_throw()
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	PCPU(CURPROC),%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	jz	sw1
 
-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)
 
 	movl	P_ADDR(%ecx),%edx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%edx)
 	movl	%ebx,PCB_EBX(%edx)
 	movl	%esp,PCB_ESP(%edx)
 	movl	%ebp,PCB_EBP(%edx)
 	movl	%esi,PCB_ESI(%edx)
 	movl	%edi,PCB_EDI(%edx)
 	movl	%gs,PCB_GS(%edx)
 
 	/* test if debug registers should be saved */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    %dr7,%eax                       /* yes, do the save */
 	movl    %eax,PCB_DR7(%edx)
 	andl    $0x0000ff00, %eax               /* disable all watchpoints */
 	movl    %eax,%dr7
 	movl    %dr6,%eax
 	movl    %eax,PCB_DR6(%edx)
 	movl    %dr3,%eax
 	movl    %eax,PCB_DR3(%edx)
 	movl    %dr2,%eax
 	movl    %eax,PCB_DR2(%edx)
 	movl    %dr1,%eax
 	movl    %eax,PCB_DR1(%edx)
 	movl    %dr0,%eax
 	movl    %eax,PCB_DR0(%edx)
 1:
  
 	/* save sched_lock recursion count */
 	movl	_sched_lock+MTX_RECURSECNT,%eax
 	movl    %eax,PCB_SCHEDNEST(%edx)
  
 #ifdef SMP
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #endif /* SMP */
 
 #ifdef DEV_NPX
 	/* have we used fp, and need a save? */
 	cmpl	%ecx,PCPU(NPXPROC)
 	jne	1f
 	addl	$PCB_SAVEFPU,%edx		/* h/w bugs make saving complicated */
 	pushl	%edx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* DEV_NPX */
 
 	/* save is done, now choose a new process */
 sw1:
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,PCPU(CPUID)
 	je	1f
 
 	movl	PCPU(IDLEPROC), %eax
 	jmp	sw1b
 1:
 #endif
 
 	/*
 	 * Choose a new process to schedule.  chooseproc() returns idleproc
 	 * if it cannot find another process to run.
 	 */
 sw1a:
 	call	_chooseproc			/* trash ecx, edx, ret eax*/
 
 #ifdef INVARIANTS
 	testl	%eax,%eax			/* no process? */
 	jz	badsw3				/* no, panic */
 #endif
 sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	P_ADDR(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
 	incl	_swtch_optim_stats
 #endif
 	/* switch address space */
 	movl	%cr3,%ebx
 	cmpl	PCB_CR3(%edx),%ebx
 	je	4f
 #if defined(SWTCH_OPTIM_STATS)
 	decl	_swtch_optim_stats
 	incl	_tlb_flush_count
 #endif
 	movl	PCB_CR3(%edx),%ebx
 	movl	%ebx,%cr3
 4:
 
-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	%edx, %ebx			/* pcb */
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 	movl	%ebx, PCPU(COMMON_TSS) + TSS_ESP0
 
 	btrl	%esi, _private_tss
 	jae	3f
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	movl	PCPU(TSS_GDT), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	movl	$GPROC0_SEL*8, %esi		/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, _lapic+LA_TPR
 #else
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */
 
 #ifdef SMP
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	PCPU(CURRENTLDT),%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,PCPU(CURRENTLDT)
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	/* test if debug regisers should be restored */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    PCB_DR6(%edx),%eax              /* yes, do the restore */
 	movl    %eax,%dr6
 	movl    PCB_DR3(%edx),%eax
 	movl    %eax,%dr3
 	movl    PCB_DR2(%edx),%eax
 	movl    %eax,%dr2
 	movl    PCB_DR1(%edx),%eax
 	movl    %eax,%dr1
 	movl    PCB_DR0(%edx),%eax
 	movl    %eax,%dr0
 	movl    PCB_DR7(%edx),%eax
 	movl    %eax,%dr7
 1:
 
 	/*
 	 * restore sched_lock recursion count and transfer ownership to
 	 * new process
 	 */
 	movl	PCB_SCHEDNEST(%edx),%eax
 	movl	%eax,_sched_lock+MTX_RECURSECNT
 
 	movl	PCPU(CURPROC),%eax
 	movl	%eax,_sched_lock+MTX_LOCK
 
 	ret
 
 CROSSJUMPTARGET(sw1a)
 
 #ifdef INVARIANTS
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 
 badsw3:
 	pushl	$sw0_3
 	call	_panic
 
 sw0_3:	.asciz	"cpu_switch: chooseproc returned NULL"
 #endif
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%cr3,%eax
 	movl	%eax,PCB_CR3(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef DEV_NPX
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	PCPU(NPXPROC),%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* DEV_NPX */
 
 1:
 	ret
Index: head/sys/amd64/amd64/genassym.c
===================================================================
--- head/sys/amd64/amd64/genassym.c	(revision 72375)
+++ head/sys/amd64/amd64/genassym.c	(revision 72376)
@@ -1,222 +1,220 @@
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  * $FreeBSD$
  */
 
 #include "opt_user_ldt.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 /* XXX */
 #ifdef KTR_PERCPU
 #include <sys/ktr.h>
 #endif
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsv2.h>
 #include <nfs/rpcv2.h>
 #include <nfs/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef SMP
 #include <machine/apic.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/sigframe.h>
 #include <machine/globaldata.h>
 #include <machine/vm86.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 ASSYM(P_ADDR, offsetof(struct proc, p_addr));
 ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level));
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
 ASSYM(P_STAT, offsetof(struct proc, p_stat));
 ASSYM(P_WCHAN, offsetof(struct proc, p_wchan));
 
 ASSYM(PS_ASTPENDING, PS_ASTPENDING);
 ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED);
 
-#ifdef SMP
 ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu));
 ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu));
-#endif
 
 ASSYM(SSLEEP, SSLEEP);
 ASSYM(SRUN, SRUN);
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
 ASSYM(UPAGES, UPAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(PDESIZE, PDESIZE);
 ASSYM(PTESIZE, PTESIZE);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(USRSTACK, USRSTACK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(MCLBYTES, MCLBYTES);
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
 #ifdef USER_LDT
 ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt));
 #endif
 
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest));
 
 ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 #ifdef SMP
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 #endif
 
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(GD_SIZEOF, sizeof(struct globaldata));
 ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace));
 ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc));
 ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc));
 ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc));
 ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb));
 ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss));
 ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime));
 ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks));
 ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd));
 ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt));
 
 #ifdef USER_LDT
 ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt));
 #endif
 
 ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check));
 
 /* XXX */
 #ifdef KTR_PERCPU
 ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx));
 ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf));
 ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data));
 #endif
 
-#ifdef SMP
 ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid));
 
+#ifdef SMP
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
 ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
 ASSYM(LA_SVR, offsetof(struct LAPIC, svr));
 ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo));
 ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi));
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
 
 ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock));
 ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse));
 ASSYM(MTX_SAVEINTR, offsetof(struct mtx, mtx_saveintr));
Index: head/sys/amd64/amd64/swtch.s
===================================================================
--- head/sys/amd64/amd64/swtch.s	(revision 72375)
+++ head/sys/amd64/amd64/swtch.s	(revision 72376)
@@ -1,393 +1,379 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_npx.h"
 #include "opt_user_ldt.h"
 
-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.data
 
 	.globl	_panic
 
 #if defined(SWTCH_OPTIM_STATS)
 	.globl	_swtch_optim_stats, _tlb_flush_count
 _swtch_optim_stats:	.long	0		/* number of _swtch_optims */
 _tlb_flush_count:	.long	0
 #endif
 
 	.text
 
 /*
  * cpu_throw()
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	PCPU(CURPROC),%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	jz	sw1
 
-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)
 
 	movl	P_ADDR(%ecx),%edx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%edx)
 	movl	%ebx,PCB_EBX(%edx)
 	movl	%esp,PCB_ESP(%edx)
 	movl	%ebp,PCB_EBP(%edx)
 	movl	%esi,PCB_ESI(%edx)
 	movl	%edi,PCB_EDI(%edx)
 	movl	%gs,PCB_GS(%edx)
 
 	/* test if debug registers should be saved */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    %dr7,%eax                       /* yes, do the save */
 	movl    %eax,PCB_DR7(%edx)
 	andl    $0x0000ff00, %eax               /* disable all watchpoints */
 	movl    %eax,%dr7
 	movl    %dr6,%eax
 	movl    %eax,PCB_DR6(%edx)
 	movl    %dr3,%eax
 	movl    %eax,PCB_DR3(%edx)
 	movl    %dr2,%eax
 	movl    %eax,PCB_DR2(%edx)
 	movl    %dr1,%eax
 	movl    %eax,PCB_DR1(%edx)
 	movl    %dr0,%eax
 	movl    %eax,PCB_DR0(%edx)
 1:
  
 	/* save sched_lock recursion count */
 	movl	_sched_lock+MTX_RECURSECNT,%eax
 	movl    %eax,PCB_SCHEDNEST(%edx)
  
 #ifdef SMP
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #endif /* SMP */
 
 #ifdef DEV_NPX
 	/* have we used fp, and need a save? */
 	cmpl	%ecx,PCPU(NPXPROC)
 	jne	1f
 	addl	$PCB_SAVEFPU,%edx		/* h/w bugs make saving complicated */
 	pushl	%edx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* DEV_NPX */
 
 	/* save is done, now choose a new process */
 sw1:
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,PCPU(CPUID)
 	je	1f
 
 	movl	PCPU(IDLEPROC), %eax
 	jmp	sw1b
 1:
 #endif
 
 	/*
 	 * Choose a new process to schedule.  chooseproc() returns idleproc
 	 * if it cannot find another process to run.
 	 */
 sw1a:
 	call	_chooseproc			/* trash ecx, edx, ret eax*/
 
 #ifdef INVARIANTS
 	testl	%eax,%eax			/* no process? */
 	jz	badsw3				/* no, panic */
 #endif
 sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	P_ADDR(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
 	incl	_swtch_optim_stats
 #endif
 	/* switch address space */
 	movl	%cr3,%ebx
 	cmpl	PCB_CR3(%edx),%ebx
 	je	4f
 #if defined(SWTCH_OPTIM_STATS)
 	decl	_swtch_optim_stats
 	incl	_tlb_flush_count
 #endif
 	movl	PCB_CR3(%edx),%ebx
 	movl	%ebx,%cr3
 4:
 
-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	%edx, %ebx			/* pcb */
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 	movl	%ebx, PCPU(COMMON_TSS) + TSS_ESP0
 
 	btrl	%esi, _private_tss
 	jae	3f
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	movl	PCPU(TSS_GDT), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	movl	$GPROC0_SEL*8, %esi		/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, _lapic+LA_TPR
 #else
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */
 
 #ifdef SMP
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	PCPU(CURRENTLDT),%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,PCPU(CURRENTLDT)
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	/* test if debug regisers should be restored */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    PCB_DR6(%edx),%eax              /* yes, do the restore */
 	movl    %eax,%dr6
 	movl    PCB_DR3(%edx),%eax
 	movl    %eax,%dr3
 	movl    PCB_DR2(%edx),%eax
 	movl    %eax,%dr2
 	movl    PCB_DR1(%edx),%eax
 	movl    %eax,%dr1
 	movl    PCB_DR0(%edx),%eax
 	movl    %eax,%dr0
 	movl    PCB_DR7(%edx),%eax
 	movl    %eax,%dr7
 1:
 
 	/*
 	 * restore sched_lock recursion count and transfer ownership to
 	 * new process
 	 */
 	movl	PCB_SCHEDNEST(%edx),%eax
 	movl	%eax,_sched_lock+MTX_RECURSECNT
 
 	movl	PCPU(CURPROC),%eax
 	movl	%eax,_sched_lock+MTX_LOCK
 
 	ret
 
 CROSSJUMPTARGET(sw1a)
 
 #ifdef INVARIANTS
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 
 badsw3:
 	pushl	$sw0_3
 	call	_panic
 
 sw0_3:	.asciz	"cpu_switch: chooseproc returned NULL"
 #endif
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%cr3,%eax
 	movl	%eax,PCB_CR3(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef DEV_NPX
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	PCPU(NPXPROC),%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* DEV_NPX */
 
 1:
 	ret
Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 72375)
+++ head/sys/amd64/amd64/trap.c	(revision 72376)
@@ -1,1328 +1,1327 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD$
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #include <machine/vm86.h>
 
 #include <ddb/ddb.h>
 
 #include <sys/sysctl.h>
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void ast __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
 static void trap_fatal __P((struct trapframe *, vm_offset_t));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		28
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef DDB
 static int ddb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 	&ddb_on_nmi, 0, "Go to DDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig;
 
 	while ((sig = CURSIG(p)) != 0) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		postsig(sig);
 	}
 
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		clear_resched();
 		DROP_GIANT_NOSWITCH();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		while ((sig = CURSIG(p)) != 0) {
 			if (!mtx_owned(&Giant))
 				mtx_lock(&Giant);
 			postsig(sig);
 		}
 		mtx_lock_spin(&sched_lock);
 	}
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_sflag & PS_PROFIL) {
 		mtx_unlock_spin(&sched_lock);
 		/* XXX - do we need Giant? */
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 	vm_offset_t eva;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	atomic_add_int(&cnt.v_trap, 1);
 
 	if ((frame.tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.  XXX This is really bad if we trap
 		 * while holding a spin lock.
 		 */
 		type = frame.tf_trapno;
 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * We should walk p_heldmtx here and see if any are
 			 * spin mutexes, and not do this if so.
 			 */
 			enable_intr();
 		}
 	}
 
 	eva = 0;
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 restart:
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
 	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
 		/* user trap */
 
 		mtx_lock_spin(&sched_lock);
 		sticks = p->p_sticks;
 		mtx_unlock_spin(&sched_lock);
 		p->p_md.md_regs = &frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			i = trap_pfault(&frame, TRUE, eva);
 			mtx_unlock(&Giant);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * f00f hack workaround has triggered, treat
 				 * as illegal instruction not page fault.
 				 */
 				frame.tf_trapno = T_PRIVINFLT;
 				goto restart;
 			}
 #endif
 			if (i == -1)
 				goto out;
 			if (i == 0)
 				goto user;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto out;
 #endif
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			mtx_lock(&Giant);
 			i = (*pmath_emulate)(&frame);
 			mtx_unlock(&Giant);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					goto out;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			(void) trap_pfault(&frame, FALSE, eva);
 			mtx_unlock(&Giant);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)&frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (in_vm86call)
 				break;
 
 			if (p->p_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 				mtx_lock(&Giant);
 				psignal(p, SIGBUS);
 				mtx_unlock(&Giant);
 				goto out;
 			}
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame.tf_eip == (int)doreti_iret) {
 				frame.tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_ds) {
 				frame.tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_es) {
 				frame.tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_fs) {
 				frame.tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb) != NULL &&
 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame.tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			/* XXX Giant */
 			if (user_dbreg_trap() && !in_vm86call) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * Fall through (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			/* XXX Giant */
 			if (kdb_trap (type, 0, &frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		mtx_lock(&Giant);
 		trap_fatal(&frame, eva);
 		mtx_unlock(&Giant);
 		goto out;
 	}
 
 	mtx_lock(&Giant);
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 	mtx_unlock(&Giant);
 
 user:
 	userret(p, &frame, sticks);
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 out:
 	return;
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	va = trunc_page(eva);
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t mpte;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		     (p->p_intr_nesting_level != 0 ||
 		      PCPU_GET(curpcb) == NULL ||
 		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 		
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return -2;
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss, esp;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
         		frame->tf_eflags & PSL_VM ? "vm86" :
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
 		return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	PROC_LOCK(p);
 	++p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (!grow_stack (p, va)) {
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 		return (1);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
 
 	PROC_LOCK(p);
 	--p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  *	syscall -	MP aware system call request C handler
  *
  *	A system call is essentially treated as a trap except that the
  *	MP lock is not held on entry or return.  We are responsible for
  *	obtaining the MP lock if necessary and for handling ASTs
  *	(e.g. a task switch) prior to return.
  *
  *	In general, only simple access and manipulation of curproc and
  *	the current stack is allowed without having to hold MP lock.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
 
 	atomic_add_int(&cnt.v_syscall, 1);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame.tf_cs) != SEL_UPL) {
 		mtx_lock(&Giant);
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 
 	p->p_md.md_regs = &frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is not MP aware.
 		 */
 		mtx_lock(&Giant);
 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
 		mtx_unlock(&Giant);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 */
 			code = fuword(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin is MP aware, but the tracing code is not
 	 */
 	if (params && (i = narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 		mtx_lock(&Giant);
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, narg, args);
 #endif
 		goto bad;
 	}
 
 	/*
 	 * Try to run the syscall without the MP lock if the syscall
 	 * is MP safe.  We have to obtain the MP lock no matter what if 
 	 * we are ktracing
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
 		mtx_lock(&Giant);
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsyscall(p->p_tracep, code, narg, args);
 	}
 #endif
 	p->p_retval[0] = 0;
 	p->p_retval[1] = frame.tf_edx;
 
 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
 
 	error = (*callp->sy_call)(p, args);
 
 	/*
 	 * MP SAFE (we may or may not have the MP lock at this point)
 	 */
 	switch (error) {
 	case 0:
 		frame.tf_eax = p->p_retval[0];
 		frame.tf_edx = p->p_retval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame.tf_eip -= frame.tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.  trapsignal() is not MP aware.
 	 */
 	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 	}
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 	/*
 	 * Release Giant if we had to get it
 	 */
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 #ifdef WITNESS
 	if (witness_list(p)) {
 		panic("system call %s returning with mutex(s) held\n",
 		    syscallnames[code]);
 	}
 #endif
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 void
 ast(frame)
 	struct trapframe frame;
 {
 	struct proc *p = CURPROC;
 	u_quad_t sticks;
 
 	KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode"));
 
 	/*
 	 * We check for a pending AST here rather than in the assembly as
 	 * acquiring and releasing mutexes in assembly is not fun.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (!(astpending() || resched_wanted())) {
 		mtx_unlock_spin(&sched_lock);
 		return;
 	}
 
 	sticks = p->p_sticks;
 
 	astoff();
 	mtx_intr_enable(&sched_lock);
 	atomic_add_int(&cnt.v_soft, 1);
 	if (p->p_sflag & PS_OWEUPC) {
 		p->p_sflag &= ~PS_OWEUPC;
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, p->p_stats->p_prof.pr_addr,
 			    p->p_stats->p_prof.pr_ticks);
 	}
 	if (p->p_sflag & PS_ALRMPEND) {
 		p->p_sflag &= ~PS_ALRMPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGVTALRM);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (p->p_sflag & PS_PROFPEND) {
 		p->p_sflag &= ~PS_PROFPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGPROF);
 	} else
 		mtx_unlock_spin(&sched_lock);
 	
 	userret(p, &frame, sticks);
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
Index: head/sys/dev/acpica/Osd/OsdSchedule.c
===================================================================
--- head/sys/dev/acpica/Osd/OsdSchedule.c	(revision 72375)
+++ head/sys/dev/acpica/Osd/OsdSchedule.c	(revision 72376)
@@ -1,150 +1,150 @@
 /*-
  * Copyright (c) 2000 Michael Smith
  * Copyright (c) 2000 BSDi
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$FreeBSD$
  */
 
 /*
  * 6.3 : Scheduling services
  */
 
 #include "acpi.h"
 
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/taskqueue.h>
 #include <machine/clock.h>
 
 #define _COMPONENT	OS_DEPENDENT
 MODULE_NAME("SCHEDULE")
 
 /*
  * This is a little complicated due to the fact that we need to build and then
  * free a 'struct task' for each task we enqueue.
  *
  * We use the default taskqueue_swi queue, since it really doesn't matter what
  * else we're queued along with.
  */
 
 MALLOC_DEFINE(M_ACPITASK, "acpitask", "ACPI deferred task");
 
 static void	AcpiOsExecuteQueue(void *arg, int pending);
 
 struct acpi_task {
     struct task			at_task;
     OSD_EXECUTION_CALLBACK	at_function;
     void			*at_context;
 };
 
 ACPI_STATUS
 AcpiOsQueueForExecution(UINT32 Priority, OSD_EXECUTION_CALLBACK Function, void *Context)
 {
     struct acpi_task	*at;
 
     FUNCTION_TRACE(__FUNCTION__);
 
     if (Function == NULL)
 	return_ACPI_STATUS(AE_BAD_PARAMETER);
 
     at = malloc(sizeof(*at), M_ACPITASK, M_NOWAIT);	/* Interrupt Context */
     if (at == NULL)
 	return_ACPI_STATUS(AE_NO_MEMORY);
     bzero(at, sizeof(*at));
 
     at->at_function = Function;
     at->at_context = Context;
     at->at_task.ta_func = AcpiOsExecuteQueue;
     at->at_task.ta_context = at;
     switch (Priority) {
     case OSD_PRIORITY_GPE:
 	at->at_task.ta_priority = 4;
 	break;
     case OSD_PRIORITY_HIGH:
 	at->at_task.ta_priority = 3;
 	break;
     case OSD_PRIORITY_MED:
 	at->at_task.ta_priority = 2;
 	break;
     case OSD_PRIORITY_LO:
 	at->at_task.ta_priority = 1;
 	break;
     default:
 	free(at, M_ACPITASK);
 	return_ACPI_STATUS(AE_BAD_PARAMETER);
     }
 
     taskqueue_enqueue(taskqueue_swi, (struct task *)at);
     return_ACPI_STATUS(AE_OK);
 }
 
 static void
 AcpiOsExecuteQueue(void *arg, int pending)
 {
     struct acpi_task		*at = (struct acpi_task *)arg;
     OSD_EXECUTION_CALLBACK	Function;
     void			*Context;
 
     FUNCTION_TRACE(__FUNCTION__);
 
     Function = (OSD_EXECUTION_CALLBACK)at->at_function;
     Context = at->at_context;
 
     free(at, M_ACPITASK);
 
     Function(Context);
     return_VOID;
 }
 
 /*
  * We don't have any sleep granularity better than hz, so
  * make do with that.
  */
 void
 AcpiOsSleep (UINT32 Seconds, UINT32 Milliseconds)
 {
     int		timo;
 
     FUNCTION_TRACE(__FUNCTION__);
 
     timo = (Seconds * hz) + Milliseconds / (1000 * hz);
     if (timo == 0)
 	timo = 1;
-    tsleep(NULL, 0, "acpislp", timo);
+    tsleep(NULL, PZERO, "acpislp", timo);
     return_VOID;
 }
 
 void
 AcpiOsSleepUsec (UINT32 Microseconds)
 {
     FUNCTION_TRACE(__FUNCTION__);
 
     if (Microseconds > 1000) {	/* long enough to be worth the overhead of sleeping */
 	AcpiOsSleep(0, Microseconds / 1000);
     } else {
 	DELAY(Microseconds);
     }
     return_VOID;
 }
Index: head/sys/i386/i386/genassym.c
===================================================================
--- head/sys/i386/i386/genassym.c	(revision 72375)
+++ head/sys/i386/i386/genassym.c	(revision 72376)
@@ -1,222 +1,220 @@
 /*-
  * Copyright (c) 1982, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)genassym.c	5.11 (Berkeley) 5/10/91
  * $FreeBSD$
  */
 
 #include "opt_user_ldt.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/assym.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/socket.h>
 #include <sys/resourcevar.h>
 /* XXX */
 #ifdef KTR_PERCPU
 #include <sys/ktr.h>
 #endif
 #include <machine/bootinfo.h>
 #include <machine/tss.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <nfs/nfsv2.h>
 #include <nfs/rpcv2.h>
 #include <nfs/nfs.h>
 #include <nfs/nfsdiskless.h>
 #ifdef SMP
 #include <machine/apic.h>
 #endif
 #include <machine/cpu.h>
 #include <machine/sigframe.h>
 #include <machine/globaldata.h>
 #include <machine/vm86.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
 ASSYM(PM_ACTIVE, offsetof(struct pmap, pm_active));
 ASSYM(P_ADDR, offsetof(struct proc, p_addr));
 ASSYM(P_INTR_NESTING_LEVEL, offsetof(struct proc, p_intr_nesting_level));
 ASSYM(P_SFLAG, offsetof(struct proc, p_sflag));
 ASSYM(P_STAT, offsetof(struct proc, p_stat));
 ASSYM(P_WCHAN, offsetof(struct proc, p_wchan));
 
 ASSYM(PS_ASTPENDING, PS_ASTPENDING);
 ASSYM(PS_NEEDRESCHED, PS_NEEDRESCHED);
 
-#ifdef SMP
 ASSYM(P_ONCPU, offsetof(struct proc, p_oncpu));
 ASSYM(P_LASTCPU, offsetof(struct proc, p_lastcpu));
-#endif
 
 ASSYM(SSLEEP, SSLEEP);
 ASSYM(SRUN, SRUN);
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
 ASSYM(V_INTR, offsetof(struct vmmeter, v_intr));
 ASSYM(UPAGES, UPAGES);
 ASSYM(PAGE_SIZE, PAGE_SIZE);
 ASSYM(NPTEPG, NPTEPG);
 ASSYM(NPDEPG, NPDEPG);
 ASSYM(PDESIZE, PDESIZE);
 ASSYM(PTESIZE, PTESIZE);
 ASSYM(PAGE_SHIFT, PAGE_SHIFT);
 ASSYM(PAGE_MASK, PAGE_MASK);
 ASSYM(PDRSHIFT, PDRSHIFT);
 ASSYM(USRSTACK, USRSTACK);
 ASSYM(VM_MAXUSER_ADDRESS, VM_MAXUSER_ADDRESS);
 ASSYM(KERNBASE, KERNBASE);
 ASSYM(MCLBYTES, MCLBYTES);
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_EDI, offsetof(struct pcb, pcb_edi));
 ASSYM(PCB_ESI, offsetof(struct pcb, pcb_esi));
 ASSYM(PCB_EBP, offsetof(struct pcb, pcb_ebp));
 ASSYM(PCB_ESP, offsetof(struct pcb, pcb_esp));
 ASSYM(PCB_EBX, offsetof(struct pcb, pcb_ebx));
 ASSYM(PCB_EIP, offsetof(struct pcb, pcb_eip));
 ASSYM(TSS_ESP0, offsetof(struct i386tss, tss_esp0));
 
 #ifdef USER_LDT
 ASSYM(PCB_USERLDT, offsetof(struct pcb, pcb_ldt));
 #endif
 
 ASSYM(PCB_GS, offsetof(struct pcb, pcb_gs));
 ASSYM(PCB_DR0, offsetof(struct pcb, pcb_dr0));
 ASSYM(PCB_DR1, offsetof(struct pcb, pcb_dr1));
 ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
 ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 
 ASSYM(PCB_SCHEDNEST, offsetof(struct pcb, pcb_schednest));
 
 ASSYM(PCB_SPARE, offsetof(struct pcb, __pcb_spare));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_SAVEFPU, offsetof(struct pcb, pcb_savefpu));
 ASSYM(PCB_SAVEFPU_SIZE, sizeof(struct save87));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
 
 #ifdef SMP
 ASSYM(PCB_SIZE, sizeof(struct pcb));
 #endif
 
 ASSYM(TF_TRAPNO, offsetof(struct trapframe, tf_trapno));
 ASSYM(TF_ERR, offsetof(struct trapframe, tf_err));
 ASSYM(TF_CS, offsetof(struct trapframe, tf_cs));
 ASSYM(TF_EFLAGS, offsetof(struct trapframe, tf_eflags));
 ASSYM(SIGF_HANDLER, offsetof(struct sigframe, sf_ahu.sf_handler));
 ASSYM(SIGF_SC, offsetof(struct osigframe, sf_siginfo.si_sc));
 ASSYM(SIGF_UC, offsetof(struct sigframe, sf_uc));
 ASSYM(SC_PS, offsetof(struct osigcontext, sc_ps));
 ASSYM(SC_FS, offsetof(struct osigcontext, sc_fs));
 ASSYM(SC_GS, offsetof(struct osigcontext, sc_gs));
 ASSYM(SC_TRAPNO, offsetof(struct osigcontext, sc_trapno));
 ASSYM(UC_EFLAGS, offsetof(ucontext_t, uc_mcontext.mc_eflags));
 ASSYM(UC_GS, offsetof(ucontext_t, uc_mcontext.mc_gs));
 ASSYM(ENOENT, ENOENT);
 ASSYM(EFAULT, EFAULT);
 ASSYM(ENAMETOOLONG, ENAMETOOLONG);
 ASSYM(MAXPATHLEN, MAXPATHLEN);
 ASSYM(BOOTINFO_SIZE, sizeof(struct bootinfo));
 ASSYM(BI_VERSION, offsetof(struct bootinfo, bi_version));
 ASSYM(BI_KERNELNAME, offsetof(struct bootinfo, bi_kernelname));
 ASSYM(BI_NFS_DISKLESS, offsetof(struct bootinfo, bi_nfs_diskless));
 ASSYM(BI_ENDCOMMON, offsetof(struct bootinfo, bi_endcommon));
 ASSYM(NFSDISKLESS_SIZE, sizeof(struct nfs_diskless));
 ASSYM(BI_SIZE, offsetof(struct bootinfo, bi_size));
 ASSYM(BI_SYMTAB, offsetof(struct bootinfo, bi_symtab));
 ASSYM(BI_ESYMTAB, offsetof(struct bootinfo, bi_esymtab));
 ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 ASSYM(GD_SIZEOF, sizeof(struct globaldata));
 ASSYM(GD_PRVSPACE, offsetof(struct globaldata, gd_prvspace));
 ASSYM(GD_CURPROC, offsetof(struct globaldata, gd_curproc));
 ASSYM(GD_NPXPROC, offsetof(struct globaldata, gd_npxproc));
 ASSYM(GD_IDLEPROC, offsetof(struct globaldata, gd_idleproc));
 ASSYM(GD_CURPCB, offsetof(struct globaldata, gd_curpcb));
 ASSYM(GD_COMMON_TSS, offsetof(struct globaldata, gd_common_tss));
 ASSYM(GD_SWITCHTIME, offsetof(struct globaldata, gd_switchtime));
 ASSYM(GD_SWITCHTICKS, offsetof(struct globaldata, gd_switchticks));
 ASSYM(GD_COMMON_TSSD, offsetof(struct globaldata, gd_common_tssd));
 ASSYM(GD_TSS_GDT, offsetof(struct globaldata, gd_tss_gdt));
 
 #ifdef USER_LDT
 ASSYM(GD_CURRENTLDT, offsetof(struct globaldata, gd_currentldt));
 #endif
 
 ASSYM(GD_WITNESS_SPIN_CHECK, offsetof(struct globaldata, gd_witness_spin_check));
 
 /* XXX */
 #ifdef KTR_PERCPU
 ASSYM(GD_KTR_IDX, offsetof(struct globaldata, gd_ktr_idx));
 ASSYM(GD_KTR_BUF, offsetof(struct globaldata, gd_ktr_buf));
 ASSYM(GD_KTR_BUF_DATA, offsetof(struct globaldata, gd_ktr_buf_data));
 #endif
 
-#ifdef SMP
 ASSYM(GD_CPUID, offsetof(struct globaldata, gd_cpuid));
 
+#ifdef SMP
 ASSYM(LA_VER, offsetof(struct LAPIC, version));
 ASSYM(LA_TPR, offsetof(struct LAPIC, tpr));
 ASSYM(LA_EOI, offsetof(struct LAPIC, eoi));
 ASSYM(LA_SVR, offsetof(struct LAPIC, svr));
 ASSYM(LA_ICR_LO, offsetof(struct LAPIC, icr_lo));
 ASSYM(LA_ICR_HI, offsetof(struct LAPIC, icr_hi));
 #endif
 
 ASSYM(KCSEL, GSEL(GCODE_SEL, SEL_KPL));
 ASSYM(KDSEL, GSEL(GDATA_SEL, SEL_KPL));
 ASSYM(KPSEL, GSEL(GPRIV_SEL, SEL_KPL));
 
 ASSYM(BC32SEL, GSEL(GBIOSCODE32_SEL, SEL_KPL));
 ASSYM(GPROC0_SEL, GPROC0_SEL);
 ASSYM(VM86_FRAMESIZE, sizeof(struct vm86frame));
 
 ASSYM(MTX_LOCK, offsetof(struct mtx, mtx_lock));
 ASSYM(MTX_RECURSECNT, offsetof(struct mtx, mtx_recurse));
 ASSYM(MTX_SAVEINTR, offsetof(struct mtx, mtx_saveintr));
Index: head/sys/i386/i386/swtch.s
===================================================================
--- head/sys/i386/i386/swtch.s	(revision 72375)
+++ head/sys/i386/i386/swtch.s	(revision 72376)
@@ -1,393 +1,379 @@
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_npx.h"
 #include "opt_user_ldt.h"
 
-#include <sys/rtprio.h>
-
 #include <machine/asmacros.h>
 #include <machine/ipl.h>
 
 #ifdef SMP
 #include <machine/pmap.h>
 #include <machine/apic.h>
 #include <machine/smptests.h>		/** GRAB_LOPRIO */
 #endif /* SMP */
 
 #include "assym.s"
 
 
 /*****************************************************************************/
 /* Scheduling                                                                */
 /*****************************************************************************/
 
 	.data
 
 	.globl	_panic
 
 #if defined(SWTCH_OPTIM_STATS)
 	.globl	_swtch_optim_stats, _tlb_flush_count
 _swtch_optim_stats:	.long	0		/* number of _swtch_optims */
 _tlb_flush_count:	.long	0
 #endif
 
 	.text
 
 /*
  * cpu_throw()
  */
 ENTRY(cpu_throw)
 	jmp	sw1
 
 /*
  * cpu_switch()
  */
 ENTRY(cpu_switch)
 	
 	/* switch to new process. first, save context as needed */
 	movl	PCPU(CURPROC),%ecx
 
 	/* if no process to save, don't bother */
 	testl	%ecx,%ecx
 	jz	sw1
 
-#ifdef SMP
 	movb	P_ONCPU(%ecx), %al		/* save "last" cpu */
 	movb	%al, P_LASTCPU(%ecx)
 	movb	$0xff, P_ONCPU(%ecx)		/* "leave" the cpu */
-#endif /* SMP */
+
 	movl	P_VMSPACE(%ecx), %edx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif /* SMP */
 	btrl	%eax, VM_PMAP+PM_ACTIVE(%edx)
 
 	movl	P_ADDR(%ecx),%edx
 
 	movl	(%esp),%eax			/* Hardware registers */
 	movl	%eax,PCB_EIP(%edx)
 	movl	%ebx,PCB_EBX(%edx)
 	movl	%esp,PCB_ESP(%edx)
 	movl	%ebp,PCB_EBP(%edx)
 	movl	%esi,PCB_ESI(%edx)
 	movl	%edi,PCB_EDI(%edx)
 	movl	%gs,PCB_GS(%edx)
 
 	/* test if debug registers should be saved */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    %dr7,%eax                       /* yes, do the save */
 	movl    %eax,PCB_DR7(%edx)
 	andl    $0x0000ff00, %eax               /* disable all watchpoints */
 	movl    %eax,%dr7
 	movl    %dr6,%eax
 	movl    %eax,PCB_DR6(%edx)
 	movl    %dr3,%eax
 	movl    %eax,PCB_DR3(%edx)
 	movl    %dr2,%eax
 	movl    %eax,PCB_DR2(%edx)
 	movl    %dr1,%eax
 	movl    %eax,PCB_DR1(%edx)
 	movl    %dr0,%eax
 	movl    %eax,PCB_DR0(%edx)
 1:
  
 	/* save sched_lock recursion count */
 	movl	_sched_lock+MTX_RECURSECNT,%eax
 	movl    %eax,PCB_SCHEDNEST(%edx)
  
 #ifdef SMP
 	/* XXX FIXME: we should be saving the local APIC TPR */
 #endif /* SMP */
 
 #ifdef DEV_NPX
 	/* have we used fp, and need a save? */
 	cmpl	%ecx,PCPU(NPXPROC)
 	jne	1f
 	addl	$PCB_SAVEFPU,%edx		/* h/w bugs make saving complicated */
 	pushl	%edx
 	call	_npxsave			/* do it in a big C function */
 	popl	%eax
 1:
 #endif	/* DEV_NPX */
 
 	/* save is done, now choose a new process */
 sw1:
 
 #ifdef SMP
 	/* Stop scheduling if smp_active goes zero and we are not BSP */
 	cmpl	$0,_smp_active
 	jne	1f
 	cmpl	$0,PCPU(CPUID)
 	je	1f
 
 	movl	PCPU(IDLEPROC), %eax
 	jmp	sw1b
 1:
 #endif
 
 	/*
 	 * Choose a new process to schedule.  chooseproc() returns idleproc
 	 * if it cannot find another process to run.
 	 */
 sw1a:
 	call	_chooseproc			/* trash ecx, edx, ret eax*/
 
 #ifdef INVARIANTS
 	testl	%eax,%eax			/* no process? */
 	jz	badsw3				/* no, panic */
 #endif
 sw1b:
 	movl	%eax,%ecx
 
 #ifdef	INVARIANTS
 	cmpb	$SRUN,P_STAT(%ecx)
 	jne	badsw2
 #endif
 
 	movl	P_ADDR(%ecx),%edx
 
 #if defined(SWTCH_OPTIM_STATS)
 	incl	_swtch_optim_stats
 #endif
 	/* switch address space */
 	movl	%cr3,%ebx
 	cmpl	PCB_CR3(%edx),%ebx
 	je	4f
 #if defined(SWTCH_OPTIM_STATS)
 	decl	_swtch_optim_stats
 	incl	_tlb_flush_count
 #endif
 	movl	PCB_CR3(%edx),%ebx
 	movl	%ebx,%cr3
 4:
 
-#ifdef SMP
 	movl	PCPU(CPUID), %esi
-#else
-	xorl	%esi, %esi
-#endif
 	cmpl	$0, PCB_EXT(%edx)		/* has pcb extension? */
 	je	1f
 	btsl	%esi, _private_tss		/* mark use of private tss */
 	movl	PCB_EXT(%edx), %edi		/* new tss descriptor */
 	jmp	2f
 1:
 
 	/* update common_tss.tss_esp0 pointer */
 	movl	%edx, %ebx			/* pcb */
 	addl	$(UPAGES * PAGE_SIZE - 16), %ebx
 	movl	%ebx, PCPU(COMMON_TSS) + TSS_ESP0
 
 	btrl	%esi, _private_tss
 	jae	3f
 	PCPU_ADDR(COMMON_TSSD, %edi)
 2:
 	/* move correct tss descriptor into GDT slot, then reload tr */
 	movl	PCPU(TSS_GDT), %ebx		/* entry in GDT */
 	movl	0(%edi), %eax
 	movl	%eax, 0(%ebx)
 	movl	4(%edi), %eax
 	movl	%eax, 4(%ebx)
 	movl	$GPROC0_SEL*8, %esi		/* GSEL(entry, SEL_KPL) */
 	ltr	%si
 3:
 	movl	P_VMSPACE(%ecx), %ebx
-#ifdef SMP
 	movl	PCPU(CPUID), %eax
-#else
-	xorl	%eax, %eax
-#endif
 	btsl	%eax, VM_PMAP+PM_ACTIVE(%ebx)
 
 	/* restore context */
 	movl	PCB_EBX(%edx),%ebx
 	movl	PCB_ESP(%edx),%esp
 	movl	PCB_EBP(%edx),%ebp
 	movl	PCB_ESI(%edx),%esi
 	movl	PCB_EDI(%edx),%edi
 	movl	PCB_EIP(%edx),%eax
 	movl	%eax,(%esp)
 
 #ifdef SMP
 #ifdef GRAB_LOPRIO				/* hold LOPRIO for INTs */
 #ifdef CHEAP_TPR
 	movl	$0, _lapic+LA_TPR
 #else
 	andl	$~APIC_TPR_PRIO, _lapic+LA_TPR
 #endif /** CHEAP_TPR */
 #endif /** GRAB_LOPRIO */
+#endif /* SMP */
 	movl	PCPU(CPUID),%eax
 	movb	%al, P_ONCPU(%ecx)
-#endif /* SMP */
+
 	movl	%edx, PCPU(CURPCB)
 	movl	%ecx, PCPU(CURPROC)		/* into next process */
 
 #ifdef SMP
 	/* XXX FIXME: we should be restoring the local APIC TPR */
 #endif /* SMP */
 
 #ifdef	USER_LDT
 	cmpl	$0, PCB_USERLDT(%edx)
 	jnz	1f
 	movl	__default_ldt,%eax
 	cmpl	PCPU(CURRENTLDT),%eax
 	je	2f
 	lldt	__default_ldt
 	movl	%eax,PCPU(CURRENTLDT)
 	jmp	2f
 1:	pushl	%edx
 	call	_set_user_ldt
 	popl	%edx
 2:
 #endif
 
 	/* This must be done after loading the user LDT. */
 	.globl	cpu_switch_load_gs
 cpu_switch_load_gs:
 	movl	PCB_GS(%edx),%gs
 
 	/* test if debug regisers should be restored */
 	movb    PCB_FLAGS(%edx),%al
 	andb    $PCB_DBREGS,%al
 	jz      1f                              /* no, skip over */
 	movl    PCB_DR6(%edx),%eax              /* yes, do the restore */
 	movl    %eax,%dr6
 	movl    PCB_DR3(%edx),%eax
 	movl    %eax,%dr3
 	movl    PCB_DR2(%edx),%eax
 	movl    %eax,%dr2
 	movl    PCB_DR1(%edx),%eax
 	movl    %eax,%dr1
 	movl    PCB_DR0(%edx),%eax
 	movl    %eax,%dr0
 	movl    PCB_DR7(%edx),%eax
 	movl    %eax,%dr7
 1:
 
 	/*
 	 * restore sched_lock recursion count and transfer ownership to
 	 * new process
 	 */
 	movl	PCB_SCHEDNEST(%edx),%eax
 	movl	%eax,_sched_lock+MTX_RECURSECNT
 
 	movl	PCPU(CURPROC),%eax
 	movl	%eax,_sched_lock+MTX_LOCK
 
 	ret
 
 CROSSJUMPTARGET(sw1a)
 
 #ifdef INVARIANTS
 badsw2:
 	pushl	$sw0_2
 	call	_panic
 
 sw0_2:	.asciz	"cpu_switch: not SRUN"
 
 badsw3:
 	pushl	$sw0_3
 	call	_panic
 
 sw0_3:	.asciz	"cpu_switch: chooseproc returned NULL"
 #endif
 
 /*
  * savectx(pcb)
  * Update pcb, saving current processor state.
  */
 ENTRY(savectx)
 	/* fetch PCB */
 	movl	4(%esp),%ecx
 
 	/* caller's return address - child won't execute this routine */
 	movl	(%esp),%eax
 	movl	%eax,PCB_EIP(%ecx)
 
 	movl	%cr3,%eax
 	movl	%eax,PCB_CR3(%ecx)
 
 	movl	%ebx,PCB_EBX(%ecx)
 	movl	%esp,PCB_ESP(%ecx)
 	movl	%ebp,PCB_EBP(%ecx)
 	movl	%esi,PCB_ESI(%ecx)
 	movl	%edi,PCB_EDI(%ecx)
 	movl	%gs,PCB_GS(%ecx)
 
 #ifdef DEV_NPX
 	/*
 	 * If npxproc == NULL, then the npx h/w state is irrelevant and the
 	 * state had better already be in the pcb.  This is true for forks
 	 * but not for dumps (the old book-keeping with FP flags in the pcb
 	 * always lost for dumps because the dump pcb has 0 flags).
 	 *
 	 * If npxproc != NULL, then we have to save the npx h/w state to
 	 * npxproc's pcb and copy it to the requested pcb, or save to the
 	 * requested pcb and reload.  Copying is easier because we would
 	 * have to handle h/w bugs for reloading.  We used to lose the
 	 * parent's npx state for forks by forgetting to reload.
 	 */
 	movl	PCPU(NPXPROC),%eax
 	testl	%eax,%eax
 	je	1f
 
 	pushl	%ecx
 	movl	P_ADDR(%eax),%eax
 	leal	PCB_SAVEFPU(%eax),%eax
 	pushl	%eax
 	pushl	%eax
 	call	_npxsave
 	addl	$4,%esp
 	popl	%eax
 	popl	%ecx
 
 	pushl	$PCB_SAVEFPU_SIZE
 	leal	PCB_SAVEFPU(%ecx),%ecx
 	pushl	%ecx
 	pushl	%eax
 	call	_bcopy
 	addl	$12,%esp
 #endif	/* DEV_NPX */
 
 1:
 	ret
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 72375)
+++ head/sys/i386/i386/trap.c	(revision 72376)
@@ -1,1328 +1,1327 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD$
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #include <machine/vm86.h>
 
 #include <ddb/ddb.h>
 
 #include <sys/sysctl.h>
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void ast __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
 static void trap_fatal __P((struct trapframe *, vm_offset_t));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		28
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef DDB
 static int ddb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 	&ddb_on_nmi, 0, "Go to DDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig;
 
 	while ((sig = CURSIG(p)) != 0) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		postsig(sig);
 	}
 
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		clear_resched();
 		DROP_GIANT_NOSWITCH();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		while ((sig = CURSIG(p)) != 0) {
 			if (!mtx_owned(&Giant))
 				mtx_lock(&Giant);
 			postsig(sig);
 		}
 		mtx_lock_spin(&sched_lock);
 	}
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_sflag & PS_PROFIL) {
 		mtx_unlock_spin(&sched_lock);
 		/* XXX - do we need Giant? */
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 	vm_offset_t eva;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	atomic_add_int(&cnt.v_trap, 1);
 
 	if ((frame.tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.  XXX This is really bad if we trap
 		 * while holding a spin lock.
 		 */
 		type = frame.tf_trapno;
 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * We should walk p_heldmtx here and see if any are
 			 * spin mutexes, and not do this if so.
 			 */
 			enable_intr();
 		}
 	}
 
 	eva = 0;
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 restart:
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
 	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
 		/* user trap */
 
 		mtx_lock_spin(&sched_lock);
 		sticks = p->p_sticks;
 		mtx_unlock_spin(&sched_lock);
 		p->p_md.md_regs = &frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			i = trap_pfault(&frame, TRUE, eva);
 			mtx_unlock(&Giant);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * f00f hack workaround has triggered, treat
 				 * as illegal instruction not page fault.
 				 */
 				frame.tf_trapno = T_PRIVINFLT;
 				goto restart;
 			}
 #endif
 			if (i == -1)
 				goto out;
 			if (i == 0)
 				goto user;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto out;
 #endif
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			mtx_lock(&Giant);
 			i = (*pmath_emulate)(&frame);
 			mtx_unlock(&Giant);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					goto out;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			(void) trap_pfault(&frame, FALSE, eva);
 			mtx_unlock(&Giant);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)&frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (in_vm86call)
 				break;
 
 			if (p->p_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 				mtx_lock(&Giant);
 				psignal(p, SIGBUS);
 				mtx_unlock(&Giant);
 				goto out;
 			}
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame.tf_eip == (int)doreti_iret) {
 				frame.tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_ds) {
 				frame.tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_es) {
 				frame.tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_fs) {
 				frame.tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb) != NULL &&
 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame.tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			/* XXX Giant */
 			if (user_dbreg_trap() && !in_vm86call) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * Fall through (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			/* XXX Giant */
 			if (kdb_trap (type, 0, &frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		mtx_lock(&Giant);
 		trap_fatal(&frame, eva);
 		mtx_unlock(&Giant);
 		goto out;
 	}
 
 	mtx_lock(&Giant);
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 	mtx_unlock(&Giant);
 
 user:
 	userret(p, &frame, sticks);
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 out:
 	return;
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	va = trunc_page(eva);
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t mpte;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		     (p->p_intr_nesting_level != 0 ||
 		      PCPU_GET(curpcb) == NULL ||
 		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 		
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return -2;
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss, esp;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
         		frame->tf_eflags & PSL_VM ? "vm86" :
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
 		return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	PROC_LOCK(p);
 	++p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (!grow_stack (p, va)) {
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 		return (1);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
 
 	PROC_LOCK(p);
 	--p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  *	syscall -	MP aware system call request C handler
  *
  *	A system call is essentially treated as a trap except that the
  *	MP lock is not held on entry or return.  We are responsible for
  *	obtaining the MP lock if necessary and for handling ASTs
  *	(e.g. a task switch) prior to return.
  *
  *	In general, only simple access and manipulation of curproc and
  *	the current stack is allowed without having to hold MP lock.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
 
 	atomic_add_int(&cnt.v_syscall, 1);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame.tf_cs) != SEL_UPL) {
 		mtx_lock(&Giant);
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 
 	p->p_md.md_regs = &frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is not MP aware.
 		 */
 		mtx_lock(&Giant);
 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
 		mtx_unlock(&Giant);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 */
 			code = fuword(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin is MP aware, but the tracing code is not
 	 */
 	if (params && (i = narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 		mtx_lock(&Giant);
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, narg, args);
 #endif
 		goto bad;
 	}
 
 	/*
 	 * Try to run the syscall without the MP lock if the syscall
 	 * is MP safe.  We have to obtain the MP lock no matter what if 
 	 * we are ktracing
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
 		mtx_lock(&Giant);
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsyscall(p->p_tracep, code, narg, args);
 	}
 #endif
 	p->p_retval[0] = 0;
 	p->p_retval[1] = frame.tf_edx;
 
 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
 
 	error = (*callp->sy_call)(p, args);
 
 	/*
 	 * MP SAFE (we may or may not have the MP lock at this point)
 	 */
 	switch (error) {
 	case 0:
 		frame.tf_eax = p->p_retval[0];
 		frame.tf_edx = p->p_retval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame.tf_eip -= frame.tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.  trapsignal() is not MP aware.
 	 */
 	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 	}
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 	/*
 	 * Release Giant if we had to get it
 	 */
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 #ifdef WITNESS
 	if (witness_list(p)) {
 		panic("system call %s returning with mutex(s) held\n",
 		    syscallnames[code]);
 	}
 #endif
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 void
 ast(frame)
 	struct trapframe frame;
 {
 	struct proc *p = CURPROC;
 	u_quad_t sticks;
 
 	KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode"));
 
 	/*
 	 * We check for a pending AST here rather than in the assembly as
 	 * acquiring and releasing mutexes in assembly is not fun.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (!(astpending() || resched_wanted())) {
 		mtx_unlock_spin(&sched_lock);
 		return;
 	}
 
 	sticks = p->p_sticks;
 
 	astoff();
 	mtx_intr_enable(&sched_lock);
 	atomic_add_int(&cnt.v_soft, 1);
 	if (p->p_sflag & PS_OWEUPC) {
 		p->p_sflag &= ~PS_OWEUPC;
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, p->p_stats->p_prof.pr_addr,
 			    p->p_stats->p_prof.pr_ticks);
 	}
 	if (p->p_sflag & PS_ALRMPEND) {
 		p->p_sflag &= ~PS_ALRMPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGVTALRM);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (p->p_sflag & PS_PROFPEND) {
 		p->p_sflag &= ~PS_PROFPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGPROF);
 	} else
 		mtx_unlock_spin(&sched_lock);
 	
 	userret(p, &frame, sticks);
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
Index: head/sys/ia64/ia64/trap.c
===================================================================
--- head/sys/ia64/ia64/trap.c	(revision 72375)
+++ head/sys/ia64/ia64/trap.c	(revision 72376)
@@ -1,782 +1,781 @@
 /* $FreeBSD$ */
 /* From: src/sys/alpha/alpha/trap.c,v 1.33 */
 /* $NetBSD: trap.c,v 1.31 1998/03/26 02:21:46 thorpej Exp $ */
 
 /*
  * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  * 
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  * 
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  * 
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mutex.h>
 #include <sys/ktr.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/exec.h>
 #include <sys/lock.h>
 #include <sys/vmmeter.h>
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #include <sys/pioctl.h>
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <sys/user.h>
 #include <sys/ptrace.h>
 #include <machine/clock.h>
 #include <machine/md_var.h>
 #include <machine/reg.h>
 #include <machine/pal.h>
 #include <machine/fpu.h>
 #include <machine/smp.h>
 #include <machine/mutex.h>
 
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 u_int32_t want_resched;
 
 static int unaligned_fixup(struct trapframe *framep, struct proc *p);
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 /*
  * Define the code needed before returning to user mode, for
  * trap and syscall.
  */
 void
 userret(register struct proc *p, struct trapframe *frame, u_quad_t oticks)
 {
 	int sig, s;
 
 	/* take pending signals */
 	while ((sig = CURSIG(p)) != 0) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		postsig(sig);
 	}
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (want_resched) {
 		/*
 		 * Since we are curproc, a clock interrupt could
 		 * change our priority without changing run queues
 		 * (the running process is not kept on a run queue).
 		 * If this happened after we setrunqueue ourselves but
 		 * before we switch()'ed, we might not be on the queue
 		 * indicated by our priority.
 		 */
 		s = splstatclock();
 		DROP_GIANT_NOSWITCH();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		splx(s);
 		while ((sig = CURSIG(p)) != 0) {
 			if (!mtx_owned(&Giant))
 				mtx_lock(&Giant);
 			postsig(sig);
 		}
 		mtx_lock_spin(&sched_lock);
 	}
 
 	/*
 	 * If profiling, charge recent system time to the trapped pc.
 	 */
 	if (p->p_sflag & PS_PROFIL) {
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, frame->tf_cr_iip,
 		    (int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }
 
 static const char *ia64_vector_names[] = {
 	"VHPT Translation",			/* 0 */
 	"Instruction TLB",			/* 1 */
 	"Data TLB",				/* 2 */
 	"Alternate Instruction TLB",		/* 3 */
 	"Alternate Data TLB",			/* 4 */
 	"Data Nested TLB",			/* 5 */
 	"Instruction Key Miss",			/* 6 */
 	"Data Key Miss",			/* 7 */
 	"Dirty-Bit",				/* 8 */
 	"Instruction Access-Bit",		/* 9 */
 	"Data Access-Bit",			/* 10 */
 	"Break Instruction",			/* 11 */
 	"External Interrupt",			/* 12 */
 	"Reserved 13",				/* 13 */
 	"Reserved 14",				/* 14 */
 	"Reserved 15",				/* 15 */
 	"Reserved 16",				/* 16 */
 	"Reserved 17",				/* 17 */
 	"Reserved 18",				/* 18 */
 	"Reserved 19",				/* 19 */
 	"Page Not Present",			/* 20 */
 	"Key Permission",			/* 21 */
 	"Instruction Access Rights",		/* 22 */
 	"Data Access Rights",			/* 23 */
 	"General Exception",			/* 24 */
 	"Disabled FP-Register",			/* 25 */
 	"NaT Consumption",			/* 26 */
 	"Speculation",				/* 27 */
 	"Reserved 28",				/* 28 */
 	"Debug",				/* 29 */
 	"Unaligned Reference",			/* 30 */
 	"Unsupported Data Reference",		/* 31 */
 	"Floating-point Fault",			/* 32 */
 	"Floating-point Trap",			/* 33 */
 	"Lower-Privilege Transfer Trap",	/* 34 */
 	"Taken Branch Trap",			/* 35 */
 	"Single Step Trap",			/* 36 */
 	"Reserved 37",				/* 37 */
 	"Reserved 38",				/* 38 */
 	"Reserved 39",				/* 39 */
 	"Reserved 40",				/* 40 */
 	"Reserved 41",				/* 41 */
 	"Reserved 42",				/* 42 */
 	"Reserved 43",				/* 43 */
 	"Reserved 44",				/* 44 */
 	"IA-32 Exception",			/* 45 */
 	"IA-32 Intercept",			/* 46 */
 	"IA-32 Interrupt",			/* 47 */
 	"Reserved 48",				/* 48 */
 	"Reserved 49",				/* 49 */
 	"Reserved 50",				/* 50 */
 	"Reserved 51",				/* 51 */
 	"Reserved 52",				/* 52 */
 	"Reserved 53",				/* 53 */
 	"Reserved 54",				/* 54 */
 	"Reserved 55",				/* 55 */
 	"Reserved 56",				/* 56 */
 	"Reserved 57",				/* 57 */
 	"Reserved 58",				/* 58 */
 	"Reserved 59",				/* 59 */
 	"Reserved 60",				/* 60 */
 	"Reserved 61",				/* 61 */
 	"Reserved 62",				/* 62 */
 	"Reserved 63",				/* 63 */
 	"Reserved 64",				/* 64 */
 	"Reserved 65",				/* 65 */
 	"Reserved 66",				/* 66 */
 	"Reserved 67",				/* 67 */
 };
 
 static void
 printtrap(int vector, int imm, struct trapframe *framep, int isfatal, int user)
 {
 	printf("\n");
 	printf("%s %s trap:\n", isfatal? "fatal" : "handled",
 	       user ? "user" : "kernel");
 	printf("\n");
 	printf("    trap vector = 0x%x (%s)\n",
 	       vector, ia64_vector_names[vector]);
 	printf("    cr.iip      = 0x%lx\n", framep->tf_cr_iip);
 	printf("    cr.ipsr     = 0x%lx\n", framep->tf_cr_ipsr);
 	printf("    cr.isr      = 0x%lx\n", framep->tf_cr_isr);
 	printf("    cr.ifa      = 0x%lx\n", framep->tf_cr_ifa);
 	printf("    cr.iim      = 0x%x\n", imm);
 	printf("    curproc     = %p\n", curproc);
 	if (curproc != NULL)
 		printf("        pid = %d, comm = %s\n", curproc->p_pid,
 		       curproc->p_comm);
 	printf("\n");
 }
 
 /*
  * Trap is called from exception.s to handle most types of processor traps.
  * System calls are broken out for efficiency and ASTs are broken out
  * to make the code a bit cleaner and more representative of the
  * architecture.
  */
 /*ARGSUSED*/
 void
 trap(int vector, int imm, struct trapframe *framep)
 {
 	struct proc *p;
 	int i;
 	u_int64_t ucode;
 	u_quad_t sticks;
 	int user;
 
 	cnt.v_trap++;
 	p = curproc;
 	ucode = 0;
 
 	user = ((framep->tf_cr_ipsr & IA64_PSR_CPL) == IA64_PSR_CPL_USER);
 	if (user) {
 		mtx_lock_spin(&sched_lock);
 		sticks = p->p_sticks;
 		mtx_unlock_spin(&sched_lock);
 		p->p_md.md_tf = framep;
 	} else {
 		sticks = 0;		/* XXX bogus -Wuninitialized warning */
 	}
 
 	switch (vector) {
 	case IA64_VEC_UNALIGNED_REFERENCE:
 		/*
 		 * If user-land, do whatever fixups, printing, and
 		 * signalling is appropriate (based on system-wide
 		 * and per-process unaligned-access-handling flags).
 		 */
 		if (user) {
 			mtx_lock(&Giant);
 			if ((i = unaligned_fixup(framep, p)) == 0) {
 				mtx_unlock(&Giant);
 				goto out;
 			}
 			mtx_unlock(&Giant);
 			ucode = framep->tf_cr_ifa;	/* VA */
 			break;
 		}
 
 		/*
 		 * Unaligned access from kernel mode is always an error,
 		 * EVEN IF A COPY FAULT HANDLER IS SET!
 		 *
 		 * It's an error if a copy fault handler is set because
 		 * the various routines which do user-initiated copies
 		 * do so in a bcopy-like manner.  In other words, the
 		 * kernel never assumes that pointers provided by the
 		 * user are properly aligned, and so if the kernel
 		 * does cause an unaligned access it's a kernel bug.
 		 */
 		goto dopanic;
 
 	case IA64_VEC_FLOATING_POINT_FAULT:
 	case IA64_VEC_FLOATING_POINT_TRAP:
 		/* 
 		 * If user-land, give a SIGFPE if software completion
 		 * is not requested or if the completion fails.
 		 */
 		if (user) {
 			i = SIGFPE;
 			ucode = /*a0*/ 0;		/* exception summary */
 			break;
 		}
 
 		/* Always fatal in kernel.  Should never happen. */
 		goto dopanic;
 
 	case IA64_VEC_BREAK:
 		goto dopanic;
 
 	case IA64_VEC_DISABLED_FP:
 		/*
 		 * on exit from the kernel, if proc == fpcurproc,
 		 * FP is enabled.
 		 */
 		if (PCPU_GET(fpcurproc) == p) {
 			printf("trap: fp disabled for fpcurproc == %p", p);
 			goto dopanic;
 		}
 	
 		ia64_fpstate_switch(p);
 		goto out;
 		break;
 
 	case IA64_VEC_PAGE_NOT_PRESENT:
 	case IA64_VEC_INST_ACCESS_RIGHTS:
 	case IA64_VEC_DATA_ACCESS_RIGHTS:
 	{
 		vm_offset_t va = framep->tf_cr_ifa;
 		struct vmspace *vm = NULL;
 		vm_map_t map;
 		vm_prot_t ftype = 0;
 		int rv;
 
 		mtx_lock(&Giant);
 		/*
 		 * If it was caused by fuswintr or suswintr,
 		 * just punt.  Note that we check the faulting
 		 * address against the address accessed by
 		 * [fs]uswintr, in case another fault happens
 		 * when they are running.
 			 */
 		if (!user &&
 		    p != NULL &&
 		    p->p_addr->u_pcb.pcb_onfault ==
 		    (unsigned long)fswintrberr &&
 		    p->p_addr->u_pcb.pcb_accessaddr == va) {
 			framep->tf_cr_iip = p->p_addr->u_pcb.pcb_onfault;
 			p->p_addr->u_pcb.pcb_onfault = 0;
 			mtx_unlock(&Giant);
 			goto out;
 		}
 
 		/*
 		 * It is only a kernel address space fault iff:
 		 *	1. !user and
 		 *	2. pcb_onfault not set or
 		 *	3. pcb_onfault set but kernel space data fault
 		 * The last can occur during an exec() copyin where the
 		 * argument space is lazy-allocated.
 		 *
 		 * For the purposes of the Linux emulator, we allow
 		 * kernel accesses to a small region of the
 		 * user stack which the emulator uses to
 		 * translate syscall arguments.
 		 */
 		if (!user 
 		    && ((va >= VM_MIN_KERNEL_ADDRESS) 
 			|| (p == NULL) 
 			|| (p->p_addr->u_pcb.pcb_onfault == 0))) {
 			if (va >= trunc_page(PS_STRINGS
 					     - szsigcode
 					     - SPARE_USRSPACE)
 			    && va < round_page(PS_STRINGS
 					       - szsigcode)) {
 				vm = p->p_vmspace;
 				map = &vm->vm_map;
 			} else {
 				map = kernel_map;
 			}
 		} else {
 			vm = p->p_vmspace;
 			map = &vm->vm_map;
 		}
 
 		if (framep->tf_cr_isr & IA64_ISR_X)
 			ftype = VM_PROT_EXECUTE;
 		else if (framep->tf_cr_isr & IA64_ISR_R)
 			ftype = VM_PROT_READ;
 		else
 			ftype = VM_PROT_WRITE;
 	
 		va = trunc_page((vm_offset_t)va);
 
 		if (map != kernel_map) {
 			/*
 			 * Keep swapout from messing with us
 			 * during this critical time.
 			 */
 			PROC_LOCK(p);
 			++p->p_lock;
 			PROC_UNLOCK(p);
 
 			/*
 			 * Grow the stack if necessary
 			 */
 			/* grow_stack returns false only if va falls into
 			 * a growable stack region and the stack growth
 			 * fails.  It returns true if va was not within
 			 * a growable stack region, or if the stack 
 			 * growth succeeded.
 			 */
 			if (!grow_stack (p, va)) {
 				rv = KERN_FAILURE;
 				PROC_LOCK(p);
 				--p->p_lock;
 				PROC_UNLOCK(p);
 				goto nogo;
 			}
 
 
 			/* Fault in the user page: */
 			rv = vm_fault(map, va, ftype,
 				      (ftype & VM_PROT_WRITE)
 				      ? VM_FAULT_DIRTY
 				      : VM_FAULT_NORMAL);
 
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 		} else {
 			/*
 			 * Don't have to worry about process
 			 * locking or stacks in the kernel.
 			 */
 			rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 		}
 				
 	nogo:;
 		/*
 		 * If this was a stack access we keep track of the
 		 * maximum accessed stack size.  Also, if vm_fault
 		 * gets a protection failure it is due to accessing
 		 * the stack region outside the current limit and
 		 * we need to reflect that as an access error.
 		 */
 		if (map != kernel_map &&
 		    (caddr_t)va >= vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (rv == KERN_SUCCESS) {
 				unsigned nss;
 	
 				nss = ia64_btop(round_page(USRSTACK - va));
 				if (nss > vm->vm_ssize)
 					vm->vm_ssize = nss;
 			} else if (rv == KERN_PROTECTION_FAILURE)
 				rv = KERN_INVALID_ADDRESS;
 		}
 		if (rv == KERN_SUCCESS) {
 			mtx_unlock(&Giant);
 			goto out;
 		}
 
 		mtx_unlock(&Giant);
 		ucode = va;
 		i = SIGSEGV;
 #ifdef DEBUG
 		printtrap(vector, imm, framep, 1, user);
 #endif
 		break;
 	}
 
 	default:
 		goto dopanic;
 	}
 
 #ifdef DEBUG
 	printtrap(vector, imm, framep, 1, user);
 #endif
 	trapsignal(p, i, ucode);
 out:
 	if (user) {
 		userret(p, framep, sticks);
 		if (mtx_owned(&Giant))
 			mtx_unlock(&Giant);
 	}
 	return;
 
 dopanic:
 	printtrap(vector, imm, framep, 1, user);
 
 	/* XXX dump registers */
 
 #ifdef DDB
 	kdb_trap(vector, framep);
 #endif
 
 	panic("trap");
 }
 
 /*
  * Process a system call.
  *
  * System calls are strange beasts.  They are passed the syscall number
  * in r15, and the arguments in the registers (as normal).  They return
  * an error flag in r10 (if r10 != 0 on return, the syscall had an error),
  * and the return value (if any) in r8 and r9.
  *
  * The assembly stub takes care of moving the call number into a register
  * we can get to, and moves all of the argument registers into a stack 
  * buffer.  On return, it restores r8-r10 from the frame before
  * returning to the user process. 
  */
 void
 syscall(int code, u_int64_t *args, struct trapframe *framep)
 {
 	struct sysent *callp;
 	struct proc *p;
 	int error = 0;
 	u_int64_t oldip, oldri;
 	u_quad_t sticks;
 
 	cnt.v_syscall++;
 	p = curproc;
 	p->p_md.md_tf = framep;
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 
 	mtx_lock(&Giant);
 	/*
 	 * Skip past the break instruction. Remember old address in case
 	 * we have to restart.
 	 */
 	oldip = framep->tf_cr_iip;
 	oldri = framep->tf_cr_ipsr & IA64_PSR_RI;
 	framep->tf_cr_ipsr += IA64_PSR_RI_1;
 	if ((framep->tf_cr_ipsr & IA64_PSR_RI) > IA64_PSR_RI_2) {
 		framep->tf_cr_ipsr &= ~IA64_PSR_RI;
 		framep->tf_cr_iip += 16;
 	}
 			   
 #ifdef DIAGNOSTIC
 	ia64_fpstate_check(p);
 #endif
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/* (*p->p_sysent->sv_prepsyscall)(framep, args, &code, &params); */
 		panic("prepsyscall");
 	} else {
 		/*
 		 * syscall() and __syscall() are handled the same on
 		 * the ia64, as everything is 64-bit aligned, anyway.
 		 */
 		if (code == SYS_syscall || code == SYS___syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = args[0];
 			args++;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, (callp->sy_narg & SYF_ARGMASK), args);
 #endif
 	if (error == 0) {
 		p->p_retval[0] = 0;
 		p->p_retval[1] = 0;
 
 		STOPEVENT(p, S_SCE, (callp->sy_narg & SYF_ARGMASK));
 
 		error = (*callp->sy_call)(p, args);
 	}
 
 
 	switch (error) {
 	case 0:
 		framep->tf_r[FRAME_R8] = p->p_retval[0];
 		framep->tf_r[FRAME_R9] = p->p_retval[1];
 		framep->tf_r[FRAME_R10] = 0;
 		break;
 	case ERESTART:
 		framep->tf_cr_iip = oldip;
 		framep->tf_cr_ipsr =
 			(framep->tf_cr_ipsr & ~IA64_PSR_RI) | oldri;
 		break;
 	case EJUSTRETURN:
 		break;
 	default:
 		if (p->p_sysent->sv_errsize) {
 			if (error >= p->p_sysent->sv_errsize)
 				error = -1; /* XXX */
 			else
 				error = p->p_sysent->sv_errtbl[error];
 		}
 		framep->tf_r[FRAME_R8] = error;
 		framep->tf_r[FRAME_R10] = 1;
 		break;
 	}
 
 	userret(p, framep, sticks);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 	mtx_unlock(&Giant);
 
 #ifdef WITNESS
 	if (witness_list(p)) {
 		panic("system call %s returning with mutex(s) held\n",
 		    syscallnames[code]);
 	}
 #endif
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Process the tail end of a fork() for the child.
  */
 void
 child_return(p)
 	struct proc *p;
 {
 
 	/*
 	 * Return values in the frame set by cpu_fork().
 	 */
 
 	userret(p, p->p_md.md_tf, 0);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsysret(p->p_tracep, SYS_fork, 0, 0);
 	}
 #endif
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  */
 void
 ast(framep)
 	struct trapframe *framep;
 {
 	register struct proc *p;
 	u_quad_t sticks;
 
 	p = curproc;
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 	p->p_md.md_tf = framep;
 
 	if ((framep->tf_cr_ipsr & IA64_PSR_CPL) != IA64_PSR_CPL_USER)
 		panic("ast and not user");
 
 	cnt.v_soft++;
 
 	PCPU_SET(astpending, 0);
 	mtx_lock_spin(&sched_lock);
 	if (p->p_sflag & PS_OWEUPC) {
 		p->p_sflag &= ~PS_OWEUPC;
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, p->p_stats->p_prof.pr_addr,
 			    p->p_stats->p_prof.pr_ticks);
 	}
 	if (p->p_sflag & PS_ALRMPEND) {
 		p->p_sflag &= ~PS_ALRMPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGVTALRM);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (p->p_sflag & PS_PROFPEND) {
 		p->p_sflag &= ~PS_PROFPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGPROF);
 	} else
 		mtx_unlock_spin(&sched_lock);
 
 	userret(p, framep, sticks);
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
 
 extern int	ia64_unaligned_print, ia64_unaligned_fix;
 extern int	ia64_unaligned_sigbus;
 
 static int
 unaligned_fixup(struct trapframe *framep, struct proc *p)
 {
 	vm_offset_t va = framep->tf_cr_ifa;
 	int doprint, dofix, dosigbus;
 	int signal, size = 0;
 	unsigned long uac;
 
 	/*
 	 * Figure out what actions to take.
 	 */
 
 	if (p)
 		uac = p->p_md.md_flags & MDP_UAC_MASK;
 	else
 		uac = 0;
 
 	doprint = ia64_unaligned_print && !(uac & MDP_UAC_NOPRINT);
 	dofix = ia64_unaligned_fix && !(uac & MDP_UAC_NOFIX);
 	dosigbus = ia64_unaligned_sigbus | (uac & MDP_UAC_SIGBUS);
 
 	/*
 	 * See if the user can access the memory in question.
 	 * Even if it's an unknown opcode, SEGV if the access
 	 * should have failed.
 	 */
 	if (!useracc((caddr_t)va, size ? size : 1, VM_PROT_WRITE)) {
 		signal = SIGSEGV;
 		goto out;
 	}
 
 	/*
 	 * If we're supposed to be noisy, squawk now.
 	 */
 	if (doprint) {
 		uprintf("pid %d (%s): unaligned access: va=0x%lx pc=0x%lx\n",
 			p->p_pid, p->p_comm, va, p->p_md.md_tf->tf_cr_iip);
 	}
 
 	/*
 	 * If we should try to fix it and know how, give it a shot.
 	 *
 	 * We never allow bad data to be unknowingly used by the
 	 * user process.  That is, if we decide not to fix up an
 	 * access we cause a SIGBUS rather than letting the user
 	 * process go on without warning.
 	 *
 	 * If we're trying to do a fixup, we assume that things
 	 * will be botched.  If everything works out OK, 
 	 * unaligned_{load,store}_* clears the signal flag.
 	 */
 	signal = SIGBUS;
 	if (dofix && size != 0) {
 		/*
 		 * XXX not done yet.
 		 */
 	} 
 
 	/*
 	 * Force SIGBUS if requested.
 	 */
 	if (dosigbus)
 		signal = SIGBUS;
 
 out:
 	return (signal);
 }
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 72375)
+++ head/sys/kern/init_main.c	(revision 72376)
@@ -1,605 +1,606 @@
 /*
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include "opt_init_path.h"
 
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 
 #include <machine/cpu.h>
 #include <machine/globals.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <sys/copyright.h>
 
 extern struct linker_set	sysinit_set;	/* XXX */
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 static struct pcred cred0;
 static struct procsig procsig0;
 static struct filedesc0 filedesc0;
 static struct plimit limit0;
 static struct vmspace vmspace0;
 struct	proc *initproc;
 
 int cmask = CMASK;
 extern	struct user *proc0paddr;
 
 struct	vnode *rootvp;
 int	boothowto = 0;		/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 struct sysinit **sysinit = (struct sysinit **)sysinit_set.ls_items;
 struct sysinit **newsysinit;
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count = 0;
 
 	if (newsysinit)
 		for (sipp = newsysinit; *sipp; sipp++)
 			count++;
 	else
 		for (sipp = sysinit; *sipp; sipp++)
 			count++;
 	for (sipp = set; *sipp; sipp++)
 		count++;
 	count++;		/* Trailing NULL */
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; *sipp; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; *sipp; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; *sipp; sipp++)
 		*xipp++ = *sipp;
 	*xipp = NULL;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 }
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; *sipp; sipp++) {
 		for (xipp = sipp + 1; *xipp; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for (sipp = sysinit; *sipp; sipp++) {
 
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != (struct sysinit **)sysinit_set.ls_items)
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			newsysinit = NULL;
 			goto restart;
 		}
 	}
 
 	panic("Shouldn't get here!");
 	/* NOTREACHED*/
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 static void
 print_caddr_t(void *data __unused)
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	register struct proc		*p;
 	register struct filedesc0	*fdp;
 	register unsigned i;
 
 	p = &proc0;
 
 	/*
 	 * Initialize magic number.
 	 */
 	p->p_magic = P_MAGIC;
 
 	/*
 	 * Initialize process and pgrp structures.
 	 */
 	procinit();
 
 	/*
 	 * Initialize sleep queue hash table
 	 */
 	sleepinit();
 
 	/*
 	 * additional VM structures
 	 */
 	vm_init2();
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 #ifdef __ELF__
 	p->p_sysent = &elf_freebsd_sysvec;
 #else
 	p->p_sysent = &aout_sysvec;
 #endif
 
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_stat = SRUN;
 	p->p_nice = NZERO;
-	p->p_rtprio.type = RTP_PRIO_NORMAL;
-	p->p_rtprio.prio = 0;
+	p->p_pri.pri_class = PRI_TIMESHARE;
+	p->p_pri.pri_level = PVM;
+	p->p_pri.pri_user = PUSER;
 
 	p->p_peers = 0;
 	p->p_leader = p;
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	callout_init(&p->p_itcallout, 0);
 	callout_init(&p->p_slpcallout, 1);
 
 	/* Create credentials. */
 	cred0.p_refcnt = 1;
 	cred0.p_uidinfo = uifind(0);
 	p->p_cred = &cred0;
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 	p->p_ucred->cr_uidinfo = uifind(0);
 
 	/* Don't jail it */
 	p->p_prison = 0;
 
 	/* Create procsig. */
 	p->p_procsig = &procsig0;
 	p->p_procsig->ps_refcnt = 1;
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	fdp = &filedesc0;
 	p->p_fd = &fdp->fd_fd;
 	fdp->fd_fd.fd_refcnt = 1;
 	fdp->fd_fd.fd_cmask = cmask;
 	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
 	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
 	fdp->fd_fd.fd_nfiles = NDFILE;
 
 	/* Create the limits structures. */
 	p->p_limit = &limit0;
 	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
 		limit0.pl_rlimit[i].rlim_cur =
 		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	i = ptoa(cnt.v_free_count);
 	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	limit0.p_cpulimit = RLIM_INFINITY;
 	limit0.p_refcnt = 1;
 
 	/* Allocate a prototype map so we have something to fork. */
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
 	    trunc_page(VM_MAXUSER_ADDRESS));
 	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
 	p->p_addr = proc0paddr;				/* XXX */
 
 	/*
 	 * We continue to place resource usage info and signal
 	 * actions in the user struct so they're pageable.
 	 */
 	p->p_stats = &p->p_addr->u_stats;
 	p->p_sigacts = &p->p_addr->u_sigacts;
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(cred0.p_uidinfo, 1, 0);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct timespec ts;
 	struct proc *p;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the file system.  Pretend that proc0 started now.
 	 */
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, &allproc, p_list) {
 		microtime(&p->p_stats->p_start);
 		p->p_runtime = 0;
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	microuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	nanotime(&ts);
 	srandom(ts.tv_sec ^ ts.tv_nsec);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0, "");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, error;
 	char *var, *path, *next, *s;
 	char *ucp, **uap, *arg0, *arg1;
 	struct proc *p;
 
 	mtx_lock(&Giant);
 
 	p = curproc;
 
 	/* Get the vnode for '/'.  Set p->p_fd->fd_cdir to reference it. */
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
 		panic("cannot find root vnode");
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(p->p_fd->fd_cdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VOP_UNLOCK(rootvnode, 0, p);
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = trunc_page(USRSTACK - PAGE_SIZE);
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
 			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	if ((var = getenv("init_path")) != NULL) {
 		strncpy(init_path, var, sizeof init_path);
 		init_path[sizeof init_path - 1] = 0;
 	}
 	
 	for (path = init_path; *path != '\0'; path = next) {
 		while (*path == ':')
 			path++;
 		if (*path == '\0')
 			break;
 		for (next = path; *next != '\0' && *next != ':'; next++)
 			/* nothing */ ;
 		if (bootverbose)
 			printf("start_init: trying %.*s\n", (int)(next - path),
 			    path);
 			
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)USRSTACK;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 		if (devfs_present) {
 			(void)subyte(--ucp, 'd');
 			options = 1;
 		}
 
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		(void)subyte(--ucp, 0);
 		for (s = next - 1; s >= path; s--)
 			(void)subyte(--ucp, *s);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
 		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
 		if ((error = execve(p, &args)) == 0) {
 			mtx_unlock(&Giant);
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %.*s: error %d\n", (int)(next - path), 
 			    path, error);
 	}
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kthread_create(), but runs in it's own address space.
  * We do this early to reserve pid 1.
  *
  * Note special case - do not make it runnable yet.  Other work
  * in progress will change this more.
  */
 static void
 create_init(const void *udata __unused)
 {
 	int error;
 
 	error = fork1(&proc0, RFFDG | RFPROC | RFSTOPPED, &initproc);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM;
 	PROC_UNLOCK(initproc);
 	mtx_lock_spin(&sched_lock);
 	initproc->p_sflag |= PS_INMEM;
 	mtx_unlock_spin(&sched_lock);
 	cpu_set_fork_handler(initproc, start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 
 	mtx_lock_spin(&sched_lock);
 	initproc->p_stat = SRUN;
 	setrunqueue(initproc);
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: head/sys/kern/kern_condvar.c
===================================================================
--- head/sys/kern/kern_condvar.c	(revision 72375)
+++ head/sys/kern/kern_condvar.c	(revision 72376)
@@ -1,546 +1,542 @@
 /*-
  * Copyright (c) 2000 Jake Burkholder <jake@freebsd.org>.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 /*
  * Common sanity checks for cv_wait* functions.
  */
 #define	CV_ASSERT(cvp, mp, p) do {					\
 	KASSERT((p) != NULL, ("%s: curproc NULL", __FUNCTION__));	\
 	KASSERT((p)->p_stat == SRUN, ("%s: not SRUN", __FUNCTION__));	\
 	KASSERT((cvp) != NULL, ("%s: cvp NULL", __FUNCTION__));		\
 	KASSERT((mp) != NULL, ("%s: mp NULL", __FUNCTION__));		\
 	mtx_assert((mp), MA_OWNED | MA_NOTRECURSED);			\
 } while (0)
 
 #ifdef CV_DEBUG
 #define	CV_WAIT_VALIDATE(cvp, mp) do {					\
 	if (TAILQ_EMPTY(&(cvp)->cv_waitq)) {				\
 		/* Only waiter. */					\
 		(cvp)->cv_mtx = (mp);					\
 	} else {							\
 		/*							\
 		 * Other waiter; assert that we're using the		\
 		 * same mutex.						\
 		 */							\
 		KASSERT((cvp)->cv_mtx == (mp),				\
 		    ("%s: Multiple mutexes", __FUNCTION__));		\
 	}								\
 } while (0)
 #define	CV_SIGNAL_VALIDATE(cvp) do {					\
 	if (!TAILQ_EMPTY(&(cvp)->cv_waitq)) {				\
 		KASSERT(mtx_owned((cvp)->cv_mtx),			\
 		    ("%s: Mutex not owned", __FUNCTION__));		\
 	}								\
 } while (0)
 #else
 #define	CV_WAIT_VALIDATE(cvp, mp)
 #define	CV_SIGNAL_VALIDATE(cvp)
 #endif
 
 static void cv_timedwait_end(void *arg);
 
 /*
  * Initialize a condition variable.  Must be called before use.
  */
 void
 cv_init(struct cv *cvp, const char *desc)
 {
 
 	TAILQ_INIT(&cvp->cv_waitq);
 	cvp->cv_mtx = NULL;
 	cvp->cv_description = desc;
 }
 
 /*
  * Destroy a condition variable.  The condition variable must be re-initialized
  * in order to be re-used.
  */
 void
 cv_destroy(struct cv *cvp)
 {
 
 	KASSERT(cv_waitq_empty(cvp), ("%s: cv_waitq non-empty", __FUNCTION__));
 }
 
 /*
  * Common code for cv_wait* functions.  All require sched_lock.
  */
 
 /*
  * Switch context.
  */
 static __inline void
 cv_switch(struct proc *p)
 {
 
 	p->p_stat = SSLEEP;
 	p->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
 	CTR3(KTR_PROC, "cv_switch: resume proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 }
 
 /*
  * Switch context, catching signals.
  */
 static __inline int
 cv_switch_catch(struct proc *p)
 {
 	int sig;
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout before
 	 * calling CURSIG, as we could stop there, and a wakeup or a SIGCONT (or
 	 * both) could occur while we were stopped.  A SIGCONT would cause us to
 	 * be marked as SSLEEP without resuming us, thus we must be ready for
 	 * sleep when CURSIG is called.  If the wakeup happens while we're
 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
 	 */
 	p->p_sflag |= PS_SINTR;
 	mtx_unlock_spin(&sched_lock);
 	sig = CURSIG(p);
 	mtx_lock_spin(&sched_lock);
 	if (sig != 0) {
 		if (p->p_wchan != NULL)
 			cv_waitq_remove(p);
 		p->p_stat = SRUN;
 	} else if (p->p_wchan != NULL) {
 		cv_switch(p);
 	}
 	p->p_sflag &= ~PS_SINTR;
 
 	return sig;
 }
 
 /*
  * Add a process to the wait queue of a condition variable.
  */
 static __inline void
 cv_waitq_add(struct cv *cvp, struct proc *p)
 {
 
 	/*
 	 * Process may be sitting on a slpque if asleep() was called, remove it
 	 * before re-adding.
 	 */
 	if (p->p_wchan != NULL)
 		unsleep(p);
 
 	p->p_sflag |= PS_CVWAITQ;
 	p->p_wchan = cvp;
 	p->p_wmesg = cvp->cv_description;
 	p->p_slptime = 0;
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;
 	CTR3(KTR_PROC, "cv_waitq_add: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	TAILQ_INSERT_TAIL(&cvp->cv_waitq, p, p_slpq);
 }
 
 /*
  * Wait on a condition variable.  The current process is placed on the condition
  * variable's wait queue and suspended.  A cv_signal or cv_broadcast on the same
  * condition variable will resume the process.  The mutex is released before
  * sleeping and will be held on return.  It is recommended that the mutex be
  * held when cv_signal or cv_broadcast are called.
  */
 void
 cv_wait(struct cv *cvp, struct mtx *mp)
 {
 	struct proc *p;
 	WITNESS_SAVE_DECL(mp);
 
 	p = CURPROC;
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
 #endif
 	CV_ASSERT(cvp, mp, p);
 	WITNESS_SLEEP(0, mp);
 	WITNESS_SAVE(mp, mp);
 
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
 		 * After a panic, or during autoconfiguration, just give
 		 * interrupts a chance, then just return; don't run any other
 		 * procs or panic below, in case this is the idle process and
 		 * already asleep.
 		 */
 		mtx_unlock_spin(&sched_lock);
 		return;
 	}
 	CV_WAIT_VALIDATE(cvp, mp);
 
 	DROP_GIANT_NOSWITCH();
 	mtx_unlock_flags(mp, MTX_NOSWITCH);
 
 	cv_waitq_add(cvp, p);
 	cv_switch(p);
-	curpriority = p->p_usrpri;
 
 	mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 0, 0);
 #endif
 	PICKUP_GIANT();
 	mtx_lock(mp);
 	WITNESS_RESTORE(mp, mp);
 }
 
 /*
  * Wait on a condition variable, allowing interruption by signals.  Return 0 if
  * the process was resumed with cv_signal or cv_broadcast, EINTR or ERESTART if
  * a signal was caught.  If ERESTART is returned the system call should be
  * restarted if possible.
  */
 int
 cv_wait_sig(struct cv *cvp, struct mtx *mp)
 {
 	struct proc *p;
 	int rval;
 	int sig;
 	WITNESS_SAVE_DECL(mp);
 
 	p = CURPROC;
 	rval = 0;
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
 #endif
 	CV_ASSERT(cvp, mp, p);
 	WITNESS_SLEEP(0, mp);
 	WITNESS_SAVE(mp, mp);
 
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
 		 * After a panic, or during autoconfiguration, just give
 		 * interrupts a chance, then just return; don't run any other
 		 * procs or panic below, in case this is the idle process and
 		 * already asleep.
 		 */
 		mtx_unlock_spin(&sched_lock);
 		return 0;
 	}
 	CV_WAIT_VALIDATE(cvp, mp);
 
 	DROP_GIANT_NOSWITCH();
 	mtx_unlock_flags(mp, MTX_NOSWITCH);
 
 	cv_waitq_add(cvp, p);
 	sig = cv_switch_catch(p);
-	curpriority = p->p_usrpri;
 
 	mtx_unlock_spin(&sched_lock);
 	PICKUP_GIANT();
 
 	/* proc_lock(p); */
 	if (sig == 0)
 		sig = CURSIG(p);
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
 		else
 			rval = ERESTART;
 	}
 	/* proc_unlock(p); */
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 0, 0);
 #endif
 	mtx_lock(mp);
 	WITNESS_RESTORE(mp, mp);
 
 	return (rval);
 }
 
 /*
  * Wait on a condition variable for at most timo/hz seconds.  Returns 0 if the
  * process was resumed by cv_signal or cv_broadcast, EWOULDBLOCK if the timeout
  * expires.
  */
 int
 cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
 {
 	struct proc *p;
 	int rval;
 	WITNESS_SAVE_DECL(mp);
 
 	p = CURPROC;
 	rval = 0;
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
 #endif
 	CV_ASSERT(cvp, mp, p);
 	WITNESS_SLEEP(0, mp);
 	WITNESS_SAVE(mp, mp);
 
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
 		 * After a panic, or during autoconfiguration, just give
 		 * interrupts a chance, then just return; don't run any other
 		 * procs or panic below, in case this is the idle process and
 		 * already asleep.
 		 */
 		mtx_unlock_spin(&sched_lock);
 		return 0;
 	}
 	CV_WAIT_VALIDATE(cvp, mp);
 
 	DROP_GIANT_NOSWITCH();
 	mtx_unlock_flags(mp, MTX_NOSWITCH);
 
 	cv_waitq_add(cvp, p);
 	callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p);
 	cv_switch(p);
-	curpriority = p->p_usrpri;
 
 	if (p->p_sflag & PS_TIMEOUT) {
 		p->p_sflag &= ~PS_TIMEOUT;
 		rval = EWOULDBLOCK;
 	} else
 		callout_stop(&p->p_slpcallout);
 
 	mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 0, 0);
 #endif
 	PICKUP_GIANT();
 	mtx_lock(mp);
 	WITNESS_RESTORE(mp, mp);
 
 	return (rval);
 }
 
 /*
  * Wait on a condition variable for at most timo/hz seconds, allowing
  * interruption by signals.  Returns 0 if the process was resumed by cv_signal
  * or cv_broadcast, EWOULDBLOCK if the timeout expires, and EINTR or ERESTART if
  * a signal was caught.
  */
 int
 cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
 {
 	struct proc *p;
 	int rval;
 	int sig;
 	WITNESS_SAVE_DECL(mp);
 
 	p = CURPROC;
 	rval = 0;
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
 #endif
 	CV_ASSERT(cvp, mp, p);
 	WITNESS_SLEEP(0, mp);
 	WITNESS_SAVE(mp, mp);
 
 	mtx_lock_spin(&sched_lock);
 	if (cold || panicstr) {
 		/*
 		 * After a panic, or during autoconfiguration, just give
 		 * interrupts a chance, then just return; don't run any other
 		 * procs or panic below, in case this is the idle process and
 		 * already asleep.
 		 */
 		mtx_unlock_spin(&sched_lock);
 		return 0;
 	}
 	CV_WAIT_VALIDATE(cvp, mp);
 
 	DROP_GIANT_NOSWITCH();
 	mtx_unlock_flags(mp, MTX_NOSWITCH);
 
 	cv_waitq_add(cvp, p);
 	callout_reset(&p->p_slpcallout, timo, cv_timedwait_end, p);
 	sig = cv_switch_catch(p);
-	curpriority = p->p_usrpri;
 
 	if (p->p_sflag & PS_TIMEOUT) {
 		p->p_sflag &= ~PS_TIMEOUT;
 		rval = EWOULDBLOCK;
 	} else
 		callout_stop(&p->p_slpcallout);
 
 	mtx_unlock_spin(&sched_lock);
 	PICKUP_GIANT();
 
 	/* proc_lock(p); */
 	if (sig == 0)
 		sig = CURSIG(p);
 	if (sig != 0) {
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
 		else
 			rval = ERESTART;
 	}
 	/* proc_unlock(p); */
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 0, 0);
 #endif
 	mtx_lock(mp);
 	WITNESS_RESTORE(mp, mp);
 
 	return (rval);
 }
 
 /*
  * Common code for signal and broadcast.  Assumes waitq is not empty.  Must be
  * called with sched_lock held.
  */
 static __inline void
 cv_wakeup(struct cv *cvp)
 {
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	p = TAILQ_FIRST(&cvp->cv_waitq);
 	KASSERT(p->p_wchan == cvp, ("%s: bogus wchan", __FUNCTION__));
 	KASSERT(p->p_sflag & PS_CVWAITQ, ("%s: not on waitq", __FUNCTION__));
 	TAILQ_REMOVE(&cvp->cv_waitq, p, p_slpq);
 	p->p_sflag &= ~PS_CVWAITQ;
 	p->p_wchan = 0;
 	if (p->p_stat == SSLEEP) {
 		/* OPTIMIZED EXPANSION OF setrunnable(p); */
 		CTR3(KTR_PROC, "cv_signal: proc %p (pid %d, %s)",
 		    p, p->p_pid, p->p_comm);
 		if (p->p_slptime > 1)
 			updatepri(p);
 		p->p_slptime = 0;
 		p->p_stat = SRUN;
 		if (p->p_sflag & PS_INMEM) {
 			setrunqueue(p);
 			maybe_resched(p);
 		} else {
 			p->p_sflag |= PS_SWAPINREQ;
 			wakeup(&proc0);
 		}
 		/* END INLINE EXPANSION */
 	}
 }
 
 /*
  * Signal a condition variable, wakes up one waiting process.  Will also wakeup
  * the swapper if the process is not in memory, so that it can bring the
  * sleeping process in.  Note that this may also result in additional processes
  * being made runnable.  Should be called with the same mutex as was passed to
  * cv_wait held.
  */
 void
 cv_signal(struct cv *cvp)
 {
 
 	KASSERT(cvp != NULL, ("%s: cvp NULL", __FUNCTION__));
 	mtx_lock_spin(&sched_lock);
 	if (!TAILQ_EMPTY(&cvp->cv_waitq)) {
 		CV_SIGNAL_VALIDATE(cvp);
 		cv_wakeup(cvp);
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Broadcast a signal to a condition variable.  Wakes up all waiting processes.
  * Should be called with the same mutex as was passed to cv_wait held.
  */
 void
 cv_broadcast(struct cv *cvp)
 {
 
 	KASSERT(cvp != NULL, ("%s: cvp NULL", __FUNCTION__));
 	mtx_lock_spin(&sched_lock);
 	CV_SIGNAL_VALIDATE(cvp);
 	while (!TAILQ_EMPTY(&cvp->cv_waitq))
 		cv_wakeup(cvp);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Remove a process from the wait queue of its condition variable.  This may be
  * called externally.
  */
 void
 cv_waitq_remove(struct proc *p)
 {
 	struct cv *cvp;
 
 	mtx_lock_spin(&sched_lock);
 	if ((cvp = p->p_wchan) != NULL && p->p_sflag & PS_CVWAITQ) {
 		TAILQ_REMOVE(&cvp->cv_waitq, p, p_slpq);
 		p->p_sflag &= ~PS_CVWAITQ;
 		p->p_wchan = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Timeout function for cv_timedwait.  Put the process on the runqueue and set
  * its timeout flag.
  */
 static void
 cv_timedwait_end(void *arg)
 {
 	struct proc *p;
 
 	p = arg;
 	CTR3(KTR_PROC, "cv_timedwait_end: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	mtx_lock_spin(&sched_lock);
 	if (p->p_wchan != NULL) {
 		if (p->p_stat == SSLEEP)
 			setrunnable(p);
 		else
 			cv_waitq_remove(p);
 		p->p_sflag |= PS_TIMEOUT;
 	}
 	mtx_unlock_spin(&sched_lock);
 }
Index: head/sys/kern/kern_idle.c
===================================================================
--- head/sys/kern/kern_idle.c	(revision 72375)
+++ head/sys/kern/kern_idle.c	(revision 72376)
@@ -1,117 +1,119 @@
 /*-
  * Copyright (c) 2000, All rights reserved.  See /usr/src/COPYRIGHT
  *
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/ipl.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/eventhandler.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/smp.h>
 
 #include <machine/globaldata.h>
 #include <machine/globals.h>
 
 static void idle_setup(void *dummy);
 SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
 
 static void idle_proc(void *dummy);
 
 /*
  * Setup per-cpu idle process contexts.  The AP's shouldn't be running or
  * accessing their idle processes at this point, so don't bother with
  * locking.
  */
 static void
 idle_setup(void *dummy)
 {
 	struct globaldata *gd;
 	int error;
 
 	SLIST_FOREACH(gd, &cpuhead, gd_allcpu) {
 #ifdef SMP
 		error = kthread_create(idle_proc, NULL, &gd->gd_idleproc,
 				       RFSTOPPED|RFHIGHPID, "idle: cpu%d",
 				       gd->gd_cpuid);
 #else
 		error = kthread_create(idle_proc, NULL, &gd->gd_idleproc,
 				       RFSTOPPED|RFHIGHPID, "idle");
 #endif
 		if (error)
 			panic("idle_setup: kthread_create error %d\n", error);
 
 		gd->gd_idleproc->p_flag |= P_NOLOAD;
 		gd->gd_idleproc->p_stat = SRUN;
 		if (gd->gd_curproc == NULL)
 			gd->gd_curproc = gd->gd_idleproc;
 	}
 }
 
 /*
  * idle process context
  */
 static void
 idle_proc(void *dummy)
 {
 #ifdef DIAGNOSTIC
 	int count;
 #endif
 
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
 #ifdef DIAGNOSTIC
 		count = 0;
 
 		while (count >= 0 && procrunnable() == 0) {
 #else
 		while (procrunnable() == 0) {
 #endif
 		/*
 		 * This is a good place to put things to be done in
 		 * the background, including sanity checks.
 		 */
 
 #ifdef DIAGNOSTIC
 			if (count++ < 0)
 				CTR0(KTR_PROC, "idle_proc: timed out waiting"
 				    " for a process");
 #endif
 
+#if 0
 			if (vm_page_zero_idle() != 0)
 				continue;
+#endif
 
 #ifdef __i386__
 			cpu_idle();
 #endif
 		}
 
 		mtx_lock_spin(&sched_lock);
 		curproc->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 	}
 }
Index: head/sys/kern/kern_intr.c
===================================================================
--- head/sys/kern/kern_intr.c	(revision 72375)
+++ head/sys/kern/kern_intr.c	(revision 72376)
@@ -1,537 +1,537 @@
 /*
  * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/rtprio.h>
 #include <sys/systm.h>
 #include <sys/ipl.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/unistd.h>
 #include <sys/vmmeter.h>
 #include <machine/atomic.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/stdarg.h>
 
 #include <net/netisr.h>		/* prototype for legacy_setsoftnet */
 
 void	*net_ih;
 void	*vm_ih;
 void	*softclock_ih;
 struct	ithd *clk_ithd;
 struct	ithd *tty_ithd;
 
 static struct	mtx ithread_list_lock;
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
 static void	ithread_update(struct ithd *);
 static void	ithread_loop(void *);
 static void	ithread_init(void *);
 static void	start_softintr(void *);
 static void	swi_net(void *);
 
 u_char
 ithread_priority(enum intr_type flags)
 {
 	u_char pri;
 
 	flags &= (INTR_TYPE_TTY | INTR_TYPE_BIO | INTR_TYPE_NET |
 	    INTR_TYPE_CAM | INTR_TYPE_MISC | INTR_TYPE_CLK);
 	switch (flags) {
 	case INTR_TYPE_TTY:
 		pri = PI_TTYLOW;
 		break;
 	case INTR_TYPE_BIO:
 		/*
 		 * XXX We need to refine this.  BSD/OS distinguishes
 		 * between tape and disk priorities.
 		 */
 		pri = PI_DISK;
 		break;
 	case INTR_TYPE_NET:
 		pri = PI_NET;
 		break;
 	case INTR_TYPE_CAM:
 		pri = PI_DISK;          /* XXX or PI_CAM? */
 		break;
 	case INTR_TYPE_CLK:
 		pri = PI_REALTIME;
 		break;
 	case INTR_TYPE_MISC:
 		pri = PI_DULL;          /* don't care */
 		break;
 	default:
 		/* We didn't specify an interrupt level. */
 		panic("ithread_priority: no interrupt type in flags");
 	}
 
 	return pri;
 }
 
 /*
  * Regenerate the name (p_comm) and priority for a threaded interrupt thread.
  */
 static void
 ithread_update(struct ithd *ithd)
 {
 	struct intrhand *ih;
 	struct proc *p;
 	int entropy;
 
 	p = ithd->it_proc;
 	if (p == NULL)
 		return;
 
 	strncpy(p->p_comm, ithd->it_name, sizeof(ithd->it_name));
 	ih = TAILQ_FIRST(&ithd->it_handlers);
 	if (ih == NULL) {
-		p->p_rtprio.prio = RTP_PRIO_MAX;
+		p->p_pri.pri_level = PRI_MAX_ITHD;
 		ithd->it_flags &= ~IT_ENTROPY;
 		return;
 	}
 
 	entropy = 0;
-	p->p_rtprio.prio = ih->ih_pri;
+	p->p_pri.pri_level = ih->ih_pri;
 	TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
 		if (strlen(p->p_comm) + strlen(ih->ih_name) + 1 <
 		    sizeof(p->p_comm)) {
 			strcat(p->p_comm, " ");
 			strcat(p->p_comm, ih->ih_name);
 		} else if (strlen(p->p_comm) + 1 == sizeof(p->p_comm)) {
 			if (p->p_comm[sizeof(p->p_comm) - 2] == '+')
 				p->p_comm[sizeof(p->p_comm) - 2] = '*';
 			else
 				p->p_comm[sizeof(p->p_comm) - 2] = '+';
 		} else
 			strcat(p->p_comm, "+");
 		if (ih->ih_flags & IH_ENTROPY)
 			entropy++;
 	}
 
 	if (entropy) {
 		printf("Warning, ithread (%d, %s) is an entropy source.\n",
 		    p->p_pid, p->p_comm);
 		ithd->it_flags |= IT_ENTROPY;
 	}
 	else
 		ithd->it_flags &= ~IT_ENTROPY;
 }
 
 int
 ithread_create(struct ithd **ithread, int vector, int flags,
     void (*disable)(int), void (*enable)(int), const char *fmt, ...)
 {
 	struct ithd *ithd;
 	struct proc *p;
 	int error;
 	va_list ap;
 
 	ithd = malloc(sizeof(struct ithd), M_ITHREAD, M_WAITOK | M_ZERO);
 	ithd->it_vector = vector;
 	ithd->it_disable = disable;
 	ithd->it_enable = enable;
 	ithd->it_flags = flags;
 	TAILQ_INIT(&ithd->it_handlers);
 
 	va_start(ap, fmt);
 	vsnprintf(ithd->it_name, sizeof(ithd->it_name), fmt, ap);
 	va_end(ap);
 
 	error = kthread_create(ithread_loop, ithd, &p, RFSTOPPED | RFHIGHPID,
 	    ithd->it_name);
 	if (error) {
 		free(ithd, M_ITHREAD);
 		return (error);
 	}
-	p->p_rtprio.type = RTP_PRIO_ITHREAD;
-	p->p_rtprio.prio = RTP_PRIO_MAX;
+	p->p_pri.pri_class = PRI_ITHD;
+	p->p_pri.pri_level = PRI_MAX_ITHD;
 	p->p_stat = SWAIT;
 	ithd->it_proc = p;
 	p->p_ithd = ithd;
 	if (ithread != NULL)
 		*ithread = ithd;
 
 	return (0);
 }
 
 int
 ithread_destroy(struct ithd *ithread)
 {
 
 	if (ithread == NULL || !TAILQ_EMPTY(&ithread->it_handlers))
 		return (EINVAL);
 
 	mtx_lock_spin(&sched_lock);
 	ithread->it_flags |= IT_DEAD;
 	if (ithread->it_proc->p_stat == SWAIT) {
 		ithread->it_proc->p_stat = SRUN;
 		setrunqueue(ithread->it_proc);
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 int
 ithread_add_handler(struct ithd* ithread, const char *name,
     driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
     void **cookiep)
 {
 	struct intrhand *ih, *temp_ih;
 
 	if (ithread == NULL || name == NULL || handler == NULL)
 		return (EINVAL);
 	if ((flags & INTR_FAST) !=0)
 		flags |= INTR_EXCL;
 
 	ih = malloc(sizeof(struct intrhand), M_ITHREAD, M_WAITOK | M_ZERO);
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	ih->ih_name = name;
 	ih->ih_ithread = ithread;
 	ih->ih_pri = pri;
 	if (flags & INTR_FAST)
 		ih->ih_flags = IH_FAST | IH_EXCLUSIVE;
 	else if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
 	mtx_lock_spin(&ithread_list_lock);
 	if ((flags & INTR_EXCL) !=0 && !TAILQ_EMPTY(&ithread->it_handlers))
 		goto fail;
 	if (!TAILQ_EMPTY(&ithread->it_handlers) &&
 	    (TAILQ_FIRST(&ithread->it_handlers)->ih_flags & IH_EXCLUSIVE) != 0)
 		goto fail;
 
 	TAILQ_FOREACH(temp_ih, &ithread->it_handlers, ih_next)
 	    if (temp_ih->ih_pri > ih->ih_pri)
 		    break;
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ithread->it_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
 	ithread_update(ithread);
 	mtx_unlock_spin(&ithread_list_lock);
 
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 
 fail:
 	mtx_unlock_spin(&ithread_list_lock);
 	free(ih, M_ITHREAD);
 	return (EINVAL);
 }
 
 int
 ithread_remove_handler(void *cookie)
 {
 	struct intrhand *handler = (struct intrhand *)cookie;
 	struct ithd *ithread;
 #ifdef INVARIANTS
 	struct intrhand *ih;
 	int found;
 #endif
 
 	if (handler == NULL || (ithread = handler->ih_ithread) == NULL)
 		return (EINVAL);
 
 	mtx_lock_spin(&ithread_list_lock);
 #ifdef INVARIANTS
 	found = 0;
 	TAILQ_FOREACH(ih, &ithread->it_handlers, ih_next)
 		if (ih == handler) {
 			found++;
 			break;
 		}
 	if (found == 0) {
 		mtx_unlock_spin(&ithread_list_lock);
 		return (EINVAL);
 	}
 #endif
 	TAILQ_REMOVE(&ithread->it_handlers, handler, ih_next);
 	ithread_update(ithread);
 	mtx_unlock_spin(&ithread_list_lock);
 
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
 int
 swi_add(struct ithd **ithdp, const char *name, driver_intr_t handler, 
 	    void *arg, int pri, enum intr_type flags, void **cookiep)
 {
 	struct proc *p;
 	struct ithd *ithd;
 	int error;
 
 	ithd = (ithdp != NULL) ? *ithdp : NULL;
 
 	if (ithd == NULL) {
 		error = ithread_create(&ithd, pri, IT_SOFT, NULL, NULL,
 		    "swi%d:", pri);
 		if (error)
 			return (error);
 
 		/* XXX - some hacks are _really_ gross */
 		p = ithd->it_proc;
 		PROC_LOCK(p);
 		if (pri == SWI_CLOCK)
 			p->p_flag |= P_NOLOAD;
 		PROC_UNLOCK(p);
 		if (ithdp != NULL)
 			*ithdp = ithd;
 	}
-	return (ithread_add_handler(ithd, name, handler, arg, pri + PI_SOFT,
-		    flags, cookiep));
+	return (ithread_add_handler(ithd, name, handler, arg,
+		    (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
 }
 
 
 /*
  * Schedule a heavyweight software interrupt process. 
  */
 void
 swi_sched(void *cookie, int flags)
 {
 	struct intrhand *ih = (struct intrhand *)cookie;
 	struct ithd *it = ih->ih_ithread;
 	struct proc *p = it->it_proc;
 
 	atomic_add_int(&cnt.v_intr, 1); /* one more global interrupt */
 		
 	CTR3(KTR_INTR, "swi_sched pid %d(%s) need=%d",
 		p->p_pid, p->p_comm, it->it_need);
 
 	/*
 	 * Set it_need so that if the thread is already running but close
 	 * to done, it will do another go-round.  Then get the sched lock
 	 * and see if the thread is on whichkqs yet.  If not, put it on
 	 * there.  In any case, kick everyone so that if the new thread
 	 * is higher priority than their current thread, it gets run now.
 	 */
 	atomic_store_rel_int(&ih->ih_need, 1);
 	if (!(flags & SWI_DELAY)) {
 		it->it_need = 1;
 		mtx_lock_spin(&sched_lock);
 		if (p->p_stat == SWAIT) { /* not on run queue */
 			CTR1(KTR_INTR, "swi_sched: setrunqueue %d", p->p_pid);
 			p->p_stat = SRUN;
 			setrunqueue(p);
 			if (!cold && flags & SWI_SWITCH) {
 				if (curproc != PCPU_GET(idleproc))
 					setrunqueue(curproc);
 				curproc->p_stats->p_ru.ru_nvcsw++;
 				mi_switch();
 			} else
 				need_resched();
 		}
 		else {
 			CTR3(KTR_INTR, "swi_sched %d: it_need %d, state %d",
 				p->p_pid, it->it_need, p->p_stat );
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * This is the main code for interrupt threads.
  */
 void
 ithread_loop(void *arg)
 {
 	struct ithd *ithd;		/* our thread context */
 	struct intrhand *ih;		/* and our interrupt handler chain */
 	struct proc *p;
 	
 	p = curproc;
 	ithd = (struct ithd *)arg;	/* point to myself */
 	KASSERT(ithd->it_proc == p && p->p_ithd == ithd,
 	    (__func__ ": ithread and proc linkage out of sync"));
 
 	/*
 	 * As long as we have interrupts outstanding, go through the
 	 * list of handlers, giving each one a go at it.
 	 */
 	for (;;) {
 		/*
 		 * If we are an orphaned thread, then just die.
 		 */
 		if (ithd->it_flags & IT_DEAD) {
 			CTR2(KTR_INTR, __func__ ": pid %d: (%s) exiting",
 			    p->p_pid, p->p_comm);
 			p->p_ithd = NULL;
 			mtx_lock(&Giant);
 			free(ithd, M_ITHREAD);
 			kthread_exit(0);
 		}
 
 		CTR3(KTR_INTR, __func__ ": pid %d: (%s) need=%d",
 		     p->p_pid, p->p_comm, ithd->it_need);
 		while (ithd->it_need) {
 			/*
 			 * Service interrupts.  If another interrupt
 			 * arrives while we are running, they will set
 			 * it_need to denote that we should make
 			 * another pass.
 			 */
 			atomic_store_rel_int(&ithd->it_need, 0);
 			TAILQ_FOREACH(ih, &ithd->it_handlers, ih_next) {
 				if (ithd->it_flags & IT_SOFT && !ih->ih_need)
 					continue;
 				atomic_store_rel_int(&ih->ih_need, 0);
 				CTR5(KTR_INTR,
 				    __func__ ": pid %d ih=%p: %p(%p) flg=%x",
 				    p->p_pid, (void *)ih,
 				    (void *)ih->ih_handler, ih->ih_argument,
 				    ih->ih_flags);
 
 				if ((ih->ih_flags & IH_MPSAFE) == 0)
 					mtx_lock(&Giant);
 				ih->ih_handler(ih->ih_argument);
 				if ((ih->ih_flags & IH_MPSAFE) == 0)
 					mtx_unlock(&Giant);
 			}
 		}
 
 		/*
 		 * Processed all our interrupts.  Now get the sched
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
 		mtx_assert(&Giant, MA_NOTOWNED);
 		mtx_lock_spin(&sched_lock);
 		if (!ithd->it_need) {
 			/*
 			 * Should we call this earlier in the loop above?
 			 */
 			if (ithd->it_enable != NULL)
 				ithd->it_enable(ithd->it_vector);
 			p->p_stat = SWAIT; /* we're idle */
 			CTR1(KTR_INTR, __func__ ": pid %d: done", p->p_pid);
 			mi_switch();
 			CTR1(KTR_INTR, __func__ ": pid %d: resumed", p->p_pid);
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 }
 
 /*
  * Initialize mutex used to protect ithread handler lists.
  */
 static void
 ithread_init(void *dummy)
 {
 
 	mtx_init(&ithread_list_lock, "ithread list lock", MTX_SPIN);
 }
 SYSINIT(ithread_init, SI_SUB_INTR, SI_ORDER_FIRST, ithread_init, NULL);
 
 /*
  * Start standard software interrupt threads
  */
 static void
 start_softintr(void *dummy)
 {
 
 	if (swi_add(NULL, "net", swi_net, NULL, SWI_NET, 0, &net_ih) ||
 	    swi_add(&clk_ithd, "clock", softclock, NULL, SWI_CLOCK,
 		INTR_MPSAFE, &softclock_ih) ||
 	    swi_add(NULL, "vm", swi_vm, NULL, SWI_VM, 0, &vm_ih))
 		panic("died while creating standard software ithreads");
 }
 SYSINIT(start_softintr, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softintr, NULL)
 
 void
 legacy_setsoftnet(void)
 {
 	swi_sched(net_ih, SWI_NOSWITCH);
 }
 
 /*
  * XXX: This should really be in the network code somewhere and installed
  * via a SI_SUB_SOFINTR, SI_ORDER_MIDDLE sysinit.
  */
 void	(*netisrs[32]) __P((void));
 u_int	netisr;
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 int
 unregister_netisr(num)
 	int num;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("unregister_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = NULL;
 	return (0);
 }
 
 static void
 swi_net(void *dummy)
 {
 	u_int bits;
 	int i;
 
 	bits = atomic_readandclear_int(&netisr);
 	while ((i = ffs(bits)) != 0) {
 		i--;
 		if (netisrs[i] != NULL)
 			netisrs[i]();
 		else
 			printf("swi_net: unregistered isr number: %d.\n", i);
 		bits &= ~(1 << i);
 	}
 }
Index: head/sys/kern/kern_mib.c
===================================================================
--- head/sys/kern/kern_mib.c	(revision 72375)
+++ head/sys/kern/kern_mib.c	(revision 72376)
@@ -1,255 +1,259 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <machine/smp.h>
 
 SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
 	"Sysctl internal magic");
 SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW, 0,
 	"High kernel, proc, limits &c");
 SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
 	"Virtual memory");
 SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
 	"File system");
 SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
 	"Network, (see socket.h)");
 SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
 	"Debugging");
 SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW, 0,
 	"Sizeof various things");
 SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
 	"hardware");
 SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
 	"machine dependent");
 SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
 	"user-level");
 SYSCTL_NODE(, CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
 	"p1003_1b, (see p1003_1b.h)");
 
 SYSCTL_NODE(, OID_AUTO,  compat, CTLFLAG_RW, 0,
 	"Compatibility code");
 
 SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, 
     osrelease, 0, "Operating system type");
 
 SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 
     0, BSD, "Operating system revision");
 
 SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, 
     version, 0, "Kernel version");
 
 SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, 
     ostype, 0, "Operating system type");
 
 extern int osreldate;
 SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, 
     &osreldate, 0, "Operating system release date");
 
 SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, 
     &maxproc, 0, "Maximum number of processes");
 
 SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW, 
     &maxprocperuid, 0, "Maximum processes allowed per userid");
 
 SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 
     0, ARG_MAX, "Maximum bytes of argument to execve(2)");
 
 SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 
     0, _KPOSIX_VERSION, "Version of POSIX attempting to comply to");
 
 SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 
     0, NGROUPS_MAX, "Maximum number of groups a user can belong to");
 
 SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 
     0, 1, "Whether job control is available");
 
 #ifdef _POSIX_SAVED_IDS
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 
     0, 1, "Whether saved set-group/user ID is available");
 #else
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 
     0, 0, "Whether saved set-group/user ID is available");
 #endif
 
 char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
 
 SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW, 
     kernelname, sizeof kernelname, "Name of kernel file booted");
 
 #ifdef SMP
 SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 
     &mp_ncpus, 0, "Number of active CPUs");
 #else
 SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 
     0, 1, "Number of active CPUs");
 #endif
 
 SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 
     0, BYTE_ORDER, "System byte order");
 
 SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 
     0, PAGE_SIZE, "System memory page size");
 
 static char	machine_arch[] = MACHINE_ARCH;
 SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
     machine_arch, 0, "System architecture");
 
 char hostname[MAXHOSTNAMELEN];
 
 static int
 sysctl_hostname(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	if (req->p->p_prison) {
 		if (!jail_set_hostname_allowed && req->newptr)
 			return(EPERM);
 		error = sysctl_handle_string(oidp, 
 		    req->p->p_prison->pr_host,
 		    sizeof req->p->p_prison->pr_host, req);
 	} else
 		error = sysctl_handle_string(oidp, 
 		    hostname, sizeof hostname, req);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname, 
        CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON,
        0, 0, sysctl_hostname, "A", "Hostname");
 
 int securelevel = -1;
 
 static int
 sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
 {
 		int error, level;
 
 		level = securelevel;
 		error = sysctl_handle_int(oidp, &level, 0, req);
 		if (error || !req->newptr)
 			return (error);
 		if (level < securelevel)
 			return (EPERM);
 		securelevel = level;
 		return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_securelvl, "I", "Current secure level");
 
 char domainname[MAXHOSTNAMELEN];
 SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
     &domainname, sizeof(domainname), "Name of the current YP/NIS domain");
 
 long hostid;
 /* Some trouble here, if sizeof (int) != sizeof (long) */
 SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
 
 /*
  * This is really cheating.  These actually live in the libc, something
  * which I'm not quite sure is a good idea anyway, but in order for 
  * getnext and friends to actually work, we define dummies here.
  */
 SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, 
     "", 0, "PATH that finds all the standard utilities");
 SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 
     0, 0, "Max ibase/obase values in bc(1)");
 SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 
     0, 0, "Max array size in bc(1)");
 SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 
     0, 0, "Max scale value in bc(1)");
 SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 
     0, 0, "Max string length in bc(1)");
 SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 
     0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
 SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
 SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 
     0, 0, "Max length (bytes) of a text-processing utility's input line");
 SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 
     0, 0, "Maximum number of repeats of a regexp permitted");
 SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 
     0, 0, 
     "The version of POSIX 1003.2 with which the system attempts to comply");
 SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 
     0, 0, "Whether C development supports the C bindings option");
 SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 
     0, 0, "Whether system supports the C development utilities option");
 SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 
     0, 0, "");
 SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 
     0, 0, "Whether system supports FORTRAN development utilities");
 SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 
     0, 0, "Whether system supports FORTRAN runtime utilities");
 SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 
     0, 0, "Whether system supports creation of locales");
 SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 
     0, 0, "Whether system supports software development utilities");
 SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 
     0, 0, "Whether system supports the user portability utilities");
 SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 
     0, 0, "Min Maximum number of streams a process may have open at one time");
 SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 
     0, 0, "Min Maximum number of types supported for timezone names");
 
 #include <sys/vnode.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD, 
     0, sizeof(struct vnode), "sizeof(struct vnode)");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD, 
     0, sizeof(struct proc), "sizeof(struct proc)");
 
 #include <sys/conf.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD,
     0, sizeof(struct specinfo), "sizeof(struct specinfo)");
 
 #include <sys/bio.h>
 #include <sys/buf.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
     0, sizeof(struct bio), "sizeof(struct bio)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
     0, sizeof(struct buf), "sizeof(struct buf)");
+
+#include <sys/user.h>
+SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
+    0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
Index: head/sys/kern/kern_mutex.c
===================================================================
--- head/sys/kern/kern_mutex.c	(revision 72375)
+++ head/sys/kern/kern_mutex.c	(revision 72376)
@@ -1,1705 +1,1680 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  * $FreeBSD$
  */
 
 /*
  * Machine independent bits of mutex implementation and implementation of
  * `witness' structure & related debugging routines.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 #include "opt_ddb.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #include <sys/ktr.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <sys/mutex.h>
 
 /*
  * The WITNESS-enabled mutex debug structure.
  */
 #ifdef WITNESS
 struct mtx_debug {
 	struct witness	*mtxd_witness;
 	LIST_ENTRY(mtx)	mtxd_held;
 	const char	*mtxd_file;
 	int		mtxd_line;
 };
 
 #define mtx_held	mtx_debug->mtxd_held
 #define	mtx_file	mtx_debug->mtxd_file
 #define	mtx_line	mtx_debug->mtxd_line
 #define	mtx_witness	mtx_debug->mtxd_witness
 #endif	/* WITNESS */
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
 
 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)
 
 /*
  * Early WITNESS-enabled declarations.
  */
 #ifdef WITNESS
 
 /*
  * Internal WITNESS routines which must be prototyped early.
  *
  * XXX: When/if witness code is cleaned up, it would be wise to place all
  *	witness prototyping early in this file.
  */ 
 static void witness_init(struct mtx *, int flag);
 static void witness_destroy(struct mtx *);
 static void witness_display(void(*)(const char *fmt, ...));
 
 MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure");
 
 /* All mutexes in system (used for debug/panic) */
 static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 };
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 #else	/* WITNESS */
 
 /* XXX XXX XXX
  * flag++ is sleazoid way of shuting up warning
  */
 #define witness_init(m, flag) flag++
 #define witness_destroy(m)
 #define witness_try_enter(m, t, f, l)
 #endif	/* WITNESS */
 
 /*
  * All mutex locks in system are kept on the all_mtx list.
  */
 static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head",
 	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
 	{ NULL, NULL }, &all_mtx, &all_mtx,
 #ifdef WITNESS
 	&all_mtx_debug
 #else
 	NULL
 #endif
 	 };
 
 /*
  * Global variables for book keeping.
  */
 static int	mtx_cur_cnt;
 static int	mtx_max_cnt;
 
 /*
  * Couple of strings for KTR_LOCK tracing in order to avoid duplicates.
  */
 char	STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d";
 
 /*
  * Prototypes for non-exported routines.
  *
  * NOTE: Prototypes for witness routines are placed at the bottom of the file. 
  */
 static void	propagate_priority(struct proc *);
 
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	for (;;) {
 		struct proc *p1;
 
 		p = mtx_owner(m);
 
 		if (p == NULL) {
 			/*
 			 * This really isn't quite right. Really
 			 * ought to bump priority of process that
 			 * next acquires the mutex.
 			 */
 			MPASS(m->mtx_lock == MTX_CONTESTED);
 			return;
 		}
 
 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;
 
 		/*
 		 * Bump this process' priority.
 		 */
 		SET_PRIO(p, pri);
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
+
 		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
-		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
 			return;
 		}
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
 		KASSERT(p->p_stat == SMTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
 		    p->p_pid, p->p_comm, p->p_stat,
 		    m->mtx_description));
 
 		/*
 		 * Pick up the mutex that p is blocked on.
 		 */
 		m = p->p_blocked;
 		MPASS(m != NULL);
 
-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}
 
-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}
 
 		/*
 		 * Remove proc from blocked chain and determine where
 		 * it should be moved up to.  Since we know that p1 has
 		 * a lower priority than p, we know that at least one
 		 * process in the chain has a lower priority and that
 		 * p1 will thus not be NULL after the loop.
 		 */
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}
 
 		MPASS(p1 != NULL);
 		TAILQ_INSERT_BEFORE(p1, p, p_procq);
 		CTR4(KTR_LOCK,
 		    "propagate_priority: p %p moved before %p on [%p] %s",
 		    p, p1, m, m->mtx_description);
 	}
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
  * if we're called, it's because we know we don't already own this lock.
  */
 int
 _mtx_trylock(struct mtx *m, int opts, const char *file, int line)
 {
 	int rval;
 
 	MPASS(CURPROC != NULL);
 
 	/*
 	 * _mtx_trylock does not accept MTX_NOSWITCH option.
 	 */
 	KASSERT((opts & MTX_NOSWITCH) == 0,
 	    ("mtx_trylock() called with invalid option flag(s) %d", opts));
 
 	rval = _obtain_lock(m, CURTHD);
 
 #ifdef WITNESS
 	if (rval && m->mtx_witness != NULL) {
 		/*
 		 * We do not handle recursion in _mtx_trylock; see the
 		 * note at the top of the routine.
 		 */
 		KASSERT(!mtx_recursed(m),
 		    ("mtx_trylock() called on a recursed mutex"));
 		witness_try_enter(m, (opts | m->mtx_flags), file, line);
 	}
 #endif	/* WITNESS */
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d",
 		    m->mtx_description, m, rval, file, line);
 
 	return rval;
 }
 
 /*
  * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
 _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p = CURPROC;
 
 	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]",
 		    m, (void *)m->mtx_lock, (void *)RETIP(m));
 
 	/*
 	 * Save our priority. Even though p_nativepri is protected by
 	 * sched_lock, we don't obtain it here as it can be expensive.
 	 * Since this is the only place p_nativepri is set, and since two
 	 * CPUs will not be executing the same process concurrently, we know
 	 * that no other CPU is going to be messing with this. Also,
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;
 
 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
 		struct proc *p1;
 
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the sched_lock.
 		 */
 		if ((v = m->mtx_lock) == MTX_UNOWNED) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * The mutex was marked contested on release. This means that
 		 * there are processes blocked on it.
 		 */
 		if (v == MTX_CONTESTED) {
 			p1 = TAILQ_FIRST(&m->mtx_blocked);
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
 
-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
 			(void *)(v | MTX_CONTESTED))) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * We deffinately must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef notyet
 		/*
 		 * If we're borrowing an interrupted thread's VM context, we
 		 * must clean up before going to sleep.
 		 */
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_lock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 
 		/*
 		 * Put us on the list of threads blocked on this mutex.
 		 */
 		if (TAILQ_EMPTY(&m->mtx_blocked)) {
 			p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK);
 			LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested);
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
 			else
 				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		}
 
 		/*
 		 * Save who we're blocked on.
 		 */
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			    "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m,
 			    m->mtx_description);
 
 		mi_switch();
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
 			  p, m, m->mtx_description);
 
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	return;
 }
 
 /*
  * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 void
 _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file,
 	       int line)
 {
 	int i = 0;
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 
 	for (;;) {
 		if (_obtain_lock(m, CURPROC))
 			break;
 
 		while (m->mtx_lock != MTX_UNOWNED) {
 			if (i++ < 1000000)
 				continue;
 			if (i++ < 6000000)
 				DELAY(1);
 #ifdef DDB
 			else if (!db_active)
 #else
 			else
 #endif
 			panic("spin lock %s held by %p for > 5 seconds",
 			    m->mtx_description, (void *)m->mtx_lock);
 		}
 	}
 
 	m->mtx_saveintr = mtx_intr;
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 
 	return;
 }
 
 /*
  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed or contested (i.e. we
  * need to wake up a blocked thread).
  */
 void
 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p, *p1;
 	struct mtx *m1;
 	int pri;
 
 	p = CURPROC;
 	MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line);
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 
 	p1 = TAILQ_FIRST(&m->mtx_blocked);
 	MPASS(p->p_magic == P_MAGIC);
 	MPASS(p1->p_magic == P_MAGIC);
 
 	TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
 
 	if (TAILQ_EMPTY(&m->mtx_blocked)) {
 		LIST_REMOVE(m, mtx_contested);
 		_release_lock_quick(m);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
 
-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}
 
-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
 		    m, p1);
 
 	p1->p_blocked = NULL;
 	p1->p_mtxname = NULL;
 	p1->p_stat = SRUN;
 	setrunqueue(p1);
 
-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_unlock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 		setrunqueue(p);
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
 			    (void *)m->mtx_lock);
 
 		mi_switch();
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
 			    m, (void *)m->mtx_lock);
 	}
 
 	mtx_unlock_spin(&sched_lock);
 
 	return;
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the _rel_spin_lock() macro for the details. 
  */
 
 /*
  * The INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANTS
 void
 _mtx_assert(struct mtx *m, int what, const char *file, int line)
 {
 	switch ((what)) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned((m)))
 			panic("mutex %s not owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		if (mtx_recursed((m))) {
 			if (((what) & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    (m)->mtx_description, file, line);
 		} else if (((what) & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    (m)->mtx_description, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned((m)))
 			panic("mutex %s owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * The MUTEX_DEBUG-enabled mtx_validate()
  */
 #define MV_DESTROY	0	/* validate before destory */
 #define MV_INIT		1	/* validate before init */
 
 #ifdef MUTEX_DEBUG
 
 int mtx_validate __P((struct mtx *, int));
 
 int
 mtx_validate(struct mtx *m, int when)
 {
 	struct mtx *mp;
 	int i;
 	int retval = 0;
 
 #ifdef WITNESS
 	if (witness_cold)
 		return 0;
 #endif
 	if (m == &all_mtx || cold)
 		return 0;
 
 	mtx_lock(&all_mtx);
 /*
  * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
  * we can re-enable the kernacc() checks.
  */
 #ifndef __alpha__
 	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
 	    VM_PROT_READ) == 1);
 #endif
 	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
 	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 #ifndef __alpha__
 		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
 		    VM_PROT_READ) != 1) {
 			panic("mtx_validate: mp=%p mp->mtx_next=%p",
 			    mp, mp->mtx_next);
 		}
 #endif
 		i++;
 		if (i > mtx_cur_cnt) {
 			panic("mtx_validate: too many in chain, known=%d\n",
 			    mtx_cur_cnt);
 		}
 	}
 	MPASS(i == mtx_cur_cnt); 
 	switch (when) {
 	case MV_DESTROY:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 			if (mp == m)
 				break;
 		MPASS(mp == m);
 		break;
 	case MV_INIT:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 		if (mp == m) {
 			/*
 			 * Not good. This mutex already exists.
 			 */
 			printf("re-initing existing mutex %s\n",
 			    m->mtx_description);
 			MPASS(m->mtx_lock == MTX_UNOWNED);
 			retval = 1;
 		}
 	}
 	mtx_unlock(&all_mtx);
 	return (retval);
 }
 #endif
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and description `description.'
  * Place on "all_mtx" queue.
  */ 
 void
 mtx_init(struct mtx *m, const char *description, int opts)
 {
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description);
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	if (mtx_validate(m, MV_INIT))
 		return;
 #endif
 
 	bzero((void *)m, sizeof *m);
 	TAILQ_INIT(&m->mtx_blocked);
 
 #ifdef WITNESS
 	if (!witness_cold) {
 		m->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(m->mtx_debug != NULL);
 	}
 #endif
 
 	m->mtx_description = description;
 	m->mtx_flags = opts;
 	m->mtx_lock = MTX_UNOWNED;
 
 	/* Put on all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next = &all_mtx;
 	m->mtx_prev = all_mtx.mtx_prev;
 	m->mtx_prev->mtx_next = m;
 	all_mtx.mtx_prev = m;
 	if (++mtx_cur_cnt > mtx_max_cnt)
 		mtx_max_cnt = mtx_cur_cnt;
 	mtx_unlock(&all_mtx);
 
 #ifdef WITNESS
 	if (!witness_cold)
 		witness_init(m, opts);
 #endif
 }
 
 /*
  * Remove lock `m' from all_mtx queue.
  */
 void
 mtx_destroy(struct mtx *m)
 {
 
 #ifdef WITNESS
 	KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n",
 	    __FUNCTION__));
 #endif
 
 	CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description);
 
 #ifdef MUTEX_DEBUG
 	if (m->mtx_next == NULL)
 		panic("mtx_destroy: %p (%s) already destroyed",
 		    m, m->mtx_description);
 
 	if (!mtx_owned(m)) {
 		MPASS(m->mtx_lock == MTX_UNOWNED);
 	} else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 	}
 
 	/* diagnostic */
 	mtx_validate(m, MV_DESTROY);
 #endif
 
 #ifdef WITNESS
 	if (m->mtx_witness)
 		witness_destroy(m);
 #endif /* WITNESS */
 
 	/* Remove from the all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next->mtx_prev = m->mtx_prev;
 	m->mtx_prev->mtx_next = m->mtx_next;
 
 #ifdef MUTEX_DEBUG
 	m->mtx_next = m->mtx_prev = NULL;
 #endif
 
 #ifdef WITNESS
 	free(m->mtx_debug, M_WITNESS);
 	m->mtx_debug = NULL;
 #endif
 
 	mtx_cur_cnt--;
 	mtx_unlock(&all_mtx);
 }
 
 
 /*
  * The WITNESS-enabled diagnostic code.
  */
 #ifdef WITNESS
 static void
 witness_fixup(void *dummy __unused)
 {
 	struct mtx *mp;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	mtx_lock(&all_mtx);
 
 	/* Iterate through all mutexes and finish up mutex initialization. */
 	for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 
 		mp->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(mp->mtx_debug != NULL);
 
 		witness_init(mp, mp->mtx_flags);
 	}
 	mtx_unlock(&all_mtx);
 
 	/* Mark the witness code as being ready for use. */
 	atomic_store_rel_int(&witness_cold, 0);
 
 	mtx_lock(&Giant);
 }
 SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL)
 
 #define WITNESS_COUNT 200
 #define	WITNESS_NCHILDREN 2
 
 int witness_watch = 1;
 
 struct witness {
 	struct witness	*w_next;
 	const char	*w_description;
 	const char	*w_file;
 	int		 w_line;
 	struct witness	*w_morechildren;
 	u_char		 w_childcnt;
 	u_char		 w_Giant_squawked:1;
 	u_char		 w_other_squawked:1;
 	u_char		 w_same_squawked:1;
 	u_char		 w_spin:1;	/* MTX_SPIN type mutex. */
 	u_int		 w_level;
 	struct witness	*w_children[WITNESS_NCHILDREN];
 };
 
 struct witness_blessed {
 	char 	*b_lock1;
 	char	*b_lock2;
 };
 
 #ifdef DDB
 /*
  * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
  * drop into kdebug() when:
  *	- a lock heirarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_ddb;
 #ifdef WITNESS_DDB
 TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb);
 #else
 TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
 #endif /* DDB */
 
 int	witness_skipspin;
 #ifdef WITNESS_SKIPSPIN
 TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin);
 #else
 TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
     "");
 
 /*
  * Witness-enabled globals
  */
 static struct mtx	w_mtx;
 static struct witness	*w_free;
 static struct witness	*w_all;
 static int		 w_inited;
 static int		 witness_dead;	/* fatal error, probably no memory */
 
 static struct witness	 w_data[WITNESS_COUNT];
 
 /*
  * Internal witness routine prototypes
  */
 static struct witness *enroll(const char *description, int flag);
 static int itismychild(struct witness *parent, struct witness *child);
 static void removechild(struct witness *parent, struct witness *child);
 static int isitmychild(struct witness *parent, struct witness *child);
 static int isitmydescendant(struct witness *parent, struct witness *child);
 static int dup_ok(struct witness *);
 static int blessed(struct witness *, struct witness *);
 static void
     witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *);
 static void witness_leveldescendents(struct witness *parent, int level);
 static void witness_levelall(void);
 static struct witness * witness_get(void);
 static void witness_free(struct witness *m);
 
 static char *ignore_list[] = {
 	"witness lock",
 	NULL
 };
 
 static char *spin_order_list[] = {
 #if defined(__i386__) && defined (SMP)
 	"com",
 #endif
 	"sio",
 #ifdef __i386__
 	"cy",
 #endif
 	"sched lock",
 #ifdef __i386__
 	"clk",
 #endif
 	"callout",
 	/*
 	 * leaf locks
 	 */
 	"ithread table lock",
 	"ithread list lock",
 #ifdef SMP
 #ifdef __i386__
 	"ap boot",
 	"imen",
 #endif
 	"smp rendezvous",
 #endif
 	NULL
 };
 
 static char *order_list[] = {
 	"Giant", "proctree", "allproc", "process lock", "uidinfo hash",
 	    "uidinfo struct", NULL,
 	NULL
 };
 
 static char *dup_list[] = {
 	NULL
 };
 
 static char *sleep_list[] = {
 	"Giant",
 	NULL
 };
 
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 static int blessed_count =
 	sizeof(blessed_list) / sizeof(struct witness_blessed);
 
 static void
 witness_init(struct mtx *m, int flag)
 {
 	m->mtx_witness = enroll(m->mtx_description, flag);
 }
 
 static void
 witness_destroy(struct mtx *m)
 {
 	struct mtx *m1;
 	struct proc *p;
 	p = CURPROC;
 	LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) {
 		if (m1 == m) {
 			LIST_REMOVE(m, mtx_held);
 			break;
 		}
 	}
 	return;
 
 }
 
 static void
 witness_display(void(*prnt)(const char *fmt, ...))
 {
 	struct witness *w, *w1;
 	int level, found;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	witness_levelall();
 
 	/*
 	 * First, handle sleep mutexes which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep mutexes:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file == NULL || w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		/*
 		 * This lock has no anscestors, display its descendants. 
 		 */
 		witness_displaydescendants(prnt, w);
 	}
 	
 	/*
 	 * Now do spin mutexes which have been acquired at least once.
 	 */
 	prnt("\nSpin mutexes:\n");
 	level = 0;
 	while (level < sizeof(spin_order_list) / sizeof(char *)) {
 		found = 0;
 		for (w = w_all; w; w = w->w_next) {
 			if (w->w_file == NULL || !w->w_spin)
 				continue;
 			if (w->w_level == 1 << level) {
 				witness_displaydescendants(prnt, w);
 				level++;
 				found = 1;
 			}
 		}
 		if (found == 0)
 			level++;
 	}
 	
 	/*
 	 * Finally, any mutexes which have not been acquired yet.
 	 */
 	prnt("\nMutexes which were never acquired:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file != NULL)
 			continue;
 		prnt("%s\n", w->w_description);
 	}
 }
 
 void
 witness_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w, *w1;
 	struct mtx *m1;
 	struct proc *p;
 	int i;
 #ifdef DDB
 	int go_into_ddb = 0;
 #endif /* DDB */
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 	p = CURPROC;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_enter: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		i = PCPU_GET(witness_spin_check);
 		if (i != 0 && w->w_level < i) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
 			    " %s:%d already holding %s:%x",
 			    m->mtx_description, w->w_level, file, line,
 			    spin_order_list[ffs(i)-1], i);
 		}
 		PCPU_SET(witness_spin_check, i | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line);
 		return;
 	}
 	if (witness_dead)
 		goto out;
 	if (cold)
 		goto out;
 
 	if (!mtx_legal2block())
 		panic("blockable mtx_lock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	/*
 	 * Is this the first mutex acquired 
 	 */
 	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
 		goto out;
 
 	if ((w1 = m1->mtx_witness) == w) {
 		if (w->w_same_squawked || dup_ok(w))
 			goto out;
 		w->w_same_squawked = 1;
 		printf("acquring duplicate lock of same type: \"%s\"\n", 
 			m->mtx_description);
 		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
 		printf(" 2nd @ %s:%d\n", file, line);
 #ifdef DDB
 		go_into_ddb = 1;
 #endif /* DDB */
 		goto out;
 	}
 	MPASS(!mtx_owned(&w_mtx));
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	/*
 	 * If we have a known higher number just say ok
 	 */
 	if (witness_watch > 1 && w->w_level > w1->w_level) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	if (isitmydescendant(m1->mtx_witness, w)) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
 
 		MPASS(i < 200);
 		w1 = m1->mtx_witness;
 		if (isitmydescendant(w, w1)) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			if (blessed(w, w1))
 				goto out;
 			if (m1 == &Giant) {
 				if (w1->w_Giant_squawked)
 					goto out;
 				else
 					w1->w_Giant_squawked = 1;
 			} else {
 				if (w1->w_other_squawked)
 					goto out;
 				else
 					w1->w_other_squawked = 1;
 			}
 			printf("lock order reversal\n");
 			printf(" 1st %s last acquired @ %s:%d\n",
 			    w->w_description, w->w_file, w->w_line);
 			printf(" 2nd %p %s @ %s:%d\n",
 			    m1, w1->w_description, w1->w_file, w1->w_line);
 			printf(" 3rd %p %s @ %s:%d\n",
 			    m, w->w_description, file, line);
 #ifdef DDB
 			go_into_ddb = 1;
 #endif /* DDB */
 			goto out;
 		}
 	}
 	m1 = LIST_FIRST(&p->p_heldmtx);
 	if (!itismychild(m1->mtx_witness, w))
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 
 out:
 #ifdef DDB
 	if (witness_ddb && go_into_ddb)
 		Debugger("witness_enter");
 #endif /* DDB */
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 
 	/*
 	 * If this pays off it likely means that a mutex being witnessed
 	 * is acquired in hardclock. Put it in the ignore list. It is
 	 * likely not the mutex this assert fails on.
 	 */
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_try_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct proc *p;
 	struct witness *w = m->mtx_witness;
 
 	if (witness_cold)
 		return;
 	if (panicstr)
 		return;
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_try_enter: "
 			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
 			    m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_try_enter: recursion on"
 				    " non-recursive mutex %s @ %s:%d",
 				    m->mtx_description, file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_try_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description, file,
 			    line);
 		return;
 	}
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 	p = CURPROC;
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_exit(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w;
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_exit: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line); 
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) & ~w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_exit: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line); 
 		return;
 	}
 
 	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
 		panic("switchable mtx_unlock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	LIST_REMOVE(m, mtx_held);
 	m->mtx_held.le_prev = NULL;
 }
 
 int
 witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
 {
 	struct mtx *m;
 	struct proc *p;
 	char **sleep;
 	int n = 0;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	p = CURPROC;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		if (m == mtx)
 			continue;
 		for (sleep = sleep_list; *sleep!= NULL; sleep++)
 			if (strcmp(m->mtx_description, *sleep) == 0)
 				goto next;
 		if (n == 0)
 			printf("Whee!\n");
 		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
 			file, line, check_only ? "could sleep" : "sleeping",
 			m->mtx_description,
 			m->mtx_witness->w_file, m->mtx_witness->w_line);
 		n++;
 	next:
 	}
 #ifdef DDB
 	if (witness_ddb && n)
 		Debugger("witness_sleep");
 #endif /* DDB */
 	return (n);
 }
 
 static struct witness *
 enroll(const char *description, int flag)
 {
 	int i;
 	struct witness *w, *w1;
 	char **ignore;
 	char **order;
 
 	if (!witness_watch)
 		return (NULL);
 	for (ignore = ignore_list; *ignore != NULL; ignore++)
 		if (strcmp(description, *ignore) == 0)
 			return (NULL);
 
 	if (w_inited == 0) {
 		mtx_init(&w_mtx, "witness lock", MTX_SPIN);
 		for (i = 0; i < WITNESS_COUNT; i++) {
 			w = &w_data[i];
 			witness_free(w);
 		}
 		w_inited = 1;
 		for (order = order_list; *order != NULL; order++) {
 			w = enroll(*order, MTX_DEF);
 			w->w_file = "order list";
 			for (order++; *order != NULL; order++) {
 				w1 = enroll(*order, MTX_DEF);
 				w1->w_file = "order list";
 				itismychild(w, w1);
 				w = w1;
     	    	    	}
 		}
 	}
 	if ((flag & MTX_SPIN) && witness_skipspin)
 		return (NULL);
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	for (w = w_all; w; w = w->w_next) {
 		if (strcmp(description, w->w_description) == 0) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			return (w);
 		}
 	}
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	w->w_next = w_all;
 	w_all = w;
 	w->w_description = description;
 	mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 	if (flag & MTX_SPIN) {
 		w->w_spin = 1;
 	
 		i = 1;
 		for (order = spin_order_list; *order != NULL; order++) {
 			if (strcmp(description, *order) == 0)
 				break;
 			i <<= 1;
 		}
 		if (*order == NULL)
 			panic("spin lock %s not in order list", description);
 		w->w_level = i; 
 	}
 
 	return (w);
 }
 
 static int
 itismychild(struct witness *parent, struct witness *child)
 {
 	static int recursed;
 
 	/*
 	 * Insert "child" after "parent"
 	 */
 	while (parent->w_morechildren)
 		parent = parent->w_morechildren;
 
 	if (parent->w_childcnt == WITNESS_NCHILDREN) {
 		if ((parent->w_morechildren = witness_get()) == NULL)
 			return (1);
 		parent = parent->w_morechildren;
 	}
 	MPASS(child != NULL);
 	parent->w_children[parent->w_childcnt++] = child;
 	/*
 	 * now prune whole tree
 	 */
 	if (recursed)
 		return (0);
 	recursed = 1;
 	for (child = w_all; child != NULL; child = child->w_next) {
 		for (parent = w_all; parent != NULL;
 		    parent = parent->w_next) {
 			if (!isitmychild(parent, child))
 				continue;
 			removechild(parent, child);
 			if (isitmydescendant(parent, child))
 				continue;
 			itismychild(parent, child);
 		}
 	}
 	recursed = 0;
 	witness_levelall();
 	return (0);
 }
 
 static void
 removechild(struct witness *parent, struct witness *child)
 {
 	struct witness *w, *w1;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			if (w->w_children[i] == child)
 				goto found;
 	return;
 found:
 	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
 		continue;
 	w->w_children[i] = w1->w_children[--w1->w_childcnt];
 	MPASS(w->w_children[i] != NULL);
 
 	if (w1->w_childcnt != 0)
 		return;
 
 	if (w1 == parent)
 		return;
 	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
 		continue;
 	w->w_morechildren = 0;
 	witness_free(w1);
 }
 
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren) {
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 isitmydescendant(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 	int j;
 
 	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
 		MPASS(j < 1000);
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (isitmydescendant(w->w_children[i], child))
 				return (1);
 		}
 	}
 	return (0);
 }
 
 void
 witness_levelall (void)
 {
 	struct witness *w, *w1;
 
 	for (w = w_all; w; w = w->w_next)
 		if (!(w->w_spin))
 			w->w_level = 0;
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		witness_leveldescendents(w, 0);
 	}
 }
 
 static void
 witness_leveldescendents(struct witness *parent, int level)
 {
 	int i;
 	struct witness *w;
 
 	if (parent->w_level < level)
 		parent->w_level = level;
 	level++;
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			witness_leveldescendents(w->w_children[i], level);
 }
 
 static void
 witness_displaydescendants(void(*prnt)(const char *fmt, ...),
 			   struct witness *parent)
 {
 	struct witness *w;
 	int i;
 	int level;
 
 	level = parent->w_spin ? ffs(parent->w_level) : parent->w_level;
 
 	prnt("%d", level);
 	if (level < 10)
 		prnt(" ");
 	for (i = 0; i < level; i++)
 		prnt(" ");
 	prnt("%s", parent->w_description);
 	if (parent->w_file != NULL)
 		prnt(" -- last acquired @ %s:%d\n", parent->w_file,
 		    parent->w_line);
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			    witness_displaydescendants(prnt, w->w_children[i]);
     }
 
 static int
 dup_ok(struct witness *w)
 {
 	char **dup;
 	
 	for (dup = dup_list; *dup!= NULL; dup++)
 		if (strcmp(w->w_description, *dup) == 0)
 			return (1);
 	return (0);
 }
 
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < blessed_count; i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_description, b->b_lock1) == 0) {
 			if (strcmp(w2->w_description, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_description, b->b_lock2) == 0)
 			if (strcmp(w2->w_description, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 
 static struct witness *
 witness_get()
 {
 	struct witness *w;
 
 	if ((w = w_free) == NULL) {
 		witness_dead = 1;
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		printf("witness exhausted\n");
 		return (NULL);
 	}
 	w_free = w->w_next;
 	bzero(w, sizeof(*w));
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 	w->w_next = w_free;
 	w_free = w;
 }
 
 int
 witness_list(struct proc *p)
 {
 	struct mtx *m;
 	int nheld;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	nheld = 0;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		printf("\t\"%s\" (%p) locked at %s:%d\n",
 		    m->mtx_description, m,
 		    m->mtx_witness->w_file, m->mtx_witness->w_line);
 		nheld++;
 	}
 
 	return (nheld);
 }
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(mutexes, db_witness_list)
 {
 
 	witness_list(CURPROC);
 }
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_display(db_printf);
 }
 #endif
 
 void
 witness_save(struct mtx *m, const char **filep, int *linep)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	*filep = m->mtx_witness->w_file;
 	*linep = m->mtx_witness->w_line;
 }
 
 void
 witness_restore(struct mtx *m, const char *file, int line)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	m->mtx_witness->w_file = file;
 	m->mtx_witness->w_line = line;
 }
 
 #endif	/* WITNESS */
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c	(revision 72375)
+++ head/sys/kern/kern_proc.c	(revision 72376)
@@ -1,699 +1,696 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <vm/vm.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <vm/vm_zone.h>
 
 static MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 int ps_showallprocs = 1;
 SYSCTL_INT(_kern, OID_AUTO, ps_showallprocs, CTLFLAG_RW,
     &ps_showallprocs, 0, "");
 
 static void pgdelete	__P((struct pgrp *));
 
 static void	orphanpg __P((struct pgrp *pg));
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct lock allproc_lock;
 struct lock proctree_lock;
 vm_zone_t proc_zone;
 vm_zone_t ithread_zone;
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	lockinit(&allproc_lock, PZERO, "allproc", 0, 0);
 	lockinit(&proctree_lock, PZERO, "proctree", 0, 0);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = zinit("PROC", sizeof (struct proc), 0, 0, 5);
 	uihashinit();
 	/*
 	 * This should really be a compile time warning, but I do
 	 * not know of any way to do that...
 	 */
 	if (sizeof(struct kinfo_proc) != KINFO_PROC_SIZE)
 		printf("WARNING: size of kinfo_proc (%ld) should be %d!!!\n",
 			(long)sizeof(struct kinfo_proc), KINFO_PROC_SIZE);
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(p)
 	register struct proc *p;
 {
 	int rval = 1;
 
 	PROCTREE_LOCK(PT_SHARED);
 	for (; p != curproc; p = p->p_pptr)
 		if (p->p_pid == 0) {
 			rval = 0;
 			break;
 		}
 	PROCTREE_LOCK(PT_RELEASE);
 	return (rval);
 }
 
 /*
  * Locate a process by number
  */
 struct proc *
 pfind(pid)
 	register pid_t pid;
 {
 	register struct proc *p;
 
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash)
 		if (p->p_pid == pid)
 			break;
 	ALLPROC_LOCK(AP_RELEASE);
 	return (p);
 }
 
 /*
  * Locate a process group by number
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash)
 		if (pgrp->pg_id == pgid)
 			return (pgrp);
 	return (NULL);
 }
 
 /*
  * Move p to a new or existing process group (and session)
  */
 int
 enterpgrp(p, pgid, mksess)
 	register struct proc *p;
 	pid_t pgid;
 	int mksess;
 {
 	register struct pgrp *pgrp = pgfind(pgid);
 
 	KASSERT(pgrp == NULL || !mksess,
 	    ("enterpgrp: setsid into non-empty pgrp"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	if (pgrp == NULL) {
 		pid_t savepid = p->p_pid;
 		struct proc *np;
 		/*
 		 * new process group
 		 */
 		KASSERT(p->p_pid == pgid,
 		    ("enterpgrp: new pgrp and pid != pgid"));
 		MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
 		    M_WAITOK);
 		if ((np = pfind(savepid)) == NULL || np != p)
 			return (ESRCH);
 		if (mksess) {
 			register struct session *sess;
 
 			/*
 			 * new session
 			 */
 			MALLOC(sess, struct session *, sizeof(struct session),
 			    M_SESSION, M_WAITOK);
 			sess->s_leader = p;
 			sess->s_sid = p->p_pid;
 			sess->s_count = 1;
 			sess->s_ttyvp = NULL;
 			sess->s_ttyp = NULL;
 			bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 			p->p_flag &= ~P_CONTROLT;
 			pgrp->pg_session = sess;
 			KASSERT(p == curproc,
 			    ("enterpgrp: mksession and p != curproc"));
 		} else {
 			pgrp->pg_session = p->p_session;
 			pgrp->pg_session->s_count++;
 		}
 		pgrp->pg_id = pgid;
 		LIST_INIT(&pgrp->pg_members);
 		LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 		pgrp->pg_jobc = 0;
 		SLIST_INIT(&pgrp->pg_sigiolst);
 	} else if (pgrp == p->p_pgrp)
 		return (0);
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	LIST_REMOVE(p, p_pglist);
 	if (LIST_EMPTY(&p->p_pgrp->pg_members))
 		pgdelete(p->p_pgrp);
 	p->p_pgrp = pgrp;
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	return (0);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 
 	LIST_REMOVE(p, p_pglist);
 	if (LIST_EMPTY(&p->p_pgrp->pg_members))
 		pgdelete(p->p_pgrp);
 	p->p_pgrp = 0;
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
 		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
 	LIST_REMOVE(pgrp, pg_hash);
 	if (--pgrp->pg_session->s_count == 0)
 		FREE(pgrp->pg_session, M_SESSION);
 	FREE(pgrp, M_PGRP);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession = pgrp->pg_session;
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	PROCTREE_LOCK(PT_SHARED);
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession) {
 		if (entering)
 			pgrp->pg_jobc++;
 		else if (--pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling)
 		if ((hispgrp = p->p_pgrp) != pgrp &&
 		    hispgrp->pg_session == mysession &&
 		    p->p_stat != SZOMB) {
 			if (entering)
 				hispgrp->pg_jobc++;
 			else if (--hispgrp->pg_jobc == 0)
 				orphanpg(hispgrp);
 		}
 	PROCTREE_LOCK(PT_RELEASE);
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		if (p->p_stat == SSTOP) {
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				psignal(p, SIGHUP);
 				psignal(p, SIGCONT);
 			}
 			return;
 		}
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  * Fill in an kinfo_proc structure for the specified process.
  */
 void
 fill_kinfo_proc(p, kp)
 	struct proc *p;
 	struct kinfo_proc *kp;
 {
 	struct tty *tp;
 	struct session *sp;
 
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	PROC_LOCK(p);
 	kp->ki_addr = p->p_addr;
 	kp->ki_args = p->p_args;
 	kp->ki_tracep = p->p_tracep;
 	kp->ki_textvp = p->p_textvp;
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	if (p->p_cred) {
 		kp->ki_uid = p->p_cred->pc_ucred->cr_uid;
 		kp->ki_ruid = p->p_cred->p_ruid;
 		kp->ki_svuid = p->p_cred->p_svuid;
 		kp->ki_ngroups = p->p_cred->pc_ucred->cr_ngroups;
 		bcopy(p->p_cred->pc_ucred->cr_groups, kp->ki_groups,
 		    NGROUPS * sizeof(gid_t));
 		kp->ki_rgid = p->p_cred->p_rgid;
 		kp->ki_svgid = p->p_cred->p_svgid;
 	}
 	if (p->p_procsig) {
 		kp->ki_sigignore = p->p_procsig->ps_sigignore;
 		kp->ki_sigcatch = p->p_procsig->ps_sigcatch;
 	}
 	mtx_lock_spin(&sched_lock);
 	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	}
 	if ((p->p_sflag & PS_INMEM) && p->p_stats) {
 		kp->ki_start = p->p_stats->p_start;
 		kp->ki_rusage = p->p_stats->p_ru;
 		kp->ki_childtime.tv_sec = p->p_stats->p_cru.ru_utime.tv_sec +
 		    p->p_stats->p_cru.ru_stime.tv_sec;
 		kp->ki_childtime.tv_usec = p->p_stats->p_cru.ru_utime.tv_usec +
 		    p->p_stats->p_cru.ru_stime.tv_usec;
 	}
 	if (p->p_wmesg) {
 		strncpy(kp->ki_wmesg, p->p_wmesg, WMESGLEN);
 		kp->ki_wmesg[WMESGLEN] = 0;
 	}
 	if (p->p_stat == SMTX) {
 		kp->ki_kiflag |= KI_MTXBLOCK;
 		strncpy(kp->ki_mtxname, p->p_mtxname, MTXNAMELEN);
 		kp->ki_mtxname[MTXNAMELEN] = 0;
 	}
 	kp->ki_stat = p->p_stat;
 	kp->ki_sflag = p->p_sflag;
 	kp->ki_pctcpu = p->p_pctcpu;
 	kp->ki_estcpu = p->p_estcpu;
 	kp->ki_slptime = p->p_slptime;
 	kp->ki_swtime = p->p_swtime;
 	kp->ki_wchan = p->p_wchan;
 	kp->ki_traceflag = p->p_traceflag;
-	kp->ki_priority = p->p_priority;
-	kp->ki_usrpri = p->p_usrpri;
-	kp->ki_nativepri = p->p_nativepri;
+	kp->ki_pri = p->p_pri;
 	kp->ki_nice = p->p_nice;
-	kp->ki_rtprio = p->p_rtprio;
 	kp->ki_runtime = p->p_runtime;
 	kp->ki_pid = p->p_pid;
 	kp->ki_rqindex = p->p_rqindex;
 	kp->ki_oncpu = p->p_oncpu;
 	kp->ki_lastcpu = p->p_lastcpu;
 	mtx_unlock_spin(&sched_lock);
 	sp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			bcopy(sp->s_login, kp->ki_login, sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag = KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && sp && ((tp = sp->s_ttyp) != NULL)) {
 		kp->ki_tdev = dev2udev(tp->t_dev);
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NOUDEV;
 	if (p->p_comm[0] != 0) {
 		strncpy(kp->ki_comm, p->p_comm, MAXCOMLEN);
 		kp->ki_comm[MAXCOMLEN] = 0;
 	}
 	kp->ki_siglist = p->p_siglist;
 	kp->ki_sigmask = p->p_sigmask;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_flag = p->p_flag;
 	kp->ki_lock = p->p_lock;
 	PROC_UNLOCK(p);
 	PROCTREE_LOCK(PT_SHARED);
 	if (p->p_pptr)
 		kp->ki_ppid = p->p_pptr->p_pid;
 	PROCTREE_LOCK(PT_RELEASE);
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, &zombproc, p_list)
 		if (p->p_pid == pid)
 			break;
 	ALLPROC_LOCK(AP_RELEASE);
 	return (p);
 }
 
 
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
 {
 	struct kinfo_proc kinfo_proc;
 	int error;
 	pid_t pid = p->p_pid;
 
 	fill_kinfo_proc(p, &kinfo_proc);
 	error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc, sizeof(kinfo_proc));
 	if (error)
 		return (error);
 	if (!doingzomb && pid && (pfind(pid) != p))
 		return EAGAIN;
 	if (doingzomb && zpfind(pid) != p)
 		return EAGAIN;
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int doingzomb;
 	int error = 0;
 
 	if (oidp->oid_number == KERN_PROC_PID) {
 		if (namelen != 1) 
 			return (EINVAL);
 		p = pfind((pid_t)name[0]);
 		if (!p)
 			return (0);
 		if (p_can(curproc, p, P_CAN_SEE, NULL))
 			return (0);
 		error = sysctl_out_proc(p, req, 0);
 		return (error);
 	}
 	if (oidp->oid_number == KERN_PROC_ALL && !namelen)
 		;
 	else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
 		;
 	else
 		return (EINVAL);
 	
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	ALLPROC_LOCK(AP_SHARED);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_can(curproc, p, P_CAN_SEE, NULL))
 				continue;
 			/*
 			 * Skip embryonic processes.
 			 */
 			if (p->p_stat == SIDL)
 				continue;
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oidp->oid_number) {
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL || 
 				    p->p_pgrp->pg_id != (pid_t)name[0])
 					continue;
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL ||
 				    p->p_session->s_ttyp == NULL ||
 				    dev2udev(p->p_session->s_ttyp->t_dev) != 
 					(udev_t)name[0])
 					continue;
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred == NULL || 
 				    p->p_ucred->cr_uid != (uid_t)name[0])
 					continue;
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred == NULL || 
 				    p->p_cred->p_ruid != (uid_t)name[0])
 					continue;
 				break;
 			}
 
 			if (p_can(curproc, p, P_CAN_SEE, NULL))
 				continue;
 
 			error = sysctl_out_proc(p, req, doingzomb);
 			if (error) {
 				ALLPROC_LOCK(AP_RELEASE);
 				return (error);
 			}
 		}
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	return (0);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	struct pargs *pa;
 	int error = 0;
 
 	if (namelen != 1) 
 		return (EINVAL);
 
 	p = pfind((pid_t)name[0]);
 	if (!p)
 		return (0);
 
 	if ((!ps_argsopen) && p_can(curproc, p, P_CAN_SEE, NULL))
 		return (0);
 
 	if (req->newptr && curproc != p)
 		return (EPERM);
 
 	if (req->oldptr && p->p_args != NULL)
 		error = SYSCTL_OUT(req, p->p_args->ar_args, p->p_args->ar_length);
 	if (req->newptr == NULL)
 		return (error);
 
 	if (p->p_args && --p->p_args->ar_ref == 0) 
 		FREE(p->p_args, M_PARGS);
 	p->p_args = NULL;
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (error);
 
 	MALLOC(pa, struct pargs *, sizeof(struct pargs) + req->newlen, 
 	    M_PARGS, M_WAITOK);
 	pa->ar_ref = 1;
 	pa->ar_length = req->newlen;
 	error = SYSCTL_IN(req, pa->ar_args, req->newlen);
 	if (!error)
 		p->p_args = pa;
 	else
 		FREE(pa, M_PARGS);
 	return (error);
 }
 
 SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
 	0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args, CTLFLAG_RW | CTLFLAG_ANYBODY,
 	sysctl_kern_proc_args, "Process argument list");
Index: head/sys/kern/kern_resource.c
===================================================================
--- head/sys/kern/kern_resource.c	(revision 72375)
+++ head/sys/kern/kern_resource.c	(revision 72376)
@@ -1,861 +1,895 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_rlimit.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/kernel.h>
 #include <sys/resourcevar.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 static int donice __P((struct proc *curp, struct proc *chgp, int n));
 /* dosetrlimit non-static:  Needed by SysVR4 emulator */
 int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
 
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct mtx uihashtbl_mtx;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static struct uidinfo	*uicreate __P((uid_t uid));
 static struct uidinfo	*uilookup __P((uid_t uid));
 
 /*
  * Resource controls and accounting.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
 int
 getpriority(curp, uap)
 	struct proc *curp;
 	register struct getpriority_args *uap;
 {
 	register struct proc *p;
 	register int low = PRIO_MAX + 1;
 
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			p = curp;
 		else
 			p = pfind(uap->who);
 		if (p == 0)
 			break;
 		if (p_can(curp, p, P_CAN_SEE, NULL))
 			break;
 		low = p->p_nice;
 		break;
 
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
 
 		if (uap->who == 0)
 			pg = curp->p_pgrp;
 		else if ((pg = pgfind(uap->who)) == NULL)
 			break;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			if (!p_can(curp, p, P_CAN_SEE, NULL) && p->p_nice < low)
 				low = p->p_nice;
 		}
 		break;
 	}
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = curp->p_ucred->cr_uid;
 		ALLPROC_LOCK(AP_SHARED);
 		LIST_FOREACH(p, &allproc, p_list)
 			if (!p_can(curp, p, P_CAN_SEE, NULL) &&
 			    p->p_ucred->cr_uid == uap->who &&
 			    p->p_nice < low)
 				low = p->p_nice;
 		ALLPROC_LOCK(AP_RELEASE);
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	if (low == PRIO_MAX + 1)
 		return (ESRCH);
 	curp->p_retval[0] = low;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
 /* ARGSUSED */
 int
 setpriority(curp, uap)
 	struct proc *curp;
 	register struct setpriority_args *uap;
 {
 	register struct proc *p;
 	int found = 0, error = 0;
 
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			p = curp;
 		else
 			p = pfind(uap->who);
 		if (p == 0)
 			break;
 		if (p_can(curp, p, P_CAN_SEE, NULL))
 			break;
 		error = donice(curp, p, uap->prio);
 		found++;
 		break;
 
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
 
 		if (uap->who == 0)
 			pg = curp->p_pgrp;
 		else if ((pg = pgfind(uap->who)) == NULL)
 			break;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			if (!p_can(curp, p, P_CAN_SEE, NULL)) {
 				error = donice(curp, p, uap->prio);
 				found++;
 			}
 		}
 		break;
 	}
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = curp->p_ucred->cr_uid;
 		ALLPROC_LOCK(AP_SHARED);
 		LIST_FOREACH(p, &allproc, p_list)
 			if (p->p_ucred->cr_uid == uap->who &&
 			    !p_can(curp, p, P_CAN_SEE, NULL)) {
 				error = donice(curp, p, uap->prio);
 				found++;
 			}
 		ALLPROC_LOCK(AP_RELEASE);
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	if (found == 0)
 		return (ESRCH);
 	return (error);
 }
 
 static int
 donice(curp, chgp, n)
 	register struct proc *curp, *chgp;
 	register int n;
 {
 	int	error;
 
 	if ((error = p_can(curp, chgp, P_CAN_SCHED, NULL)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
 	if (n < chgp->p_nice && suser(curp))
 		return (EACCES);
 	chgp->p_nice = n;
 	(void)resetpriority(chgp);
 	return (0);
 }
 
 /* rtprio system call */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
 
 /*
  * Set realtime priority
  */
 
 /* ARGSUSED */
 int
 rtprio(curp, uap)
 	struct proc *curp;
 	register struct rtprio_args *uap;
 {
 	register struct proc *p;
 	struct rtprio rtp;
 	int error;
 
 	error = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	if (error)
 		return (error);
 
 	if (uap->pid == 0)
 		p = curp;
 	else
 		p = pfind(uap->pid);
 
 	if (p == 0)
 		return (ESRCH);
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
-		return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+		pri_to_rtp(&p->p_pri, &rtp);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_can(curp, p, P_CAN_SCHED, NULL)))
 		        return (error);
 		/* disallow setting rtprio in most cases if not superuser */
 		if (suser(curp) != 0) {
 			/* can't set someone else's */
 			if (uap->pid)
 				return (EPERM);
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious. However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process). Fix me! XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type))
 #endif
 			if (rtp.type != RTP_PRIO_NORMAL)
 				return (EPERM);
 		}
-		switch (rtp.type) {
-#ifdef RTP_PRIO_FIFO
-		case RTP_PRIO_FIFO:
-#endif
-		case RTP_PRIO_REALTIME:
-		case RTP_PRIO_NORMAL:
-		case RTP_PRIO_IDLE:
-			if (rtp.prio > RTP_PRIO_MAX)
-				return (EINVAL);
-			p->p_rtprio = rtp;
+		if (rtp_to_pri(&rtp, &p->p_pri) == 0)
 			return (0);
-		default:
-			return (EINVAL);
-		}
-
+		return (EINVAL);
 	default:
 		return (EINVAL);
 	}
+}
+
+int
+rtp_to_pri(struct rtprio *rtp, struct priority *pri)
+{
+
+	if (rtp->prio > RTP_PRIO_MAX)
+		return (-1);
+	switch (RTP_PRIO_BASE(rtp->type)) {
+	case RTP_PRIO_REALTIME:
+		pri->pri_level = PRI_MIN_REALTIME + rtp->prio;
+		break;
+	case RTP_PRIO_NORMAL:
+		pri->pri_level = PRI_MIN_TIMESHARE + rtp->prio;
+		break;
+	case RTP_PRIO_IDLE:
+		pri->pri_level = PRI_MIN_IDLE + rtp->prio;
+		break;
+	default:
+		return (-1);
+	}
+	pri->pri_class = rtp->type;
+	pri->pri_native = pri->pri_level;
+	pri->pri_user = pri->pri_level;
+	return (0);
+}
+
+void
+pri_to_rtp(struct priority *pri, struct rtprio *rtp)
+{
+
+	switch (PRI_BASE(pri->pri_class)) {
+	case PRI_REALTIME:
+		rtp->prio = pri->pri_level - PRI_MIN_REALTIME;
+		break;
+	case PRI_TIMESHARE:
+		rtp->prio = pri->pri_level - PRI_MIN_TIMESHARE;
+		break;
+	case PRI_IDLE:
+		rtp->prio = pri->pri_level - PRI_MIN_IDLE;
+		break;
+	default:
+		break;
+	}
+	rtp->type = pri->pri_class;
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 osetrlimit(p, uap)
 	struct proc *p;
 	register struct osetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error =
 	    copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	return (dosetrlimit(p, uap->which, &lim));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 ogetrlimit(p, uap)
 	struct proc *p;
 	register struct ogetrlimit_args *uap;
 {
 	struct orlimit olim;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
 	if (olim.rlim_cur == -1)
 		olim.rlim_cur = 0x7fffffff;
 	olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
 	if (olim.rlim_max == -1)
 		olim.rlim_max = 0x7fffffff;
 	return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 setrlimit(p, uap)
 	struct proc *p;
 	register struct __setrlimit_args *uap;
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error =
 	    copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
 		return (error);
 	return (dosetrlimit(p, uap->which, &alim));
 }
 
 int
 dosetrlimit(p, which, limp)
 	struct proc *p;
 	u_int which;
 	struct rlimit *limp;
 {
 	register struct rlimit *alimp;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 	alimp = &p->p_rlimit[which];
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = suser_xxx(0, p, PRISON_ROOT)))
 			return (error);
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	if (p->p_limit->p_refcnt > 1 &&
 	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
 		p->p_limit->p_refcnt--;
 		p->p_limit = limcopy(p->p_limit);
 		alimp = &p->p_rlimit[which];
 	}
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000)
 			p->p_limit->p_cpulimit = RLIM_INFINITY;
 		else
 			p->p_limit->p_cpulimit = 
 			    (rlim_t)1000000 * limp->rlim_cur;
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > MAXDSIZ)
 			limp->rlim_cur = MAXDSIZ;
 		if (limp->rlim_max > MAXDSIZ)
 			limp->rlim_max = MAXDSIZ;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > MAXSSIZ)
 			limp->rlim_cur = MAXSSIZ;
 		if (limp->rlim_max > MAXSSIZ)
 			limp->rlim_max = MAXSSIZ;
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != alimp->rlim_cur) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > alimp->rlim_cur) {
 				prot = VM_PROT_ALL;
 				size = limp->rlim_cur - alimp->rlim_cur;
 				addr = USRSTACK - limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = alimp->rlim_cur - limp->rlim_cur;
 				addr = USRSTACK - alimp->rlim_cur;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void) vm_map_protect(&p->p_vmspace->vm_map,
 					      addr, addr+size, prot, FALSE);
 		}
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		break;
 	}
 	*alimp = *limp;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /* ARGSUSED */
 int
 getrlimit(p, uap)
 	struct proc *p;
 	register struct __getrlimit_args *uap;
 {
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
 	    sizeof (struct rlimit)));
 }
 
 /*
  * Transform the running time and tick information in proc p into user,
  * system, and interrupt time usage.
  */
 void
 calcru(p, up, sp, ip)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 	struct timeval *ip;
 {
 	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
 	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
 	int s;
 	struct timeval tv;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	/* XXX: why spl-protect ?  worst case is an off-by-one report */
 	s = splstatclock();
 	ut = p->p_uticks;
 	st = p->p_sticks;
 	it = p->p_iticks;
 	splx(s);
 
 	tt = ut + st + it;
 	if (tt == 0) {
 		st = 1;
 		tt = 1;
 	}
 
 	tu = p->p_runtime;
 	if (p == curproc) {
 		/*
 		 * Adjust for the current time slice.  This is actually fairly
 		 * important since the error here is on the order of a time
 		 * quantum, which is much greater than the sampling error.
 		 */
 		microuptime(&tv);
 		if (timevalcmp(&tv, PCPU_PTR(switchtime), <))
 			printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
 			    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
 			    tv.tv_sec, tv.tv_usec);
 		else
 			tu += (tv.tv_usec - PCPU_GET(switchtime.tv_usec)) +
 			    (tv.tv_sec - PCPU_GET(switchtime.tv_sec)) *
 			    (int64_t)1000000;
 	}
 	ptu = p->p_uu + p->p_su + p->p_iu;
 	if (tu < ptu || (int64_t)tu < 0) {
 		/* XXX no %qd in kernel.  Truncate. */
 		printf("calcru: negative time of %ld usec for pid %d (%s)\n",
 		       (long)tu, p->p_pid, p->p_comm);
 		tu = ptu;
 	}
 
 	/* Subdivide tu. */
 	uu = (tu * ut) / tt;
 	su = (tu * st) / tt;
 	iu = tu - uu - su;
 
 	/* Enforce monotonicity. */
 	if (uu < p->p_uu || su < p->p_su || iu < p->p_iu) {
 		if (uu < p->p_uu)
 			uu = p->p_uu;
 		else if (uu + p->p_su + p->p_iu > tu)
 			uu = tu - p->p_su - p->p_iu;
 		if (st == 0)
 			su = p->p_su;
 		else {
 			su = ((tu - uu) * st) / (st + it);
 			if (su < p->p_su)
 				su = p->p_su;
 			else if (uu + su + p->p_iu > tu)
 				su = tu - uu - p->p_iu;
 		}
 		KASSERT(uu + su + p->p_iu <= tu,
 		    ("calcru: monotonisation botch 1"));
 		iu = tu - uu - su;
 		KASSERT(iu >= p->p_iu,
 		    ("calcru: monotonisation botch 2"));
 	}
 	p->p_uu = uu;
 	p->p_su = su;
 	p->p_iu = iu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 	if (ip != NULL) {
 		ip->tv_sec = iu / 1000000;
 		ip->tv_usec = iu % 1000000;
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
 /* ARGSUSED */
 int
 getrusage(p, uap)
 	register struct proc *p;
 	register struct getrusage_args *uap;
 {
 	register struct rusage *rup;
 
 	switch (uap->who) {
 
 	case RUSAGE_SELF:
 		rup = &p->p_stats->p_ru;
 		mtx_lock_spin(&sched_lock);
 		calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
 		mtx_unlock_spin(&sched_lock);
 		break;
 
 	case RUSAGE_CHILDREN:
 		rup = &p->p_stats->p_cru;
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	return (copyout((caddr_t)rup, (caddr_t)uap->rusage,
 	    sizeof (struct rusage)));
 }
 
 void
 ruadd(ru, ru2)
 	register struct rusage *ru, *ru2;
 {
 	register long *ip, *ip2;
 	register int i;
 
 	timevaladd(&ru->ru_utime, &ru2->ru_utime);
 	timevaladd(&ru->ru_stime, &ru2->ru_stime);
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first; ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork,
  * and copy when a limit is changed.
  */
 struct plimit *
 limcopy(lim)
 	struct plimit *lim;
 {
 	register struct plimit *copy;
 
 	MALLOC(copy, struct plimit *, sizeof(struct plimit),
 	    M_SUBPROC, M_WAITOK);
 	bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit));
 	copy->p_lflags = 0;
 	copy->p_refcnt = 1;
 	return (copy);
 }
 
 /*
  * Find the uidinfo structure for a uid.  This structure is used to
  * track the total resource consumption (process count, socket buffer
  * size, etc.) for the uid and impose limits.
  */
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	mtx_init(&uihashtbl_mtx, "uidinfo hash", MTX_DEF);
 }
 
 /*
  * lookup a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uilookup(uid)
 	uid_t uid;
 {
 	struct	uihashhead *uipp;
 	struct	uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid)
 			break;
 
 	return (uip);
 }
 
 /*
  * Create a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uicreate(uid)
 	uid_t uid;
 {
 	struct	uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	MALLOC(uip, struct uidinfo *, sizeof(*uip), M_UIDINFO,
 	    M_WAITOK | M_ZERO);
 	LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
 	uip->ui_uid = uid;
 	mtx_init(&uip->ui_mtx, "uidinfo struct", MTX_DEF);
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Increase refcount on uidinfo struct returned.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid)
 	uid_t uid;
 {
 	struct	uidinfo *uip;
 
 	mtx_lock(&uihashtbl_mtx);
 	uip = uilookup(uid);
 	if (uip == NULL)
 		uip = uicreate(uid);
 	uihold(uip);
 	mtx_unlock(&uihashtbl_mtx);
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(uip)
 	struct uidinfo *uip;
 {
 
 	mtx_lock(&uip->ui_mtx);
 	uip->ui_ref++;
 	mtx_unlock(&uip->ui_mtx);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, loose the lock and aquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(uip)
 	struct uidinfo *uip;
 {
 
 	/* Prepare for optimal case. */
 	mtx_lock(&uip->ui_mtx);
 
 	if (--uip->ui_ref != 0) {
 		mtx_unlock(&uip->ui_mtx);
 		return;
 	}
 
 	/* Prepare for suboptimal case. */
 	uip->ui_ref++;
 	mtx_unlock(&uip->ui_mtx);
 	mtx_lock(&uihashtbl_mtx);
 	mtx_lock(&uip->ui_mtx);
 
 	/*
 	 * We must subtract one from the count again because we backed out
 	 * our initial subtraction before dropping the lock.
 	 * Since another thread may have added a reference after we dropped the
 	 * initial lock we have to test for zero again.
 	 */
 	if (--uip->ui_ref == 0) {
 		LIST_REMOVE(uip, ui_hash);
 		mtx_unlock(&uihashtbl_mtx);
 		if (uip->ui_sbsize != 0)
 			/* XXX no %qd in kernel.  Truncate. */
 			printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
 			    uip->ui_uid, (long)uip->ui_sbsize);
 		if (uip->ui_proccnt != 0)
 			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 			    uip->ui_uid, uip->ui_proccnt);
 		mtx_destroy(&uip->ui_mtx);
 		FREE(uip, M_UIDINFO);
 		return;
 	}
 
 	mtx_unlock(&uihashtbl_mtx);
 	mtx_unlock(&uip->ui_mtx);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(uip, diff, max)
 	struct	uidinfo	*uip;
 	int	diff;
 	int	max;
 {
 
 	mtx_lock(&uip->ui_mtx);
 	/* don't allow them to exceed max, but allow subtraction */
 	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
 		mtx_unlock(&uip->ui_mtx);
 		return (0);
 	}
 	uip->ui_proccnt += diff;
 	if (uip->ui_proccnt < 0)
 		printf("negative proccnt for uid = %d\n", uip->ui_uid);
 	mtx_unlock(&uip->ui_mtx);
 	return (1);
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(uip, hiwat, to, max)
 	struct	uidinfo	*uip;
 	u_long *hiwat;
 	u_long	to;
 	rlim_t	max;
 {
 	rlim_t new;
 	int s;
 
 	s = splnet();
 	mtx_lock(&uip->ui_mtx);
 	new = uip->ui_sbsize + to - *hiwat;
 	/* don't allow them to exceed max, but allow subtraction */
 	if (to > *hiwat && new > max) {
 		splx(s);
 		mtx_unlock(&uip->ui_mtx);
 		return (0);
 	}
 	uip->ui_sbsize = new;
 	*hiwat = to;
 	if (uip->ui_sbsize < 0)
 		printf("negative sbsize for uid = %d\n", uip->ui_uid);
 	splx(s);
 	mtx_unlock(&uip->ui_mtx);
 	return (1);
 }
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 72375)
+++ head/sys/kern/kern_sig.c	(revision 72376)
@@ -1,1856 +1,1856 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/acct.h>
 #include <sys/fcntl.h>
 #include <sys/ipl.h>
 #include <sys/condvar.h>
 #include <sys/mutex.h>
 #include <sys/wait.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/syslog.h>
 #include <sys/stat.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int coredump	__P((struct proc *));
 static int do_sigaction	__P((struct proc *p, int sig, struct sigaction *act,
 			     struct sigaction *oact, int old));
 static int do_sigprocmask __P((struct proc *p, int how, sigset_t *set,
 			       sigset_t *oset, int old));
 static char *expand_name __P((const char *, uid_t, pid_t));
 static int killpg1	__P((struct proc *cp, int sig, int pgid, int all));
 static int sig_ffs	__P((sigset_t *set));
 static int sigprop	__P((int sig));
 static void stop	__P((struct proc *));
 
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 /*
  * Can process p, with pcred pc, send the signal sig to process q?
  */
 #define CANSIGNAL(p, q, sig) \
 	(!p_can(p, q, P_CAN_KILL, NULL) || \
 	((sig) == SIGCONT && (q)->p_session == (p)->p_session))
 
 /*
  * Policy -- Can real uid ruid with ucred uc send a signal to process q?
  */
 #define CANSIGIO(ruid, uc, q) \
 	((uc)->cr_uid == 0 || \
 	    (ruid) == (q)->p_cred->p_ruid || \
 	    (uc)->cr_uid == (q)->p_cred->p_ruid || \
 	    (ruid) == (q)->p_ucred->cr_uid || \
 	    (uc)->cr_uid == (q)->p_ucred->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 
 static int sigproptbl[NSIG] = {
         SA_KILL,                /* SIGHUP */
         SA_KILL,                /* SIGINT */
         SA_KILL|SA_CORE,        /* SIGQUIT */
         SA_KILL|SA_CORE,        /* SIGILL */
         SA_KILL|SA_CORE,        /* SIGTRAP */
         SA_KILL|SA_CORE,        /* SIGABRT */
         SA_KILL|SA_CORE,        /* SIGEMT */
         SA_KILL|SA_CORE,        /* SIGFPE */
         SA_KILL,                /* SIGKILL */
         SA_KILL|SA_CORE,        /* SIGBUS */
         SA_KILL|SA_CORE,        /* SIGSEGV */
         SA_KILL|SA_CORE,        /* SIGSYS */
         SA_KILL,                /* SIGPIPE */
         SA_KILL,                /* SIGALRM */
         SA_KILL,                /* SIGTERM */
         SA_IGNORE,              /* SIGURG */
         SA_STOP,                /* SIGSTOP */
         SA_STOP|SA_TTYSTOP,     /* SIGTSTP */
         SA_IGNORE|SA_CONT,      /* SIGCONT */
         SA_IGNORE,              /* SIGCHLD */
         SA_STOP|SA_TTYSTOP,     /* SIGTTIN */
         SA_STOP|SA_TTYSTOP,     /* SIGTTOU */
         SA_IGNORE,              /* SIGIO */
         SA_KILL,                /* SIGXCPU */
         SA_KILL,                /* SIGXFSZ */
         SA_KILL,                /* SIGVTALRM */
         SA_KILL,                /* SIGPROF */
         SA_IGNORE,              /* SIGWINCH  */
         SA_IGNORE,              /* SIGINFO */
         SA_KILL,                /* SIGUSR1 */
         SA_KILL,                /* SIGUSR2 */
 };
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  *
  * MP SAFE.
  */
 int
 CURSIG(struct proc *p)
 {
 	sigset_t tmpset;
 	int r;
 
 	if (SIGISEMPTY(p->p_siglist))
 		return (0);
 	tmpset = p->p_siglist;
 	SIGSETNAND(tmpset, p->p_sigmask);
 	if (SIGISEMPTY(tmpset) && (p->p_flag & P_TRACED) == 0)
 		return (0);
 	mtx_lock(&Giant);
 	r = issignal(p);
 	mtx_unlock(&Giant);
 	return (r);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 static __inline int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * do_sigaction
  * sigaction
  * osigaction
  */
 static int
 do_sigaction(p, sig, act, oact, old)
 	struct proc *p;
 	register int sig;
 	struct sigaction *act, *oact;
 	int old;
 {
 	register struct sigacts *ps = p->p_sigacts;
 
 	if (sig <= 0 || sig > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (oact) {
 		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig))
 			oact->sa_flags |= SA_SIGINFO;
 		if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && p->p_procsig->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL)
 			return (EINVAL);
 
 		/*
 		 * Change setting atomically.
 		 */
 		(void) splhigh();
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 #ifdef COMPAT_SUNOS
 		if (act->sa_flags & SA_USERTRAMP)
 			SIGADDSET(ps->ps_usertramp, sig);
 		else
 			SIGDELSET(ps->ps_usertramp, seg);
 #endif
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				p->p_procsig->ps_flag |= PS_NOCLDSTOP;
 			else
 				p->p_procsig->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					p->p_procsig->ps_flag |= PS_NOCLDWAIT;
 			} else
 				p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
 		}
 		/*
 		 * Set bit in p_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in p_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			SIGDELSET(p->p_siglist, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(p->p_sigignore, sig);
 			SIGDELSET(p->p_sigcatch, sig);
 		} else {
 			SIGDELSET(p->p_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(p->p_sigcatch, sig);
 			else
 				SIGADDSET(p->p_sigcatch, sig);
 		}
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || !old)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 
 		(void) spl0();
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 /* ARGSUSED */
 int
 sigaction(p, uap)
 	struct proc *p;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = do_sigaction(p, uap->sig, actp, oactp, 0);
 	if (oactp && !error) {
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 /* ARGSUSED */
 int
 osigaction(p, uap)
 	struct proc *p;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = do_sigaction(p, uap->signum, nsap, osap, 1);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(p->p_sigignore, i);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(p)
 	register struct proc *p;
 {
 	register struct sigacts *ps = p->p_sigacts;
 	register int sig;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through p_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	while (SIGNOTEMPTY(p->p_sigcatch)) {
 		sig = sig_ffs(&p->p_sigcatch);
 		SIGDELSET(p->p_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(p->p_sigignore, sig);
 			SIGDELSET(p->p_siglist, sig);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	p->p_sigstk.ss_flags = SS_DISABLE;
 	p->p_sigstk.ss_size = 0;
 	p->p_sigstk.ss_sp = 0;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	p->p_procsig->ps_flag &= ~PS_NOCLDWAIT;
 }
 
 /*
  * do_sigprocmask() - MP SAFE ONLY IF p == curproc
  *
  *	Manipulate signal mask.  This routine is MP SAFE *ONLY* if
  *	p == curproc.  Also remember that in order to remain MP SAFE
  *	no spl*() calls may be made.
  */
 static int
 do_sigprocmask(p, how, set, oset, old)
 	struct proc *p;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	if (oset != NULL)
 		*oset = p->p_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(p->p_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(p->p_sigmask, *set);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(p->p_sigmask, *set);
 			else
 				p->p_sigmask = *set;
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	return (error);
 }
 
 /*
  * sigprocmask() - MP SAFE
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(p, uap)
 	register struct proc *p;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = do_sigprocmask(p, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * osigprocmask() - MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(p, uap)
 	register struct proc *p;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = do_sigprocmask(p, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, p->p_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 /* ARGSUSED */
 int
 sigpending(p, uap)
 	struct proc *p;
 	struct sigpending_args *uap;
 {
 
 	return (copyout(&p->p_siglist, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 osigpending(p, uap)
 	struct proc *p;
 	struct osigpending_args *uap;
 {
 
 	SIG2OSIG(p->p_siglist, p->p_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(p, uap)
 	struct proc *p;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 #ifdef COMPAT_SUNOS
 		nsap->sa_flags |= SA_USERTRAMP;
 #endif
 	}
 	error = do_sigaction(p, uap->signum, nsap, osap, 1);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 #ifdef COMPAT_SUNOS
 		vec.sv_flags &= ~SA_NOCLDSTOP;
 #endif
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(p, uap)
 	register struct proc *p;
 	struct osigblock_args *uap;
 {
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	(void) splhigh();
 	SIG2OSIG(p->p_sigmask, p->p_retval[0]);
 	SIGSETOR(p->p_sigmask, set);
 	(void) spl0();
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(p, uap)
 	struct proc *p;
 	struct osigsetmask_args *uap;
 {
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	(void) splhigh();
 	SIG2OSIG(p->p_sigmask, p->p_retval[0]);
 	SIGSETLO(p->p_sigmask, set);
 	(void) spl0();
 	return (0);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Suspend process until signal, providing mask to be set
  * in the meantime.  Note nonstandard calling convention:
  * libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sigsuspend(p, uap)
 	register struct proc *p;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	register struct sigacts *ps = p->p_sigacts;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	p->p_oldsigmask = p->p_sigmask;
 	p->p_flag |= P_OLDMASK;
 
 	SIG_CANTMASK(mask);
 	p->p_sigmask = mask;
 	while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(p, uap)
 	register struct proc *p;
 	struct osigsuspend_args *uap;
 {
 	sigset_t mask;
 	register struct sigacts *ps = p->p_sigacts;
 
 	p->p_oldsigmask = p->p_sigmask;
 	p->p_flag |= P_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(p->p_sigmask, mask);
 	while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(p, uap)
 	struct proc *p;
 	register struct osigstack_args *uap;
 {
 	struct sigstack ss;
 	int error;
 
 	if (uap->oss != NULL) {
 		ss.ss_sp = p->p_sigstk.ss_sp;
 		ss.ss_onstack = sigonstack(cpu_getstack(p));
 		error = copyout(&ss, uap->oss, sizeof(struct sigstack));
 		if (error)
 			return (error);
 	}
 
 	if (uap->nss != NULL) {
 		if ((error = copyin(uap->nss, &ss, sizeof(ss))) != 0)
 			return (error);
 		p->p_sigstk.ss_sp = ss.ss_sp;
 		p->p_sigstk.ss_size = 0;
 		p->p_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
 		p->p_flag |= P_ALTSTACK;
 	}
 	return (0);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sigaltstack(p, uap)
 	struct proc *p;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss;
 	int error, oonstack;
 
 	oonstack = sigonstack(cpu_getstack(p));
 
 	if (uap->oss != NULL) {
 		ss = p->p_sigstk;
 		ss.ss_flags = (p->p_flag & P_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 		if ((error = copyout(&ss, uap->oss, sizeof(stack_t))) != 0)
 			return (error);
 	}
 
 	if (uap->ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((error = copyin(uap->ss, &ss, sizeof(ss))) != 0)
 			return (error);
 		if ((ss.ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss.ss_flags & SS_DISABLE)) {
 			if (ss.ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 			p->p_sigstk = ss;
 			p->p_flag |= P_ALTSTACK;
 		} else
 			p->p_flag &= ~P_ALTSTACK;
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 int
 killpg1(cp, sig, pgid, all)
 	register struct proc *cp;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		ALLPROC_LOCK(AP_SHARED);
 		LIST_FOREACH(p, &allproc, p_list) {
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == cp || !CANSIGNAL(cp, p, sig))
 				continue;
 			nfound++;
 			if (sig)
 				psignal(p, sig);
 		}
 		ALLPROC_LOCK(AP_RELEASE);
 	} else {
 		if (pgid == 0)
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = cp->p_pgrp;
 		else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL)
 				return (ESRCH);
 		}
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p->p_stat == SZOMB ||
 			    !CANSIGNAL(cp, p, sig))
 				continue;
 			nfound++;
 			if (sig)
 				psignal(p, sig);
 		}
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 kill(cp, uap)
 	register struct proc *cp;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL)
 			return (ESRCH);
 		if (!CANSIGNAL(cp, p, uap->signum))
 			return (EPERM);
 		if (uap->signum)
 			psignal(p, uap->signum);
 		return (0);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(cp, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(cp, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(cp, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(p, uap)
 	struct proc *p;
 	register struct okillpg_args *uap;
 {
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 	return (killpg1(p, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid && (pgrp = pgfind(pgid)))
 		pgsignal(pgrp, sig, 0);
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp)
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist)
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 }
 
 /*
  * Send a signal caused by a trap to the current process.
  * If it will be caught immediately, deliver it with correct code.
  * Otherwise, post it normally.
  */
 void
 trapsignal(p, sig, code)
 	struct proc *p;
 	register int sig;
 	u_long code;
 {
 	register struct sigacts *ps = p->p_sigacts;
 
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(p->p_sigcatch, sig) &&
 	    !SIGISMEMBER(p->p_sigmask, sig)) {
 		p->p_stats->p_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_PSIG))
 			ktrpsig(p->p_tracep, sig, ps->ps_sigact[_SIG_IDX(sig)],
 				&p->p_sigmask, code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], sig,
 						&p->p_sigmask, code);
 		SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(p->p_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See do_sigaction() for origin of this code.
 			 */
 			SIGDELSET(p->p_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(p->p_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 	} else {
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		psignal(p, sig);
 	}
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  */
 void
 psignal(p, sig)
 	register struct proc *p;
 	register int sig;
 {
 	register int prop;
 	register sig_t action;
 
 	if (sig > _SIG_MAXSIG || sig <= 0) {
 		printf("psignal: signal %d\n", sig);
 		panic("psignal signal number");
 	}
 
 	PROC_LOCK(p);
 	KNOTE(&p->p_klist, NOTE_SIGNAL | sig);
 
 	prop = sigprop(sig);
 
 	/*
 	 * If proc is traced, always give parent a chance;
 	 * if signal event is tracked by procfs, give *that*
 	 * a chance, as well.
 	 */
 	if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG))
 		action = SIG_DFL;
 	else {
 		/*
 		 * If the signal is being ignored,
 		 * then we forget about it immediately.
 		 * (Note: we don't set SIGCONT in p_sigignore,
 		 * and if it is set to SIG_IGN,
 		 * action will be SIG_DFL here.)
 		 */
 		if (SIGISMEMBER(p->p_sigignore, sig) || (p->p_flag & P_WEXIT)) {
 			PROC_UNLOCK(p);
 			return;
 		}
 		if (SIGISMEMBER(p->p_sigmask, sig))
 			action = SIG_HOLD;
 		else if (SIGISMEMBER(p->p_sigcatch, sig))
 			action = SIG_CATCH;
 		else
 			action = SIG_DFL;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) &&
 	    (p->p_flag & P_TRACED) == 0)
 		p->p_nice = NZERO;
 	mtx_unlock_spin(&sched_lock);
 
 	if (prop & SA_CONT)
 		SIG_STOPSIGMASK(p->p_siglist);
 
 	if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
 		    action == SIG_DFL) {
 			PROC_UNLOCK(p);
 		        return;
 		}
 		SIG_CONTSIGMASK(p->p_siglist);
 	}
 	SIGADDSET(p->p_siglist, sig);
 
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (action == SIG_HOLD && (!(prop & SA_CONT) || p->p_stat != SSTOP)) {
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return;
 	}
 	switch (p->p_stat) {
 
 	case SSLEEP:
 		/*
 		 * If process is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((p->p_sflag & PS_SINTR) == 0) {
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 		/*
 		 * Process is sleeping and traced... make it runnable
 		 * so it can discover the signal in issignal() and stop
 		 * for the parent.
 		 */
 		if (p->p_flag & P_TRACED)
 			goto run;
 		mtx_unlock_spin(&sched_lock);
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			SIGDELSET(p->p_siglist, sig);
 			goto out;
 		}
 		/*
 		 * When a sleeping process receives a stop
 		 * signal, process immediately if possible.
 		 * All other (caught or default) signals
 		 * cause the process to run.
 		 */
 		if (prop & SA_STOP) {
 			if (action != SIG_DFL)
 				goto runfast;
 			/*
 			 * If a child holding parent blocked,
 			 * stopping could cause deadlock.
 			 */
 			if (p->p_flag & P_PPWAIT)
 				goto out;
 			SIGDELSET(p->p_siglist, sig);
 			p->p_xstat = sig;
 			PROC_UNLOCK(p);
 			PROCTREE_LOCK(PT_SHARED);
 			if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
 				psignal(p->p_pptr, SIGCHLD);
 			stop(p);
 			PROCTREE_LOCK(PT_RELEASE);
 			PROC_LOCK(p);
 			goto out;
 		} else
 			goto runfast;
 		/* NOTREACHED */
 
 	case SSTOP:
 		mtx_unlock_spin(&sched_lock);
 		/*
 		 * If traced process is already stopped,
 		 * then no further action is necessary.
 		 */
 		if (p->p_flag & P_TRACED)
 			goto out;
 
 		/*
 		 * Kill signal always sets processes running.
 		 */
 		if (sig == SIGKILL)
 			goto runfast;
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in p_siglist, as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * p_siglist.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, then it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			if (action == SIG_DFL)
 				SIGDELSET(p->p_siglist, sig);
 			if (action == SIG_CATCH)
 				goto runfast;
 			mtx_lock_spin(&sched_lock);
 			if (p->p_wchan == NULL)
 				goto run;
 			p->p_stat = SSLEEP;
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again.
 			 * (If we did the shell could get confused.)
 			 */
 			SIGDELSET(p->p_siglist, sig);
 			goto out;
 		}
 
 		/*
 		 * If process is sleeping interruptibly, then simulate a
 		 * wakeup so that when it is continued, it will be made
 		 * runnable and can look at the signal.  But don't make
 		 * the process runnable, leave it stopped.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (p->p_wchan && p->p_sflag & PS_SINTR) {
 			if (p->p_sflag & PS_CVWAITQ)
 				cv_waitq_remove(p);
 			else
 				unsleep(p);
 		}
 		mtx_unlock_spin(&sched_lock);
 		goto out;
 
 	default:
 		/*
 		 * SRUN, SIDL, SZOMB do nothing with the signal,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 		if (p == curproc) {
 			signotify(p);
 			mtx_unlock_spin(&sched_lock);
 		}
 #ifdef SMP
 		else if (p->p_stat == SRUN) {
 			mtx_unlock_spin(&sched_lock);
 			forward_signal(p);
 		}
 #endif
 		else
 			mtx_unlock_spin(&sched_lock);
 		goto out;
 	}
 	/*NOTREACHED*/
 
 runfast:
 	/*
 	 * Raise priority to at least PUSER.
 	 */
 	mtx_lock_spin(&sched_lock);
-	if (p->p_priority > PUSER)
-		p->p_priority = PUSER;
+	if (p->p_pri.pri_level > PUSER)
+		p->p_pri.pri_level = PUSER;
 run:
 	/* If we jump here, sched_lock has to be owned. */
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	setrunnable(p);
 	mtx_unlock_spin(&sched_lock);
 out:
 	/* If we jump here, sched_lock should not be owned. */
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	PROC_UNLOCK(p);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in the CURSIG macro.) The normal call
  * sequence is
  *
  *	while (sig = CURSIG(curproc))
  *		postsig(sig);
  */
 int
 issignal(p)
 	register struct proc *p;
 {
 	sigset_t mask;
 	register int sig, prop;
 
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		mask = p->p_siglist;
 		SIGSETNAND(mask, p->p_sigmask);
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(mask);
 		if (!SIGNOTEMPTY(mask))	 	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&mask);
 		prop = sigprop(sig);
 
 		STOPEVENT(p, S_SIG, sig);
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(p->p_sigignore, sig) && (traced == 0)) {
 			SIGDELSET(p->p_siglist, sig);
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop, and stay
 			 * stopped until released by the parent.
 			 */
 			p->p_xstat = sig;
 			PROCTREE_LOCK(PT_SHARED);
 			psignal(p->p_pptr, SIGCHLD);
 			do {
 				stop(p);
 				PROCTREE_LOCK(PT_RELEASE);
 				mtx_lock_spin(&sched_lock);
 				DROP_GIANT_NOSWITCH();
 				mi_switch();
 				mtx_unlock_spin(&sched_lock);
 				PICKUP_GIANT();
 				PROCTREE_LOCK(PT_SHARED);
 			} while (!trace_req(p)
 				 && p->p_flag & P_TRACED);
 			PROCTREE_LOCK(PT_RELEASE);
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and ps_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 
 			/*
 			 * If parent wants us to take the signal,
 			 * then it will leave it in p->p_xstat;
 			 * otherwise we just look for signals again.
 			 */
 			SIGDELSET(p->p_siglist, sig);	/* clear old signal */
 			sig = p->p_xstat;
 			if (sig == 0)
 				continue;
 
 			/*
 			 * Put the new signal into p_siglist.  If the
 			 * signal is being masked, look for other signals.
 			 */
 			SIGADDSET(p->p_siglist, sig);
 			if (SIGISMEMBER(p->p_sigmask, sig))
 				continue;
 		}
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((int)(intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (int)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				    prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				p->p_xstat = sig;
 				PROCTREE_LOCK(PT_SHARED);
 				stop(p);
 				if ((p->p_pptr->p_procsig->ps_flag & PS_NOCLDSTOP) == 0)
 					psignal(p->p_pptr, SIGCHLD);
 				PROCTREE_LOCK(PT_RELEASE);
 				mtx_lock_spin(&sched_lock);
 				DROP_GIANT_NOSWITCH();
 				mi_switch();
 				mtx_unlock_spin(&sched_lock);
 				PICKUP_GIANT();
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (int)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		SIGDELSET(p->p_siglist, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Put the argument process into the stopped state and notify the parent
  * via wakeup.  Signals are handled elsewhere.  The process must not be
  * on the run queue.  Must be called with at least a shared hold of the
  * proctree lock.
  */
 void
 stop(p)
 	register struct proc *p;
 {
 
 	PROCTREE_ASSERT(PT_SHARED);
 	mtx_lock_spin(&sched_lock);
 	p->p_stat = SSTOP;
 	p->p_flag &= ~P_WAITED;
 	wakeup((caddr_t)p->p_pptr);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	register struct proc *p = curproc;
 	struct sigacts *ps = p->p_sigacts;
 	sig_t action;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	SIGDELSET(p->p_siglist, sig);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_PSIG))
 		ktrpsig(p->p_tracep, sig, action, p->p_flag & P_OLDMASK ?
 		    &p->p_oldsigmask : &p->p_sigmask, 0);
 #endif
 	STOPEVENT(p, S_SIG, sig);
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		sigexit(p, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(p->p_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		(void) splhigh();
 		if (p->p_flag & P_OLDMASK) {
 			returnmask = p->p_oldsigmask;
 			p->p_flag &= ~P_OLDMASK;
 		} else
 			returnmask = p->p_sigmask;
 
 		SIGSETOR(p->p_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(p->p_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See do_sigaction() for origin of this code.
 			 */
 			SIGDELSET(p->p_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(p->p_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		(void) spl0();
 		p->p_stats->p_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, sig, &returnmask, code);
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(p, sig)
 	register struct proc *p;
 	int sig;
 {
 
 	p->p_acflag |= AXSIG;
 	if (sigprop(sig) & SA_CORE) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 */
 		if (coredump(p) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	}
 	exit1(p, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 static char corefilename[MAXPATHLEN+1] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 const char *name; uid_t uid; pid_t pid; {
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	int i, n;
 	char *format = corefilename;
 	size_t namelen;
 
 	temp = malloc(MAXPATHLEN + 1, M_TEMP, M_NOWAIT);
 	if (temp == NULL)
 		return NULL;
 	namelen = strlen(name);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		int l;
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				temp[n++] = '%';
 				break;
 			case 'N':	/* process name */
 				if ((n + namelen) > MAXPATHLEN) {
 					log(LOG_ERR, "pid %d (%s), uid (%u):  Path `%s%s' is too long\n",
 					    pid, name, uid, temp, name);
 					free(temp, M_TEMP);
 					return NULL;
 				}
 				memcpy(temp+n, name, namelen);
 				n += namelen;
 				break;
 			case 'P':	/* process id */
 				l = sprintf(buf, "%u", pid);
 				if ((n + l) > MAXPATHLEN) {
 					log(LOG_ERR, "pid %d (%s), uid (%u):  Path `%s%s' is too long\n",
 					    pid, name, uid, temp, name);
 					free(temp, M_TEMP);
 					return NULL;
 				}
 				memcpy(temp+n, buf, l);
 				n += l;
 				break;
 			case 'U':	/* user id */
 				l = sprintf(buf, "%u", uid);
 				if ((n + l) > MAXPATHLEN) {
 					log(LOG_ERR, "pid %d (%s), uid (%u):  Path `%s%s' is too long\n",
 					    pid, name, uid, temp, name);
 					free(temp, M_TEMP);
 					return NULL;
 				}
 				memcpy(temp+n, buf, l);
 				n += l;
 				break;
 			default:
 			  	log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format);
 			}
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	temp[n] = '\0';
 	return temp;
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(p)
 	register struct proc *p;
 {
 	register struct vnode *vp;
 	register struct ucred *cred = p->p_ucred;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	
 	STOPEVENT(p, S_CORE, 0);
 
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0)
 		return (EFAULT);
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = p->p_rlimit[RLIMIT_CORE].rlim_cur;
 	if (limit == 0)
 		return 0;
 
 restart:
 	name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid);
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, p);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR);
 	free(name, M_TEMP);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, p);
 		if ((error = vn_close(vp, FWRITE, cred, p)) != 0)
 			return (error);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) {
 		error = EFAULT;
 		goto out;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	VOP_LEASE(vp, p, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, p);
 	p->p_acflag |= ACORE;
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(p, vp, limit) :
 	  ENOSYS;
 
 out:
 	VOP_UNLOCK(vp, 0, p);
 	vn_finished_write(mp);
 	error1 = vn_close(vp, FWRITE, cred, p);
 	if (error == 0)
 		error = error1;
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).
  * Flag error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(p, args)
 	struct proc *p;
 	struct nosys_args *args;
 {
 
 	psignal(p, SIGSYS);
 	return (EINVAL);
 }
 
 /*
  * Send a signal to a SIGIO or SIGURG to a process or process group using
  * stored credentials rather than those of the current process.
  */
 void
 pgsigio(sigio, sig, checkctty)
 	struct sigio *sigio;
 	int sig, checkctty;
 {
 	if (sigio == NULL)
 		return;
 		
 	if (sigio->sio_pgid > 0) {
 		if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred,
 		             sigio->sio_proc))
 			psignal(sigio->sio_proc, sig);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist)
 			if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, p) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 	}
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/* XXX lock the proc here while adding to the list? */
 	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
Index: head/sys/kern/kern_subr.c
===================================================================
--- head/sys/kern/kern_subr.c	(revision 72375)
+++ head/sys/kern/kern_subr.c	(revision 72376)
@@ -1,389 +1,389 @@
 /*
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/lock.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 
 static void	uio_yield __P((void));
 
 int
 uiomove(cp, n, uio)
 	register caddr_t cp;
 	register int n;
 	register struct uio *uio;
 {
 	register struct iovec *iov;
 	u_int cnt;
 	int error = 0;
 	int save = 0;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomove: mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
 	    ("uiomove proc"));
 
 	if (curproc) {
 		save = curproc->p_flag & P_DEADLKTREAT;
 		curproc->p_flag |= P_DEADLKTREAT;
 	}
 
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
 		if (cnt == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (cnt > n)
 			cnt = n;
 
 		switch (uio->uio_segflg) {
 
 		case UIO_USERSPACE:
 		case UIO_USERISPACE:
 			if (ticks - PCPU_GET(switchticks) >= hogticks)
 				uio_yield();
 			if (uio->uio_rw == UIO_READ)
 				error = copyout(cp, iov->iov_base, cnt);
 			else
 				error = copyin(iov->iov_base, cp, cnt);
 			if (error)
 				break;
 			break;
 
 		case UIO_SYSSPACE:
 			if (uio->uio_rw == UIO_READ)
 				bcopy((caddr_t)cp, iov->iov_base, cnt);
 			else
 				bcopy(iov->iov_base, (caddr_t)cp, cnt);
 			break;
 		case UIO_NOCOPY:
 			break;
 		}
 		iov->iov_base += cnt;
 		iov->iov_len -= cnt;
 		uio->uio_resid -= cnt;
 		uio->uio_offset += cnt;
 		cp += cnt;
 		n -= cnt;
 	}
 	if (curproc)
 		curproc->p_flag = (curproc->p_flag & ~P_DEADLKTREAT) | save;
 	return (error);
 }
 
 int
 uiomoveco(cp, n, uio, obj)
 	caddr_t cp;
 	int n;
 	struct uio *uio;
 	struct vm_object *obj;
 {
 	struct iovec *iov;
 	u_int cnt;
 	int error;
 
 	KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
 	    ("uiomoveco: mode"));
 	KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc,
 	    ("uiomoveco proc"));
 
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
 		if (cnt == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (cnt > n)
 			cnt = n;
 
 		switch (uio->uio_segflg) {
 
 		case UIO_USERSPACE:
 		case UIO_USERISPACE:
 			if (ticks - PCPU_GET(switchticks) >= hogticks)
 				uio_yield();
 			if (uio->uio_rw == UIO_READ) {
 #ifdef ENABLE_VFS_IOOPT
 				if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) &&
 					((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
 					((uio->uio_offset & PAGE_MASK) == 0) &&
 					((((intptr_t) cp) & PAGE_MASK) == 0)) {
 						error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
 								uio->uio_offset, cnt,
 								(vm_offset_t) iov->iov_base, NULL);
 				} else
 #endif
 				{
 					error = copyout(cp, iov->iov_base, cnt);
 				}
 			} else {
 				error = copyin(iov->iov_base, cp, cnt);
 			}
 			if (error)
 				return (error);
 			break;
 
 		case UIO_SYSSPACE:
 			if (uio->uio_rw == UIO_READ)
 				bcopy((caddr_t)cp, iov->iov_base, cnt);
 			else
 				bcopy(iov->iov_base, (caddr_t)cp, cnt);
 			break;
 		case UIO_NOCOPY:
 			break;
 		}
 		iov->iov_base += cnt;
 		iov->iov_len -= cnt;
 		uio->uio_resid -= cnt;
 		uio->uio_offset += cnt;
 		cp += cnt;
 		n -= cnt;
 	}
 	return (0);
 }
 
 #ifdef ENABLE_VFS_IOOPT
 
 int
 uioread(n, uio, obj, nread)
 	int n;
 	struct uio *uio;
 	struct vm_object *obj;
 	int *nread;
 {
 	int npagesmoved;
 	struct iovec *iov;
 	u_int cnt, tcnt;
 	int error;
 
 	*nread = 0;
 	if (vfs_ioopt < 2)
 		return 0;
 
 	error = 0;
 
 	while (n > 0 && uio->uio_resid) {
 		iov = uio->uio_iov;
 		cnt = iov->iov_len;
 		if (cnt == 0) {
 			uio->uio_iov++;
 			uio->uio_iovcnt--;
 			continue;
 		}
 		if (cnt > n)
 			cnt = n;
 
 		if ((uio->uio_segflg == UIO_USERSPACE) &&
 			((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) &&
 				 ((uio->uio_offset & PAGE_MASK) == 0) ) {
 
 			if (cnt < PAGE_SIZE)
 				break;
 
 			cnt &= ~PAGE_MASK;
 
 			if (ticks - PCPU_GET(switchticks) >= hogticks)
 				uio_yield();
 			error = vm_uiomove(&curproc->p_vmspace->vm_map, obj,
 						uio->uio_offset, cnt,
 						(vm_offset_t) iov->iov_base, &npagesmoved);
 
 			if (npagesmoved == 0)
 				break;
 
 			tcnt = npagesmoved * PAGE_SIZE;
 			cnt = tcnt;
 
 			if (error)
 				break;
 
 			iov->iov_base += cnt;
 			iov->iov_len -= cnt;
 			uio->uio_resid -= cnt;
 			uio->uio_offset += cnt;
 			*nread += cnt;
 			n -= cnt;
 		} else {
 			break;
 		}
 	}
 	return error;
 }
 
 #endif
 
 /*
  * Give next character to user as result of read.
  */
 int
 ureadc(c, uio)
 	register int c;
 	register struct uio *uio;
 {
 	register struct iovec *iov;
 
 again:
 	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
 		panic("ureadc");
 	iov = uio->uio_iov;
 	if (iov->iov_len == 0) {
 		uio->uio_iovcnt--;
 		uio->uio_iov++;
 		goto again;
 	}
 	switch (uio->uio_segflg) {
 
 	case UIO_USERSPACE:
 		if (subyte(iov->iov_base, c) < 0)
 			return (EFAULT);
 		break;
 
 	case UIO_SYSSPACE:
 		*iov->iov_base = c;
 		break;
 
 	case UIO_USERISPACE:
 		if (suibyte(iov->iov_base, c) < 0)
 			return (EFAULT);
 		break;
 	case UIO_NOCOPY:
 		break;
 	}
 	iov->iov_base++;
 	iov->iov_len--;
 	uio->uio_resid--;
 	uio->uio_offset++;
 	return (0);
 }
 
 /*
  * General routine to allocate a hash table.
  */
 void *
 hashinit(elements, type, hashmask)
 	int elements;
 	struct malloc_type *type;
 	u_long *hashmask;
 {
 	long hashsize;
 	LIST_HEAD(generic, generic) *hashtbl;
 	int i;
 
 	if (elements <= 0)
 		panic("hashinit: bad elements");
 	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
 		continue;
 	hashsize >>= 1;
 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
 	for (i = 0; i < hashsize; i++)
 		LIST_INIT(&hashtbl[i]);
 	*hashmask = hashsize - 1;
 	return (hashtbl);
 }
 
 static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
 			2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
 			7159, 7673, 8191, 12281, 16381, 24571, 32749 };
 #define NPRIMES (sizeof(primes) / sizeof(primes[0]))
 
 /*
  * General routine to allocate a prime number sized hash table.
  */
 void *
 phashinit(elements, type, nentries)
 	int elements;
 	struct malloc_type *type;
 	u_long *nentries;
 {
 	long hashsize;
 	LIST_HEAD(generic, generic) *hashtbl;
 	int i;
 
 	if (elements <= 0)
 		panic("phashinit: bad elements");
 	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
 		i++;
 		if (i == NPRIMES)
 			break;
 		hashsize = primes[i];
 	}
 	hashsize = primes[i - 1];
 	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
 	for (i = 0; i < hashsize; i++)
 		LIST_INIT(&hashtbl[i]);
 	*nentries = hashsize;
 	return (hashtbl);
 }
 
 static void
 uio_yield()
 {
 	struct proc *p;
 	int s;
 
 	p = curproc;
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT_NOSWITCH();
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	setrunqueue(p);
 	p->p_stats->p_ru.ru_nivcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
 	PICKUP_GIANT();
 	splx(s);
 }
Index: head/sys/kern/kern_switch.c
===================================================================
--- head/sys/kern/kern_switch.c	(revision 72375)
+++ head/sys/kern/kern_switch.c	(revision 72376)
@@ -1,256 +1,238 @@
 /*
  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
  * All rights reserved.
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
-#include <sys/rtprio.h>
 #include <sys/queue.h>
 
 /*
- * We have NQS (32) run queues per scheduling class.  For the normal
- * class, there are 128 priorities scaled onto these 32 queues.  New
- * processes are added to the last entry in each queue, and processes
- * are selected for running by taking them from the head and maintaining
- * a simple FIFO arrangement.
- *
- * Interrupt, real time and idle priority processes have and explicit
- * 0-31 priority which maps directly onto their class queue index.
- * When a queue has something in it, the corresponding bit is set in
- * the queuebits variable, allowing a single read to determine the
- * state of all 32 queues and then a ffs() to find the first busy
- * queue.
- *
- * XXX This needs fixing.  First, we only have one idle process, so we
- * hardly need 32 queues for it.  Secondly, the number of classes
- * makes things unwieldy.  We should be able to merge them into a
- * single 96 or 128 entry queue.
+ * Global run queue.
  */
-struct rq itqueues[NQS];		/* interrupt threads */
-struct rq rtqueues[NQS];		/* real time processes */
-struct rq queues[NQS];			/* time sharing processes */
-struct rq idqueues[NQS];		/* idle process */
-u_int32_t itqueuebits;
-u_int32_t rtqueuebits;
-u_int32_t queuebits;
-u_int32_t idqueuebits;
+static struct runq runq;
+SYSINIT(runq, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, runq_init, &runq)
 
 /*
- * Initialize the run queues at boot time.
+ * Wrappers which implement old interface; act on global run queue.
  */
-static void
-rqinit(void *dummy)
+
+struct proc *
+chooseproc(void)
 {
-	int i;
+	return runq_choose(&runq);
+}
 
-	for (i = 0; i < NQS; i++) {
-		TAILQ_INIT(&itqueues[i]);
-		TAILQ_INIT(&rtqueues[i]);
-		TAILQ_INIT(&queues[i]);
-		TAILQ_INIT(&idqueues[i]);
-	}
+int
+procrunnable(void)
+{
+	return runq_check(&runq);
 }
-SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
 
-/*
- * setrunqueue() examines a process priority and class and inserts it on
- * the tail of it's appropriate run queue (based on class and priority).
- * This sets the queue busy bit.
- * The process must be runnable.
- * This must be called at splhigh().
- */
 void
+remrunqueue(struct proc *p)
+{
+	runq_remove(&runq, p);
+}
+
+void
 setrunqueue(struct proc *p)
 {
-	struct rq *q;
-	u_int8_t pri;
+	runq_add(&runq, p);
+}
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(p->p_stat == SRUN, ("setrunqueue: proc %p (%s) not SRUN", p, \
-	    p->p_comm));
+/*
+ * Clear the status bit of the queue corresponding to priority level pri,
+ * indicating that it is empty.
+ */
+static __inline void
+runq_clrbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
 
-	/*
-	 * Decide which class we want to run.  We now have four
-	 * queues, and this is becoming ugly.  We should be able to
-	 * collapse the first three classes into a single contiguous
-	 * queue.  XXX FIXME.
-	 */
-	CTR4(KTR_PROC, "setrunqueue: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
-	if (p->p_rtprio.type == RTP_PRIO_ITHREAD) {	/* interrupt thread */
-		pri = p->p_rtprio.prio;
-		q = &itqueues[pri];
-		itqueuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME || /* real time */
-		   p->p_rtprio.type == RTP_PRIO_FIFO) {
-		pri = p->p_rtprio.prio;
-		q = &rtqueues[pri];
-		rtqueuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_NORMAL) {   /* time sharing */
-		pri = p->p_priority >> 2;
-		q = &queues[pri];
-		queuebits |= 1 << pri;
-	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {	    /* idle proc */
-		pri = p->p_rtprio.prio;
-		q = &idqueues[pri];
-		idqueuebits |= 1 << pri;
-	} else {
-		panic("setrunqueue: invalid rtprio type %d", p->p_rtprio.type);
-	}
-	p->p_rqindex = pri;		/* remember the queue index */
-	TAILQ_INSERT_TAIL(q, p, p_procq);
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
 }
 
 /*
- * remrunqueue() removes a given process from the run queue that it is on,
- * clearing the queue busy bit if it becomes empty.
- * This must be called at splhigh().
+ * Find the index of the first non-empty run queue.  This is done by
+ * scanning the status bits, a set bit indicates a non-empty queue.
  */
+static __inline int
+runq_findbit(struct runq *rq)
+{
+	struct rqbits *rqb;
+	int pri;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			pri = (RQB_FFS(rqb->rqb_bits[i]) - 1) +
+			    (i << RQB_L2BPW);
+			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
+			    rqb->rqb_bits[i], i, pri);
+			return (pri);
+		}
+
+	return (-1);
+}
+
+/*
+ * Set the status bit of the queue corresponding to priority level pri,
+ * indicating that it is non-empty.
+ */
+static __inline void
+runq_setbit(struct runq *rq, int pri)
+{
+	struct rqbits *rqb;
+
+	rqb = &rq->rq_status;
+	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
+	    rqb->rqb_bits[RQB_WORD(pri)],
+	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
+	    RQB_BIT(pri), RQB_WORD(pri));
+	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
+}
+
+/*
+ * Add the process to the queue specified by its priority, and set the
+ * corresponding status bit.
+ */
 void
-remrunqueue(struct proc *p)
+runq_add(struct runq *rq, struct proc *p)
 {
-	struct rq *q;
-	u_int32_t *which;
-	u_int8_t pri;
+	struct rqhead *rqh;
+	int pri;
 
-	CTR4(KTR_PROC, "remrunqueue: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
 	mtx_assert(&sched_lock, MA_OWNED);
-	pri = p->p_rqindex;
-	if (p->p_rtprio.type == RTP_PRIO_ITHREAD) {
-		q = &itqueues[pri];
-		which = &itqueuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_REALTIME ||
-		   p->p_rtprio.type == RTP_PRIO_FIFO) {
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (p->p_rtprio.type == RTP_PRIO_IDLE) {
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		panic("remrunqueue: invalid rtprio type");
-	}
-	TAILQ_REMOVE(q, p, p_procq);
-	if (TAILQ_EMPTY(q)) {
-		KASSERT((*which & (1 << pri)) != 0,
-			("remrunqueue: remove from empty queue"));
-		*which &= ~(1 << pri);
-	}
+	KASSERT(p->p_stat == SRUN, ("runq_add: proc %p (%s) not SRUN",
+	    p, p->p_comm));
+	pri = p->p_pri.pri_level / RQ_PPQ;
+	p->p_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_add: p=%p pri=%d %d rqh=%p",
+	    p, p->p_pri.pri_level, pri, rqh);
+	TAILQ_INSERT_TAIL(rqh, p, p_procq);
 }
 
 /*
- * procrunnable() returns a boolean true (non-zero) value if there are
- * any runnable processes.  This is intended to be called from the idle
- * loop to avoid the more expensive (and destructive) chooseproc().
- *
- * MP SAFE.  CALLED WITHOUT THE MP LOCK
- *
- * XXX I doubt this.  It's possibly fail-safe, but there's obviously
- * the case here where one of the bits words gets loaded, the
- * processor gets preempted, and by the time it returns from this
- * function, some other processor has picked the runnable process.
- * What am I missing?  (grog, 23 July 2000).
+ * Return true if there are runnable processes of any priority on the run
+ * queue, false otherwise.  Has no side effects, does not modify the run
+ * queue structure.
  */
-u_int32_t
-procrunnable(void)
+int
+runq_check(struct runq *rq)
 {
-	return (itqueuebits || rtqueuebits || queuebits || idqueuebits);
+	struct rqbits *rqb;
+	int i;
+
+	rqb = &rq->rq_status;
+	for (i = 0; i < RQB_LEN; i++)
+		if (rqb->rqb_bits[i]) {
+			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
+			    rqb->rqb_bits[i], i);
+			return (1);
+		}
+	CTR0(KTR_RUNQ, "runq_check: empty");
+
+	return (0);
 }
 
 /*
- * chooseproc() selects the next process to run.  Ideally, cpu_switch()
- * would have determined that there is a process available before calling
- * this, but it is not a requirement.  The selected process is removed
- * from it's queue, and the queue busy bit is cleared if it becomes empty.
- * This must be called at splhigh().
- *
- * For SMP, trivial affinity is implemented by locating the first process
- * on the queue that has a matching lastcpu id.  Since normal priorities
- * are mapped four priority levels per queue, this may allow the cpu to
- * choose a slightly lower priority process in order to preserve the cpu
- * caches.
+ * Find and remove the highest priority process from the run queue.
+ * If there are no runnable processes, the per-cpu idle process is
+ * returned.  Will not return NULL under any circumstances.
  */
 struct proc *
-chooseproc(void)
+runq_choose(struct runq *rq)
 {
+	struct rqhead *rqh;
 	struct proc *p;
-	struct rq *q;
-	u_int32_t *which;
-	u_int32_t pri;
-#ifdef SMP
-	u_char id;
-#endif
+	int pri;
 
 	mtx_assert(&sched_lock, MA_OWNED);
-	if (itqueuebits) {
-		pri = ffs(itqueuebits) - 1;
-		q = &itqueues[pri];
-		which = &itqueuebits;
-	} else if (rtqueuebits) {
-		pri = ffs(rtqueuebits) - 1;
-		q = &rtqueues[pri];
-		which = &rtqueuebits;
-	} else if (queuebits) {
-		pri = ffs(queuebits) - 1;
-		q = &queues[pri];
-		which = &queuebits;
-	} else if (idqueuebits) {
-		pri = ffs(idqueuebits) - 1;
-		q = &idqueues[pri];
-		which = &idqueuebits;
-	} else {
-		CTR1(KTR_PROC, "chooseproc: idleproc, schedlock %lx",
-			(long)sched_lock.mtx_lock);
-		return PCPU_GET(idleproc);
-	}
-	p = TAILQ_FIRST(q);
-#ifdef SMP
-	/* wander down the current run queue for this pri level for a match */
-	id = PCPU_GET(cpuid);
-	while (p->p_lastcpu != id) {
-		p = TAILQ_NEXT(p, p_procq);
-		if (p == NULL) {
-			p = TAILQ_FIRST(q);
-			break;
+	if ((pri = runq_findbit(rq)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		p = TAILQ_FIRST(rqh);
+		CTR3(KTR_RUNQ, "runq_choose: pri=%d p=%p rqh=%p", pri, p, rqh);
+		TAILQ_REMOVE(rqh, p, p_procq);
+		if (TAILQ_EMPTY(rqh)) {
+			CTR0(KTR_RUNQ, "runq_choose: empty");
+			runq_clrbit(rq, pri);
 		}
+		return (p);
 	}
-#endif
-	CTR4(KTR_PROC, "chooseproc: proc %p (pid %d, %s), schedlock %lx",
-		p, p->p_pid, p->p_comm, (long)sched_lock.mtx_lock);
-	KASSERT(p, ("chooseproc: no proc on busy queue"));
-	TAILQ_REMOVE(q, p, p_procq);
-	if (TAILQ_EMPTY(q))
-		*which &= ~(1 << pri);
-	return p;
+	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
+
+	return (PCPU_GET(idleproc));
+}
+
+/*
+ * Initialize a run structure.
+ */
+void
+runq_init(struct runq *rq)
+{
+	int i;
+
+	for (i = 0; i < RQ_NQS; i++)
+		TAILQ_INIT(&rq->rq_queues[i]);
+}
+
+/*
+ * Remove the process from the queue specified by its priority, and clear the
+ * corresponding status bit if the queue becomes empty.
+ */
+void
+runq_remove(struct runq *rq, struct proc *p)
+{
+	struct rqhead *rqh;
+	int pri;
+
+	mtx_assert(&sched_lock, MA_OWNED);
+	pri = p->p_rqindex;
+	rqh = &rq->rq_queues[pri];
+	CTR4(KTR_RUNQ, "runq_remove: p=%p pri=%d %d rqh=%p",
+	    p, p->p_pri.pri_level, pri, rqh);
+	KASSERT(p != NULL, ("runq_remove: no proc on busy queue"));
+	TAILQ_REMOVE(rqh, p, p_procq);
+	if (TAILQ_EMPTY(rqh)) {
+		CTR0(KTR_RUNQ, "runq_remove: empty");
+		runq_clrbit(rq, pri);
+	}
 }
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 72375)
+++ head/sys/kern/kern_synch.c	(revision 72376)
@@ -1,1110 +1,1068 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/condvar.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
 static void sched_setup __P((void *dummy));
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
-u_char	curpriority;
 int	hogticks;
 int	lbolt;
 int	sched_quantum;		/* Roundrobin scheduling quantum in ticks. */
 
 static struct callout schedcpu_callout;
 static struct callout roundrobin_callout;
 
-static int	curpriority_cmp __P((struct proc *p));
 static void	endtsleep __P((void *));
 static void	roundrobin __P((void *arg));
 static void	schedcpu __P((void *arg));
 
 static int
 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 
 	new_val = sched_quantum * tick;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
         if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < tick)
 		return (EINVAL);
 	sched_quantum = new_val / tick;
 	hogticks = 2 * sched_quantum;
 	return (0);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof sched_quantum, sysctl_kern_quantum, "I", "");
 
-/*-
- * Compare priorities.  Return:
- *     <0: priority of p < current priority
- *      0: priority of p == current priority
- *     >0: priority of p > current priority
- * The priorities are the normal priorities or the normal realtime priorities
- * if p is on the same scheduler as curproc.  Otherwise the process on the
- * more realtimeish scheduler has lowest priority.  As usual, a higher
- * priority really means a lower priority.
- */
-static int
-curpriority_cmp(p)
-	struct proc *p;
-{
-	int c_class, p_class;
-
-	c_class = RTP_PRIO_BASE(curproc->p_rtprio.type);
-	p_class = RTP_PRIO_BASE(p->p_rtprio.type);
-	if (p_class != c_class)
-		return (p_class - c_class);
-	if (p_class == RTP_PRIO_NORMAL)
-		return (((int)p->p_priority - (int)curpriority) / PPQ);
-	return ((int)p->p_rtprio.prio - (int)curproc->p_rtprio.prio);
-}
-
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
  */
 void
-maybe_resched(chk)
-	struct proc *chk;
+maybe_resched(p)
+	struct proc *p;
 {
-	struct proc *p = curproc; /* XXX */
 
-	/*
-	 * XXX idle scheduler still broken because proccess stays on idle
-	 * scheduler during waits (such as when getting FS locks).  If a
-	 * standard process becomes runaway cpu-bound, the system can lockup
-	 * due to idle-scheduler processes in wakeup never getting any cpu.
-	 */
-	if (p == PCPU_GET(idleproc)) {
-#if 0
+	if (p->p_pri.pri_level < curproc->p_pri.pri_level)
 		need_resched();
-#endif
-	} else if (chk == p) {
-		/* We may need to yield if our priority has been raised. */
-		if (curpriority_cmp(chk) > 0)
-			need_resched();
-	} else if (curpriority_cmp(chk) < 0)
-		need_resched();
 }
 
 int 
 roundrobin_interval(void)
 {
 	return (sched_quantum);
 }
 
 /*
  * Force switch among equal priority processes every 100ms.
  */
 /* ARGSUSED */
 static void
 roundrobin(arg)
 	void *arg;
 {
 
 	mtx_lock_spin(&sched_lock);
 	need_resched();
 	mtx_unlock_spin(&sched_lock);
 #ifdef SMP
 	forward_roundrobin();
 #endif
 
 	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
 }
 
 /*
  * Constants for digital decay and forget:
  *	90% of (p_estcpu) usage in 5 * loadav time
  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
  * Note that schedclock() updates p_estcpu and p_cpticks asynchronously.
  *
  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
  * 		p_estcpu *= decay;
  * will compute
  * 	p_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * The system computes decay as:
  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
  *
  * We wish to prove that the system's computation of decay
  * will always fulfill the equation:
  * 	decay ** (5 * loadavg) ~= .1
  *
  * If we compute b as:
  * 	b = 2 * loadavg
  * then
  * 	decay = b / (b + 1)
  *
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
  *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
  *         For x close to zero, ln(1+x) =~ x, since
  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
  *         ln(.1) =~ -2.30
  *
  * Proof of (1):
  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
  *	solving for factor,
  *      ln(factor) =~ (-2.30/5*loadav), or
  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
  *
  * Proof of (2):
  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
  *	solving for power,
  *      power*ln(b/(b+1)) =~ -2.30, or
  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
  *
  * Actual power values for the implemented algorithm are as follows:
  *      loadav: 1       2       3       4
  *      power:  5.68    10.32   14.94   19.55
  */
 
 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int	fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 /*
  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
  *
  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
  *
  * If you don't want to bother with the faster/more-accurate formula, you
  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
  * (more general) method of calculating the %age of CPU used by a process.
  */
 #define	CCPU_SHIFT	11
 
 /*
  * Recompute process priorities, every hz ticks.
  * MP-safe, called without the Giant mutex.
  */
 /* ARGSUSED */
 static void
 schedcpu(arg)
 	void *arg;
 {
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	register struct proc *p;
 	register int realstathz, s;
 
 	realstathz = stathz ? stathz : hz;
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, &allproc, p_list) {
 		/*
 		 * Increment time in/out of memory and sleep time
 		 * (if sleeping).  We ignore overflow; with 16-bit int's
 		 * (remember them?) overflow takes 45 days.
 		if (p->p_stat == SWAIT)
 			continue;
 		 */
 		mtx_lock_spin(&sched_lock);
 		p->p_swtime++;
 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
 			p->p_slptime++;
 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
 		/*
 		 * If the process has slept the entire second,
 		 * stop recalculating its priority until it wakes up.
 		 */
 		if (p->p_slptime > 1) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * prevent state changes and protect run queue
 		 */
 		s = splhigh();
 
 		/*
 		 * p_pctcpu is only for ps.
 		 */
 #if	(FSHIFT >= CCPU_SHIFT)
 		p->p_pctcpu += (realstathz == 100)?
 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
                 	100 * (((fixpt_t) p->p_cpticks)
 				<< (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
 		p->p_pctcpu += ((FSCALE - ccpu) *
 			(p->p_cpticks * FSCALE / realstathz)) >> FSHIFT;
 #endif
 		p->p_cpticks = 0;
 		p->p_estcpu = decay_cpu(loadfac, p->p_estcpu);
 		resetpriority(p);
-		if (p->p_priority >= PUSER) {
+		if (p->p_pri.pri_level >= PUSER) {
 			if ((p != curproc) &&
 #ifdef SMP
 			    p->p_oncpu == 0xff && 	/* idle */
 #endif
 			    p->p_stat == SRUN &&
 			    (p->p_sflag & PS_INMEM) &&
-			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
+			    (p->p_pri.pri_level / RQ_PPQ) !=
+			    (p->p_pri.pri_user / RQ_PPQ)) {
 				remrunqueue(p);
-				p->p_priority = p->p_usrpri;
+				p->p_pri.pri_level = p->p_pri.pri_user;
 				setrunqueue(p);
 			} else
-				p->p_priority = p->p_usrpri;
+				p->p_pri.pri_level = p->p_pri.pri_user;
 		}
 		mtx_unlock_spin(&sched_lock);
 		splx(s);
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	vmmeter();
 	wakeup((caddr_t)&lbolt);
 	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
  * least six times the loadfactor will decay p_estcpu to zero.
  */
 void
 updatepri(p)
 	register struct proc *p;
 {
 	register unsigned int newcpu = p->p_estcpu;
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 
 	if (p->p_slptime > 5 * loadfac)
 		p->p_estcpu = 0;
 	else {
 		p->p_slptime--;	/* the first time was done in schedcpu */
 		while (newcpu && --p->p_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		p->p_estcpu = newcpu;
 	}
 	resetpriority(p);
 }
 
 /*
  * We're only looking at 7 bits of the address; everything is
  * aligned to 4, lots of things are aligned to greater powers
  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
  */
 #define TABLESIZE	128
 static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
 #define LOOKUP(x)	(((intptr_t)(x) >> 8) & (TABLESIZE - 1))
 
 void
 sleepinit(void)
 {
 	int i;
 
 	sched_quantum = hz/10;
 	hogticks = 2 * sched_quantum;
 	for (i = 0; i < TABLESIZE; i++)
 		TAILQ_INIT(&slpque[i]);
 }
 
 /*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The mutex argument is exited before the caller is suspended, and
  * entered before msleep returns.  If priority includes the PDROP
  * flag the mutex is not entered before returning.
  */
 int
 msleep(ident, mtx, priority, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct proc *p = curproc;
 	int s, sig, catch = priority & PCATCH;
 	int rval = 0;
 	WITNESS_SAVE_DECL(mtx);
 
 #ifdef KTRACE
 	if (p && KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 1, 0);
 #endif
 	WITNESS_SLEEP(0, mtx);
 	mtx_lock_spin(&sched_lock);
 	s = splhigh();
 	if (cold || panicstr) {
 		/*
 		 * After a panic, or during autoconfiguration,
 		 * just give interrupts a chance, then just return;
 		 * don't run any other procs or panic below,
 		 * in case this is the idle process and already asleep.
 		 */
 		if (mtx != NULL && priority & PDROP)
 			mtx_unlock_flags(mtx, MTX_NOSWITCH);
 		mtx_unlock_spin(&sched_lock);
 		splx(s);
 		return (0);
 	}
 
 	DROP_GIANT_NOSWITCH();
 
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(mtx, mtx);
 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
 		if (priority & PDROP)
 			mtx = NULL;
 	}
 
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && p->p_stat == SRUN, ("msleep"));
 	/*
 	 * Process may be sitting on a slpque if asleep() was called, remove
 	 * it before re-adding.
 	 */
 	if (p->p_wchan != NULL)
 		unsleep(p);
 
 	p->p_wchan = ident;
 	p->p_wmesg = wmesg;
 	p->p_slptime = 0;
-	p->p_priority = priority & PRIMASK;
+	p->p_pri.pri_level = priority & PRIMASK;
 	CTR4(KTR_PROC, "msleep: proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 	if (timo)
 		callout_reset(&p->p_slpcallout, timo, endtsleep, p);
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling CURSIG, as we could stop there, and a wakeup
 	 * or a SIGCONT (or both) could occur while we were stopped.
 	 * A SIGCONT would cause us to be marked as SSLEEP
 	 * without resuming us, thus we must be ready for sleep
 	 * when CURSIG is called.  If the wakeup happens while we're
 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
 	 */
 	if (catch) {
 		CTR4(KTR_PROC,
 		        "msleep caught: proc %p (pid %d, %s), schedlock %p",
 			p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 		p->p_sflag |= PS_SINTR;
 		mtx_unlock_spin(&sched_lock);
 		if ((sig = CURSIG(p))) {
 			mtx_lock_spin(&sched_lock);
 			if (p->p_wchan)
 				unsleep(p);
 			p->p_stat = SRUN;
 			goto resume;
 		}
 		mtx_lock_spin(&sched_lock);
 		if (p->p_wchan == NULL) {
 			catch = 0;
 			goto resume;
 		}
 	} else
 		sig = 0;
 	p->p_stat = SSLEEP;
 	p->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
 	CTR4(KTR_PROC,
 	        "msleep resume: proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 resume:
-	curpriority = p->p_usrpri;
 	splx(s);
 	p->p_sflag &= ~PS_SINTR;
 	if (p->p_sflag & PS_TIMEOUT) {
 		p->p_sflag &= ~PS_TIMEOUT;
 		if (sig == 0) {
 #ifdef KTRACE
 			if (KTRPOINT(p, KTR_CSW))
 				ktrcsw(p->p_tracep, 0, 0);
 #endif
 			rval = EWOULDBLOCK;
 			mtx_unlock_spin(&sched_lock);
 			goto out;
 		}
 	} else if (timo)
 		callout_stop(&p->p_slpcallout);
 	mtx_unlock_spin(&sched_lock);
 
 	if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_CSW))
 			ktrcsw(p->p_tracep, 0, 0);
 #endif
 		if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 			rval = EINTR;
 		else
 			rval = ERESTART;
 		goto out;
 	}
 out:
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
 		ktrcsw(p->p_tracep, 0, 0);
 #endif
 	PICKUP_GIANT();
 	if (mtx != NULL) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(mtx, mtx);
 	}
 	return (rval);
 }
 
 /*
  * asleep() - async sleep call.  Place process on wait queue and return 
  * immediately without blocking.  The process stays runnable until mawait() 
  * is called.  If ident is NULL, remove process from wait queue if it is still
  * on one.
  *
  * Only the most recent sleep condition is effective when making successive
  * calls to asleep() or when calling msleep().
  *
  * The timeout, if any, is not initiated until mawait() is called.  The sleep
  * priority, signal, and timeout is specified in the asleep() call but may be
  * overriden in the mawait() call.
  *
  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
  */
 
 int
 asleep(void *ident, int priority, const char *wmesg, int timo)
 {
 	struct proc *p = curproc;
 	int s;
 
 	/*
 	 * obtain sched_lock while manipulating sleep structures and slpque.
 	 *
 	 * Remove preexisting wait condition (if any) and place process
 	 * on appropriate slpque, but do not put process to sleep.
 	 */
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 
 	if (p->p_wchan != NULL)
 		unsleep(p);
 
 	if (ident) {
 		p->p_wchan = ident;
 		p->p_wmesg = wmesg;
 		p->p_slptime = 0;
 		p->p_asleep.as_priority = priority;
 		p->p_asleep.as_timo = timo;
 		TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
 	}
 
 	mtx_unlock_spin(&sched_lock);
 	splx(s);
 
 	return(0);
 }
 
 /*
  * mawait() - wait for async condition to occur.   The process blocks until
  * wakeup() is called on the most recent asleep() address.  If wakeup is called
  * prior to mawait(), mawait() winds up being a NOP.
  *
  * If mawait() is called more then once (without an intervening asleep() call),
  * mawait() is still effectively a NOP but it calls mi_switch() to give other
  * processes some cpu before returning.  The process is left runnable.
  *
  * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>>
  */
 
 int
 mawait(struct mtx *mtx, int priority, int timo)
 {
 	struct proc *p = curproc;
 	int rval = 0;
 	int s;
 	WITNESS_SAVE_DECL(mtx);
 
 	WITNESS_SLEEP(0, mtx);
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT_NOSWITCH();
 	if (mtx != NULL) {
 		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 		WITNESS_SAVE(mtx, mtx);
 		mtx_unlock_flags(mtx, MTX_NOSWITCH);
 		if (priority & PDROP)
 			mtx = NULL;
 	}
 
 	s = splhigh();
 
 	if (p->p_wchan != NULL) {
 		int sig;
 		int catch;
 
 		/*
 		 * The call to mawait() can override defaults specified in
 		 * the original asleep().
 		 */
 		if (priority < 0)
 			priority = p->p_asleep.as_priority;
 		if (timo < 0)
 			timo = p->p_asleep.as_timo;
 
 		/*
 		 * Install timeout
 		 */
 
 		if (timo)
 			callout_reset(&p->p_slpcallout, timo, endtsleep, p);
 
 		sig = 0;
 		catch = priority & PCATCH;
 
 		if (catch) {
 			p->p_sflag |= PS_SINTR;
 			mtx_unlock_spin(&sched_lock);
 			if ((sig = CURSIG(p))) {
 				mtx_lock_spin(&sched_lock);
 				if (p->p_wchan)
 					unsleep(p);
 				p->p_stat = SRUN;
 				goto resume;
 			}
 			mtx_lock_spin(&sched_lock);
 			if (p->p_wchan == NULL) {
 				catch = 0;
 				goto resume;
 			}
 		}
 		p->p_stat = SSLEEP;
 		p->p_stats->p_ru.ru_nvcsw++;
 		mi_switch();
 resume:
-		curpriority = p->p_usrpri;
 
 		splx(s);
 		p->p_sflag &= ~PS_SINTR;
 		if (p->p_sflag & PS_TIMEOUT) {
 			p->p_sflag &= ~PS_TIMEOUT;
 			if (sig == 0) {
 #ifdef KTRACE
 				if (KTRPOINT(p, KTR_CSW))
 					ktrcsw(p->p_tracep, 0, 0);
 #endif
 				rval = EWOULDBLOCK;
 				mtx_unlock_spin(&sched_lock);
 				goto out;
 			}
 		} else if (timo)
 			callout_stop(&p->p_slpcallout);
 		mtx_unlock_spin(&sched_lock);
 
 		if (catch && (sig != 0 || (sig = CURSIG(p)))) {
 #ifdef KTRACE
 			if (KTRPOINT(p, KTR_CSW))
 				ktrcsw(p->p_tracep, 0, 0);
 #endif
 			if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig))
 				rval = EINTR;
 			else
 				rval = ERESTART;
 			goto out;
 		}
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_CSW))
 			ktrcsw(p->p_tracep, 0, 0);
 #endif
 	} else {
 		/*
 		 * If as_priority is 0, mawait() has been called without an 
 		 * intervening asleep().  We are still effectively a NOP, 
 		 * but we call mi_switch() for safety.
 		 */
 
 		if (p->p_asleep.as_priority == 0) {
 			p->p_stats->p_ru.ru_nvcsw++;
 			mi_switch();
 		}
 		mtx_unlock_spin(&sched_lock);
 		splx(s);
 	}
 
 	/*
 	 * clear p_asleep.as_priority as an indication that mawait() has been
 	 * called.  If mawait() is called again without an intervening asleep(),
 	 * mawait() is still effectively a NOP but the above mi_switch() code
 	 * is triggered as a safety.
 	 */
 	p->p_asleep.as_priority = 0;
 
 out:
 	PICKUP_GIANT();
 	if (mtx != NULL) {
 		mtx_lock(mtx);
 		WITNESS_RESTORE(mtx, mtx);
 	}
 	return (rval);
 }
 
 /*
  * Implement timeout for msleep or asleep()/mawait()
  *
  * If process hasn't been awakened (wchan non-zero),
  * set timeout flag and undo the sleep.  If proc
  * is stopped, just unsleep so it will remain stopped.
  * MP-safe, called without the Giant mutex.
  */
 static void
 endtsleep(arg)
 	void *arg;
 {
 	register struct proc *p;
 	int s;
 
 	p = (struct proc *)arg;
 	CTR4(KTR_PROC,
 	        "endtsleep: proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	if (p->p_wchan) {
 		if (p->p_stat == SSLEEP)
 			setrunnable(p);
 		else
 			unsleep(p);
 		p->p_sflag |= PS_TIMEOUT;
 	}
 	mtx_unlock_spin(&sched_lock);
 	splx(s);
 }
 
 /*
  * Remove a process from its wait queue
  */
 void
 unsleep(p)
 	register struct proc *p;
 {
 	int s;
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	if (p->p_wchan) {
 		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
 		p->p_wchan = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 	splx(s);
 }
 
 /*
  * Make all processes sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct proc *p;
 	int s;
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 restart:
 	TAILQ_FOREACH(p, qp, p_slpq) {
 		if (p->p_wchan == ident) {
 			TAILQ_REMOVE(qp, p, p_slpq);
 			p->p_wchan = NULL;
 			if (p->p_stat == SSLEEP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
 				CTR4(KTR_PROC,
 				        "wakeup: proc %p (pid %d, %s), schedlock %p",
 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 				if (p->p_slptime > 1)
 					updatepri(p);
 				p->p_slptime = 0;
 				p->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(p);
 					maybe_resched(p);
 				} else {
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup((caddr_t)&proc0);
 				}
 				/* END INLINE EXPANSION */
 				goto restart;
 			}
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	splx(s);
 }
 
 /*
  * Make a process sleeping on the specified identifier runnable.
  * May wake more than one process if a target process is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 	register struct slpquehead *qp;
 	register struct proc *p;
 	int s;
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	qp = &slpque[LOOKUP(ident)];
 
 	TAILQ_FOREACH(p, qp, p_slpq) {
 		if (p->p_wchan == ident) {
 			TAILQ_REMOVE(qp, p, p_slpq);
 			p->p_wchan = NULL;
 			if (p->p_stat == SSLEEP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
 				CTR4(KTR_PROC,
 				        "wakeup1: proc %p (pid %d, %s), schedlock %p",
 					p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 				if (p->p_slptime > 1)
 					updatepri(p);
 				p->p_slptime = 0;
 				p->p_stat = SRUN;
 				if (p->p_sflag & PS_INMEM) {
 					setrunqueue(p);
 					maybe_resched(p);
 					break;
 				} else {
 					p->p_sflag |= PS_SWAPINREQ;
 					wakeup((caddr_t)&proc0);
 				}
 				/* END INLINE EXPANSION */
 			}
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	splx(s);
 }
 
 /*
  * The machine independent parts of mi_switch().
  * Must be called at splstatclock() or higher.
  */
 void
 mi_switch()
 {
 	struct timeval new_switchtime;
 	register struct proc *p = curproc;	/* XXX */
 #if 0
 	register struct rlimit *rlim;
 #endif
 	int x;
 
 	/*
 	 * XXX this spl is almost unnecessary.  It is partly to allow for
 	 * sloppy callers that don't do it (issignal() via CURSIG() is the
 	 * main offender).  It is partly to work around a bug in the i386
 	 * cpu_switch() (the ipl is not preserved).  We ran for years
 	 * without it.  I think there was only a interrupt latency problem.
 	 * The main caller, msleep(), does an splx() a couple of instructions
 	 * after calling here.  The buggy caller, issignal(), usually calls
 	 * here at spl0() and sometimes returns at splhigh().  The process
 	 * then runs for a little too long at splhigh().  The ipl gets fixed
 	 * when the process returns to user mode (or earlier).
 	 *
 	 * It would probably be better to always call here at spl0(). Callers
 	 * are prepared to give up control to another process, so they must
 	 * be prepared to be interrupted.  The clock stuff here may not
 	 * actually need splstatclock().
 	 */
 	x = splstatclock();
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	microuptime(&new_switchtime);
 	if (timevalcmp(&new_switchtime, PCPU_PTR(switchtime), <)) {
 #if 0
 		/* XXX: This doesn't play well with sched_lock right now. */
 		printf("microuptime() went backwards (%ld.%06ld -> %ld.%06ld)\n",
 		    PCPU_GET(switchtime.tv_sec), PCPU_GET(switchtime.tv_usec),
 		    new_switchtime.tv_sec, new_switchtime.tv_usec);
 #endif
 		new_switchtime = PCPU_GET(switchtime);
 	} else {
 		p->p_runtime += (new_switchtime.tv_usec - PCPU_GET(switchtime.tv_usec)) +
 		    (new_switchtime.tv_sec - PCPU_GET(switchtime.tv_sec)) *
 		    (int64_t)1000000;
 	}
 
 #if 0
 	/*
 	 * Check if the process exceeds its cpu resource allocation.
 	 * If over max, kill it.
 	 *
 	 * XXX drop sched_lock, pickup Giant
 	 */
 	if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY &&
 	    p->p_runtime > p->p_limit->p_cpulimit) {
 		rlim = &p->p_rlimit[RLIMIT_CPU];
 		if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) {
 			mtx_unlock_spin(&sched_lock);
 			killproc(p, "exceeded maximum CPU limit");
 			mtx_lock_spin(&sched_lock);
 		} else {
 			mtx_unlock_spin(&sched_lock);
 			psignal(p, SIGXCPU);
 			mtx_lock_spin(&sched_lock);
 			if (rlim->rlim_cur < rlim->rlim_max) {
 				/* XXX: we should make a private copy */
 				rlim->rlim_cur += 5;
 			}
 		}
 	}
 #endif
 
 	/*
 	 * Pick a new current process and record its start time.
 	 */
 	cnt.v_swtch++;
 	PCPU_SET(switchtime, new_switchtime);
 	CTR4(KTR_PROC, "mi_switch: old proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 	cpu_switch();
 	CTR4(KTR_PROC, "mi_switch: new proc %p (pid %d, %s), schedlock %p",
 		p, p->p_pid, p->p_comm, (void *) sched_lock.mtx_lock);
 	if (PCPU_GET(switchtime.tv_sec) == 0)
 		microuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 	splx(x);
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(p)
 	register struct proc *p;
 {
 	register int s;
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	switch (p->p_stat) {
 	case 0:
 	case SRUN:
 	case SZOMB:
 	case SWAIT:
 	default:
 		panic("setrunnable");
 	case SSTOP:
 	case SSLEEP:			/* e.g. when sending signals */
 		if (p->p_sflag & PS_CVWAITQ)
 			cv_waitq_remove(p);
 		else
 			unsleep(p);
 		break;
 
 	case SIDL:
 		break;
 	}
 	p->p_stat = SRUN;
 	if (p->p_sflag & PS_INMEM)
 		setrunqueue(p);
 	splx(s);
 	if (p->p_slptime > 1)
 		updatepri(p);
 	p->p_slptime = 0;
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		p->p_sflag |= PS_SWAPINREQ;
 		wakeup((caddr_t)&proc0);
 	}
 	else
 		maybe_resched(p);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Compute the priority of a process when running in user mode.
  * Arrange to reschedule if the resulting priority is better
  * than that of the current process.
  */
 void
 resetpriority(p)
 	register struct proc *p;
 {
 	register unsigned int newpriority;
 
 	mtx_lock_spin(&sched_lock);
-	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+	if (p->p_pri.pri_class == PRI_TIMESHARE) {
 		newpriority = PUSER + p->p_estcpu / INVERSE_ESTCPU_WEIGHT +
 		    NICE_WEIGHT * (p->p_nice - PRIO_MIN);
-		newpriority = min(newpriority, MAXPRI);
-		p->p_usrpri = newpriority;
+		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
+		    PRI_MAX_TIMESHARE);
+		p->p_pri.pri_user = newpriority;
 	}
 	maybe_resched(p);
 	mtx_unlock_spin(&sched_lock);
 }
 
 /* ARGSUSED */
 static void
 sched_setup(dummy)
 	void *dummy;
 {
 
 	callout_init(&schedcpu_callout, 1);
 	callout_init(&roundrobin_callout, 0);
 
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
 }
 
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
  * estimator (p_estcpu) is increased here.  resetpriority() will
  * compute a different priority each time p_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
  * away exponentially, at a rate which is proportionally slower when
  * the system is busy.  The basic principle is that the system will
  * 90% forget that the process used a lot of CPU time in 5 * loadav
  * seconds.  This causes the system to favor processes which haven't
  * run much recently, and to round-robin among other processes.
  */
 void
 schedclock(p)
 	struct proc *p;
 {
 
 	p->p_cpticks++;
 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
 	if ((p->p_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(p);
-		if (p->p_priority >= PUSER)
-			p->p_priority = p->p_usrpri;
+		if (p->p_pri.pri_level >= PUSER)
+			p->p_pri.pri_level = p->p_pri.pri_user;
 	}
 }
 
 /*
  * General purpose yield system call
  */
 int
 yield(struct proc *p, struct yield_args *uap)
 {
 	int s;
 
 	p->p_retval[0] = 0;
 
 	s = splhigh();
 	mtx_lock_spin(&sched_lock);
 	DROP_GIANT_NOSWITCH();
-	p->p_priority = MAXPRI;
+	p->p_pri.pri_level = PRI_MAX_TIMESHARE;
 	setrunqueue(p);
 	p->p_stats->p_ru.ru_nvcsw++;
 	mi_switch();
 	mtx_unlock_spin(&sched_lock);
 	PICKUP_GIANT();
 	splx(s);
 
 	return (0);
 }
Index: head/sys/kern/ksched.c
===================================================================
--- head/sys/kern/ksched.c	(revision 72375)
+++ head/sys/kern/ksched.c	(revision 72376)
@@ -1,264 +1,269 @@
 /*
  * Copyright (c) 1996, 1997
  *	HD Associates, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by HD Associates, Inc
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /* ksched: Soft real time scheduling based on "rtprio".
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <machine/cpu.h>	/* For need_resched */
 #include <machine/ipl.h>	/* For need_resched */
 
 #include <posix4/posix4.h>
 
 /* ksched: Real-time extension to support POSIX priority scheduling.
  */
 
 struct ksched {
 	struct timespec rr_interval;
 };
 
 int ksched_attach(struct ksched **p)
 {
 	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
 
 	ksched->rr_interval.tv_sec = 0;
 	ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
 
 	*p = ksched;
 	return 0;
 }
 
 int ksched_detach(struct ksched *p)
 {
 	p31b_free(p);
 
 	return 0;
 }
 
 /*
  * XXX About priorities
  *
  *	POSIX 1003.1b requires that numerically higher priorities be of
  *	higher priority.  It also permits sched_setparam to be
  *	implementation defined for SCHED_OTHER.  I don't like
  *	the notion of inverted priorites for normal processes when
  *  you can use "setpriority" for that.
  *
  *	I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
  */
 
 /* Macros to convert between the unix (lower numerically is higher priority)
  * and POSIX 1003.1b (higher numerically is higher priority)
  */
 
 #define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
 #define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
 
 /* These improve readability a bit for me:
  */
 #define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
 #define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
 
 static __inline int
 getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
+	struct rtprio rtp;
 	int e = 0;
 
-	switch (p->p_rtprio.type)
+	pri_to_rtp(&p->p_pri, &rtp);
+	switch (rtp.type)
 	{
 		case RTP_PRIO_FIFO:
 		*ret = SCHED_FIFO;
 		break;
 
 		case RTP_PRIO_REALTIME:
 		*ret = SCHED_RR;
 		break;
 
 		default:
 		*ret = SCHED_OTHER;
 		break;
 	}
 
 	return e;
 }
 
 int ksched_setparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, const struct sched_param *param)
 {
 	register_t policy;
 	int e;
 
 	e = getscheduler(&policy, ksched, p);
 
 	if (e == 0)
 	{
 		if (policy == SCHED_OTHER)
 			e = EINVAL;
 		else
 			e = ksched_setscheduler(ret, ksched, p, policy, param);
 	}
 
 	return e;
 }
 
 int ksched_getparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct sched_param *param)
 {
-	if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
-		param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+	struct rtprio rtp;
 
+	pri_to_rtp(&p->p_pri, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
 	return 0;
 }
 
 /*
  * XXX The priority and scheduler modifications should
  *     be moved into published interfaces in kern/kern_sync.
  *
  * The permissions to modify process p were checked in "p31b_proc()".
  *
  */
 int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 	struct proc *p, int policy, const struct sched_param *param)
 {
 	int e = 0;
 	struct rtprio rtp;
 
 	switch(policy)
 	{
 		case SCHED_RR:
 		case SCHED_FIFO:
 
 		if (param->sched_priority >= P1B_PRIO_MIN &&
 		param->sched_priority <= P1B_PRIO_MAX)
 		{
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
 			rtp.type = (policy == SCHED_FIFO)
 				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
 
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 			need_resched();
 		}
 		else
 			e = EPERM;
 
 
 		break;
 
 		case SCHED_OTHER:
 		{
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 
 			/* XXX Simply revert to whatever we had for last
 			 *     normal scheduler priorities.
 			 *     This puts a requirement
 			 *     on the scheduling code: You must leave the
 			 *     scheduling info alone.
 			 */
 			need_resched();
 		}
 		break;
 	}
 
 	return e;
 }
 
 int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
 	return getscheduler(ret, ksched, p);
 }
 
 /* ksched_yield: Yield the CPU.
  */
 int ksched_yield(register_t *ret, struct ksched *ksched)
 {
 	need_resched();
 	return 0;
 }
 
 int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy)
 {
 	int e = 0;
 
 	switch (policy)
 	{
 		case SCHED_FIFO:
 		case SCHED_RR:
 		*ret = RTP_PRIO_MAX;
 		break;
 
 		case SCHED_OTHER:
 		*ret =  PRIO_MAX;
 		break;
 
 		default:
 		e = EINVAL;
 	}
 
 	return e;
 }
 
 int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy)
 {
 	int e = 0;
 
 	switch (policy)
 	{
 		case SCHED_FIFO:
 		case SCHED_RR:
 		*ret = P1B_PRIO_MIN;
 		break;
 
 		case SCHED_OTHER:
 		*ret =  PRIO_MIN;
 		break;
 
 		default:
 		e = EINVAL;
 	}
 
 	return e;
 }
 
 int ksched_rr_get_interval(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct timespec *timespec)
 {
 	*timespec = ksched->rr_interval;
 
 	return 0;
 }
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 72375)
+++ head/sys/kern/subr_trap.c	(revision 72376)
@@ -1,1328 +1,1327 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  * $FreeBSD$
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 #include <machine/vm86.h>
 
 #include <ddb/ddb.h>
 
 #include <sys/sysctl.h>
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void ast __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int, vm_offset_t));
 static void trap_fatal __P((struct trapframe *, vm_offset_t));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		28
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef DDB
 static int ddb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 	&ddb_on_nmi, 0, "Go to DDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 #ifdef WITNESS
 extern char *syscallnames[];
 #endif
 
 void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig;
 
 	while ((sig = CURSIG(p)) != 0) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		postsig(sig);
 	}
 
 	mtx_lock_spin(&sched_lock);
-	p->p_priority = p->p_usrpri;
+	p->p_pri.pri_level = p->p_pri.pri_user;
 	if (resched_wanted()) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		clear_resched();
 		DROP_GIANT_NOSWITCH();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		mtx_unlock_spin(&sched_lock);
 		PICKUP_GIANT();
 		while ((sig = CURSIG(p)) != 0) {
 			if (!mtx_owned(&Giant))
 				mtx_lock(&Giant);
 			postsig(sig);
 		}
 		mtx_lock_spin(&sched_lock);
 	}
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_sflag & PS_PROFIL) {
 		mtx_unlock_spin(&sched_lock);
 		/* XXX - do we need Giant? */
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, TRAPF_PC(frame),
 			    (u_int)(p->p_sticks - oticks) * psratio);
 	}
-	curpriority = p->p_priority;
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 	vm_offset_t eva;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	atomic_add_int(&cnt.v_trap, 1);
 
 	if ((frame.tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.  XXX This is really bad if we trap
 		 * while holding a spin lock.
 		 */
 		type = frame.tf_trapno;
 		if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * We should walk p_heldmtx here and see if any are
 			 * spin mutexes, and not do this if so.
 			 */
 			enable_intr();
 		}
 	}
 
 	eva = 0;
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 restart:
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
         if ((ISPL(frame.tf_cs) == SEL_UPL) ||
 	    ((frame.tf_eflags & PSL_VM) && !in_vm86call)) {
 		/* user trap */
 
 		mtx_lock_spin(&sched_lock);
 		sticks = p->p_sticks;
 		mtx_unlock_spin(&sched_lock);
 		p->p_md.md_regs = &frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			i = trap_pfault(&frame, TRUE, eva);
 			mtx_unlock(&Giant);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * f00f hack workaround has triggered, treat
 				 * as illegal instruction not page fault.
 				 */
 				frame.tf_trapno = T_PRIVINFLT;
 				goto restart;
 			}
 #endif
 			if (i == -1)
 				goto out;
 			if (i == 0)
 				goto user;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto out;
 #endif
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			mtx_lock(&Giant);
 			i = (*pmath_emulate)(&frame);
 			mtx_unlock(&Giant);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					goto out;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			/*
 			 * For some Cyrix CPUs, %cr2 is clobbered by
 			 * interrupts.  This problem is worked around by using
 			 * an interrupt gate for the pagefault handler.  We
 			 * are finally ready to read %cr2 and then must
 			 * reenable interrupts.
 			 */
 			eva = rcr2();
 			enable_intr();
 			mtx_lock(&Giant);
 			(void) trap_pfault(&frame, FALSE, eva);
 			mtx_unlock(&Giant);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame.tf_eflags & PSL_VM) {
 				mtx_lock(&Giant);
 				i = vm86_emulate((struct vm86frame *)&frame);
 				mtx_unlock(&Giant);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)&frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (in_vm86call)
 				break;
 
 			if (p->p_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame.tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 				mtx_lock(&Giant);
 				psignal(p, SIGBUS);
 				mtx_unlock(&Giant);
 				goto out;
 			}
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame.tf_eip == (int)doreti_iret) {
 				frame.tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_ds) {
 				frame.tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_es) {
 				frame.tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame.tf_eip == (int)doreti_popl_fs) {
 				frame.tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb) != NULL &&
 			    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame.tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			/* XXX Giant */
 			if (user_dbreg_trap() && !in_vm86call) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * Fall through (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			/* XXX Giant */
 			if (kdb_trap (type, 0, &frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef DDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (ddb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap (type, 0, &frame);
 				}
 #endif /* DDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		mtx_lock(&Giant);
 		trap_fatal(&frame, eva);
 		mtx_unlock(&Giant);
 		goto out;
 	}
 
 	mtx_lock(&Giant);
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 	mtx_unlock(&Giant);
 
 user:
 	userret(p, &frame, sticks);
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 out:
 	return;
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	va = trunc_page(eva);
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t mpte;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		     (p->p_intr_nesting_level != 0 ||
 		      PCPU_GET(curpcb) == NULL ||
 		      PCPU_GET(curpcb)->pcb_onfault == NULL))) {
 			trap_fatal(frame, eva);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 		
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct proc *p = curproc;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return -2;
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		/* grow_stack returns false only if va falls into
 		 * a growable stack region and the stack growth
 		 * fails.  It returns true if va was not within
 		 * a growable stack region, or if the stack 
 		 * growth succeeded.
 		 */
 		if (!grow_stack (p, va)) {
 			rv = KERN_FAILURE;
 			PROC_LOCK(p);
 			--p->p_lock;
 			PROC_UNLOCK(p);
 			goto nogo;
 		}
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (p->p_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb) != NULL &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss, esp;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
         		frame->tf_eflags & PSL_VM ? "vm86" :
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if ((debugger_on_panic || db_active) && kdb_trap(type, 0, frame))
 		return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("lapic.id = %08x\n", lapic.id);
 #endif
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	PROC_LOCK(p);
 	++p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (!grow_stack (p, va)) {
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 		return (1);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
 
 	PROC_LOCK(p);
 	--p->p_lock;
 	PROC_UNLOCK(p);
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  *	syscall -	MP aware system call request C handler
  *
  *	A system call is essentially treated as a trap except that the
  *	MP lock is not held on entry or return.  We are responsible for
  *	obtaining the MP lock if necessary and for handling ASTs
  *	(e.g. a task switch) prior to return.
  *
  *	In general, only simple access and manipulation of curproc and
  *	the current stack is allowed without having to hold MP lock.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
 
 	atomic_add_int(&cnt.v_syscall, 1);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame.tf_cs) != SEL_UPL) {
 		mtx_lock(&Giant);
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 
 	mtx_lock_spin(&sched_lock);
 	sticks = p->p_sticks;
 	mtx_unlock_spin(&sched_lock);
 
 	p->p_md.md_regs = &frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is not MP aware.
 		 */
 		mtx_lock(&Giant);
 		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
 		mtx_unlock(&Giant);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 */
 			code = fuword(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg & SYF_ARGMASK;
 
 	/*
 	 * copyin is MP aware, but the tracing code is not
 	 */
 	if (params && (i = narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 		mtx_lock(&Giant);
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, narg, args);
 #endif
 		goto bad;
 	}
 
 	/*
 	 * Try to run the syscall without the MP lock if the syscall
 	 * is MP safe.  We have to obtain the MP lock no matter what if 
 	 * we are ktracing
 	 */
 	if ((callp->sy_narg & SYF_MPSAFE) == 0) {
 		mtx_lock(&Giant);
 	}
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsyscall(p->p_tracep, code, narg, args);
 	}
 #endif
 	p->p_retval[0] = 0;
 	p->p_retval[1] = frame.tf_edx;
 
 	STOPEVENT(p, S_SCE, narg);	/* MP aware */
 
 	error = (*callp->sy_call)(p, args);
 
 	/*
 	 * MP SAFE (we may or may not have the MP lock at this point)
 	 */
 	switch (error) {
 	case 0:
 		frame.tf_eax = p->p_retval[0];
 		frame.tf_edx = p->p_retval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame.tf_eip -= frame.tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.  trapsignal() is not MP aware.
 	 */
 	if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET)) {
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		ktrsysret(p->p_tracep, code, error, p->p_retval[0]);
 	}
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 	/*
 	 * Release Giant if we had to get it
 	 */
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 
 #ifdef WITNESS
 	if (witness_list(p)) {
 		panic("system call %s returning with mutex(s) held\n",
 		    syscallnames[code]);
 	}
 #endif
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 void
 ast(frame)
 	struct trapframe frame;
 {
 	struct proc *p = CURPROC;
 	u_quad_t sticks;
 
 	KASSERT(TRAPF_USERMODE(&frame), ("ast in kernel mode"));
 
 	/*
 	 * We check for a pending AST here rather than in the assembly as
 	 * acquiring and releasing mutexes in assembly is not fun.
 	 */
 	mtx_lock_spin(&sched_lock);
 	if (!(astpending() || resched_wanted())) {
 		mtx_unlock_spin(&sched_lock);
 		return;
 	}
 
 	sticks = p->p_sticks;
 
 	astoff();
 	mtx_intr_enable(&sched_lock);
 	atomic_add_int(&cnt.v_soft, 1);
 	if (p->p_sflag & PS_OWEUPC) {
 		p->p_sflag &= ~PS_OWEUPC;
 		mtx_unlock_spin(&sched_lock);
 		mtx_lock(&Giant);
 		mtx_lock_spin(&sched_lock);
 		addupc_task(p, p->p_stats->p_prof.pr_addr,
 			    p->p_stats->p_prof.pr_ticks);
 	}
 	if (p->p_sflag & PS_ALRMPEND) {
 		p->p_sflag &= ~PS_ALRMPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGVTALRM);
 		mtx_lock_spin(&sched_lock);
 	}
 	if (p->p_sflag & PS_PROFPEND) {
 		p->p_sflag &= ~PS_PROFPEND;
 		mtx_unlock_spin(&sched_lock);
 		if (!mtx_owned(&Giant))
 			mtx_lock(&Giant);
 		psignal(p, SIGPROF);
 	} else
 		mtx_unlock_spin(&sched_lock);
 	
 	userret(p, &frame, sticks);
 
 	if (mtx_owned(&Giant))
 		mtx_unlock(&Giant);
 }
Index: head/sys/kern/subr_turnstile.c
===================================================================
--- head/sys/kern/subr_turnstile.c	(revision 72375)
+++ head/sys/kern/subr_turnstile.c	(revision 72376)
@@ -1,1705 +1,1680 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  * $FreeBSD$
  */
 
 /*
  * Machine independent bits of mutex implementation and implementation of
  * `witness' structure & related debugging routines.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 #include "opt_ddb.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #include <sys/ktr.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <sys/mutex.h>
 
 /*
  * The WITNESS-enabled mutex debug structure.
  */
 #ifdef WITNESS
 struct mtx_debug {
 	struct witness	*mtxd_witness;
 	LIST_ENTRY(mtx)	mtxd_held;
 	const char	*mtxd_file;
 	int		mtxd_line;
 };
 
 #define mtx_held	mtx_debug->mtxd_held
 #define	mtx_file	mtx_debug->mtxd_file
 #define	mtx_line	mtx_debug->mtxd_line
 #define	mtx_witness	mtx_debug->mtxd_witness
 #endif	/* WITNESS */
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
 
 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)
 
 /*
  * Early WITNESS-enabled declarations.
  */
 #ifdef WITNESS
 
 /*
  * Internal WITNESS routines which must be prototyped early.
  *
  * XXX: When/if witness code is cleaned up, it would be wise to place all
  *	witness prototyping early in this file.
  */ 
 static void witness_init(struct mtx *, int flag);
 static void witness_destroy(struct mtx *);
 static void witness_display(void(*)(const char *fmt, ...));
 
 MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure");
 
 /* All mutexes in system (used for debug/panic) */
 static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 };
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 #else	/* WITNESS */
 
 /* XXX XXX XXX
  * flag++ is sleazoid way of shuting up warning
  */
 #define witness_init(m, flag) flag++
 #define witness_destroy(m)
 #define witness_try_enter(m, t, f, l)
 #endif	/* WITNESS */
 
 /*
  * All mutex locks in system are kept on the all_mtx list.
  */
 static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head",
 	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
 	{ NULL, NULL }, &all_mtx, &all_mtx,
 #ifdef WITNESS
 	&all_mtx_debug
 #else
 	NULL
 #endif
 	 };
 
 /*
  * Global variables for book keeping.
  */
 static int	mtx_cur_cnt;
 static int	mtx_max_cnt;
 
 /*
  * Couple of strings for KTR_LOCK tracing in order to avoid duplicates.
  */
 char	STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d";
 
 /*
  * Prototypes for non-exported routines.
  *
  * NOTE: Prototypes for witness routines are placed at the bottom of the file. 
  */
 static void	propagate_priority(struct proc *);
 
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	for (;;) {
 		struct proc *p1;
 
 		p = mtx_owner(m);
 
 		if (p == NULL) {
 			/*
 			 * This really isn't quite right. Really
 			 * ought to bump priority of process that
 			 * next acquires the mutex.
 			 */
 			MPASS(m->mtx_lock == MTX_CONTESTED);
 			return;
 		}
 
 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;
 
 		/*
 		 * Bump this process' priority.
 		 */
 		SET_PRIO(p, pri);
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
+
 		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
-		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
 			return;
 		}
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
 		KASSERT(p->p_stat == SMTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
 		    p->p_pid, p->p_comm, p->p_stat,
 		    m->mtx_description));
 
 		/*
 		 * Pick up the mutex that p is blocked on.
 		 */
 		m = p->p_blocked;
 		MPASS(m != NULL);
 
-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}
 
-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}
 
 		/*
 		 * Remove proc from blocked chain and determine where
 		 * it should be moved up to.  Since we know that p1 has
 		 * a lower priority than p, we know that at least one
 		 * process in the chain has a lower priority and that
 		 * p1 will thus not be NULL after the loop.
 		 */
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}
 
 		MPASS(p1 != NULL);
 		TAILQ_INSERT_BEFORE(p1, p, p_procq);
 		CTR4(KTR_LOCK,
 		    "propagate_priority: p %p moved before %p on [%p] %s",
 		    p, p1, m, m->mtx_description);
 	}
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
  * if we're called, it's because we know we don't already own this lock.
  */
 int
 _mtx_trylock(struct mtx *m, int opts, const char *file, int line)
 {
 	int rval;
 
 	MPASS(CURPROC != NULL);
 
 	/*
 	 * _mtx_trylock does not accept MTX_NOSWITCH option.
 	 */
 	KASSERT((opts & MTX_NOSWITCH) == 0,
 	    ("mtx_trylock() called with invalid option flag(s) %d", opts));
 
 	rval = _obtain_lock(m, CURTHD);
 
 #ifdef WITNESS
 	if (rval && m->mtx_witness != NULL) {
 		/*
 		 * We do not handle recursion in _mtx_trylock; see the
 		 * note at the top of the routine.
 		 */
 		KASSERT(!mtx_recursed(m),
 		    ("mtx_trylock() called on a recursed mutex"));
 		witness_try_enter(m, (opts | m->mtx_flags), file, line);
 	}
 #endif	/* WITNESS */
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d",
 		    m->mtx_description, m, rval, file, line);
 
 	return rval;
 }
 
 /*
  * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
 _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p = CURPROC;
 
 	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]",
 		    m, (void *)m->mtx_lock, (void *)RETIP(m));
 
 	/*
 	 * Save our priority. Even though p_nativepri is protected by
 	 * sched_lock, we don't obtain it here as it can be expensive.
 	 * Since this is the only place p_nativepri is set, and since two
 	 * CPUs will not be executing the same process concurrently, we know
 	 * that no other CPU is going to be messing with this. Also,
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;
 
 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
 		struct proc *p1;
 
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the sched_lock.
 		 */
 		if ((v = m->mtx_lock) == MTX_UNOWNED) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * The mutex was marked contested on release. This means that
 		 * there are processes blocked on it.
 		 */
 		if (v == MTX_CONTESTED) {
 			p1 = TAILQ_FIRST(&m->mtx_blocked);
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
 
-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
 			(void *)(v | MTX_CONTESTED))) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * We deffinately must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef notyet
 		/*
 		 * If we're borrowing an interrupted thread's VM context, we
 		 * must clean up before going to sleep.
 		 */
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_lock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 
 		/*
 		 * Put us on the list of threads blocked on this mutex.
 		 */
 		if (TAILQ_EMPTY(&m->mtx_blocked)) {
 			p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK);
 			LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested);
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
 			else
 				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		}
 
 		/*
 		 * Save who we're blocked on.
 		 */
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			    "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m,
 			    m->mtx_description);
 
 		mi_switch();
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
 			  p, m, m->mtx_description);
 
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	return;
 }
 
 /*
  * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 void
 _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file,
 	       int line)
 {
 	int i = 0;
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 
 	for (;;) {
 		if (_obtain_lock(m, CURPROC))
 			break;
 
 		while (m->mtx_lock != MTX_UNOWNED) {
 			if (i++ < 1000000)
 				continue;
 			if (i++ < 6000000)
 				DELAY(1);
 #ifdef DDB
 			else if (!db_active)
 #else
 			else
 #endif
 			panic("spin lock %s held by %p for > 5 seconds",
 			    m->mtx_description, (void *)m->mtx_lock);
 		}
 	}
 
 	m->mtx_saveintr = mtx_intr;
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 
 	return;
 }
 
 /*
  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed or contested (i.e. we
  * need to wake up a blocked thread).
  */
 void
 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p, *p1;
 	struct mtx *m1;
 	int pri;
 
 	p = CURPROC;
 	MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line);
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 
 	p1 = TAILQ_FIRST(&m->mtx_blocked);
 	MPASS(p->p_magic == P_MAGIC);
 	MPASS(p1->p_magic == P_MAGIC);
 
 	TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
 
 	if (TAILQ_EMPTY(&m->mtx_blocked)) {
 		LIST_REMOVE(m, mtx_contested);
 		_release_lock_quick(m);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
 
-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}
 
-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
 		    m, p1);
 
 	p1->p_blocked = NULL;
 	p1->p_mtxname = NULL;
 	p1->p_stat = SRUN;
 	setrunqueue(p1);
 
-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_unlock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 		setrunqueue(p);
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
 			    (void *)m->mtx_lock);
 
 		mi_switch();
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
 			    m, (void *)m->mtx_lock);
 	}
 
 	mtx_unlock_spin(&sched_lock);
 
 	return;
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the _rel_spin_lock() macro for the details. 
  */
 
 /*
  * The INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANTS
 void
 _mtx_assert(struct mtx *m, int what, const char *file, int line)
 {
 	switch ((what)) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned((m)))
 			panic("mutex %s not owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		if (mtx_recursed((m))) {
 			if (((what) & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    (m)->mtx_description, file, line);
 		} else if (((what) & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    (m)->mtx_description, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned((m)))
 			panic("mutex %s owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * The MUTEX_DEBUG-enabled mtx_validate()
  */
 #define MV_DESTROY	0	/* validate before destory */
 #define MV_INIT		1	/* validate before init */
 
 #ifdef MUTEX_DEBUG
 
 int mtx_validate __P((struct mtx *, int));
 
 int
 mtx_validate(struct mtx *m, int when)
 {
 	struct mtx *mp;
 	int i;
 	int retval = 0;
 
 #ifdef WITNESS
 	if (witness_cold)
 		return 0;
 #endif
 	if (m == &all_mtx || cold)
 		return 0;
 
 	mtx_lock(&all_mtx);
 /*
  * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
  * we can re-enable the kernacc() checks.
  */
 #ifndef __alpha__
 	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
 	    VM_PROT_READ) == 1);
 #endif
 	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
 	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 #ifndef __alpha__
 		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
 		    VM_PROT_READ) != 1) {
 			panic("mtx_validate: mp=%p mp->mtx_next=%p",
 			    mp, mp->mtx_next);
 		}
 #endif
 		i++;
 		if (i > mtx_cur_cnt) {
 			panic("mtx_validate: too many in chain, known=%d\n",
 			    mtx_cur_cnt);
 		}
 	}
 	MPASS(i == mtx_cur_cnt); 
 	switch (when) {
 	case MV_DESTROY:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 			if (mp == m)
 				break;
 		MPASS(mp == m);
 		break;
 	case MV_INIT:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 		if (mp == m) {
 			/*
 			 * Not good. This mutex already exists.
 			 */
 			printf("re-initing existing mutex %s\n",
 			    m->mtx_description);
 			MPASS(m->mtx_lock == MTX_UNOWNED);
 			retval = 1;
 		}
 	}
 	mtx_unlock(&all_mtx);
 	return (retval);
 }
 #endif
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and description `description.'
  * Place on "all_mtx" queue.
  */ 
 void
 mtx_init(struct mtx *m, const char *description, int opts)
 {
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description);
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	if (mtx_validate(m, MV_INIT))
 		return;
 #endif
 
 	bzero((void *)m, sizeof *m);
 	TAILQ_INIT(&m->mtx_blocked);
 
 #ifdef WITNESS
 	if (!witness_cold) {
 		m->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(m->mtx_debug != NULL);
 	}
 #endif
 
 	m->mtx_description = description;
 	m->mtx_flags = opts;
 	m->mtx_lock = MTX_UNOWNED;
 
 	/* Put on all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next = &all_mtx;
 	m->mtx_prev = all_mtx.mtx_prev;
 	m->mtx_prev->mtx_next = m;
 	all_mtx.mtx_prev = m;
 	if (++mtx_cur_cnt > mtx_max_cnt)
 		mtx_max_cnt = mtx_cur_cnt;
 	mtx_unlock(&all_mtx);
 
 #ifdef WITNESS
 	if (!witness_cold)
 		witness_init(m, opts);
 #endif
 }
 
 /*
  * Remove lock `m' from all_mtx queue.
  */
 void
 mtx_destroy(struct mtx *m)
 {
 
 #ifdef WITNESS
 	KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n",
 	    __FUNCTION__));
 #endif
 
 	CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description);
 
 #ifdef MUTEX_DEBUG
 	if (m->mtx_next == NULL)
 		panic("mtx_destroy: %p (%s) already destroyed",
 		    m, m->mtx_description);
 
 	if (!mtx_owned(m)) {
 		MPASS(m->mtx_lock == MTX_UNOWNED);
 	} else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 	}
 
 	/* diagnostic */
 	mtx_validate(m, MV_DESTROY);
 #endif
 
 #ifdef WITNESS
 	if (m->mtx_witness)
 		witness_destroy(m);
 #endif /* WITNESS */
 
 	/* Remove from the all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next->mtx_prev = m->mtx_prev;
 	m->mtx_prev->mtx_next = m->mtx_next;
 
 #ifdef MUTEX_DEBUG
 	m->mtx_next = m->mtx_prev = NULL;
 #endif
 
 #ifdef WITNESS
 	free(m->mtx_debug, M_WITNESS);
 	m->mtx_debug = NULL;
 #endif
 
 	mtx_cur_cnt--;
 	mtx_unlock(&all_mtx);
 }
 
 
 /*
  * The WITNESS-enabled diagnostic code.
  */
 #ifdef WITNESS
 static void
 witness_fixup(void *dummy __unused)
 {
 	struct mtx *mp;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	mtx_lock(&all_mtx);
 
 	/* Iterate through all mutexes and finish up mutex initialization. */
 	for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 
 		mp->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(mp->mtx_debug != NULL);
 
 		witness_init(mp, mp->mtx_flags);
 	}
 	mtx_unlock(&all_mtx);
 
 	/* Mark the witness code as being ready for use. */
 	atomic_store_rel_int(&witness_cold, 0);
 
 	mtx_lock(&Giant);
 }
 SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL)
 
 #define WITNESS_COUNT 200
 #define	WITNESS_NCHILDREN 2
 
 int witness_watch = 1;
 
 struct witness {
 	struct witness	*w_next;
 	const char	*w_description;
 	const char	*w_file;
 	int		 w_line;
 	struct witness	*w_morechildren;
 	u_char		 w_childcnt;
 	u_char		 w_Giant_squawked:1;
 	u_char		 w_other_squawked:1;
 	u_char		 w_same_squawked:1;
 	u_char		 w_spin:1;	/* MTX_SPIN type mutex. */
 	u_int		 w_level;
 	struct witness	*w_children[WITNESS_NCHILDREN];
 };
 
 struct witness_blessed {
 	char 	*b_lock1;
 	char	*b_lock2;
 };
 
 #ifdef DDB
 /*
  * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
  * drop into kdebug() when:
  *	- a lock heirarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_ddb;
 #ifdef WITNESS_DDB
 TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb);
 #else
 TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
 #endif /* DDB */
 
 int	witness_skipspin;
 #ifdef WITNESS_SKIPSPIN
 TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin);
 #else
 TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
     "");
 
 /*
  * Witness-enabled globals
  */
 static struct mtx	w_mtx;
 static struct witness	*w_free;
 static struct witness	*w_all;
 static int		 w_inited;
 static int		 witness_dead;	/* fatal error, probably no memory */
 
 static struct witness	 w_data[WITNESS_COUNT];
 
 /*
  * Internal witness routine prototypes
  */
 static struct witness *enroll(const char *description, int flag);
 static int itismychild(struct witness *parent, struct witness *child);
 static void removechild(struct witness *parent, struct witness *child);
 static int isitmychild(struct witness *parent, struct witness *child);
 static int isitmydescendant(struct witness *parent, struct witness *child);
 static int dup_ok(struct witness *);
 static int blessed(struct witness *, struct witness *);
 static void
     witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *);
 static void witness_leveldescendents(struct witness *parent, int level);
 static void witness_levelall(void);
 static struct witness * witness_get(void);
 static void witness_free(struct witness *m);
 
 static char *ignore_list[] = {
 	"witness lock",
 	NULL
 };
 
 static char *spin_order_list[] = {
 #if defined(__i386__) && defined (SMP)
 	"com",
 #endif
 	"sio",
 #ifdef __i386__
 	"cy",
 #endif
 	"sched lock",
 #ifdef __i386__
 	"clk",
 #endif
 	"callout",
 	/*
 	 * leaf locks
 	 */
 	"ithread table lock",
 	"ithread list lock",
 #ifdef SMP
 #ifdef __i386__
 	"ap boot",
 	"imen",
 #endif
 	"smp rendezvous",
 #endif
 	NULL
 };
 
 static char *order_list[] = {
 	"Giant", "proctree", "allproc", "process lock", "uidinfo hash",
 	    "uidinfo struct", NULL,
 	NULL
 };
 
 static char *dup_list[] = {
 	NULL
 };
 
 static char *sleep_list[] = {
 	"Giant",
 	NULL
 };
 
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 static int blessed_count =
 	sizeof(blessed_list) / sizeof(struct witness_blessed);
 
 static void
 witness_init(struct mtx *m, int flag)
 {
 	m->mtx_witness = enroll(m->mtx_description, flag);
 }
 
 static void
 witness_destroy(struct mtx *m)
 {
 	struct mtx *m1;
 	struct proc *p;
 	p = CURPROC;
 	LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) {
 		if (m1 == m) {
 			LIST_REMOVE(m, mtx_held);
 			break;
 		}
 	}
 	return;
 
 }
 
 static void
 witness_display(void(*prnt)(const char *fmt, ...))
 {
 	struct witness *w, *w1;
 	int level, found;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	witness_levelall();
 
 	/*
 	 * First, handle sleep mutexes which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep mutexes:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file == NULL || w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		/*
 		 * This lock has no anscestors, display its descendants. 
 		 */
 		witness_displaydescendants(prnt, w);
 	}
 	
 	/*
 	 * Now do spin mutexes which have been acquired at least once.
 	 */
 	prnt("\nSpin mutexes:\n");
 	level = 0;
 	while (level < sizeof(spin_order_list) / sizeof(char *)) {
 		found = 0;
 		for (w = w_all; w; w = w->w_next) {
 			if (w->w_file == NULL || !w->w_spin)
 				continue;
 			if (w->w_level == 1 << level) {
 				witness_displaydescendants(prnt, w);
 				level++;
 				found = 1;
 			}
 		}
 		if (found == 0)
 			level++;
 	}
 	
 	/*
 	 * Finally, any mutexes which have not been acquired yet.
 	 */
 	prnt("\nMutexes which were never acquired:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file != NULL)
 			continue;
 		prnt("%s\n", w->w_description);
 	}
 }
 
 void
 witness_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w, *w1;
 	struct mtx *m1;
 	struct proc *p;
 	int i;
 #ifdef DDB
 	int go_into_ddb = 0;
 #endif /* DDB */
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 	p = CURPROC;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_enter: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		i = PCPU_GET(witness_spin_check);
 		if (i != 0 && w->w_level < i) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
 			    " %s:%d already holding %s:%x",
 			    m->mtx_description, w->w_level, file, line,
 			    spin_order_list[ffs(i)-1], i);
 		}
 		PCPU_SET(witness_spin_check, i | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line);
 		return;
 	}
 	if (witness_dead)
 		goto out;
 	if (cold)
 		goto out;
 
 	if (!mtx_legal2block())
 		panic("blockable mtx_lock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	/*
 	 * Is this the first mutex acquired 
 	 */
 	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
 		goto out;
 
 	if ((w1 = m1->mtx_witness) == w) {
 		if (w->w_same_squawked || dup_ok(w))
 			goto out;
 		w->w_same_squawked = 1;
 		printf("acquring duplicate lock of same type: \"%s\"\n", 
 			m->mtx_description);
 		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
 		printf(" 2nd @ %s:%d\n", file, line);
 #ifdef DDB
 		go_into_ddb = 1;
 #endif /* DDB */
 		goto out;
 	}
 	MPASS(!mtx_owned(&w_mtx));
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	/*
 	 * If we have a known higher number just say ok
 	 */
 	if (witness_watch > 1 && w->w_level > w1->w_level) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	if (isitmydescendant(m1->mtx_witness, w)) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
 
 		MPASS(i < 200);
 		w1 = m1->mtx_witness;
 		if (isitmydescendant(w, w1)) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			if (blessed(w, w1))
 				goto out;
 			if (m1 == &Giant) {
 				if (w1->w_Giant_squawked)
 					goto out;
 				else
 					w1->w_Giant_squawked = 1;
 			} else {
 				if (w1->w_other_squawked)
 					goto out;
 				else
 					w1->w_other_squawked = 1;
 			}
 			printf("lock order reversal\n");
 			printf(" 1st %s last acquired @ %s:%d\n",
 			    w->w_description, w->w_file, w->w_line);
 			printf(" 2nd %p %s @ %s:%d\n",
 			    m1, w1->w_description, w1->w_file, w1->w_line);
 			printf(" 3rd %p %s @ %s:%d\n",
 			    m, w->w_description, file, line);
 #ifdef DDB
 			go_into_ddb = 1;
 #endif /* DDB */
 			goto out;
 		}
 	}
 	m1 = LIST_FIRST(&p->p_heldmtx);
 	if (!itismychild(m1->mtx_witness, w))
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 
 out:
 #ifdef DDB
 	if (witness_ddb && go_into_ddb)
 		Debugger("witness_enter");
 #endif /* DDB */
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 
 	/*
 	 * If this pays off it likely means that a mutex being witnessed
 	 * is acquired in hardclock. Put it in the ignore list. It is
 	 * likely not the mutex this assert fails on.
 	 */
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_try_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct proc *p;
 	struct witness *w = m->mtx_witness;
 
 	if (witness_cold)
 		return;
 	if (panicstr)
 		return;
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_try_enter: "
 			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
 			    m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_try_enter: recursion on"
 				    " non-recursive mutex %s @ %s:%d",
 				    m->mtx_description, file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_try_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description, file,
 			    line);
 		return;
 	}
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 	p = CURPROC;
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_exit(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w;
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_exit: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line); 
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) & ~w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_exit: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line); 
 		return;
 	}
 
 	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
 		panic("switchable mtx_unlock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	LIST_REMOVE(m, mtx_held);
 	m->mtx_held.le_prev = NULL;
 }
 
 int
 witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
 {
 	struct mtx *m;
 	struct proc *p;
 	char **sleep;
 	int n = 0;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	p = CURPROC;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		if (m == mtx)
 			continue;
 		for (sleep = sleep_list; *sleep!= NULL; sleep++)
 			if (strcmp(m->mtx_description, *sleep) == 0)
 				goto next;
 		if (n == 0)
 			printf("Whee!\n");
 		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
 			file, line, check_only ? "could sleep" : "sleeping",
 			m->mtx_description,
 			m->mtx_witness->w_file, m->mtx_witness->w_line);
 		n++;
 	next:
 	}
 #ifdef DDB
 	if (witness_ddb && n)
 		Debugger("witness_sleep");
 #endif /* DDB */
 	return (n);
 }
 
 static struct witness *
 enroll(const char *description, int flag)
 {
 	int i;
 	struct witness *w, *w1;
 	char **ignore;
 	char **order;
 
 	if (!witness_watch)
 		return (NULL);
 	for (ignore = ignore_list; *ignore != NULL; ignore++)
 		if (strcmp(description, *ignore) == 0)
 			return (NULL);
 
 	if (w_inited == 0) {
 		mtx_init(&w_mtx, "witness lock", MTX_SPIN);
 		for (i = 0; i < WITNESS_COUNT; i++) {
 			w = &w_data[i];
 			witness_free(w);
 		}
 		w_inited = 1;
 		for (order = order_list; *order != NULL; order++) {
 			w = enroll(*order, MTX_DEF);
 			w->w_file = "order list";
 			for (order++; *order != NULL; order++) {
 				w1 = enroll(*order, MTX_DEF);
 				w1->w_file = "order list";
 				itismychild(w, w1);
 				w = w1;
     	    	    	}
 		}
 	}
 	if ((flag & MTX_SPIN) && witness_skipspin)
 		return (NULL);
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	for (w = w_all; w; w = w->w_next) {
 		if (strcmp(description, w->w_description) == 0) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			return (w);
 		}
 	}
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	w->w_next = w_all;
 	w_all = w;
 	w->w_description = description;
 	mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 	if (flag & MTX_SPIN) {
 		w->w_spin = 1;
 	
 		i = 1;
 		for (order = spin_order_list; *order != NULL; order++) {
 			if (strcmp(description, *order) == 0)
 				break;
 			i <<= 1;
 		}
 		if (*order == NULL)
 			panic("spin lock %s not in order list", description);
 		w->w_level = i; 
 	}
 
 	return (w);
 }
 
 static int
 itismychild(struct witness *parent, struct witness *child)
 {
 	static int recursed;
 
 	/*
 	 * Insert "child" after "parent"
 	 */
 	while (parent->w_morechildren)
 		parent = parent->w_morechildren;
 
 	if (parent->w_childcnt == WITNESS_NCHILDREN) {
 		if ((parent->w_morechildren = witness_get()) == NULL)
 			return (1);
 		parent = parent->w_morechildren;
 	}
 	MPASS(child != NULL);
 	parent->w_children[parent->w_childcnt++] = child;
 	/*
 	 * now prune whole tree
 	 */
 	if (recursed)
 		return (0);
 	recursed = 1;
 	for (child = w_all; child != NULL; child = child->w_next) {
 		for (parent = w_all; parent != NULL;
 		    parent = parent->w_next) {
 			if (!isitmychild(parent, child))
 				continue;
 			removechild(parent, child);
 			if (isitmydescendant(parent, child))
 				continue;
 			itismychild(parent, child);
 		}
 	}
 	recursed = 0;
 	witness_levelall();
 	return (0);
 }
 
 static void
 removechild(struct witness *parent, struct witness *child)
 {
 	struct witness *w, *w1;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			if (w->w_children[i] == child)
 				goto found;
 	return;
 found:
 	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
 		continue;
 	w->w_children[i] = w1->w_children[--w1->w_childcnt];
 	MPASS(w->w_children[i] != NULL);
 
 	if (w1->w_childcnt != 0)
 		return;
 
 	if (w1 == parent)
 		return;
 	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
 		continue;
 	w->w_morechildren = 0;
 	witness_free(w1);
 }
 
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren) {
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 isitmydescendant(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 	int j;
 
 	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
 		MPASS(j < 1000);
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (isitmydescendant(w->w_children[i], child))
 				return (1);
 		}
 	}
 	return (0);
 }
 
 void
 witness_levelall (void)
 {
 	struct witness *w, *w1;
 
 	for (w = w_all; w; w = w->w_next)
 		if (!(w->w_spin))
 			w->w_level = 0;
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		witness_leveldescendents(w, 0);
 	}
 }
 
 static void
 witness_leveldescendents(struct witness *parent, int level)
 {
 	int i;
 	struct witness *w;
 
 	if (parent->w_level < level)
 		parent->w_level = level;
 	level++;
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			witness_leveldescendents(w->w_children[i], level);
 }
 
 static void
 witness_displaydescendants(void(*prnt)(const char *fmt, ...),
 			   struct witness *parent)
 {
 	struct witness *w;
 	int i;
 	int level;
 
 	level = parent->w_spin ? ffs(parent->w_level) : parent->w_level;
 
 	prnt("%d", level);
 	if (level < 10)
 		prnt(" ");
 	for (i = 0; i < level; i++)
 		prnt(" ");
 	prnt("%s", parent->w_description);
 	if (parent->w_file != NULL)
 		prnt(" -- last acquired @ %s:%d\n", parent->w_file,
 		    parent->w_line);
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			    witness_displaydescendants(prnt, w->w_children[i]);
     }
 
 static int
 dup_ok(struct witness *w)
 {
 	char **dup;
 	
 	for (dup = dup_list; *dup!= NULL; dup++)
 		if (strcmp(w->w_description, *dup) == 0)
 			return (1);
 	return (0);
 }
 
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < blessed_count; i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_description, b->b_lock1) == 0) {
 			if (strcmp(w2->w_description, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_description, b->b_lock2) == 0)
 			if (strcmp(w2->w_description, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 
 static struct witness *
 witness_get()
 {
 	struct witness *w;
 
 	if ((w = w_free) == NULL) {
 		witness_dead = 1;
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		printf("witness exhausted\n");
 		return (NULL);
 	}
 	w_free = w->w_next;
 	bzero(w, sizeof(*w));
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 	w->w_next = w_free;
 	w_free = w;
 }
 
 int
 witness_list(struct proc *p)
 {
 	struct mtx *m;
 	int nheld;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	nheld = 0;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		printf("\t\"%s\" (%p) locked at %s:%d\n",
 		    m->mtx_description, m,
 		    m->mtx_witness->w_file, m->mtx_witness->w_line);
 		nheld++;
 	}
 
 	return (nheld);
 }
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(mutexes, db_witness_list)
 {
 
 	witness_list(CURPROC);
 }
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_display(db_printf);
 }
 #endif
 
 void
 witness_save(struct mtx *m, const char **filep, int *linep)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	*filep = m->mtx_witness->w_file;
 	*linep = m->mtx_witness->w_line;
 }
 
 void
 witness_restore(struct mtx *m, const char *file, int line)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	m->mtx_witness->w_file = file;
 	m->mtx_witness->w_line = line;
 }
 
 #endif	/* WITNESS */
Index: head/sys/kern/subr_witness.c
===================================================================
--- head/sys/kern/subr_witness.c	(revision 72375)
+++ head/sys/kern/subr_witness.c	(revision 72376)
@@ -1,1705 +1,1680 @@
 /*-
  * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  * $FreeBSD$
  */
 
 /*
  * Machine independent bits of mutex implementation and implementation of
  * `witness' structure & related debugging routines.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 #include "opt_ddb.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #include <sys/ktr.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <sys/mutex.h>
 
 /*
  * The WITNESS-enabled mutex debug structure.
  */
 #ifdef WITNESS
 struct mtx_debug {
 	struct witness	*mtxd_witness;
 	LIST_ENTRY(mtx)	mtxd_held;
 	const char	*mtxd_file;
 	int		mtxd_line;
 };
 
 #define mtx_held	mtx_debug->mtxd_held
 #define	mtx_file	mtx_debug->mtxd_file
 #define	mtx_line	mtx_debug->mtxd_line
 #define	mtx_witness	mtx_debug->mtxd_witness
 #endif	/* WITNESS */
 
 /*
  * Internal utility macros.
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
 #define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
 	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
 
 #define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
-#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
+#define SET_PRIO(p, pri)	(p)->p_pri.pri_level = (pri)
 
 /*
  * Early WITNESS-enabled declarations.
  */
 #ifdef WITNESS
 
 /*
  * Internal WITNESS routines which must be prototyped early.
  *
  * XXX: When/if witness code is cleaned up, it would be wise to place all
  *	witness prototyping early in this file.
  */ 
 static void witness_init(struct mtx *, int flag);
 static void witness_destroy(struct mtx *);
 static void witness_display(void(*)(const char *fmt, ...));
 
 MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure");
 
 /* All mutexes in system (used for debug/panic) */
 static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 };
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 #else	/* WITNESS */
 
 /* XXX XXX XXX
  * flag++ is sleazoid way of shuting up warning
  */
 #define witness_init(m, flag) flag++
 #define witness_destroy(m)
 #define witness_try_enter(m, t, f, l)
 #endif	/* WITNESS */
 
 /*
  * All mutex locks in system are kept on the all_mtx list.
  */
 static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head",
 	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
 	{ NULL, NULL }, &all_mtx, &all_mtx,
 #ifdef WITNESS
 	&all_mtx_debug
 #else
 	NULL
 #endif
 	 };
 
 /*
  * Global variables for book keeping.
  */
 static int	mtx_cur_cnt;
 static int	mtx_max_cnt;
 
 /*
  * Couple of strings for KTR_LOCK tracing in order to avoid duplicates.
  */
 char	STR_mtx_lock_slp[] = "GOT (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_slp[] = "REL (sleep) %s [%p] r=%d at %s:%d";
 char	STR_mtx_lock_spn[] = "GOT (spin) %s [%p] r=%d at %s:%d";
 char	STR_mtx_unlock_spn[] = "REL (spin) %s [%p] r=%d at %s:%d";
 
 /*
  * Prototypes for non-exported routines.
  *
  * NOTE: Prototypes for witness routines are placed at the bottom of the file. 
  */
 static void	propagate_priority(struct proc *);
 
 static void
 propagate_priority(struct proc *p)
 {
-	int pri = p->p_priority;
+	int pri = p->p_pri.pri_level;
 	struct mtx *m = p->p_blocked;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	for (;;) {
 		struct proc *p1;
 
 		p = mtx_owner(m);
 
 		if (p == NULL) {
 			/*
 			 * This really isn't quite right. Really
 			 * ought to bump priority of process that
 			 * next acquires the mutex.
 			 */
 			MPASS(m->mtx_lock == MTX_CONTESTED);
 			return;
 		}
 
 		MPASS(p->p_magic == P_MAGIC);
 		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
-		if (p->p_priority <= pri)
+		if (p->p_pri.pri_level <= pri)
 			return;
 
 		/*
 		 * Bump this process' priority.
 		 */
 		SET_PRIO(p, pri);
 
 		/*
 		 * If lock holder is actually running, just bump priority.
 		 */
-#ifdef SMP
-		/*
-		 * For SMP, we can check the p_oncpu field to see if we are
-		 * running.
-		 */
 		if (p->p_oncpu != 0xff) {
 			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
 			return;
 		}
-#else
+
 		/*
-		 * For UP, we check to see if p is curproc (this shouldn't
-		 * ever happen however as it would mean we are in a deadlock.)
-		 */
-		if (p == curproc) {
-			panic("Deadlock detected");
-			return;
-		}
-#endif
-		/*
 		 * If on run queue move to new run queue, and
 		 * quit.
 		 */
 		if (p->p_stat == SRUN) {
-			printf("XXX: moving proc %d(%s) to a new run queue\n",
-			       p->p_pid, p->p_comm);
 			MPASS(p->p_blocked == NULL);
 			remrunqueue(p);
 			setrunqueue(p);
 			return;
 		}
 
 		/*
 		 * If we aren't blocked on a mutex, we should be.
 		 */
 		KASSERT(p->p_stat == SMTX, (
 		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
 		    p->p_pid, p->p_comm, p->p_stat,
 		    m->mtx_description));
 
 		/*
 		 * Pick up the mutex that p is blocked on.
 		 */
 		m = p->p_blocked;
 		MPASS(m != NULL);
 
-		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
-		    p->p_comm, m->mtx_description);
-
 		/*
 		 * Check if the proc needs to be moved up on
 		 * the blocked chain
 		 */
 		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
-			printf("XXX: process at head of run queue\n");
 			continue;
 		}
 
-		p1 = TAILQ_PREV(p, rq, p_procq);
-		if (p1->p_priority <= pri) {
-			printf(
-			   "XXX: previous process %d(%s) has higher priority\n",
-	                    p->p_pid, p->p_comm);
+		p1 = TAILQ_PREV(p, procqueue, p_procq);
+		if (p1->p_pri.pri_level <= pri) {
 			continue;
 		}
 
 		/*
 		 * Remove proc from blocked chain and determine where
 		 * it should be moved up to.  Since we know that p1 has
 		 * a lower priority than p, we know that at least one
 		 * process in the chain has a lower priority and that
 		 * p1 will thus not be NULL after the loop.
 		 */
 		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
 		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
 			MPASS(p1->p_magic == P_MAGIC);
-			if (p1->p_priority > pri)
+			if (p1->p_pri.pri_level > pri)
 				break;
 		}
 
 		MPASS(p1 != NULL);
 		TAILQ_INSERT_BEFORE(p1, p, p_procq);
 		CTR4(KTR_LOCK,
 		    "propagate_priority: p %p moved before %p on [%p] %s",
 		    p, p1, m, m->mtx_description);
 	}
 }
 
 /*
  * The important part of mtx_trylock{,_flags}()
  * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
  * if we're called, it's because we know we don't already own this lock.
  */
 int
 _mtx_trylock(struct mtx *m, int opts, const char *file, int line)
 {
 	int rval;
 
 	MPASS(CURPROC != NULL);
 
 	/*
 	 * _mtx_trylock does not accept MTX_NOSWITCH option.
 	 */
 	KASSERT((opts & MTX_NOSWITCH) == 0,
 	    ("mtx_trylock() called with invalid option flag(s) %d", opts));
 
 	rval = _obtain_lock(m, CURTHD);
 
 #ifdef WITNESS
 	if (rval && m->mtx_witness != NULL) {
 		/*
 		 * We do not handle recursion in _mtx_trylock; see the
 		 * note at the top of the routine.
 		 */
 		KASSERT(!mtx_recursed(m),
 		    ("mtx_trylock() called on a recursed mutex"));
 		witness_try_enter(m, (opts | m->mtx_flags), file, line);
 	}
 #endif	/* WITNESS */
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d",
 		    m->mtx_description, m, rval, file, line);
 
 	return rval;
 }
 
 /*
  * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
  *
  * We call this if the lock is either contested (i.e. we need to go to
  * sleep waiting for it), or if we need to recurse on it.
  */
 void
 _mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p = CURPROC;
 
 	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR3(KTR_LOCK, "_mtx_lock_sleep: %p contested (lock=%p) [%p]",
 		    m, (void *)m->mtx_lock, (void *)RETIP(m));
 
 	/*
 	 * Save our priority. Even though p_nativepri is protected by
 	 * sched_lock, we don't obtain it here as it can be expensive.
 	 * Since this is the only place p_nativepri is set, and since two
 	 * CPUs will not be executing the same process concurrently, we know
 	 * that no other CPU is going to be messing with this. Also,
 	 * p_nativepri is only read when we are blocked on a mutex, so that
 	 * can't be happening right now either.
 	 */
-	p->p_nativepri = p->p_priority;
+	p->p_pri.pri_native = p->p_pri.pri_level;
 
 	while (!_obtain_lock(m, p)) {
 		uintptr_t v;
 		struct proc *p1;
 
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Check if the lock has been released while spinning for
 		 * the sched_lock.
 		 */
 		if ((v = m->mtx_lock) == MTX_UNOWNED) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * The mutex was marked contested on release. This means that
 		 * there are processes blocked on it.
 		 */
 		if (v == MTX_CONTESTED) {
 			p1 = TAILQ_FIRST(&m->mtx_blocked);
 			MPASS(p1 != NULL);
 			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
 
-			if (p1->p_priority < p->p_priority)
-				SET_PRIO(p, p1->p_priority); 
+			if (p1->p_pri.pri_level < p->p_pri.pri_level)
+				SET_PRIO(p, p1->p_pri.pri_level); 
 			mtx_unlock_spin(&sched_lock);
 			return;
 		}
 
 		/*
 		 * If the mutex isn't already contested and a failure occurs
 		 * setting the contested bit, the mutex was either released
 		 * or the state of the MTX_RECURSED bit changed.
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
 			(void *)(v | MTX_CONTESTED))) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		/*
 		 * We deffinately must sleep for this lock.
 		 */
 		mtx_assert(m, MA_NOTOWNED);
 
 #ifdef notyet
 		/*
 		 * If we're borrowing an interrupted thread's VM context, we
 		 * must clean up before going to sleep.
 		 */
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_lock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 
 		/*
 		 * Put us on the list of threads blocked on this mutex.
 		 */
 		if (TAILQ_EMPTY(&m->mtx_blocked)) {
 			p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK);
 			LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested);
 			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		} else {
 			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
-				if (p1->p_priority > p->p_priority)
+				if (p1->p_pri.pri_level > p->p_pri.pri_level)
 					break;
 			if (p1)
 				TAILQ_INSERT_BEFORE(p1, p, p_procq);
 			else
 				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
 		}
 
 		/*
 		 * Save who we're blocked on.
 		 */
 		p->p_blocked = m;
 		p->p_mtxname = m->mtx_description;
 		p->p_stat = SMTX;
-#if 0
 		propagate_priority(p);
-#endif
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			    "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m,
 			    m->mtx_description);
 
 		mi_switch();
 
 		if ((opts & MTX_QUIET) == 0)
 			CTR3(KTR_LOCK,
 			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
 			  p, m, m->mtx_description);
 
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	return;
 }
 
 /*
  * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
  *
  * This is only called if we need to actually spin for the lock. Recursion
  * is handled inline.
  */
 void
 _mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file,
 	       int line)
 {
 	int i = 0;
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 
 	for (;;) {
 		if (_obtain_lock(m, CURPROC))
 			break;
 
 		while (m->mtx_lock != MTX_UNOWNED) {
 			if (i++ < 1000000)
 				continue;
 			if (i++ < 6000000)
 				DELAY(1);
 #ifdef DDB
 			else if (!db_active)
 #else
 			else
 #endif
 			panic("spin lock %s held by %p for > 5 seconds",
 			    m->mtx_description, (void *)m->mtx_lock);
 		}
 	}
 
 	m->mtx_saveintr = mtx_intr;
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 
 	return;
 }
 
 /*
  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
  * We are only called here if the lock is recursed or contested (i.e. we
  * need to wake up a blocked thread).
  */
 void
 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct proc *p, *p1;
 	struct mtx *m1;
 	int pri;
 
 	p = CURPROC;
 	MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line);
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	if ((opts & MTX_QUIET) == 0)
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 
 	p1 = TAILQ_FIRST(&m->mtx_blocked);
 	MPASS(p->p_magic == P_MAGIC);
 	MPASS(p1->p_magic == P_MAGIC);
 
 	TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
 
 	if (TAILQ_EMPTY(&m->mtx_blocked)) {
 		LIST_REMOVE(m, mtx_contested);
 		_release_lock_quick(m);
 		if ((opts & MTX_QUIET) == 0)
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
 	} else
 		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
 
-	pri = MAXPRI;
+	pri = PRI_MAX;
 	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
-		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
+		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_pri.pri_level;
 		if (cp < pri)
 			pri = cp;
 	}
 
-	if (pri > p->p_nativepri)
-		pri = p->p_nativepri;
+	if (pri > p->p_pri.pri_native)
+		pri = p->p_pri.pri_native;
 	SET_PRIO(p, pri);
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
 		    m, p1);
 
 	p1->p_blocked = NULL;
 	p1->p_mtxname = NULL;
 	p1->p_stat = SRUN;
 	setrunqueue(p1);
 
-	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
+	if ((opts & MTX_NOSWITCH) == 0 && p1->p_pri.pri_level < pri) {
 #ifdef notyet
 		if (p->p_flag & (P_ITHD | P_SITHD)) {
 			ithd_t *it = (ithd_t *)p;
 
 			if (it->it_interrupted) {
 				if ((opts & MTX_QUIET) == 0)
 					CTR2(KTR_LOCK,
 				    "_mtx_unlock_sleep: 0x%x interrupted 0x%x",
 					    it, it->it_interrupted);
 				intr_thd_fixup(it);
 			}
 		}
 #endif
 		setrunqueue(p);
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK,
 			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
 			    (void *)m->mtx_lock);
 
 		mi_switch();
 		if ((opts & MTX_QUIET) == 0)
 			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
 			    m, (void *)m->mtx_lock);
 	}
 
 	mtx_unlock_spin(&sched_lock);
 
 	return;
 }
 
 /*
  * All the unlocking of MTX_SPIN locks is done inline.
  * See the _rel_spin_lock() macro for the details. 
  */
 
 /*
  * The INVARIANTS-enabled mtx_assert()
  */
 #ifdef INVARIANTS
 void
 _mtx_assert(struct mtx *m, int what, const char *file, int line)
 {
 	switch ((what)) {
 	case MA_OWNED:
 	case MA_OWNED | MA_RECURSED:
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned((m)))
 			panic("mutex %s not owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		if (mtx_recursed((m))) {
 			if (((what) & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
 				    (m)->mtx_description, file, line);
 		} else if (((what) & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
 			    (m)->mtx_description, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned((m)))
 			panic("mutex %s owned at %s:%d",
 			    (m)->mtx_description, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
 	}
 }
 #endif
 
 /*
  * The MUTEX_DEBUG-enabled mtx_validate()
  */
 #define MV_DESTROY	0	/* validate before destory */
 #define MV_INIT		1	/* validate before init */
 
 #ifdef MUTEX_DEBUG
 
 int mtx_validate __P((struct mtx *, int));
 
 int
 mtx_validate(struct mtx *m, int when)
 {
 	struct mtx *mp;
 	int i;
 	int retval = 0;
 
 #ifdef WITNESS
 	if (witness_cold)
 		return 0;
 #endif
 	if (m == &all_mtx || cold)
 		return 0;
 
 	mtx_lock(&all_mtx);
 /*
  * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
  * we can re-enable the kernacc() checks.
  */
 #ifndef __alpha__
 	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
 	    VM_PROT_READ) == 1);
 #endif
 	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
 	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 #ifndef __alpha__
 		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
 		    VM_PROT_READ) != 1) {
 			panic("mtx_validate: mp=%p mp->mtx_next=%p",
 			    mp, mp->mtx_next);
 		}
 #endif
 		i++;
 		if (i > mtx_cur_cnt) {
 			panic("mtx_validate: too many in chain, known=%d\n",
 			    mtx_cur_cnt);
 		}
 	}
 	MPASS(i == mtx_cur_cnt); 
 	switch (when) {
 	case MV_DESTROY:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 			if (mp == m)
 				break;
 		MPASS(mp == m);
 		break;
 	case MV_INIT:
 		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
 		if (mp == m) {
 			/*
 			 * Not good. This mutex already exists.
 			 */
 			printf("re-initing existing mutex %s\n",
 			    m->mtx_description);
 			MPASS(m->mtx_lock == MTX_UNOWNED);
 			retval = 1;
 		}
 	}
 	mtx_unlock(&all_mtx);
 	return (retval);
 }
 #endif
 
 /*
  * Mutex initialization routine; initialize lock `m' of type contained in
  * `opts' with options contained in `opts' and description `description.'
  * Place on "all_mtx" queue.
  */ 
 void
 mtx_init(struct mtx *m, const char *description, int opts)
 {
 
 	if ((opts & MTX_QUIET) == 0)
 		CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description);
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	if (mtx_validate(m, MV_INIT))
 		return;
 #endif
 
 	bzero((void *)m, sizeof *m);
 	TAILQ_INIT(&m->mtx_blocked);
 
 #ifdef WITNESS
 	if (!witness_cold) {
 		m->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(m->mtx_debug != NULL);
 	}
 #endif
 
 	m->mtx_description = description;
 	m->mtx_flags = opts;
 	m->mtx_lock = MTX_UNOWNED;
 
 	/* Put on all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next = &all_mtx;
 	m->mtx_prev = all_mtx.mtx_prev;
 	m->mtx_prev->mtx_next = m;
 	all_mtx.mtx_prev = m;
 	if (++mtx_cur_cnt > mtx_max_cnt)
 		mtx_max_cnt = mtx_cur_cnt;
 	mtx_unlock(&all_mtx);
 
 #ifdef WITNESS
 	if (!witness_cold)
 		witness_init(m, opts);
 #endif
 }
 
 /*
  * Remove lock `m' from all_mtx queue.
  */
 void
 mtx_destroy(struct mtx *m)
 {
 
 #ifdef WITNESS
 	KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n",
 	    __FUNCTION__));
 #endif
 
 	CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description);
 
 #ifdef MUTEX_DEBUG
 	if (m->mtx_next == NULL)
 		panic("mtx_destroy: %p (%s) already destroyed",
 		    m, m->mtx_description);
 
 	if (!mtx_owned(m)) {
 		MPASS(m->mtx_lock == MTX_UNOWNED);
 	} else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 	}
 
 	/* diagnostic */
 	mtx_validate(m, MV_DESTROY);
 #endif
 
 #ifdef WITNESS
 	if (m->mtx_witness)
 		witness_destroy(m);
 #endif /* WITNESS */
 
 	/* Remove from the all mutex queue */
 	mtx_lock(&all_mtx);
 	m->mtx_next->mtx_prev = m->mtx_prev;
 	m->mtx_prev->mtx_next = m->mtx_next;
 
 #ifdef MUTEX_DEBUG
 	m->mtx_next = m->mtx_prev = NULL;
 #endif
 
 #ifdef WITNESS
 	free(m->mtx_debug, M_WITNESS);
 	m->mtx_debug = NULL;
 #endif
 
 	mtx_cur_cnt--;
 	mtx_unlock(&all_mtx);
 }
 
 
 /*
  * The WITNESS-enabled diagnostic code.
  */
 #ifdef WITNESS
 static void
 witness_fixup(void *dummy __unused)
 {
 	struct mtx *mp;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	mtx_lock(&all_mtx);
 
 	/* Iterate through all mutexes and finish up mutex initialization. */
 	for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
 
 		mp->mtx_debug = malloc(sizeof(struct mtx_debug),
 		    M_WITNESS, M_NOWAIT | M_ZERO);
 		MPASS(mp->mtx_debug != NULL);
 
 		witness_init(mp, mp->mtx_flags);
 	}
 	mtx_unlock(&all_mtx);
 
 	/* Mark the witness code as being ready for use. */
 	atomic_store_rel_int(&witness_cold, 0);
 
 	mtx_lock(&Giant);
 }
 SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL)
 
 #define WITNESS_COUNT 200
 #define	WITNESS_NCHILDREN 2
 
 int witness_watch = 1;
 
 struct witness {
 	struct witness	*w_next;
 	const char	*w_description;
 	const char	*w_file;
 	int		 w_line;
 	struct witness	*w_morechildren;
 	u_char		 w_childcnt;
 	u_char		 w_Giant_squawked:1;
 	u_char		 w_other_squawked:1;
 	u_char		 w_same_squawked:1;
 	u_char		 w_spin:1;	/* MTX_SPIN type mutex. */
 	u_int		 w_level;
 	struct witness	*w_children[WITNESS_NCHILDREN];
 };
 
 struct witness_blessed {
 	char 	*b_lock1;
 	char	*b_lock2;
 };
 
 #ifdef DDB
 /*
  * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
  * drop into kdebug() when:
  *	- a lock heirarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_ddb;
 #ifdef WITNESS_DDB
 TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb);
 #else
 TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
 #endif /* DDB */
 
 int	witness_skipspin;
 #ifdef WITNESS_SKIPSPIN
 TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin);
 #else
 TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin);
 #endif
 SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
     "");
 
 /*
  * Witness-enabled globals
  */
 static struct mtx	w_mtx;
 static struct witness	*w_free;
 static struct witness	*w_all;
 static int		 w_inited;
 static int		 witness_dead;	/* fatal error, probably no memory */
 
 static struct witness	 w_data[WITNESS_COUNT];
 
 /*
  * Internal witness routine prototypes
  */
 static struct witness *enroll(const char *description, int flag);
 static int itismychild(struct witness *parent, struct witness *child);
 static void removechild(struct witness *parent, struct witness *child);
 static int isitmychild(struct witness *parent, struct witness *child);
 static int isitmydescendant(struct witness *parent, struct witness *child);
 static int dup_ok(struct witness *);
 static int blessed(struct witness *, struct witness *);
 static void
     witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *);
 static void witness_leveldescendents(struct witness *parent, int level);
 static void witness_levelall(void);
 static struct witness * witness_get(void);
 static void witness_free(struct witness *m);
 
 static char *ignore_list[] = {
 	"witness lock",
 	NULL
 };
 
 static char *spin_order_list[] = {
 #if defined(__i386__) && defined (SMP)
 	"com",
 #endif
 	"sio",
 #ifdef __i386__
 	"cy",
 #endif
 	"sched lock",
 #ifdef __i386__
 	"clk",
 #endif
 	"callout",
 	/*
 	 * leaf locks
 	 */
 	"ithread table lock",
 	"ithread list lock",
 #ifdef SMP
 #ifdef __i386__
 	"ap boot",
 	"imen",
 #endif
 	"smp rendezvous",
 #endif
 	NULL
 };
 
 static char *order_list[] = {
 	"Giant", "proctree", "allproc", "process lock", "uidinfo hash",
 	    "uidinfo struct", NULL,
 	NULL
 };
 
 static char *dup_list[] = {
 	NULL
 };
 
 static char *sleep_list[] = {
 	"Giant",
 	NULL
 };
 
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 static int blessed_count =
 	sizeof(blessed_list) / sizeof(struct witness_blessed);
 
 static void
 witness_init(struct mtx *m, int flag)
 {
 	m->mtx_witness = enroll(m->mtx_description, flag);
 }
 
 static void
 witness_destroy(struct mtx *m)
 {
 	struct mtx *m1;
 	struct proc *p;
 	p = CURPROC;
 	LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) {
 		if (m1 == m) {
 			LIST_REMOVE(m, mtx_held);
 			break;
 		}
 	}
 	return;
 
 }
 
 static void
 witness_display(void(*prnt)(const char *fmt, ...))
 {
 	struct witness *w, *w1;
 	int level, found;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	witness_levelall();
 
 	/*
 	 * First, handle sleep mutexes which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep mutexes:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file == NULL || w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		/*
 		 * This lock has no anscestors, display its descendants. 
 		 */
 		witness_displaydescendants(prnt, w);
 	}
 	
 	/*
 	 * Now do spin mutexes which have been acquired at least once.
 	 */
 	prnt("\nSpin mutexes:\n");
 	level = 0;
 	while (level < sizeof(spin_order_list) / sizeof(char *)) {
 		found = 0;
 		for (w = w_all; w; w = w->w_next) {
 			if (w->w_file == NULL || !w->w_spin)
 				continue;
 			if (w->w_level == 1 << level) {
 				witness_displaydescendants(prnt, w);
 				level++;
 				found = 1;
 			}
 		}
 		if (found == 0)
 			level++;
 	}
 	
 	/*
 	 * Finally, any mutexes which have not been acquired yet.
 	 */
 	prnt("\nMutexes which were never acquired:\n");
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_file != NULL)
 			continue;
 		prnt("%s\n", w->w_description);
 	}
 }
 
 void
 witness_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w, *w1;
 	struct mtx *m1;
 	struct proc *p;
 	int i;
 #ifdef DDB
 	int go_into_ddb = 0;
 #endif /* DDB */
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 	p = CURPROC;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_enter: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		i = PCPU_GET(witness_spin_check);
 		if (i != 0 && w->w_level < i) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
 			    " %s:%d already holding %s:%x",
 			    m->mtx_description, w->w_level, file, line,
 			    spin_order_list[ffs(i)-1], i);
 		}
 		PCPU_SET(witness_spin_check, i | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line);
 		return;
 	}
 	if (witness_dead)
 		goto out;
 	if (cold)
 		goto out;
 
 	if (!mtx_legal2block())
 		panic("blockable mtx_lock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	/*
 	 * Is this the first mutex acquired 
 	 */
 	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
 		goto out;
 
 	if ((w1 = m1->mtx_witness) == w) {
 		if (w->w_same_squawked || dup_ok(w))
 			goto out;
 		w->w_same_squawked = 1;
 		printf("acquring duplicate lock of same type: \"%s\"\n", 
 			m->mtx_description);
 		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
 		printf(" 2nd @ %s:%d\n", file, line);
 #ifdef DDB
 		go_into_ddb = 1;
 #endif /* DDB */
 		goto out;
 	}
 	MPASS(!mtx_owned(&w_mtx));
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	/*
 	 * If we have a known higher number just say ok
 	 */
 	if (witness_watch > 1 && w->w_level > w1->w_level) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	if (isitmydescendant(m1->mtx_witness, w)) {
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		goto out;
 	}
 	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
 
 		MPASS(i < 200);
 		w1 = m1->mtx_witness;
 		if (isitmydescendant(w, w1)) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			if (blessed(w, w1))
 				goto out;
 			if (m1 == &Giant) {
 				if (w1->w_Giant_squawked)
 					goto out;
 				else
 					w1->w_Giant_squawked = 1;
 			} else {
 				if (w1->w_other_squawked)
 					goto out;
 				else
 					w1->w_other_squawked = 1;
 			}
 			printf("lock order reversal\n");
 			printf(" 1st %s last acquired @ %s:%d\n",
 			    w->w_description, w->w_file, w->w_line);
 			printf(" 2nd %p %s @ %s:%d\n",
 			    m1, w1->w_description, w1->w_file, w1->w_line);
 			printf(" 3rd %p %s @ %s:%d\n",
 			    m, w->w_description, file, line);
 #ifdef DDB
 			go_into_ddb = 1;
 #endif /* DDB */
 			goto out;
 		}
 	}
 	m1 = LIST_FIRST(&p->p_heldmtx);
 	if (!itismychild(m1->mtx_witness, w))
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 
 out:
 #ifdef DDB
 	if (witness_ddb && go_into_ddb)
 		Debugger("witness_enter");
 #endif /* DDB */
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 
 	/*
 	 * If this pays off it likely means that a mutex being witnessed
 	 * is acquired in hardclock. Put it in the ignore list. It is
 	 * likely not the mutex this assert fails on.
 	 */
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_try_enter(struct mtx *m, int flags, const char *file, int line)
 {
 	struct proc *p;
 	struct witness *w = m->mtx_witness;
 
 	if (witness_cold)
 		return;
 	if (panicstr)
 		return;
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_try_enter: "
 			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
 			    m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_try_enter: recursion on"
 				    " non-recursive mutex %s @ %s:%d",
 				    m->mtx_description, file, line);
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) | w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		w->w_file = file;
 		w->w_line = line;
 		m->mtx_line = line;
 		m->mtx_file = file;
 		return;
 	}
 
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_try_enter: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description, file,
 			    line);
 		return;
 	}
 	w->w_file = file;
 	w->w_line = line;
 	m->mtx_line = line;
 	m->mtx_file = file;
 	p = CURPROC;
 	MPASS(m->mtx_held.le_prev == NULL);
 	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
 }
 
 void
 witness_exit(struct mtx *m, int flags, const char *file, int line)
 {
 	struct witness *w;
 
 	if (witness_cold || m->mtx_witness == NULL || panicstr)
 		return;
 	w = m->mtx_witness;
 
 	if (flags & MTX_SPIN) {
 		if ((m->mtx_flags & MTX_SPIN) == 0)
 			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
 			    " %s:%d", m->mtx_description, file, line);
 		if (mtx_recursed(m)) {
 			if ((m->mtx_flags & MTX_RECURSE) == 0)
 				panic("mutex_exit: recursion on non-recursive"
 				    " mutex %s @ %s:%d", m->mtx_description,
 				    file, line); 
 			return;
 		}
 		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 		PCPU_SET(witness_spin_check,
 		    PCPU_GET(witness_spin_check) & ~w->w_level);
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		return;
 	}
 	if ((m->mtx_flags & MTX_SPIN) != 0)
 		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
 		    m->mtx_description, file, line);
 
 	if (mtx_recursed(m)) {
 		if ((m->mtx_flags & MTX_RECURSE) == 0)
 			panic("mutex_exit: recursion on non-recursive"
 			    " mutex %s @ %s:%d", m->mtx_description,
 			    file, line); 
 		return;
 	}
 
 	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
 		panic("switchable mtx_unlock() of %s when not legal @ %s:%d",
 			    m->mtx_description, file, line);
 	LIST_REMOVE(m, mtx_held);
 	m->mtx_held.le_prev = NULL;
 }
 
 int
 witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
 {
 	struct mtx *m;
 	struct proc *p;
 	char **sleep;
 	int n = 0;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	p = CURPROC;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		if (m == mtx)
 			continue;
 		for (sleep = sleep_list; *sleep!= NULL; sleep++)
 			if (strcmp(m->mtx_description, *sleep) == 0)
 				goto next;
 		if (n == 0)
 			printf("Whee!\n");
 		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
 			file, line, check_only ? "could sleep" : "sleeping",
 			m->mtx_description,
 			m->mtx_witness->w_file, m->mtx_witness->w_line);
 		n++;
 	next:
 	}
 #ifdef DDB
 	if (witness_ddb && n)
 		Debugger("witness_sleep");
 #endif /* DDB */
 	return (n);
 }
 
 static struct witness *
 enroll(const char *description, int flag)
 {
 	int i;
 	struct witness *w, *w1;
 	char **ignore;
 	char **order;
 
 	if (!witness_watch)
 		return (NULL);
 	for (ignore = ignore_list; *ignore != NULL; ignore++)
 		if (strcmp(description, *ignore) == 0)
 			return (NULL);
 
 	if (w_inited == 0) {
 		mtx_init(&w_mtx, "witness lock", MTX_SPIN);
 		for (i = 0; i < WITNESS_COUNT; i++) {
 			w = &w_data[i];
 			witness_free(w);
 		}
 		w_inited = 1;
 		for (order = order_list; *order != NULL; order++) {
 			w = enroll(*order, MTX_DEF);
 			w->w_file = "order list";
 			for (order++; *order != NULL; order++) {
 				w1 = enroll(*order, MTX_DEF);
 				w1->w_file = "order list";
 				itismychild(w, w1);
 				w = w1;
     	    	    	}
 		}
 	}
 	if ((flag & MTX_SPIN) && witness_skipspin)
 		return (NULL);
 	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
 	for (w = w_all; w; w = w->w_next) {
 		if (strcmp(description, w->w_description) == 0) {
 			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 			return (w);
 		}
 	}
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	w->w_next = w_all;
 	w_all = w;
 	w->w_description = description;
 	mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 	if (flag & MTX_SPIN) {
 		w->w_spin = 1;
 	
 		i = 1;
 		for (order = spin_order_list; *order != NULL; order++) {
 			if (strcmp(description, *order) == 0)
 				break;
 			i <<= 1;
 		}
 		if (*order == NULL)
 			panic("spin lock %s not in order list", description);
 		w->w_level = i; 
 	}
 
 	return (w);
 }
 
 static int
 itismychild(struct witness *parent, struct witness *child)
 {
 	static int recursed;
 
 	/*
 	 * Insert "child" after "parent"
 	 */
 	while (parent->w_morechildren)
 		parent = parent->w_morechildren;
 
 	if (parent->w_childcnt == WITNESS_NCHILDREN) {
 		if ((parent->w_morechildren = witness_get()) == NULL)
 			return (1);
 		parent = parent->w_morechildren;
 	}
 	MPASS(child != NULL);
 	parent->w_children[parent->w_childcnt++] = child;
 	/*
 	 * now prune whole tree
 	 */
 	if (recursed)
 		return (0);
 	recursed = 1;
 	for (child = w_all; child != NULL; child = child->w_next) {
 		for (parent = w_all; parent != NULL;
 		    parent = parent->w_next) {
 			if (!isitmychild(parent, child))
 				continue;
 			removechild(parent, child);
 			if (isitmydescendant(parent, child))
 				continue;
 			itismychild(parent, child);
 		}
 	}
 	recursed = 0;
 	witness_levelall();
 	return (0);
 }
 
 static void
 removechild(struct witness *parent, struct witness *child)
 {
 	struct witness *w, *w1;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			if (w->w_children[i] == child)
 				goto found;
 	return;
 found:
 	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
 		continue;
 	w->w_children[i] = w1->w_children[--w1->w_childcnt];
 	MPASS(w->w_children[i] != NULL);
 
 	if (w1->w_childcnt != 0)
 		return;
 
 	if (w1 == parent)
 		return;
 	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
 		continue;
 	w->w_morechildren = 0;
 	witness_free(w1);
 }
 
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 
 	for (w = parent; w != NULL; w = w->w_morechildren) {
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 isitmydescendant(struct witness *parent, struct witness *child)
 {
 	struct witness *w;
 	int i;
 	int j;
 
 	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
 		MPASS(j < 1000);
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (w->w_children[i] == child)
 				return (1);
 		}
 		for (i = 0; i < w->w_childcnt; i++) {
 			if (isitmydescendant(w->w_children[i], child))
 				return (1);
 		}
 	}
 	return (0);
 }
 
 void
 witness_levelall (void)
 {
 	struct witness *w, *w1;
 
 	for (w = w_all; w; w = w->w_next)
 		if (!(w->w_spin))
 			w->w_level = 0;
 	for (w = w_all; w; w = w->w_next) {
 		if (w->w_spin)
 			continue;
 		for (w1 = w_all; w1; w1 = w1->w_next) {
 			if (isitmychild(w1, w))
 				break;
 		}
 		if (w1 != NULL)
 			continue;
 		witness_leveldescendents(w, 0);
 	}
 }
 
 static void
 witness_leveldescendents(struct witness *parent, int level)
 {
 	int i;
 	struct witness *w;
 
 	if (parent->w_level < level)
 		parent->w_level = level;
 	level++;
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			witness_leveldescendents(w->w_children[i], level);
 }
 
 static void
 witness_displaydescendants(void(*prnt)(const char *fmt, ...),
 			   struct witness *parent)
 {
 	struct witness *w;
 	int i;
 	int level;
 
 	level = parent->w_spin ? ffs(parent->w_level) : parent->w_level;
 
 	prnt("%d", level);
 	if (level < 10)
 		prnt(" ");
 	for (i = 0; i < level; i++)
 		prnt(" ");
 	prnt("%s", parent->w_description);
 	if (parent->w_file != NULL)
 		prnt(" -- last acquired @ %s:%d\n", parent->w_file,
 		    parent->w_line);
 
 	for (w = parent; w != NULL; w = w->w_morechildren)
 		for (i = 0; i < w->w_childcnt; i++)
 			    witness_displaydescendants(prnt, w->w_children[i]);
     }
 
 static int
 dup_ok(struct witness *w)
 {
 	char **dup;
 	
 	for (dup = dup_list; *dup!= NULL; dup++)
 		if (strcmp(w->w_description, *dup) == 0)
 			return (1);
 	return (0);
 }
 
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < blessed_count; i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_description, b->b_lock1) == 0) {
 			if (strcmp(w2->w_description, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_description, b->b_lock2) == 0)
 			if (strcmp(w2->w_description, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 
 static struct witness *
 witness_get()
 {
 	struct witness *w;
 
 	if ((w = w_free) == NULL) {
 		witness_dead = 1;
 		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
 		printf("witness exhausted\n");
 		return (NULL);
 	}
 	w_free = w->w_next;
 	bzero(w, sizeof(*w));
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 	w->w_next = w_free;
 	w_free = w;
 }
 
 int
 witness_list(struct proc *p)
 {
 	struct mtx *m;
 	int nheld;
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	nheld = 0;
 	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
 		printf("\t\"%s\" (%p) locked at %s:%d\n",
 		    m->mtx_description, m,
 		    m->mtx_witness->w_file, m->mtx_witness->w_line);
 		nheld++;
 	}
 
 	return (nheld);
 }
 
 #ifdef DDB
 
 DB_SHOW_COMMAND(mutexes, db_witness_list)
 {
 
 	witness_list(CURPROC);
 }
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_display(db_printf);
 }
 #endif
 
 void
 witness_save(struct mtx *m, const char **filep, int *linep)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	*filep = m->mtx_witness->w_file;
 	*linep = m->mtx_witness->w_line;
 }
 
 void
 witness_restore(struct mtx *m, const char *file, int line)
 {
 
 	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
 	if (m->mtx_witness == NULL)
 		return;
 
 	m->mtx_witness->w_file = file;
 	m->mtx_witness->w_line = line;
 }
 
 #endif	/* WITNESS */
Index: head/sys/posix4/ksched.c
===================================================================
--- head/sys/posix4/ksched.c	(revision 72375)
+++ head/sys/posix4/ksched.c	(revision 72376)
@@ -1,264 +1,269 @@
 /*
  * Copyright (c) 1996, 1997
  *	HD Associates, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by HD Associates, Inc
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /* ksched: Soft real time scheduling based on "rtprio".
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <machine/cpu.h>	/* For need_resched */
 #include <machine/ipl.h>	/* For need_resched */
 
 #include <posix4/posix4.h>
 
 /* ksched: Real-time extension to support POSIX priority scheduling.
  */
 
 struct ksched {
 	struct timespec rr_interval;
 };
 
 int ksched_attach(struct ksched **p)
 {
 	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
 
 	ksched->rr_interval.tv_sec = 0;
 	ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval();
 
 	*p = ksched;
 	return 0;
 }
 
 int ksched_detach(struct ksched *p)
 {
 	p31b_free(p);
 
 	return 0;
 }
 
 /*
  * XXX About priorities
  *
  *	POSIX 1003.1b requires that numerically higher priorities be of
  *	higher priority.  It also permits sched_setparam to be
  *	implementation defined for SCHED_OTHER.  I don't like
  *	the notion of inverted priorites for normal processes when
  *  you can use "setpriority" for that.
  *
  *	I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
  */
 
 /* Macros to convert between the unix (lower numerically is higher priority)
  * and POSIX 1003.1b (higher numerically is higher priority)
  */
 
 #define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
 #define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
 
 /* These improve readability a bit for me:
  */
 #define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
 #define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
 
 static __inline int
 getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
+	struct rtprio rtp;
 	int e = 0;
 
-	switch (p->p_rtprio.type)
+	pri_to_rtp(&p->p_pri, &rtp);
+	switch (rtp.type)
 	{
 		case RTP_PRIO_FIFO:
 		*ret = SCHED_FIFO;
 		break;
 
 		case RTP_PRIO_REALTIME:
 		*ret = SCHED_RR;
 		break;
 
 		default:
 		*ret = SCHED_OTHER;
 		break;
 	}
 
 	return e;
 }
 
 int ksched_setparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, const struct sched_param *param)
 {
 	register_t policy;
 	int e;
 
 	e = getscheduler(&policy, ksched, p);
 
 	if (e == 0)
 	{
 		if (policy == SCHED_OTHER)
 			e = EINVAL;
 		else
 			e = ksched_setscheduler(ret, ksched, p, policy, param);
 	}
 
 	return e;
 }
 
 int ksched_getparam(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct sched_param *param)
 {
-	if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type))
-		param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio);
+	struct rtprio rtp;
 
+	pri_to_rtp(&p->p_pri, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
 	return 0;
 }
 
 /*
  * XXX The priority and scheduler modifications should
  *     be moved into published interfaces in kern/kern_sync.
  *
  * The permissions to modify process p were checked in "p31b_proc()".
  *
  */
 int ksched_setscheduler(register_t *ret, struct ksched *ksched,
 	struct proc *p, int policy, const struct sched_param *param)
 {
 	int e = 0;
 	struct rtprio rtp;
 
 	switch(policy)
 	{
 		case SCHED_RR:
 		case SCHED_FIFO:
 
 		if (param->sched_priority >= P1B_PRIO_MIN &&
 		param->sched_priority <= P1B_PRIO_MAX)
 		{
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
 			rtp.type = (policy == SCHED_FIFO)
 				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
 
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 			need_resched();
 		}
 		else
 			e = EPERM;
 
 
 		break;
 
 		case SCHED_OTHER:
 		{
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			p->p_rtprio = rtp;
+			rtp_to_pri(&rtp, &p->p_pri);
 
 			/* XXX Simply revert to whatever we had for last
 			 *     normal scheduler priorities.
 			 *     This puts a requirement
 			 *     on the scheduling code: You must leave the
 			 *     scheduling info alone.
 			 */
 			need_resched();
 		}
 		break;
 	}
 
 	return e;
 }
 
 int ksched_getscheduler(register_t *ret, struct ksched *ksched, struct proc *p)
 {
 	return getscheduler(ret, ksched, p);
 }
 
 /* ksched_yield: Yield the CPU.
  */
 int ksched_yield(register_t *ret, struct ksched *ksched)
 {
 	need_resched();
 	return 0;
 }
 
 int ksched_get_priority_max(register_t*ret, struct ksched *ksched, int policy)
 {
 	int e = 0;
 
 	switch (policy)
 	{
 		case SCHED_FIFO:
 		case SCHED_RR:
 		*ret = RTP_PRIO_MAX;
 		break;
 
 		case SCHED_OTHER:
 		*ret =  PRIO_MAX;
 		break;
 
 		default:
 		e = EINVAL;
 	}
 
 	return e;
 }
 
 int ksched_get_priority_min(register_t *ret, struct ksched *ksched, int policy)
 {
 	int e = 0;
 
 	switch (policy)
 	{
 		case SCHED_FIFO:
 		case SCHED_RR:
 		*ret = P1B_PRIO_MIN;
 		break;
 
 		case SCHED_OTHER:
 		*ret =  PRIO_MIN;
 		break;
 
 		default:
 		e = EINVAL;
 	}
 
 	return e;
 }
 
 int ksched_rr_get_interval(register_t *ret, struct ksched *ksched,
 	struct proc *p, struct timespec *timespec)
 {
 	*timespec = ksched->rr_interval;
 
 	return 0;
 }
Index: head/sys/sys/ktr.h
===================================================================
--- head/sys/sys/ktr.h	(revision 72375)
+++ head/sys/sys/ktr.h	(revision 72376)
@@ -1,212 +1,213 @@
 /*-
  * Copyright (c) 1996 Berkeley Software Design, Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: ktr.h,v 1.10.2.7 2000/03/16 21:44:42 cp Exp $
  * $FreeBSD$
  */
 
 /*
  *	Wraparound kernel trace buffer support.
  */
 
 #ifndef _SYS_KTR_H_
 #define _SYS_KTR_H_
 
 /* Requires sys/types.h, sys/time.h, machine/atomic.h, and machine/cpufunc.h */
 
 #include <machine/atomic.h>
 #include <machine/cpufunc.h>
 
 /*
  * Trace classes
  */
 #define	KTR_GEN		0x00000001		/* General (TR) */
 #define	KTR_NET		0x00000002		/* Network */
 #define	KTR_DEV		0x00000004		/* Device driver */
 #define	KTR_LOCK	0x00000008		/* MP locking */
 #define	KTR_SMP		0x00000010		/* MP general */
 #define	KTR_FS		0x00000020		/* Filesystem */
 #define KTR_PMAP	0x00000040		/* Pmap tracing */
 #define KTR_MALLOC	0x00000080		/* Malloc tracing */
 #define	KTR_TRAP	0x00000100		/* Trap processing */
 #define	KTR_INTR	0x00000200		/* Interrupt tracing */
 #define KTR_SIG		0x00000400		/* Signal processing */
 #define	KTR_CLK		0x00000800		/* hardclock verbose */
 #define	KTR_PROC	0x00001000		/* Process scheduling */
 #define	KTR_SYSC	0x00002000		/* System call */
 #define	KTR_INIT	0x00004000		/* System initialization */
 #define KTR_KGDB	0x00008000		/* Trace kgdb internals */
 #define	KTR_IO		0x00010000		/* Upper I/O  */
 #define KTR_LOCKMGR	0x00020000
 #define KTR_NFS		0x00040000		/* The obvious */
 #define KTR_VOP		0x00080000		/* The obvious */
 #define KTR_VM		0x00100000		/* The virtual memory system */
 #define KTR_IDLELOOP	0x00200000		/* checks done in the idle process */
+#define	KTR_RUNQ	0x00400000		/* Run queue */
 
 /*
  * Trace classes which can be assigned to particular use at compile time
  * These must remain in high 22 as some assembly code counts on it
  */
 #define KTR_CT1		0x010000000
 #define KTR_CT2		0x020000000
 #define KTR_CT3		0x040000000
 #define KTR_CT4		0x080000000
 #define KTR_CT5		0x100000000
 #define KTR_CT6		0x200000000
 #define KTR_CT7		0x400000000
 #define KTR_CT8		0x800000000
 
 /* Trace classes to compile in */
 #ifndef KTR_COMPILE
 #define	KTR_COMPILE	(KTR_GEN)
 #endif
 
 #ifndef LOCORE
 
 #include <sys/time.h>
 
 struct ktr_entry {
 	struct	timespec ktr_tv;
 #ifdef KTR_EXTEND
 #ifndef KTRDESCSIZE
 #define KTRDESCSIZE 80
 #endif
 #ifndef KTRFILENAMESIZE
 #define KTRFILENAMESIZE 32
 #endif
 	char	ktr_desc [KTRDESCSIZE];
 	char    ktr_filename [KTRFILENAMESIZE];
 	int	ktr_line;
 	int	ktr_cpu;
 #else
 	char	*ktr_desc;
 	u_long	ktr_parm1;
 	u_long	ktr_parm2;
 	u_long	ktr_parm3;
 	u_long	ktr_parm4;
 	u_long	ktr_parm5;
 #endif
 };
 
 /* These variables are used by gdb to analyse the output */
 extern int ktr_extend;
 
 extern int ktr_cpumask;
 extern int ktr_mask;
 extern int ktr_entries;
 extern int ktr_verbose;
 
 extern volatile int ktr_idx;
 extern struct ktr_entry ktr_buf[];
 
 #endif /* !LOCORE */
 #ifdef KTR
 
 #ifndef KTR_ENTRIES
 #define	KTR_ENTRIES	1024
 #endif
 
 #ifdef KTR_EXTEND
 void	ktr_tracepoint(u_int mask, char *filename, u_int line,
 		       char *format, ...);
 #else
 void	ktr_tracepoint(u_int mask, char *format, u_long arg1, u_long arg2,
 		       u_long arg3, u_long arg4, u_long arg5);
 #endif
 
 #ifdef KTR_EXTEND
 #define CTR(m, format, args...) do {					\
 	if (KTR_COMPILE & (m))						\
 		ktr_tracepoint((m), __FILE__, __LINE__, format , ##args); \
 	} while(0)
 
 #define	CTR0(m, format)			CTR(m, format)
 #define	CTR1(m, format, p1)		CTR(m, format, p1)
 #define	CTR2(m, format, p1, p2)		CTR(m, format, p1, p2)
 #define	CTR3(m, format, p1, p2, p3)	CTR(m, format, p1, p2, p3)
 #define	CTR4(m, format, p1, p2, p3, p4)	CTR(m, format, p1, p2, p3, p4)
 #define	CTR5(m, format, p1, p2, p3, p4, p5)				\
 	CTR(m, format, p1, p2, p3, p4, p5)
 #else							    /* not extended */
 #define CTR5(m, format, p1, p2, p3, p4, p5) do {			\
 	if (KTR_COMPILE & (m))						\
 		ktr_tracepoint((m), format, (u_long)p1, (u_long)p2,	\
 		    (u_long)p3, (u_long)p4, (u_long)p5);		\
 	} while(0)
 #define CTR0(m, format)			CTR5(m, format, 0, 0, 0, 0, 0)
 #define CTR1(m, format, p1)		CTR5(m, format, p1, 0, 0, 0, 0)
 #define	CTR2(m, format, p1, p2)		CTR5(m, format, p1, p2, 0, 0, 0)
 #define	CTR3(m, format, p1, p2, p3)	CTR5(m, format, p1, p2, p3, 0, 0)
 #define	CTR4(m, format, p1, p2, p3, p4)	CTR5(m, format, p1, p2, p3, p4, 0)
 #endif	/* KTR_EXTEND */
 #else	/* KTR */
 #undef KTR_COMPILE
 #define KTR_COMPILE 0
 #define	CTR0(m, d)
 #define	CTR1(m, d, p1)
 #define	CTR2(m, d, p1, p2)
 #define	CTR3(m, d, p1, p2, p3)
 #define	CTR4(m, d, p1, p2, p3, p4)
 #define	CTR5(m, d, p1, p2, p3, p4, p5)
 /* XXX vvvvvvvv ??? */
 #define	SEG_ATR(d,s)
 #define	SEG_ATR_DESC(d,s)
 #define	ATR(d)
 #define	CATR(f,d,n)
 #define	CATRD(f,d,n)
 #endif	/* KTR */
 
 #define	TR0(d)				CTR0(KTR_GEN, d)
 #define	TR1(d, p1)			CTR1(KTR_GEN, d, p1)
 #define	TR2(d, p1, p2)			CTR2(KTR_GEN, d, p1, p2)
 #define	TR3(d, p1, p2, p3)		CTR3(KTR_GEN, d, p1, p2, p3)
 #define	TR4(d, p1, p2, p3, p4)		CTR4(KTR_GEN, d, p1, p2, p3, p4)
 #define	TR5(d, p1, p2, p3, p4, p5)	CTR5(KTR_GEN, d, p1, p2, p3, p4, p5)
 
 /*
  * Trace initialization events, similar to CTR with KTR_INIT, but
  * completely ifdef'ed out if KTR_INIT isn't in KTR_COMPILE (to
  * save string space, the compiler doesn't optimize out strings
  * for the conditional ones above).
  */
 #if (KTR_COMPILE & KTR_INIT) != 0
 #define	ITR0(d)				CTR0(KTR_INIT, d)
 #define	ITR1(d, p1)			CTR1(KTR_INIT, d, p1)
 #define	ITR2(d, p1, p2)			CTR2(KTR_INIT, d, p1, p2)
 #define	ITR3(d, p1, p2, p3)		CTR3(KTR_INIT, d, p1, p2, p3)
 #define	ITR4(d, p1, p2, p3, p4)		CTR4(KTR_INIT, d, p1, p2, p3, p4)
 #define	ITR5(d, p1, p2, p3, p4, p5)	CTR5(KTR_INIT, d, p1, p2, p3, p4, p5)
 #else
 #define	ITR0(d)
 #define	ITR1(d, p1)
 #define	ITR2(d, p1, p2)
 #define	ITR3(d, p1, p2, p3)
 #define	ITR4(d, p1, p2, p3, p4)
 #define	ITR5(d, p1, p2, p3, p4, p5)
 #endif
 
 #endif /* !_SYS_KTR_H_ */
Index: head/sys/sys/param.h
===================================================================
--- head/sys/sys/param.h	(revision 72375)
+++ head/sys/sys/param.h	(revision 72376)
@@ -1,252 +1,235 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 #undef __FreeBSD_version
 #define __FreeBSD_version 500016	/* Master, propagated to newvers */
 
 #ifndef NULL
 #define	NULL	0
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  * MAXLOGNAME should be == UT_NAMESIZE+1 (see <utmp.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	16		/* max command name remembered */
 #define	MAXINTERP	32		/* max interpreter file name length */
 #define	MAXLOGNAME	17		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		NGROUPS_MAX	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	15		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #include <sys/time.h>
+#include <sys/priority.h>
 
 #define	FALSE	0
 #define	TRUE	1
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <machine/limits.h>
 #endif
-
-/*
- * Priorities.  Note that with 32 run queues, differences less than 4 are
- * insignificant.
- */
-#define	PSWP	0
-#define	PVM	4
-#define	PINOD	8
-#define	PRIBIO	16
-#define	PVFS	20
-#define	PZERO	22		/* No longer magic, shouldn't be here.  XXX */
-#define	PSOCK	24
-#define	PWAIT	32
-#define	PCONFIG	32
-#define	PLOCK	36
-#define	PPAUSE	40
-#define	PUSER	48
-#define	MAXPRI	127		/* Priorities range from 0 through MAXPRI. */
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 #ifdef _KERNEL
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 #define	NOUDEV	(udev_t)(-1)	/* non-existent device */
 #define	NOMAJ	256		/* non-existent device */
 #else
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 #endif
 
 #define	CBLOCK	128		/* Clist block size, must be a power of 2. */
 #define CBQSIZE	(CBLOCK/NBBY)	/* Quote bytes/cblock - can do better. */
 				/* Data chars/clist. */
 #define	CBSIZE	(CBLOCK - sizeof(struct cblock *) - CBQSIZE)
 #define	CROUND	(CBLOCK - 1)	/* Clist rounding. */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual 
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use 
  *		the KVM memory reserved for the buffer cache and will wind 
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #define BKVASIZE	16384	/* must be power of 2 */
 #define BKVAMASK	(BKVASIZE-1)
 #define MAXFRAG 	8
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	((a)[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	((a)[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)	((a)[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)	(((a)[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #ifndef _KERNEL
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 #endif
 
 /*
  * Constants for setting the parameters of the kernel memory allocator.
  *
  * 2 ** MINBUCKET is the smallest unit of memory that will be
  * allocated. It must be at least large enough to hold a pointer.
  *
  * Units of memory less or equal to MAXALLOCSAVE will permanently
  * allocate physical memory; requests for these size pieces of
  * memory are quite fast. Allocations greater than MAXALLOCSAVE must
  * always allocate and free physical memory; requests for these
  * size allocations should be done infrequently as they will be slow.
  *
  * Constraints: PAGE_SIZE <= MAXALLOCSAVE <= 2 ** (MINBUCKET + 14), and
  * MAXALLOCSIZE must be a power of two.
  */
 #if defined(__alpha__) || defined(__ia64__)
 #define MINBUCKET	5		/* 5 => min allocation of 32 bytes */
 #else
 #define MINBUCKET	4		/* 4 => min allocation of 16 bytes */
 #endif
 #define MAXALLOCSAVE	(2 * PAGE_SIZE)
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
  
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 
 /*
  * Make this available for most of the kernel.  There were too many
  * things that included sys/systm.h just for panic().
  */
 #ifdef _KERNEL
 void	panic __P((const char *, ...)) __dead2 __printflike(1, 2);
 #endif
 #endif	/* _SYS_PARAM_H_ */
Index: head/sys/sys/priority.h
===================================================================
--- head/sys/sys/priority.h	(nonexistent)
+++ head/sys/sys/priority.h	(revision 72376)
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 1994, Henrik Vestergaard Draboel
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by (name).
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_PRIORITY_H_
+#define _SYS_PRIORITY_H_
+
+/*
+ * Process priority specifications.
+ */
+
+/*
+ * Priority classes.
+ */
+
+#define	PRI_ITHD		1	/* Interrupt thread. */
+#define	PRI_REALTIME		2	/* Real time process. */
+#define	PRI_TIMESHARE		3	/* Time sharing process. */
+#define	PRI_IDLE		4	/* Idle process. */
+
+/*
+ * PRI_FIFO is POSIX.1B SCHED_FIFO.
+ */
+
+#define	PRI_FIFO_BIT		8
+#define	PRI_FIFO		(PRI_FIFO_BIT | PRI_REALTIME)
+
+#define	PRI_BASE(P)		((P) & ~PRI_FIFO_BIT)
+#define	PRI_IS_REALTIME(P)	(PRI_BASE(P) == PRI_REALTIME)
+#define	PRI_NEED_RR(P)		((P) != PRI_FIFO)
+
+/*
+ * Priorities.  Note that with 64 run queues, differences less than 4 are
+ * insignificant.
+ */
+
+/*
+ * Priorities range from 0 to 255, but differences of less then 4 (RQ_PPQ)
+ * are insignificant.  Ranges are as follows:
+ *
+ * Interrupt threads:		0 - 63
+ * Top half kernel threads:	64 - 127
+ * Realtime user threads:	128 - 159
+ * Time sharing user threads:	160 - 223
+ * Idle user threads:		224 - 255
+ *
+ * XXX If/When the specific interrupt thread and top half thread ranges
+ * disappear, a larger range can be used for user processes.
+ */
+
+#define	PRI_MIN			(0)		/* Highest priority. */
+#define	PRI_MAX			(255)		/* Lowest priority. */
+
+#define	PRI_MIN_ITHD		(PRI_MIN)
+#define	PRI_MAX_ITHD		(PRI_MIN_KERN - 1)
+
+#define	PI_REALTIME		(PRI_MIN_ITHD + 0)
+#define	PI_AV			(PRI_MIN_ITHD + 4)
+#define	PI_TTYHIGH		(PRI_MIN_ITHD + 8)
+#define	PI_TAPE			(PRI_MIN_ITHD + 12)
+#define	PI_NET			(PRI_MIN_ITHD + 16)
+#define	PI_DISK			(PRI_MIN_ITHD + 20)
+#define	PI_TTYLOW		(PRI_MIN_ITHD + 24)
+#define	PI_DISKLOW		(PRI_MIN_ITHD + 28)
+#define	PI_DULL			(PRI_MIN_ITHD + 32)
+#define	PI_SOFT			(PRI_MIN_ITHD + 36)
+
+#define	PRI_MIN_KERN		(64)
+#define	PRI_MAX_KERN		(PRI_MIN_REALTIME - 1)
+
+#define	PSWP			(PRI_MIN_KERN + 0)
+#define	PVM			(PRI_MIN_KERN + 4)
+#define	PINOD			(PRI_MIN_KERN + 8)
+#define	PRIBIO			(PRI_MIN_KERN + 12)
+#define	PVFS			(PRI_MIN_KERN + 16)
+#define	PZERO			(PRI_MIN_KERN + 20)
+#define	PSOCK			(PRI_MIN_KERN + 24)
+#define	PWAIT			(PRI_MIN_KERN + 28)
+#define	PCONFIG			(PRI_MIN_KERN + 32)
+#define	PLOCK			(PRI_MIN_KERN + 36)
+#define	PPAUSE			(PRI_MIN_KERN + 40)
+
+#define	PRI_MIN_REALTIME	(128)
+#define	PRI_MAX_REALTIME	(PRI_MIN_TIMESHARE - 1)
+
+#define	PRI_MIN_TIMESHARE	(160)
+#define	PRI_MAX_TIMESHARE	(PRI_MIN_IDLE - 1)
+
+#define	PUSER			(PRI_MIN_TIMESHARE)
+
+#define	PRI_MIN_IDLE		(224)
+#define	PRI_MAX_IDLE		(PRI_MAX)
+
+struct	priority {
+	u_char	pri_class;	/* Scheduling class. */
+	u_char	pri_level;	/* Normal priority level. */
+	u_char	pri_native;	/* Priority before propogation. */
+	u_char	pri_user;	/* User priority based on p_cpu and p_nice. */
+};
+
+#endif	/* !_SYS_PRIORITY_H_ */

Property changes on: head/sys/sys/priority.h
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 72375)
+++ head/sys/sys/proc.h	(revision 72376)
@@ -1,577 +1,569 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #include <sys/filedesc.h>
 #include <sys/queue.h>
-#include <sys/rtprio.h>			/* For struct rtprio. */
+#include <sys/priority.h>
+#include <sys/rtprio.h>			/* XXX */
+#include <sys/runq.h>
 #include <sys/signal.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #endif
 #include <sys/ucred.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  */
 struct	session {
 	int	s_count;		/* Ref cnt; pgrps in session. */
 	struct	proc *s_leader;		/* Session leader. */
 	struct	vnode *s_ttyvp;		/* Vnode of controlling terminal. */
 	struct	tty *s_ttyp;		/* Controlling terminal. */
 	pid_t	s_sid;			/* Session ID. */
 					/* Setlogin() name: */
 	char	s_login[roundup(MAXLOGNAME, sizeof(long))];
 };
 
 /*
  * One structure allocated per process group.
  */
 struct	pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* Pointer to pgrp members. */
 	struct	session *pg_session;	/* Pointer to session. */
 	struct  sigiolst pg_sigiolst;	/* List of sigio sources. */
 	pid_t	pg_id;			/* Pgrp id. */
 	int	pg_jobc;	/* # procs qualifying pgrp for job control */
 };
 
 struct	procsig {
 	sigset_t ps_sigignore;	/* Signals being ignored. */
 	sigset_t ps_sigcatch;	/* Signals being caught by user. */
 	int	 ps_flag;
 	struct	 sigacts *ps_sigacts;	/* Signal actions, state. */
 	int	 ps_refcnt;
 };
 
 #define	PS_NOCLDWAIT	0x0001	/* No zombies if child dies */
 #define	PS_NOCLDSTOP	0x0002	/* No SIGCHLD when children stop. */
 
 /*
  * pasleep structure, used by asleep() syscall to hold requested priority
  * and timeout values for await().
  */
 struct	pasleep {
 	int	as_priority;	/* Async priority. */
 	int	as_timo;	/* Async timeout. */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct	pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[0];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never chagnes 
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by sched_lock mtx
  *      k - either by curproc or a lock which prevents the lock from
  *          going away, such as (d,e)
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *
  * If the locking identifier is followed by a plus '+', then the specified
  * member follows these special rules:
  *      - It is only written to by the current process.
  *      - It can be read by the current process and other processes.
  * Thus, the locking rules for it are slightly different, and allow us to
  * optimize the case where a process reads its own such value:
  *	- Writes to this member are locked.
  *	- Reads of this value by other processes are locked.
  *	- Reads of this value by the current process need not be locked.
  */
 struct ithd;
 
 struct	proc {
 	TAILQ_ENTRY(proc) p_procq;	/* (j) Run/mutex queue. */
 	TAILQ_ENTRY(proc) p_slpq;	/* (j) Sleep queue. */
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 
 	/* substructures: */
 	struct	pcred *p_cred;		/* (c+) Process owner's identity. */
 	struct	filedesc *p_fd;		/* (b) Ptr to open files structure. */
 	struct	pstats *p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct	plimit *p_limit;	/* (m) Process limits. */
 	struct	vm_object *p_upages_obj;/* (a) Upages object. */
 	struct	procsig *p_procsig;	/* (c) Signal actions, state (CPU). */
 #define	p_sigacts	p_procsig->ps_sigacts
 #define	p_sigignore	p_procsig->ps_sigignore
 #define	p_sigcatch	p_procsig->ps_sigcatch
 
 #define	p_ucred		p_cred->pc_ucred
 #define	p_rlimit	p_limit->pl_rlimit
 
 	int	p_flag;			/* (c) P_* flags. */
 	int	p_sflag;		/* (j) PS_* flags. */
 	int	p_intr_nesting_level;	/* (k) Interrupt recursion. */
 	char	p_stat;			/* (j) S* process status. */
 	char	p_pad1[3];
 
 	pid_t	p_pid;			/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (c) List of processes in pgrp. */
 	struct	proc *p_pptr;		/* (e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 
 	pid_t	p_oppid;	 /* (c) Save parent pid during ptrace. XXX */
 	int	p_dupfd;	 /* (c) Sideways ret value from fdopen. XXX */
 	struct	vmspace *p_vmspace;	/* (b) Address space. */
 
 	/* scheduling */
 	u_int	p_estcpu;	 /* (j) Time averaged value of p_cpticks. */
 	int	p_cpticks;	 /* (j) Ticks of cpu time. */
 	fixpt_t	p_pctcpu;	 /* (j) %cpu during p_swtime. */
 	struct	callout p_slpcallout;	/* (h) Callout for sleep. */
 	void	*p_wchan;	 /* (j) Sleep address. */
 	const char *p_wmesg;	 /* (j) Reason for sleep. */
 	u_int	p_swtime;	 /* (j) Time swapped in or out. */
 	u_int	p_slptime;	 /* (j) Time since last blocked. */
 
 	struct	callout p_itcallout;	/* (h) Interval timer callout. */
 	struct	itimerval p_realtimer;	/* (h?/k?) Alarm timer. */
 	u_int64_t p_runtime;	/* (j) Real time in microsec. */
 	u_int64_t p_uu;		/* (j) Previous user time in microsec. */
 	u_int64_t p_su;		/* (j) Previous system time in microsec. */
 	u_int64_t p_iu;		/* (j) Previous interrupt time in microsec. */
 	u_int64_t p_uticks;	/* (j) Statclock hits in user mode. */
 	u_int64_t p_sticks;	/* (j) Statclock hits in system mode. */
 	u_int64_t p_iticks;	/* (j) Statclock hits processing intr. */
 
 	int	p_traceflag;		/* (j?) Kernel trace points. */
 	struct	vnode *p_tracep;	/* (j?) Trace to vnode. */
 
 	sigset_t p_siglist;	/* (c) Signals arrived but not delivered. */
 
 	struct	vnode *p_textvp;	/* (b) Vnode of executable. */
 
 	char	p_lock;		/* (c) Process lock (prevent swap) count. */
 	struct	mtx p_mtx;		/* (k) Lock for this struct. */
 	u_char	p_oncpu;		/* (j) Which cpu we are on. */
 	u_char	p_lastcpu;		/* (j) Last cpu we were on. */
 	char	p_rqindex;		/* (j) Run queue index. */
 
 	short	p_locks;	/* (*) DEBUG: lockmgr count of held locks */
 	u_int	p_stops;		/* (c) Procfs event bitmask. */
 	u_int	p_stype;		/* (c) Procfs stop event type. */
 	char	p_step;			/* (c) Procfs stop *once* flag. */
 	u_char	p_pfsflags;		/* (c) Procfs flags. */
 	char	p_pad3[2];		/* Alignment. */
 	register_t p_retval[2];		/* (k) Syscall aux returns. */
 	struct	sigiolst p_sigiolst;	/* (c) List of sigio sources. */
 	int	p_sigparent;		/* (c) Signal to parent on exit. */
 	sigset_t p_oldsigmask;	/* (c) Saved mask from before sigpause. */
 	int	p_sig;			/* (n) For core dump/debugger XXX. */
 	u_long	p_code;			/* (n) For core dump/debugger XXX. */
 	struct	klist p_klist;	/* (c) Knotes attached to this process. */
 	LIST_HEAD(, mtx) p_heldmtx;	/* (j) For debugging code. */
 	struct mtx *p_blocked;		/* (j) Mutex process is blocked on. */
 	const char *p_mtxname;		/* (j) Name of mutex blocked on. */
 	LIST_HEAD(, mtx) p_contested;	/* (j) Contested locks. */
 
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_startcopy
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_sigmask
 
 	sigset_t p_sigmask;	/* (c) Current signal mask. */
 	stack_t	p_sigstk;	/* (c) Stack pointer and on-stack flag. */
 
 	int	p_magic;	/* (b) Magic number. */
-	u_char	p_priority;	/* (j) Process priority. */
-	u_char	p_usrpri; /* (j) User priority based on p_cpu and p_nice. */
-	u_char	p_nativepri;	/* (j) Priority before propagation. */
+	struct	priority p_pri;	/* (j) Process priority. */
 	char	p_nice;		/* (j?/k?) Process "nice" value. */
 	char	p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 
 	struct 	pgrp *p_pgrp;	/* (e?/c?) Pointer to process group. */
 	struct 	sysentvec *p_sysent; /* (b) System call dispatch information. */
-	struct	rtprio p_rtprio;	/* (j) Realtime priority. */
 	struct	prison *p_prison;	/* (b?) jail(4). */
 	struct	pargs *p_args;		/* (b?) Process arguments. */
 
 /* End area that is copied on creation. */
 #define	p_endcopy	p_addr
 
 	struct	user *p_addr;	/* (k) Kernel virtual addr of u-area (CPU). */
 	struct	mdproc p_md;	/* (k) Any machine-dependent fields. */
 
 	u_short	p_xstat;	/* (c) Exit status for wait; also stop sig. */
 	u_short	p_acflag;	/* (c) Accounting flags. */
 	struct	rusage *p_ru;	/* (a) Exit information. XXX */
 
 	void	*p_aioinfo;	/* (c) ASYNC I/O info. */
 	struct proc *p_peers;	/* (c) */
 	struct proc *p_leader;	/* (c) */
 	struct	pasleep p_asleep;	/* (k) Used by asleep()/await(). */
 	void	*p_emuldata;	/* (c) Emulator state data. */
 	struct	ithd *p_ithd;	/* (b) For interrupt threads only. */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 /* Status values (p_stat). */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SMTX	7		/* Blocked on a mutex. */
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KTHREAD	0x00004 /* Kernel thread. */
 #define	P_NOLOAD	0x00008	/* Ignore during load avg calculations. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_SELECT	0x00040	/* Selecting; wakeup/waiting danger. */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Debugging process has waited for child. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 
 /* Should be moved to machine-dependent areas. */
 
 #define	P_BUFEXHAUST	0x100000 /* Dirty buffers flush is in progress. */
 #define	P_COWINPROGRESS	0x400000 /* Snapshot copy-on-write in progress. */
 
 #define	P_DEADLKTREAT	0x800000 /* Lock aquisition - deadlock treatment. */
 
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_OLDMASK	0x2000000 /* Need to restore mask after suspend. */
 #define	P_ALTSTACK	0x4000000 /* Have alternate signal stack. */
 
 /* These flags are kept in p_sflag and are protected with sched_lock. */
 #define	PS_INMEM	0x00001	/* Loaded into memory. */
 #define	PS_OWEUPC	0x00002	/* Owe process an addupc() call at next ast. */
 #define	PS_PROFIL	0x00004	/* Has started profiling. */
 #define	PS_SINTR	0x00008	/* Sleep is interruptible. */
 #define	PS_TIMEOUT	0x00010	/* Timing out during sleep. */
 #define	PS_ALRMPEND	0x00020 /* Pending SIGVTALRM needs to be posted. */
 #define	PS_PROFPEND	0x00040 /* Pending SIGPROF needs to be posted. */
 #define	PS_CVWAITQ	0x00080 /* Proces is on a cv_waitq (not slpq). */
 #define	PS_SWAPINREQ	0x00100	/* Swapin request due to wakeup. */
 #define	PS_SWAPPING	0x00200	/* Process is being swapped. */
 #define	PS_ASTPENDING	0x00400	/* Process has a pending ast. */
 #define	PS_NEEDRESCHED	0x00800	/* Process needs to yield. */
 
 #define	P_MAGIC		0xbeefface
 
 #define	P_CAN_SEE	1
 #define	P_CAN_KILL	2
 #define	P_CAN_SCHED	3
 #define	P_CAN_DEBUG	4
 
 /*
  * MOVE TO ucred.h?
  *
  * Shareable process credentials (always resident).  This includes a reference
  * to the current user credentials as well as real and saved ids that may be
  * used to change ids.
  */
 struct	pcred {
 	struct	ucred *pc_ucred;	/* Current credentials. */
 	uid_t	p_ruid;			/* Real user id. */
 	uid_t	p_svuid;		/* Saved effective user id. */
 	gid_t	p_rgid;			/* Real group id. */
 	gid_t	p_svgid;		/* Saved effective group id. */
 	int	p_refcnt;		/* Number of references. */
 	struct	uidinfo *p_uidinfo;	/* Per uid resource consumption. */
 };
 
 #ifdef _KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 static __inline int
 sigonstack(size_t sp)
 {
 	register struct proc *p = curproc;
 
 	return ((p->p_flag & P_ALTSTACK) ?
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	    ((p->p_sigstk.ss_size == 0) ? (p->p_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size))
 #else
 	    ((sp - (size_t)p->p_sigstk.ss_sp) < p->p_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 /*
  * Preempt the current process if in interrupt from user mode,
  * or after the current trap/syscall if in system mode.
  */
 #define	need_resched() do {						\
 	mtx_assert(&sched_lock, MA_OWNED);				\
 	curproc->p_sflag |= PS_NEEDRESCHED;				\
 } while (0)
 
 #define	resched_wanted()	(curproc->p_sflag & PS_NEEDRESCHED)
 
 #define	clear_resched() do {						\
 	mtx_assert(&sched_lock, MA_OWNED);				\
 	curproc->p_sflag &= ~PS_NEEDRESCHED;				\
 } while (0)
 
 /*
  * Notify the current process (p) that it has a signal pending,
  * process as soon as possible.
  */
 #define	aston()		signotify(CURPROC)
 #define	signotify(p) do {						\
 	mtx_assert(&sched_lock, MA_OWNED);				\
 	(p)->p_sflag |= PS_ASTPENDING;					\
 } while (0)
 
 #define	astpending()	(curproc->p_sflag & PS_ASTPENDING)
 
 #define astoff() do {							\
 	mtx_assert(&sched_lock, MA_OWNED);				\
 	CURPROC->p_sflag &= ~PS_ASTPENDING;				\
 } while (0)
 
 /* Handy macro to determine if p1 can mangle p2. */
 #define	PRISON_CHECK(p1, p2) \
 	((p1)->p_prison == NULL || (p1)->p_prison == (p2)->p_prison)
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s) {							\
 	if (--(s)->s_count == 0)					\
 		FREE(s, M_SESSION);					\
 }
 
 /* STOPEVENT() is MP safe. */
 #define	STOPEVENT(p, e, v) do {						\
 	PROC_LOCK(p);							\
 	if ((p)->p_stops & (e)) {					\
 		stopevent((p), (e), (v));				\
 	}								\
 	PROC_UNLOCK(p);							\
 } while (0)
 
 /* Lock and unlock a process. */
 #define PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 
 /* Lock and unlock the proc lists. */
 #define	ALLPROC_LOCK(how)						\
 	lockmgr(&allproc_lock, (how), NULL, CURPROC)
 
 #define	AP_SHARED	LK_SHARED
 #define	AP_EXCLUSIVE	LK_EXCLUSIVE
 #define	AP_RELEASE	LK_RELEASE
 
 /* Lock and unlock the proc child and sibling lists. */
 #define	PROCTREE_LOCK(how)						\
 	lockmgr(&proctree_lock, (how), NULL, CURPROC)
 
 #define	PROCTREE_ASSERT(what)						\
 	LOCKMGR_ASSERT(&proctree_lock, (what), CURPROC)
 
 #define	PT_SHARED	LK_SHARED
 #define	PT_EXCLUSIVE	LK_EXCLUSIVE
 #define	PT_RELEASE	LK_RELEASE
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	if ((p)->p_lock++ == 0)						\
 		faultin(p);						\
 	PROC_UNLOCK(p);							\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK(p);							\
 	(--(p)->p_lock);						\
 	PROC_UNLOCK(p);							\
 } while (0)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct lock allproc_lock;
 extern struct lock proctree_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 extern int ps_argsopen;
 extern int ps_showallprocs;
 extern int sched_quantum;		/* Scheduling quantum in ticks. */
 
 LIST_HEAD(proclist, proc);
+TAILQ_HEAD(procqueue, proc);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 extern struct proc *updateproc;		/* Process slot for syncer (sic). */
 
-#define	NQS	32			/* 32 run queues. */
-
-TAILQ_HEAD(rq, proc);
-extern struct rq itqueues[];
-extern struct rq rtqueues[];
-extern struct rq queues[];
-extern struct rq idqueues[];
 extern struct vm_zone *proc_zone;
 
 /*
  * XXX macros for scheduler.  Shouldn't be here, but currently needed for
  * bounding the dubious p_estcpu inheritance in wait1().
  * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
  * the range 100-256 Hz (approximately).
  */
 #define	ESTCPULIM(e) \
     min((e), INVERSE_ESTCPU_WEIGHT * (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) - \
-	     PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
+	     RQ_PPQ) + INVERSE_ESTCPU_WEIGHT - 1)
 #define	INVERSE_ESTCPU_WEIGHT	8	/* 1 / (priorities per estcpu level). */
 #define	NICE_WEIGHT	1		/* Priorities per nice level. */
-#define	PPQ		(128 / NQS)	/* Priorities per queue. */
 
 struct mtx;
 struct trapframe;
 
 struct	proc *pfind __P((pid_t));	/* Find process by id. */
 struct	pgrp *pgfind __P((pid_t));	/* Find process group by id. */
 struct	proc *zpfind __P((pid_t));	/* Find zombie process by id. */
 
 struct	proc *chooseproc __P((void));
 int	enterpgrp __P((struct proc *p, pid_t pgid, int mksess));
 void	faultin __P((struct proc *p));
 void	fixjobc __P((struct proc *p, struct pgrp *pgrp, int entering));
 int	fork1 __P((struct proc *, int, struct proc **));
 void	fork_exit __P((void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *));
 void	fork_return __P((struct proc *, struct trapframe *));
 int	inferior __P((struct proc *p));
 int	leavepgrp __P((struct proc *p));
 void	mi_switch __P((void));
 int	p_can __P((const struct proc *p1, const struct proc *p2, int operation,
 	    int *privused));
 int	p_trespass __P((struct proc *p1, struct proc *p2));
 void	procinit __P((void));
 void	proc_reparent __P((struct proc *child, struct proc *newparent));
-u_int32_t procrunnable __P((void));
+int	procrunnable __P((void));
 void	remrunqueue __P((struct proc *));
 void	resetpriority __P((struct proc *));
 int	roundrobin_interval __P((void));
 void	schedclock __P((struct proc *));
 void	setrunnable __P((struct proc *));
 void	setrunqueue __P((struct proc *));
 void	setsugid __P((struct proc *p));
 void	sleepinit __P((void));
 void	stopevent __P((struct proc *, u_int, u_int));
 void	cpu_idle __P((void));
 void	cpu_switch __P((void));
 void	cpu_throw __P((void)) __dead2;
 void	unsleep __P((struct proc *));
 void	updatepri __P((struct proc *));
 void	userret __P((struct proc *, struct trapframe *, u_quad_t));
 void	maybe_resched __P((struct proc *));
 
 void	cpu_exit __P((struct proc *)) __dead2;
 void	exit1 __P((struct proc *, int)) __dead2;
 void	cpu_fork __P((struct proc *, struct proc *, int));
 void	cpu_set_fork_handler __P((struct proc *, void (*)(void *), void *));
 int	trace_req __P((struct proc *));
 void	cpu_wait __P((struct proc *));
 int	cpu_coredump __P((struct proc *, struct vnode *, struct ucred *));
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
Index: head/sys/sys/rtprio.h
===================================================================
--- head/sys/sys/rtprio.h	(revision 72375)
+++ head/sys/sys/rtprio.h	(revision 72376)
@@ -1,103 +1,90 @@
 /*
  * Copyright (c) 1994, Henrik Vestergaard Draboel
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by (name).
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_RTPRIO_H_
 #define _SYS_RTPRIO_H_
 
+#include <sys/priority.h>
+
 /*
  * Process realtime-priority specifications to rtprio.
  */
 
 /* priority types.  Start at 1 to catch uninitialized fields. */
 
-#define RTP_PRIO_ITHREAD	1	/* interrupt thread */
-#define RTP_PRIO_REALTIME	2	/* real time process */
-#define RTP_PRIO_NORMAL		3	/* time sharing process */
-#define RTP_PRIO_IDLE		4	/* idle process */
+#define RTP_PRIO_REALTIME	PRI_REALTIME	/* real time process */
+#define RTP_PRIO_NORMAL		PRI_TIMESHARE	/* time sharing process */
+#define RTP_PRIO_IDLE		PRI_IDLE	/* idle process */
 
 /* RTP_PRIO_FIFO is POSIX.1B SCHED_FIFO.
  */
 
-#define RTP_PRIO_FIFO_BIT	4
-#define RTP_PRIO_FIFO		(RTP_PRIO_REALTIME | RTP_PRIO_FIFO_BIT)
-#define RTP_PRIO_BASE(P)	((P) & ~RTP_PRIO_FIFO_BIT)
-#define RTP_PRIO_IS_REALTIME(P) (RTP_PRIO_BASE(P) == RTP_PRIO_REALTIME)
-#define RTP_PRIO_NEED_RR(P)	((P) != RTP_PRIO_FIFO)
+#define RTP_PRIO_FIFO_BIT	PRI_FIFO_BIT
+#define RTP_PRIO_FIFO		PRI_FIFO
+#define RTP_PRIO_BASE(P)	PRI_BASE(P)
+#define RTP_PRIO_IS_REALTIME(P) PRI_IS_REALTIME(P)
+#define RTP_PRIO_NEED_RR(P)	PRI_NEED_RR(P)
 
 /* priority range */
 #define RTP_PRIO_MIN		0	/* Highest priority */
 #define RTP_PRIO_MAX		31	/* Lowest priority */
 
 /*
  * rtprio() syscall functions
  */
 #define RTP_LOOKUP		0
 #define RTP_SET			1
 
 #ifndef LOCORE
 /*
- * Scheduling class information.  This is strictly speaking not only
- * for real-time processes.  We should replace it with two variables:
- * class and priority.  At the moment we use prio here for real-time
- * and interrupt processes, and for others we use proc.p_pri.  FIXME.
+ * Scheduling class information.
  */
 struct rtprio {
 	u_short type;			/* scheduling class */
 	u_short prio;
 };
-#endif
 
-/*
- * Interrupt thread priorities, after BSD/OS.
- */
-#define	PI_REALTIME	 1		/* very high priority (clock) */
-#define	PI_AV		 2		/* Audio/video devices */
-#define	PI_TTYHIGH	 3		/* High priority tty's (small FIFOs) */
-#define	PI_TAPE		 4		/* Tape devices (high for streaming) */
-#define	PI_NET		 5		/* Network interfaces */
-#define	PI_DISK		 6		/* Disks and SCSI */
-#define	PI_TTYLOW	 7		/* Ttys with big buffers */
-#define	PI_DISKLOW	 8		/* Disks that do programmed I/O */
-#define	PI_DULL		 9		/* We don't know or care */
-
-/* Soft interrupt threads */
-#define	PI_SOFT  	15		/* All soft interrupts */
+#ifdef _KERNEL
+int	rtp_to_pri(struct rtprio *, struct priority *);
+void	pri_to_rtp(struct priority *, struct rtprio *);
+#endif
+#endif
 
 #ifndef _KERNEL
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
 int	rtprio __P((int, pid_t, struct rtprio *));
 __END_DECLS
 #endif	/* !_KERNEL */
 #endif	/* !_SYS_RTPRIO_H_ */
Index: head/sys/sys/runq.h
===================================================================
--- head/sys/sys/runq.h	(nonexistent)
+++ head/sys/sys/runq.h	(revision 72376)
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_RUNQ_H_
+#define	_RUNQ_H_
+
+/*
+ * Run queue parameters.
+ */
+
+#define	RQ_NQS		(64)		/* Number of run queues. */
+#define	RQ_PPQ		(4)		/* Priorities per queue. */
+
+#define	RQB_LEN		(2)		/* Number of priority status words. */
+#define	RQB_L2BPW	(5)		/* Log2(sizeof(rqb_word_t) * NBBY)). */
+#define	RQB_BPW		(1<<RQB_L2BPW)	/* Bits in an rqb_word_t. */
+
+#define	RQB_BIT(pri)	(1 << ((pri) & (RQB_BPW - 1)))
+#define	RQB_WORD(pri)	((pri) >> RQB_L2BPW)
+#define	RQB_FFS(word)	(ffs(word))
+
+/*
+ * Type of run queue status word.
+ */
+typedef	u_int32_t	rqb_word_t;
+
+/*
+ * Head of run queues.
+ */
+TAILQ_HEAD(rqhead, proc);
+
+/*
+ * Bit array which maintains the status of a run queue.  When a queue is
+ * non-empty the bit corresponding to the queue number will be set.
+ */
+struct	rqbits {
+	rqb_word_t rqb_bits[RQB_LEN];
+};
+
+/*
+ * Run queue structure.  Contains an array of run queues on which processes
+ * are placed, and a structure to maintain the status of each queue.
+ */
+struct	runq {
+	struct	rqbits rq_status;
+	struct	rqhead rq_queues[RQ_NQS];
+};
+
+void	runq_add(struct runq *, struct proc *);
+int	runq_check(struct runq *);
+struct	proc *runq_choose(struct runq *);
+void	runq_init(struct runq *);
+void	runq_remove(struct runq *, struct proc *);
+
+#endif

Property changes on: head/sys/sys/runq.h
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Index: head/sys/sys/systm.h
===================================================================
--- head/sys/sys/systm.h	(revision 72375)
+++ head/sys/sys/systm.h	(revision 72376)
@@ -1,284 +1,282 @@
 /*-
  * Copyright (c) 1982, 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)systm.h	8.7 (Berkeley) 3/29/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_SYSTM_H_
 #define	_SYS_SYSTM_H_
 
 #include <machine/atomic.h>
 #include <machine/cpufunc.h>
 #include <sys/callout.h>
 
 extern int securelevel;		/* system security level (see init(8)) */
 
 extern int cold;		/* nonzero if we are doing a cold boot */
 extern const char *panicstr;	/* panic message */
 extern int dumping;		/* system is dumping */
 extern char version[];		/* system version */
 extern char copyright[];	/* system copyright */
 
 extern int nswap;		/* size of swap space */
 
 extern int selwait;		/* select timeout address */
 
-extern u_char curpriority;	/* priority of current process */
-
 extern int physmem;		/* physical memory */
 
 extern dev_t dumpdev;		/* dump device */
 extern long dumplo;		/* offset into dumpdev */
 
 extern dev_t rootdev;		/* root device */
 extern dev_t rootdevs[2];	/* possible root devices */
 extern char *rootdevnames[2];	/* names of possible root devices */
 extern struct vnode *rootvp;	/* vnode equivalent to above */
 
 extern int boothowto;		/* reboot flags, from console subsystem */
 extern int bootverbose;		/* nonzero to print verbose messages */
 
 #ifdef	INVARIANTS		/* The option is always available */
 #define	KASSERT(exp,msg)	do { if (!(exp)) panic msg; } while (0)
 #else
 #define	KASSERT(exp,msg)
 #endif
 
 /*
  * General function declarations.
  */
 
 struct clockframe;
 struct malloc_type;
 struct mtx;
 struct proc;
 struct timeval;
 struct tty;
 struct ucred;
 struct uio;
 
 void	Debugger __P((const char *msg));
 int	nullop __P((void));
 int	eopnotsupp __P((void));
 int	einval __P((void));
 int	seltrue __P((dev_t dev, int which, struct proc *p));
 int	ureadc __P((int, struct uio *));
 void	*hashinit __P((int count, struct malloc_type *type, u_long *hashmask));
 void	*phashinit __P((int count, struct malloc_type *type, u_long *nentries));
 
 void	cpu_boot __P((int));
 void	cpu_rootconf __P((void));
 void	tablefull __P((const char *));
 int	kvprintf __P((char const *, void (*)(int, void*), void *, int,
 		      _BSD_VA_LIST_)) __printflike(1, 0);
 void	log __P((int, const char *, ...)) __printflike(2, 3);
 void	log_console __P((struct uio *));
 int	printf __P((const char *, ...)) __printflike(1, 2);
 int	snprintf __P((char *, size_t, const char *, ...)) __printflike(3, 4);
 int	sprintf __P((char *buf, const char *, ...)) __printflike(2, 3);
 int	uprintf __P((const char *, ...)) __printflike(1, 2);
 int	vprintf __P((const char *, _BSD_VA_LIST_)) __printflike(1, 0);
 int	vsnprintf __P((char *, size_t, const char *, _BSD_VA_LIST_)) __printflike(3, 0);
 int     vsprintf __P((char *buf, const char *, _BSD_VA_LIST_)) __printflike(2, 0);
 int	ttyprintf __P((struct tty *, const char *, ...)) __printflike(2, 3);
 int	sscanf __P((const char *, char const *, ...));
 int	vsscanf __P((const char *, char const *, _BSD_VA_LIST_));
 long	strtol __P((const char *, char **, int));
 u_long	strtoul __P((const char *, char **, int));
 quad_t	strtoq __P((const char *, char **, int));
 u_quad_t strtouq __P((const char *, char **, int));
 void	tprintf __P((struct proc *p, int pri, const char *, ...)) __printflike(3, 4);
 
 void	bcopy __P((const void *from, void *to, size_t len));
 void	ovbcopy __P((const void *from, void *to, size_t len));
 
 #ifdef __i386__
 extern void	(*bzero) __P((void *buf, size_t len));
 #else
 void	bzero __P((void *buf, size_t len));
 #endif
 
 void	*memcpy __P((void *to, const void *from, size_t len));
 
 int	copystr __P((const void *kfaddr, void *kdaddr, size_t len,
 		size_t *lencopied));
 int	copyinstr __P((const void *udaddr, void *kaddr, size_t len,
 		size_t *lencopied));
 int	copyin __P((const void *udaddr, void *kaddr, size_t len));
 int	copyout __P((const void *kaddr, void *udaddr, size_t len));
 
 int	fubyte __P((const void *base));
 int	subyte __P((void *base, int byte));
 int	suibyte __P((void *base, int byte));
 long	fuword __P((const void *base));
 int	suword __P((void *base, long word));
 int	fusword __P((void *base));
 int	susword __P((void *base, int word));
 
 void	realitexpire __P((void *));
 
 void	hardclock __P((struct clockframe *frame));
 void	softclock __P((void *));
 void	statclock __P((struct clockframe *frame));
 
 void	startprofclock __P((struct proc *));
 void	stopprofclock __P((struct proc *));
 void	setstatclockrate __P((int hzrate));
 
 /* flags for suser_xxx() */
 #define PRISON_ROOT	1
 
 int	suser __P((const struct proc *));
 int	suser_xxx __P((const struct ucred *cred, const struct proc *proc,
     int flag));
 
 char	*getenv __P((char *name));
 int	getenv_int __P((char *name, int *data));
 quad_t	getenv_quad __P((char *name, quad_t *data));
 extern char *kern_envp;
 
 #ifdef APM_FIXUP_CALLTODO 
 void	adjust_timeout_calltodo __P((struct timeval *time_change)); 
 #endif /* APM_FIXUP_CALLTODO */ 
 
 #include <sys/libkern.h>
 
 /* Initialize the world */
 void	consinit __P((void));
 void	cpu_initclocks __P((void));
 void	usrinfoinit __P((void));
 
 /* Finalize the world. */
 void	shutdown_nice __P((int));
 
 /*
  * Kernel to clock driver interface.
  */
 void	inittodr __P((time_t base));
 void	resettodr __P((void));
 void	startrtclock __P((void));
 
 /* Timeouts */
 typedef void timeout_t __P((void *));	/* timeout function type */
 #define CALLOUT_HANDLE_INITIALIZER(handle)	\
 	{ NULL }
 
 void	callout_handle_init __P((struct callout_handle *));
 struct	callout_handle timeout __P((timeout_t *, void *, int));
 void	untimeout __P((timeout_t *, void *, struct callout_handle));
 
 /* Stubs for obsolete functions that used to be for interrupt  management */
 static __inline void		spl0(void)		{ return; }
 static __inline intrmask_t	splbio(void)		{ return 0; }
 static __inline intrmask_t	splcam(void)		{ return 0; }
 static __inline intrmask_t	splclock(void)		{ return 0; }
 static __inline intrmask_t	splhigh(void)		{ return 0; }
 static __inline intrmask_t	splimp(void)		{ return 0; }
 static __inline intrmask_t	splnet(void)		{ return 0; }
 static __inline intrmask_t	splsoftcam(void)	{ return 0; }
 static __inline intrmask_t	splsoftclock(void)	{ return 0; }
 static __inline intrmask_t	splsofttty(void)	{ return 0; }
 static __inline intrmask_t	splsoftvm(void)		{ return 0; }
 static __inline intrmask_t	splsofttq(void)		{ return 0; }
 static __inline intrmask_t	splstatclock(void)	{ return 0; }
 static __inline intrmask_t	spltty(void)		{ return 0; }
 static __inline intrmask_t	splvm(void)		{ return 0; }
 static __inline void		splx(intrmask_t ipl)	{ return; }
 
 #if defined(__ia64__)
 #include <machine/ipl.h>
 #endif
 
 /*
  * Various callout lists.
  */
 
 /* Exit callout list declarations. */
 typedef void (*exitlist_fn) __P((struct proc *procp));
 
 int	at_exit __P((exitlist_fn function));
 int	rm_at_exit __P((exitlist_fn function));
 
 /* Fork callout list declarations. */
 typedef void (*forklist_fn) __P((struct proc *parent, struct proc *child,
 				 int flags));
 
 int	at_fork __P((forklist_fn function));
 int	rm_at_fork __P((forklist_fn function));
 
 /*
  * Not exactly a callout LIST, but a callout entry.
  * Allow an external module to define a hardware watchdog tickler.
  * Normally a process would do this, but there are times when the
  * kernel needs to be able to hold off the watchdog, when the process
  * is not active, e.g., when dumping core.
  */
 typedef void (*watchdog_tickle_fn) __P((void));
 
 extern watchdog_tickle_fn	wdog_tickler;
 
 /* 
  * Common `proc' functions are declared here so that proc.h can be included
  * less often.
  */
 int	msleep __P((void *chan, struct mtx *mtx, int pri, const char *wmesg,
 		    int timo));
 #define	tsleep(chan, pri, wmesg, timo)	msleep(chan, NULL, pri, wmesg, timo)
 int	asleep __P((void *chan, int pri, const char *wmesg, int timo));
 #define await(pri, timo)		mawait(NULL, pri, timo)
 int	mawait  __P((struct mtx *mtx, int pri, int timo));
 void	wakeup __P((void *chan));
 void	wakeup_one __P((void *chan));
 
 /*
  * Common `dev_t' stuff are declared here to avoid #include poisoning
  */
 
 int major(dev_t x);
 int minor(dev_t x);
 dev_t makedev(int x, int y);
 udev_t dev2udev(dev_t x);
 dev_t udev2dev(udev_t x, int b);
 int uminor(udev_t dev);
 int umajor(udev_t dev);
 udev_t makeudev(int x, int y);
 
 /* XXX: Should be void nanodelay(u_int nsec); */
 void	DELAY __P((int usec));
 
 #endif /* !_SYS_SYSTM_H_ */
Index: head/sys/sys/tty.h
===================================================================
--- head/sys/sys/tty.h	(revision 72375)
+++ head/sys/sys/tty.h	(revision 72376)
@@ -1,276 +1,276 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tty.h	8.6 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_TTY_H_
 #define	_SYS_TTY_H_
 
 #include <sys/termios.h>
 #include <sys/queue.h>
 #include <sys/selinfo.h>
 
 /*
  * Clists are character lists, which is a variable length linked list
  * of cblocks, with a count of the number of characters in the list.
  */
 struct clist {
 	int	c_cc;		/* Number of characters in the clist. */
 	int	c_cbcount;	/* Number of cblocks. */
 	int	c_cbmax;	/* Max # cblocks allowed for this clist. */
 	int	c_cbreserved;	/* # cblocks reserved for this clist. */
 	char	*c_cf;		/* Pointer to the first cblock. */
 	char	*c_cl;		/* Pointer to the last cblock. */
 };
 
 /*
  * Per-tty structure.
  *
  * Should be split in two, into device and tty drivers.
  * Glue could be masks of what to echo and circular buffer
  * (low, high, timeout).
  */
 struct tty {
 	struct	clist t_rawq;		/* Device raw input queue. */
 	long	t_rawcc;		/* Raw input queue statistics. */
 	struct	clist t_canq;		/* Device canonical queue. */
 	long	t_cancc;		/* Canonical queue statistics. */
 	struct	clist t_outq;		/* Device output queue. */
 	long	t_outcc;		/* Output queue statistics. */
 	int	t_line;			/* Interface to device drivers. */
 	dev_t	t_dev;			/* Device. */
 	int	t_state;		/* Device and driver (TS*) state. */
 	int	t_flags;		/* Tty flags. */
 	int     t_timeout;              /* Timeout for ttywait() */
 	struct	pgrp *t_pgrp;		/* Foreground process group. */
 	struct	session *t_session;	/* Enclosing session. */
 	struct  sigio *t_sigio;		/* Information for async I/O. */
 	struct	selinfo t_rsel;		/* Tty read/oob select. */
 	struct	selinfo t_wsel;		/* Tty write select. */
 	struct	termios t_termios;	/* Termios state. */
 	struct	winsize t_winsize;	/* Window size. */
 					/* Start output. */
 	void	(*t_oproc) __P((struct tty *));
 					/* Stop output. */
 	void	(*t_stop) __P((struct tty *, int));
 					/* Set hardware state. */
 	int	(*t_param) __P((struct tty *, struct termios *));
 	void	*t_sc;			/* XXX: net/if_sl.c:sl_softc. */
 	int	t_column;		/* Tty output column. */
 	int	t_rocount, t_rocol;	/* Tty. */
 	int	t_ififosize;		/* Total size of upstream fifos. */
 	int	t_ihiwat;		/* High water mark for input. */
 	int	t_ilowat;		/* Low water mark for input. */
 	speed_t	t_ispeedwat;		/* t_ispeed override for watermarks. */
 	int	t_ohiwat;		/* High water mark for output. */
 	int	t_olowat;		/* Low water mark for output. */
 	speed_t	t_ospeedwat;		/* t_ospeed override for watermarks. */
 	int	t_gen;			/* Generation number. */
 	SLIST_ENTRY(tty) t_list;	/* Global chain of ttys for pstat(8) */
 };
 
 #define	t_cc		t_termios.c_cc
 #define	t_cflag		t_termios.c_cflag
 #define	t_iflag		t_termios.c_iflag
 #define	t_ispeed	t_termios.c_ispeed
 #define	t_lflag		t_termios.c_lflag
 #define	t_min		t_termios.c_min
 #define	t_oflag		t_termios.c_oflag
 #define	t_ospeed	t_termios.c_ospeed
 #define	t_time		t_termios.c_time
 
-#define	TTIPRI	25			/* Sleep priority for tty reads. */
-#define	TTOPRI	26			/* Sleep priority for tty writes. */
+#define	TTIPRI		(PSOCK + 1)	/* Sleep priority for tty reads. */
+#define	TTOPRI		(PSOCK + 2)	/* Sleep priority for tty writes. */
 
 /*
  * User data unfortunately has to be copied through buffers on the way to
  * and from clists.  The buffers are on the stack so their sizes must be
  * fairly small.
  */
 #define	IBUFSIZ	384			/* Should be >= max value of MIN. */
 #define	OBUFSIZ	100
 
 #ifndef TTYHOG
 #define	TTYHOG	1024
 #endif
 
 #ifdef _KERNEL
 #define	TTMAXHIWAT	roundup(2048, CBSIZE)
 #define	TTMINHIWAT	roundup(100, CBSIZE)
 #define	TTMAXLOWAT	256
 #define	TTMINLOWAT	32
 #endif
 
 /* These flags are kept in t_state. */
 #define	TS_SO_OLOWAT	0x00001		/* Wake up when output <= low water. */
 #define	TS_ASYNC	0x00002		/* Tty in async I/O mode. */
 #define	TS_BUSY		0x00004		/* Draining output. */
 #define	TS_CARR_ON	0x00008		/* Carrier is present. */
 #define	TS_FLUSH	0x00010		/* Outq has been flushed during DMA. */
 #define	TS_ISOPEN	0x00020		/* Open has completed. */
 #define	TS_TBLOCK	0x00040		/* Further input blocked. */
 #define	TS_TIMEOUT	0x00080		/* Wait for output char processing. */
 #define	TS_TTSTOP	0x00100		/* Output paused. */
 #ifdef notyet
 #define	TS_WOPEN	0x00200		/* Open in progress. */
 #endif
 #define	TS_XCLUDE	0x00400		/* Tty requires exclusivity. */
 
 /* State for intra-line fancy editing work. */
 #define	TS_BKSL		0x00800		/* State for lowercase \ work. */
 #define	TS_CNTTB	0x01000		/* Counting tab width, ignore FLUSHO. */
 #define	TS_ERASE	0x02000		/* Within a \.../ for PRTRUB. */
 #define	TS_LNCH		0x04000		/* Next character is literal. */
 #define	TS_TYPEN	0x08000		/* Retyping suspended input (PENDIN). */
 #define	TS_LOCAL	(TS_BKSL | TS_CNTTB | TS_ERASE | TS_LNCH | TS_TYPEN)
 
 /* Extras. */
 #define	TS_CAN_BYPASS_L_RINT 0x010000	/* Device in "raw" mode. */
 #define	TS_CONNECTED	0x020000	/* Connection open. */
 #define	TS_SNOOP	0x040000	/* Device is being snooped on. */
 #define	TS_SO_OCOMPLETE	0x080000	/* Wake up when output completes. */
 #define	TS_ZOMBIE	0x100000	/* Connection lost. */
 
 /* Hardware flow-control-invoked bits. */
 #define	TS_CAR_OFLOW	0x200000	/* For MDMBUF (XXX handle in driver). */
 #ifdef notyet
 #define	TS_CTS_OFLOW	0x400000	/* For CCTS_OFLOW. */
 #define	TS_DSR_OFLOW	0x800000	/* For CDSR_OFLOW. */
 #endif
 
 /* Character type information. */
 #define	ORDINARY	0
 #define	CONTROL		1
 #define	BACKSPACE	2
 #define	NEWLINE		3
 #define	TAB		4
 #define	VTAB		5
 #define	RETURN		6
 
 struct speedtab {
 	int sp_speed;			/* Speed. */
 	int sp_code;			/* Code. */
 };
 
 /* Modem control commands (driver). */
 #define	DMSET		0
 #define	DMBIS		1
 #define	DMBIC		2
 #define	DMGET		3
 
 /* Flags on a character passed to ttyinput. */
 #define	TTY_CHARMASK	0x000000ff	/* Character mask */
 #define	TTY_QUOTE	0x00000100	/* Character quoted */
 #define	TTY_ERRORMASK	0xff000000	/* Error mask */
 #define	TTY_FE		0x01000000	/* Framing error */
 #define	TTY_PE		0x02000000	/* Parity error */
 #define	TTY_OE		0x04000000	/* Overrun error */
 #define	TTY_BI		0x08000000	/* Break condition */
 
 /* Is tp controlling terminal for p? */
 #define	isctty(p, tp)							\
 	((p)->p_session == (tp)->t_session && (p)->p_flag & P_CONTROLT)
 
 /* Is p in background of tp? */
 #define	isbackground(p, tp)						\
 	(isctty((p), (tp)) && (p)->p_pgrp != (tp)->t_pgrp)
 
 /* Unique sleep addresses. */
 #define	TSA_CARR_ON(tp)		((void *)&(tp)->t_rawq)
 #define	TSA_HUP_OR_INPUT(tp)	((void *)&(tp)->t_rawq.c_cf)
 #define	TSA_OCOMPLETE(tp)	((void *)&(tp)->t_outq.c_cl)
 #define	TSA_OLOWAT(tp)		((void *)&(tp)->t_outq)
 #define	TSA_PTC_READ(tp)	((void *)&(tp)->t_outq.c_cf)
 #define	TSA_PTC_WRITE(tp)	((void *)&(tp)->t_rawq.c_cl)
 #define	TSA_PTS_READ(tp)	((void *)&(tp)->t_canq)
 
 #ifdef _KERNEL
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_TTYS);
 #endif
 extern	struct tty *constty;	/* Temporary virtual console. */
 
 int	 b_to_q __P((char *cp, int cc, struct clist *q));
 void	 catq __P((struct clist *from, struct clist *to));
 void	 clist_alloc_cblocks __P((struct clist *q, int ccmax, int ccres));
 void	 clist_free_cblocks __P((struct clist *q));
 int	 getc __P((struct clist *q));
 void	 ndflush __P((struct clist *q, int cc));
 char	*nextc __P((struct clist *q, char *cp, int *c));
 void	 nottystop __P((struct tty *tp, int rw));
 int	 putc __P((int c, struct clist *q));
 int	 q_to_b __P((struct clist *q, char *cp, int cc));
 void	 termioschars __P((struct termios *t));
 int	 tputchar __P((int c, struct tty *tp));
 int	 ttcompat __P((struct tty *tp, u_long com, caddr_t data, int flag));
 int	 ttioctl __P((struct tty *tp, u_long com, void *data, int flag));
 int	 ttread __P((struct tty *tp, struct uio *uio, int flag));
 void	 ttrstrt __P((void *tp));
 int	 ttsetcompat __P((struct tty *tp, u_long *com, caddr_t data,
 	    struct termios *term));
 void	 ttsetwater __P((struct tty *tp));
 int	 ttspeedtab __P((int speed, struct speedtab *table));
 int	 ttstart __P((struct tty *tp));
 void	 ttwakeup __P((struct tty *tp));
 int	 ttwrite __P((struct tty *tp, struct uio *uio, int flag));
 void	 ttwwakeup __P((struct tty *tp));
 void	 ttyblock __P((struct tty *tp));
 void	 ttychars __P((struct tty *tp));
 int	 ttycheckoutq __P((struct tty *tp, int wait));
 int	 ttyclose __P((struct tty *tp));
 void	 ttyflush __P((struct tty *tp, int rw));
 void	 ttyfree __P((struct tty *tp));
 void	 ttyinfo __P((struct tty *tp));
 int	 ttyinput __P((int c, struct tty *tp));
 int	 ttylclose __P((struct tty *tp, int flag));
 struct tty *ttymalloc __P((struct tty *tp));
 int	 ttymodem __P((struct tty *tp, int flag));
 int	 ttyopen __P((dev_t device, struct tty *tp));
 int	 ttypoll __P((dev_t dev, int events, struct proc *p));
 int	 ttyread __P((dev_t dev, struct uio *uio, int flag));
 void	 ttyregister __P((struct tty *tp));
 int	 ttysleep __P((struct tty *tp, void *chan, int pri, char *wmesg,
 	    int timeout));
 int	 ttywait __P((struct tty *tp));
 int	 ttywrite __P((dev_t dev, struct uio *uio, int flag));
 int	 unputc __P((struct clist *q));
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_TTY_H_ */
Index: head/sys/sys/user.h
===================================================================
--- head/sys/sys/user.h	(revision 72375)
+++ head/sys/sys/user.h	(revision 72376)
@@ -1,175 +1,172 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)user.h	8.2 (Berkeley) 9/23/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_USER_H_
 #define _SYS_USER_H_
 
 #include <machine/pcb.h>
 #ifndef _KERNEL
 /* stuff that *used* to be included by user.h, or is now needed */
 #include <sys/errno.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/ucred.h>
 #include <sys/uio.h>
 #include <sys/proc.h>
 #include <vm/vm.h>		/* XXX */
 #include <vm/vm_param.h>	/* XXX */
 #include <vm/pmap.h>		/* XXX */
 #include <sys/lock.h>		/* XXX */
 #include <vm/vm_map.h>		/* XXX */
 #endif /* !_KERNEL */
 #ifndef _SYS_RESOURCEVAR_H_
 #include <sys/resourcevar.h>
 #endif
 #ifndef _SYS_SIGNALVAR_H_
 #include <sys/signalvar.h>
 #endif
 
 /*
  * KERN_PROC subtype ops return arrays of selected proc structure entries:
  *
  * When adding new fields to this structure, ALWAYS add them at the end
  * and decrease the size of the spare field by the amount of space that
  * you are adding.  Byte aligned data should be added to the ki_sparestring
  * space; other entries should be added to the ki_spare space. Always
  * verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE when you are
  * done. If you change the size of this structure, many programs will stop
  * working! Once you have added the new field, you will need to add code
  * to initialize it in two places: kern/kern_proc.c in the function
  * fill_kinfo_proc and in lib/libkvm/kvm_proc.c in the function kvm_proclist.
  */
 #ifdef	__alpha__
-#define	KINFO_PROC_SIZE	904		/* the correct size for kinfo_proc */
+#define	KINFO_PROC_SIZE	912		/* the correct size for kinfo_proc */
 #else
-#define	KINFO_PROC_SIZE	640		/* the correct size for kinfo_proc */
+#define	KINFO_PROC_SIZE	644		/* the correct size for kinfo_proc */
 #endif
 #define	WMESGLEN	8		/* size of returned wchan message */
 #define	MTXNAMELEN	8		/* size of returned mutex name */
 
 struct kinfo_proc {
 	int	ki_structsize;		/* size of this structure */
 	struct	pargs *ki_args;		/* address of command arguments */
 	struct	proc *ki_paddr;		/* address of proc */
 	struct	user *ki_addr;		/* kernel virtual addr of u-area */
 	struct	vnode *ki_tracep;	/* pointer to trace file */
 	struct	vnode *ki_textvp;	/* pointer to executable file */
 	struct	filedesc *ki_fd;	/* pointer to open file info */
 	struct	vmspace *ki_vmspace;	/* pointer to kernel vmspace struct */
 	void	*ki_wchan;		/* sleep address */
 	pid_t	ki_pid;			/* Process identifier */
 	pid_t	ki_ppid;		/* parent process id */
 	pid_t	ki_pgid;		/* process group id */
 	pid_t	ki_tpgid;		/* tty process group id */
 	pid_t	ki_sid;			/* Process session ID */
 	pid_t	ki_tsid;		/* Terminal session ID */
 	short	ki_jobc;		/* job control counter */
 	udev_t	ki_tdev;		/* controlling tty dev */
 	sigset_t ki_siglist;		/* Signals arrived but not delivered */
 	sigset_t ki_sigmask;		/* Current signal mask */
 	sigset_t ki_sigignore;		/* Signals being ignored */
 	sigset_t ki_sigcatch;		/* Signals being caught by user */
 	uid_t	ki_uid;			/* effective user id */
 	uid_t	ki_ruid;		/* Real user id */
 	uid_t	ki_svuid;		/* Saved effective user id */
 	gid_t	ki_rgid;		/* Real group id */
 	gid_t	ki_svgid;		/* Saved effective group id */
 	short	ki_ngroups;		/* number of groups */
 	gid_t	ki_groups[NGROUPS];	/* groups */
 	vm_size_t ki_size;		/* virtual size */
 	segsz_t ki_rssize;		/* current resident set size in pages */
 	segsz_t ki_swrss;		/* resident set size before last swap */
 	segsz_t ki_tsize;		/* text size (pages) XXX */
 	segsz_t ki_dsize;		/* data size (pages) XXX */
 	segsz_t ki_ssize;		/* stack size (pages) */
 	u_short	ki_xstat;		/* Exit status for wait & stop signal */
 	u_short	ki_acflag;		/* Accounting flags */
 	fixpt_t	ki_pctcpu;	 	/* %cpu for process during ki_swtime */
 	u_int	ki_estcpu;	 	/* Time averaged value of ki_cpticks */
 	u_int	ki_slptime;	 	/* Time since last blocked */
 	u_int	ki_swtime;	 	/* Time swapped in or out */
 	u_int64_t ki_runtime;		/* Real time in microsec */
 	struct	timeval ki_start;	/* starting time */
 	struct	timeval ki_childtime;	/* time used by process children */
 	long	ki_flag;		/* P_* flags */
 	long	ki_kiflag;		/* KI_* flags (below) */
 	int	ki_traceflag;		/* Kernel trace points */
-	u_char	ki_priority;		/* Process priority */
-	u_char	ki_usrpri;		/* User-priority based on p_cpu */
-	u_char	ki_nativepri;		/* Priority before propogation */
 	char	ki_stat;		/* S* process status */
 	char	ki_nice;		/* Process "nice" value */
 	char	ki_lock;		/* Process lock (prevent swap) count */
 	char	ki_rqindex;		/* Run queue index */
 	u_char	ki_oncpu;		/* Which cpu we are on */
 	u_char	ki_lastcpu;		/* Last cpu we were on */
 	char	ki_comm[MAXCOMLEN+1];	/* command name */
 	char	ki_wmesg[WMESGLEN+1];	/* wchan message */
 	char	ki_login[MAXLOGNAME+1];	/* setlogin name */
 	char	ki_mtxname[MTXNAMELEN+1]; /* mutex name */
 	char	ki_sparestrings[102];	/* spare string space */
-	struct	rtprio ki_rtprio;	/* Realtime priority */
 	struct	rusage ki_rusage;	/* process rusage statistics */
 	long	ki_sflag;		/* PS_* flags */
-	long	ki_spare[24];		/* spare constants */
+	struct	priority ki_pri;	/* process priority */
+	long	ki_spare[25];		/* spare constants */
 };
 void fill_kinfo_proc __P((struct proc *, struct kinfo_proc *));
 
 /* ki_sessflag values */
 #define	KI_CTTY		0x00000001	/* controlling tty vnode active */
 #define	KI_SLEADER	0x00000002	/* session leader */
 #define	KI_MTXBLOCK	0x00000004	/* proc blocked on mutex ki_mtxname */
 
 /*
  * Per process structure containing data that isn't needed in core
  * when the process isn't running (esp. when swapped out).
  * This structure may or may not be at the same kernel address
  * in all processes.
  */
 
 struct	user {
 	struct	pcb u_pcb;
 	struct	sigacts u_sigacts;	/* p_sigacts points here (use it!) */
 	struct	pstats u_stats;		/* p_stats points here (use it!) */
 	/*
 	 * Remaining fields only for core dump and/or ptrace--
 	 * not valid at other times!
 	 */
 	struct	kinfo_proc u_kproc;	/* proc + eproc */
 	struct	md_coredump u_md;	/* machine dependent glop */
 };
 
 #endif
Index: head/sys/ufs/ffs/ffs_snapshot.c
===================================================================
--- head/sys/ufs/ffs/ffs_snapshot.c	(revision 72375)
+++ head/sys/ufs/ffs/ffs_snapshot.c	(revision 72376)
@@ -1,1076 +1,1076 @@
 /*
  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
  *
  * Further information about snapshots can be obtained from:
  *
  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
  *	1614 Oxford Street		mckusick@mckusick.com
  *	Berkeley, CA 94709-1608		+1-510-843-9542
  *	USA
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #define KERNCRED proc0.p_ucred
 #define DEBUG 1
 
 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t,
 	int, int, int, int));
 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *));
 static int readblock __P((struct buf *, daddr_t));
 
 #ifdef DEBUG
 #include <sys/sysctl.h>
 int snapdebug = 0;
 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
 #endif /* DEBUG */
 
 /*
  * Create a snapshot file and initialize it for the filesystem.
  */
 int
 ffs_snapshot(mp, snapfile)
 	struct mount *mp;
 	char *snapfile;
 {
 	ufs_daddr_t rlbn;
 	ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP];
 	int error, cg, snaploc, indiroff, numblks;
 	int i, size, base, len, loc, inoblkcnt;
 	int blksperindir, flag = mp->mnt_flag;
 	void *space;
 	struct fs *copy_fs, *fs = VFSTOUFS(mp)->um_fs;
 	struct proc *p = CURPROC;
 	struct inode *devip, *ip, *xp;
 	struct buf *bp, *nbp, *ibp;
 	struct vnode *vp, *devvp;
 	struct nameidata nd;
 	struct mount *wrtmp;
 	struct dinode *dip;
 	struct vattr vat;
 	struct cg *cgp;
 
 	/*
 	 * Need to serialize access to snapshot code per filesystem.
 	 */
 	/*
 	 * Assign a snapshot slot in the superblock.
 	 */
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == 0)
 			break;
 	if (snaploc == FSMAXSNAP)
 		return (ENOSPC);
 	/*
 	 * Create the snapshot file.
 	 */
 restart:
 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		vput(nd.ni_vp);
 		error = EEXIST;
 	}
 	if (nd.ni_dvp->v_mount != mp)
 		error = EXDEV;
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == nd.ni_vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		return (error);
 	}
 	VATTR_NULL(&vat);
 	vat.va_type = VREG;
 	vat.va_mode = S_IRUSR;
 	vat.va_vaflags |= VA_EXCLUSIVE;
 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
 		wrtmp = NULL;
 	if (wrtmp != mp)
 		panic("ffs_snapshot: mount mismatch");
 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &wrtmp,
 		    V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
 	vput(nd.ni_dvp);
 	if (error) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vn_finished_write(wrtmp);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	ip = VTOI(vp);
 	devvp = ip->i_devvp;
 	devip = VTOI(devvp);
 	/*
 	 * Allocate and copy the last block contents so as to be able
 	 * to set size to that of the filesystem.
 	 */
 	numblks = howmany(fs->fs_size, fs->fs_frag);
 	error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
 	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
 	if (error)
 		goto out;
 	ip->i_size = lblktosize(fs, (off_t)numblks);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if ((error = readblock(bp, numblks - 1)) != 0)
 		goto out;
 	bawrite(bp);
 	/*
 	 * Preallocate critical data structures so that we can copy
 	 * them in without further allocation after we suspend all
 	 * operations on the filesystem. We would like to just release
 	 * the allocated buffers without writing them since they will
 	 * be filled in below once we are ready to go, but this upsets
 	 * the soft update code, so we go ahead and write the new buffers.
 	 *
 	 * Allocate all indirect blocks. Also allocate shadow copies
 	 * for each of the indirect blocks.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
 		if (error)
 			goto out;
 		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
 		bdwrite(ibp);
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
 		    fs->fs_bsize, p->p_ucred, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate shadow blocks to copy all of the other snapshot inodes
 	 * so that we will be able to expunge them from this snapshot.
 	 */
 	for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
 		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
 		for (i = 0; i < inoblkcnt; i++)
 			if (inoblks[i] == blkno)
 				break;
 		if (i == inoblkcnt) {
 			inoblks[inoblkcnt++] = blkno;
 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 			    fs->fs_bsize, KERNCRED, 0, &nbp);
 			if (error)
 				goto out;
 			bawrite(nbp);
 		}
 	}
 	/*
 	 * Allocate all cylinder group blocks.
 	 */
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Allocate copies for the superblock and its summary information.
 	 */
 	if ((error = VOP_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp)))
 		goto out;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize);
 	for (loc = 0; loc < len; loc++) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out;
 		bawrite(nbp);
 	}
 	/*
 	 * Change inode to snapshot type file.
 	 */
 	ip->i_flags |= SF_SNAPSHOT;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	/*
 	 * Ensure that the snapshot is completely on disk.
 	 */
 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0)
 		goto out;
 	/*
 	 * All allocations are done, so we can now snapshot the system.
 	 *
 	 * Suspend operation on filesystem.
 	 */
 	for (;;) {
 		vn_finished_write(wrtmp);
 		vfs_write_suspend(vp->v_mount);
 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
 			break;
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	/*
 	 * First, copy all the cylinder group maps. All the unallocated
 	 * blocks are marked BLK_NOCOPY so that the snapshot knows that
 	 * it need not copy them if they are later written.
 	 */
 	len = howmany(fs->fs_fpg, fs->fs_frag);
 	for (cg = 0; cg < fs->fs_ncg; cg++) {
 		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 			(int)fs->fs_cgsize, KERNCRED, &bp);
 		if (error) {
 			brelse(bp);
 			goto out1;
 		}
 		cgp = (struct cg *)bp->b_data;
 		if (!cg_chkmagic(cgp)) {
 			brelse(bp);
 			error = EIO;
 			goto out1;
 		}
 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
 			KERNCRED, &nbp);
 		if (error) {
 			brelse(bp);
 			brelse(nbp);
 			goto out1;
 		}
 		bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
 		if (fs->fs_cgsize < fs->fs_bsize)
 			bzero(&nbp->b_data[fs->fs_cgsize],
 			    fs->fs_bsize - fs->fs_cgsize);
 		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 		base = cg * fs->fs_fpg / fs->fs_frag;
 		if (base + len >= numblks)
 			len = numblks - base - 1;
 		loc = 0;
 		if (base < NDADDR) {
 			for ( ; loc < NDADDR; loc++) {
 				if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
 					continue;
 				ip->i_db[loc] = BLK_NOCOPY;
 			}
 		}
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
 		if (error) {
 			brelse(bp);
 			goto out1;
 		}
 		indiroff = (base + loc - NDADDR) % NINDIR(fs);
 		for ( ; loc < len; loc++, indiroff++) {
 			if (indiroff >= NINDIR(fs)) {
 				ibp->b_flags |= B_VALIDSUSPWRT;
 				bawrite(ibp);
 				error = VOP_BALLOC(vp,
 				    lblktosize(fs, (off_t)(base + loc)),
 				    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
 				if (error) {
 					brelse(bp);
 					goto out1;
 				}
 				indiroff = 0;
 			}
 			if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
 				continue;
 			if (((ufs_daddr_t *)(ibp->b_data))[indiroff] != 0)
 				panic("ffs_snapshot: lost block");
 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
 		}
 		bqrelse(bp);
 		ibp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(ibp);
 	}
 	/*
 	 * Snapshot the superblock and its summary information.
 	 */
 	if ((error = VOP_BALLOC(vp, SBOFF, SBSIZE, KERNCRED, 0, &nbp)) != 0)
 		goto out1;
 	copy_fs = (struct fs *)(nbp->b_data + blkoff(fs, SBOFF));
 	bcopy(fs, copy_fs, fs->fs_sbsize);
 	copy_fs->fs_clean = 1;
 	if (fs->fs_sbsize < SBSIZE)
 		bzero(&nbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize],
 		    SBSIZE - fs->fs_sbsize);
 	nbp->b_flags |= B_VALIDSUSPWRT;
 	bawrite(nbp);
 	blkno = fragstoblks(fs, fs->fs_csaddr);
 	len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
 	size = fs->fs_bsize;
 	space = fs->fs_csp;
 	for (loc = 0; loc <= len; loc++) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		if (loc == len) {
 			readblock(nbp, blkno + loc);
 			size = fs->fs_cssize - loc * fs->fs_bsize;
 		}
 		bcopy(space, nbp->b_data, size);
 		space = (char *)space + size;
 		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 	}
 	/*
 	 * Copy the shadow blocks for the snapshot inodes so that
 	 * the copies can can be expunged.
 	 */
 	for (loc = 0; loc < inoblkcnt; loc++) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		readblock(nbp, inoblks[loc]);
 		nbp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(nbp);
 	}
 	/*
 	 * Copy allocation information from other snapshots and then
 	 * expunge them from the view of the current snapshot.
 	 */
 	for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) {
 		/*
 		 * Before expunging a snapshot inode, note all the
 		 * blocks that it claims with BLK_SNAP so that fsck will
 		 * be able to account for those blocks properly and so
 		 * that this snapshot knows that it need not copy them
 		 * if the other snapshot holding them is freed.
 		 */
 		if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0)
 			goto out1;
 		blksperindir = 1;
 		lbn = -NDADDR;
 		len = numblks - NDADDR;
 		rlbn = NDADDR;
 		for (i = 0; len > 0 && i < NIADDR; i++) {
 			error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn,
 			    rlbn, len, blksperindir);
 			if (error)
 				goto out1;
 			blksperindir *= NINDIR(fs);
 			lbn -= blksperindir + 1;
 			len -= blksperindir;
 			rlbn += blksperindir;
 		}
 		/*
 		 * Set copied snapshot inode to be a zero length file.
 		 */
 		blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number));
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, 0, &nbp);
 		if (error)
 			goto out1;
 		dip = (struct dinode *)nbp->b_data +
 		    ino_to_fsbo(fs, xp->i_number);
 		dip->di_size = 0;
 		dip->di_blocks = 0;
 		dip->di_flags &= ~SF_SNAPSHOT;
 		bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
 		nbp->b_flags |= B_VALIDSUSPWRT;
 		bdwrite(nbp);
 	}
 	/*
 	 * Copy all indirect blocks to their shadows (allocated above)
 	 * to avoid deadlock in ffs_copyonwrite.
 	 */
 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
 		if (error)
 			goto out1;
 		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
 		bqrelse(ibp);
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
 		    fs->fs_bsize, p->p_ucred, 0, &nbp);
 		if (error)
 			goto out1;
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
 		if (error) {
 			brelse(nbp);
 			goto out1;
 		}
 		bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
 		bqrelse(ibp);
 		nbp->b_flags |= B_VALIDSUSPWRT;
 		bawrite(nbp);
 	}
 	/*
 	 * Record snapshot inode. Since this is the newest snapshot,
 	 * it must be placed at the end of the list.
 	 */
 	fs->fs_snapinum[snaploc] = ip->i_number;
 	if (ip->i_copyonwrite != 0)
 		panic("ffs_snapshot: %d already on list", ip->i_number);
 	if (devip->i_copyonwrite == 0) {
 		devvp->v_flag |= VCOPYONWRITE;
 		devip->i_copyonwrite = ip;
 	} else {
 		for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; )
 			xp = xp->i_copyonwrite;
 		xp->i_copyonwrite = ip;
 	}
 	vp->v_flag |= VSYSTEM;
 	/*
 	 * Resume operation on filesystem.
 	 */
 out1:
 	vfs_write_resume(vp->v_mount);
 	vn_start_write(NULL, &wrtmp, V_WAIT);
 out:
 	mp->mnt_flag = flag;
 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, p);
 	vn_finished_write(wrtmp);
 	return (error);
 }
 
 /*
  * Descend an indirect block chain for vnode cancelvp accounting for all
  * its indirect blocks in snapvp.
  */ 
 static int
 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
 	struct vnode *snapvp;
 	struct vnode *cancelvp;
 	int level;
 	ufs_daddr_t blkno;
 	int lbn;
 	int rlbn;
 	int remblks;
 	int blksperindir;
 {
 	int subblksperindir, error, last, num, i;
 	struct indir indirs[NIADDR + 2];
 	ufs_daddr_t *bap;
 	struct buf *bp;
 	struct fs *fs;
 
 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
 		return (error);
 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
 		panic("indiracct: botched params");
 	/*
 	 * We have to expand bread here since it will deadlock looking
 	 * up the block number for any blocks that are not in the cache.
 	 */
 	fs = VTOI(cancelvp)->i_fs;
 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
 	bp->b_blkno = fsbtodb(fs, blkno);
 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
 		brelse(bp);
 		return (error);
 	}
 	/*
 	 * Account for the block pointers in this indirect block.
 	 */
 	last = howmany(remblks, blksperindir);
 	if (last > NINDIR(fs))
 		last = NINDIR(fs);
 	if (snapvp != cancelvp) {
 		bap = (ufs_daddr_t *)bp->b_data;
 	} else {
 		MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
 		bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
 		bqrelse(bp);
 	}
 	error = snapacct(snapvp, &bap[0], &bap[last]);
 	if (error || level == 0)
 		goto out;
 	/*
 	 * Account for the block pointers in each of the indirect blocks
 	 * in the levels below us.
 	 */
 	subblksperindir = blksperindir / NINDIR(fs);
 	for (lbn++, level--, i = 0; i < last; i++) {
 		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
 		    rlbn, remblks, subblksperindir);
 		if (error)
 			goto out;
 		rlbn += blksperindir;
 		lbn -= blksperindir;
 		remblks -= blksperindir;
 	}
 out:
 	if (snapvp != cancelvp)
 		bqrelse(bp);
 	else
 		FREE(bap, M_DEVBUF);
 	return (error);
 }
 
 /*
  * Account for a set of blocks allocated in a snapshot inode.
  */
 static int
 snapacct(vp, oldblkp, lastblkp)
 	struct vnode *vp;
 	ufs_daddr_t *oldblkp, *lastblkp;
 {
 	struct inode *ip = VTOI(vp);
 	struct fs *fs = ip->i_fs;
 	ufs_daddr_t lbn, blkno, *blkp;
 	struct buf *ibp;
 	int error;
 
 	for ( ; oldblkp < lastblkp; oldblkp++) {
 		blkno = *oldblkp;
 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
 			continue;
 		lbn = fragstoblks(fs, blkno);
 		if (lbn < NDADDR) {
 			blkp = &ip->i_db[lbn];
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		} else {
 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
 			if (error)
 				return (error);
 			blkp = &((ufs_daddr_t *)(ibp->b_data))
 			    [(lbn - NDADDR) % NINDIR(fs)];
 		}
 		if (*blkp != 0)
 			panic("snapacct: bad block");
 		*blkp = BLK_SNAP;
 		if (lbn >= NDADDR) {
 			ibp->b_flags |= B_VALIDSUSPWRT;
 			bdwrite(ibp);
 		}
 	}
 	return (0);
 }
 
 /*
  * Decrement extra reference on snapshot when last name is removed.
  * It will not be freed until the last open reference goes away.
  */
 void
 ffs_snapgone(ip)
 	struct inode *ip;
 {
 	struct inode *xp;
 
 	/*
 	 * Find snapshot in incore list.
 	 */
 	for (xp = VTOI(ip->i_devvp); xp; xp = xp->i_copyonwrite)
 		if (xp->i_copyonwrite == ip)
 			break;
 	if (xp == 0)
 		printf("ffs_snapgone: lost snapshot vnode %d\n",
 		    ip->i_number);
 	else
 		vrele(ITOV(ip));
 }
 
 /*
  * Prepare a snapshot file for being removed.
  */
 void
 ffs_snapremove(vp)
 	struct vnode *vp;
 {
 	struct inode *ip, *xp;
 	struct vnode *devvp;
 	struct buf *ibp;
 	struct fs *fs;
 	ufs_daddr_t blkno, dblk;
 	int error, snaploc, loc, last;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	/*
 	 * Delete snapshot inode from superblock. Keep list dense.
 	 */
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
 		if (fs->fs_snapinum[snaploc] == ip->i_number)
 			break;
 	if (snaploc < FSMAXSNAP) {
 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
 			if (fs->fs_snapinum[snaploc] == 0)
 				break;
 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
 		}
 		fs->fs_snapinum[snaploc - 1] = 0;
 	}
 	/*
 	 * Delete from incore list.
 	 * Clear copy-on-write flag if last snapshot.
 	 */
 	devvp = ip->i_devvp;
 	for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) {
 		if (xp->i_copyonwrite != ip)
 			continue;
 		xp->i_copyonwrite = ip->i_copyonwrite;
 		ip->i_copyonwrite = 0;
 		break;
 	}
 	if (xp == 0)
 		printf("ffs_snapremove: lost snapshot vnode %d\n",
 		    ip->i_number);
 	if (VTOI(devvp)->i_copyonwrite == 0)
 		devvp->v_flag &= ~VCOPYONWRITE;
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
 	 */
 	for (blkno = 1; blkno < NDADDR; blkno++) {
 		dblk = ip->i_db[blkno];
 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
 		    (dblk == blkstofrags(fs, blkno) &&
 		     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
 			ip->i_db[blkno] = 0;
 	}
 	for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) {
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
 		if (error)
 			continue;
 		if ((last = fs->fs_size - blkno) > NINDIR(fs))
 			last = NINDIR(fs);
 		for (loc = 0; loc < last; loc++) {
 			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
 			    (dblk == blkstofrags(fs, blkno) &&
 			     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
 		}
 		bawrite(ibp);
 	}
 	/*
 	 * Clear snapshot flag and drop reference.
 	 */
 	ip->i_flags &= ~SF_SNAPSHOT;
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 }
 
 /*
  * Notification that a block is being freed. Return zero if the free
  * should be allowed to proceed. Return non-zero if the snapshot file
  * wants to claim the block. The block will be claimed if it is an
  * uncopied part of one of the snapshots. It will be freed if it is
  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
  * If a fragment is being freed, then all snapshots that care about
  * it must make a copy since a snapshot file can only claim full sized
  * blocks. Note that if more than one snapshot file maps the block,
  * we can pick one at random to claim it. Since none of the snapshots
  * can change, we are assurred that they will all see the same unmodified
  * image. When deleting a snapshot file (see ffs_snapremove above), we
  * must push any of these claimed blocks to one of the other snapshots
  * that maps it. These claimed blocks are easily identified as they will
  * have a block number equal to their logical block number within the
  * snapshot. A copied block can never have this property because they
  * must always have been allocated from a BLK_NOCOPY location.
  */
 int
 ffs_snapblkfree(freeip, bno, size)
 	struct inode *freeip;
 	ufs_daddr_t bno;
 	long size;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0;
 	struct fs *fs = freeip->i_fs;
 	struct proc *p = CURPROC;
 	struct inode *ip;
 	struct vnode *vp;
 	ufs_daddr_t lbn, blkno;
 	int indiroff = 0, error = 0, claimedblk = 0;
 
 	lbn = fragstoblks(fs, bno);
 	for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip;
 	     ip = ip->i_copyonwrite) {
 		vp = ITOV(ip);
 		/*
 		 * Lookup block being written.
 		 */
 		if (lbn < NDADDR) {
 			blkno = ip->i_db[lbn];
 		} else {
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			p->p_flag |= P_COWINPROGRESS;
 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
 			p->p_flag &= ~P_COWINPROGRESS;
 			VOP_UNLOCK(vp, 0, p);
 			if (error)
 				break;
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
 		}
 		/*
 		 * Check to see if block needs to be copied.
 		 */
 		switch (blkno) {
 		/*
 		 * If the snapshot has already copied the block (default),
 		 * or does not care about the block, it is not needed.
 		 */
 		default:
 		case BLK_NOCOPY:
 			if (lbn >= NDADDR)
 				bqrelse(ibp);
 			continue;
 		/*
 		 * No previous snapshot claimed the block, so it will be
 		 * freed and become a BLK_NOCOPY (don't care) for us.
 		 */
 		case BLK_SNAP:
 			if (claimedblk)
 				panic("snapblkfree: inconsistent block type");
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			if (lbn < NDADDR) {
 				ip->i_db[lbn] = BLK_NOCOPY;
 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			} else {
 				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
 				    BLK_NOCOPY;
 				bdwrite(ibp);
 			}
 			VOP_UNLOCK(vp, 0, p);
 			continue;
 		/*
 		 * A block that we map is being freed. If it has not been
 		 * claimed yet, we will claim or copy it (below).
 		 */
 		case 0:
 			claimedblk = 1;
 			break;
 		}
 		/*
 		 * If this is a full size block, we will just grab it
 		 * and assign it to the snapshot inode. Otherwise we
 		 * will proceed to copy it. See explanation for this
 		 * routine as to why only a single snapshot needs to
 		 * claim this block.
 		 */
 		if (size == fs->fs_bsize) {
 #ifdef DEBUG
 			if (snapdebug)
 				printf("%s %d lbn %d from inum %d\n",
 				    "Grabonremove: snapino", ip->i_number, lbn,
 				    freeip->i_number);
 #endif
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			if (lbn < NDADDR) {
 				ip->i_db[lbn] = bno;
 			} else {
 				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
 				bdwrite(ibp);
 			}
 			ip->i_blocks += btodb(size);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			VOP_UNLOCK(vp, 0, p);
 			return (1);
 		}
 		if (lbn >= NDADDR)
 			bqrelse(ibp);
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 		p->p_flag |= P_COWINPROGRESS;
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, 0, &cbp);
 		p->p_flag &= ~P_COWINPROGRESS;
 		VOP_UNLOCK(vp, 0, p);
 		if (error)
 			break;
 #ifdef DEBUG
 		if (snapdebug)
 			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
 			    "Copyonremove: snapino ", ip->i_number, lbn,
 			    freeip->i_number, size, cbp->b_blkno);
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0)
 			break;
 		savedcbp = cbp;
 	}
 	if (savedcbp)
 		bawrite(savedcbp);
 	/*
 	 * If we have been unable to allocate a block in which to do
 	 * the copy, then return non-zero so that the fragment will
 	 * not be freed. Although space will be lost, the snapshot
 	 * will stay consistent.
 	 */
 	return (error);
 }
 
 /*
  * Associate snapshot files when mounting.
  */
 void
 ffs_snapshot_mount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct fs *fs = ump->um_fs;
 	struct proc *p = CURPROC;
 	struct inode *ip, **listtailp;
 	struct vnode *vp;
 	int error, snaploc, loc;
 
 	listtailp = &VTOI(ump->um_devvp)->i_copyonwrite;
 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
 		if (fs->fs_snapinum[snaploc] == 0)
 			return;
 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){
 			printf("ffs_snapshot_mount: vget failed %d\n", error);
 			continue;
 		}
 		ip = VTOI(vp);
 		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
 			    fs->fs_snapinum[snaploc]);
 			vput(vp);
 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
 				if (fs->fs_snapinum[loc] == 0)
 					break;
 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
 			}
 			fs->fs_snapinum[loc - 1] = 0;
 			snaploc--;
 			continue;
 		}
 		if (ip->i_copyonwrite != 0)
 			panic("ffs_snapshot_mount: %d already on list",
 			    ip->i_number);
 		*listtailp = ip;
 		listtailp = &ip->i_copyonwrite;
 		vp->v_flag |= VSYSTEM;
 		VOP_UNLOCK(vp, 0, p);
 		ump->um_devvp->v_flag |= VCOPYONWRITE;
 	}
 }
 
 /*
  * Disassociate snapshot files when unmounting.
  */
 void
 ffs_snapshot_unmount(mp)
 	struct mount *mp;
 {
 	struct ufsmount *ump = VFSTOUFS(mp);
 	struct inode *devip = VTOI(ump->um_devvp);
 	struct inode *xp;
 
 	while ((xp = devip->i_copyonwrite) != 0) {
 		devip->i_copyonwrite = xp->i_copyonwrite;
 		xp->i_copyonwrite = 0;
 		if (xp->i_effnlink > 0)
 			vrele(ITOV(xp));
 	}
 	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
 }
 
 /*
  * Check for need to copy block that is about to be written,
  * copying the block if necessary.
  */
 int
 ffs_copyonwrite(ap)
 	struct vop_copyonwrite_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp;
 	struct fs *fs = VTOI(bp->b_vp)->i_fs;
 	struct proc *p = CURPROC;
 	struct inode *ip;
 	struct vnode *vp;
 	ufs_daddr_t lbn, blkno;
 	int indiroff, error = 0;
 
 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
 	if (p->p_flag & P_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
 	for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) {
 		vp = ITOV(ip);
 		/*
 		 * We ensure that everything of our own that needs to be
 		 * copied will be done at the time that ffs_snapshot is
 		 * called. Thus we can skip the check here which can
 		 * deadlock in doing the lookup in VOP_BALLOC.
 		 */
 		if (bp->b_vp == vp)
 			continue;
 		/*
 		 * Check to see if block needs to be copied. We have to
 		 * be able to do the VOP_BALLOC without blocking, otherwise
 		 * we may get in a deadlock with another process also
 		 * trying to allocate. If we find outselves unable to
 		 * get the buffer lock, we unlock the snapshot vnode,
 		 * sleep briefly, and try again.
 		 */
 retry:
 		vn_lock(vp, LK_SHARED | LK_RETRY, p);
 		if (lbn < NDADDR) {
 			blkno = ip->i_db[lbn];
 		} else {
 			p->p_flag |= P_COWINPROGRESS;
 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
 			p->p_flag &= ~P_COWINPROGRESS;
 			if (error) {
 				VOP_UNLOCK(vp, 0, p);
 				if (error != EWOULDBLOCK)
 					break;
-				tsleep(vp, p->p_usrpri, "nap", 1);
+				tsleep(vp, p->p_pri.pri_user, "nap", 1);
 				goto retry;
 			}
 			indiroff = (lbn - NDADDR) % NINDIR(fs);
 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
 			bqrelse(ibp);
 		}
 #ifdef DIAGNOSTIC
 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
 			panic("ffs_copyonwrite: bad copy block");
 #endif
 		if (blkno != 0) {
 			VOP_UNLOCK(vp, 0, p);
 			continue;
 		}
 		/*
 		 * Allocate the block into which to do the copy. Note that this
 		 * allocation will never require any additional allocations for
 		 * the snapshot inode.
 		 */
 		p->p_flag |= P_COWINPROGRESS;
 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
 		p->p_flag &= ~P_COWINPROGRESS;
 		VOP_UNLOCK(vp, 0, p);
 		if (error) {
 			if (error != EWOULDBLOCK)
 				break;
-			tsleep(vp, p->p_usrpri, "nap", 1);
+			tsleep(vp, p->p_pri.pri_user, "nap", 1);
 			goto retry;
 		}
 #ifdef DEBUG
 		if (snapdebug) {
 			printf("Copyonwrite: snapino %d lbn %d for ",
 			    ip->i_number, lbn);
 			if (bp->b_vp == ap->a_vp)
 				printf("fs metadata");
 			else
 				printf("inum %d", VTOI(bp->b_vp)->i_number);
 			printf(" lblkno %d to blkno %d\n", bp->b_lblkno,
 			    cbp->b_blkno);
 		}
 #endif
 		/*
 		 * If we have already read the old block contents, then
 		 * simply copy them to the new block.
 		 */
 		if (savedcbp != 0) {
 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
 			bawrite(cbp);
 			continue;
 		}
 		/*
 		 * Otherwise, read the old block contents into the buffer.
 		 */
 		if ((error = readblock(cbp, lbn)) != 0)
 			break;
 		savedcbp = cbp;
 	}
 	if (savedcbp)
 		bawrite(savedcbp);
 	return (error);
 }
 
 /*
  * Read the specified block into the given buffer.
  * Much of this boiler-plate comes from bwrite().
  */
 static int
 readblock(bp, lbn)
 	struct buf *bp;
 	daddr_t lbn;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct proc *p = CURPROC;
 	struct inode *ip = VTOI(bp->b_vp);
 
 	aiov.iov_base = bp->b_data;
 	aiov.iov_len = bp->b_bcount;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	auio.uio_resid = bp->b_bcount;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_procp = p;
 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
 }
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 72375)
+++ head/sys/vm/vm_glue.c	(revision 72376)
@@ -1,577 +1,577 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 #include "opt_rlimit.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 /*
  * System initialization
  *
  * Note: proc0 from proc.h
  */
 
 static void vm_init_limits __P((void *));
 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
 
 /*
  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
  *
  * Note: run scheduling should be divorced from the vm system.
  */
 static void scheduler __P((void *));
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)
 
 
 static void swapout __P((struct proc *));
 
 int
 kernacc(addr, len, rw)
 	caddr_t addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & (~VM_PROT_ALL)) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 int
 useracc(addr, len, rw)
 	caddr_t addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 	vm_map_entry_t save_hint;
 
 	KASSERT((rw & (~VM_PROT_ALL)) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	/*
 	 * XXX - check separately to disallow access to user area and user
 	 * page tables - they are in the map.
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  It was once
 	 * only used (as an end address) in trap.c.  Use it as an end address
 	 * here too.  This bogusness has spread.  I just fixed where it was
 	 * used as a max in vm_mmap.c.
 	 */
 	if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS
 	    || (vm_offset_t) addr + len < (vm_offset_t) addr) {
 		return (FALSE);
 	}
 	map = &curproc->p_vmspace->vm_map;
 	vm_map_lock_read(map);
 	/*
 	 * We save the map hint, and restore it.  Useracc appears to distort
 	 * the map hint unnecessarily.
 	 */
 	save_hint = map->hint;
 	rv = vm_map_check_protection(map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len), prot);
 	map->hint = save_hint;
 	vm_map_unlock_read(map);
 	
 	return (rv == TRUE);
 }
 
 void
 vslock(addr, len)
 	caddr_t addr;
 	u_int len;
 {
 	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), FALSE);
 }
 
 void
 vsunlock(addr, len)
 	caddr_t addr;
 	u_int len;
 {
 	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), TRUE);
 }
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 void
 vm_fork(p1, p2, flags)
 	register struct proc *p1, *p2;
 	int flags;
 {
 	register struct user *up;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				vmspace_unshare(p1);
 			}
 		}
 		cpu_fork(p1, p2, flags);
 		return;
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		p1->p_vmspace->vm_refcnt++;
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
 
 		pmap_pinit2(vmspace_pmap(p2->p_vmspace));
 
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	pmap_new_proc(p2);
 
 	up = p2->p_addr;
 
 	/*
 	 * p_stats currently points at fields in the user struct
 	 * but not at &u, instead at p_addr. Copy parts of
 	 * p_stats; zero the rest of p_stats (statistics).
 	 *
 	 * If procsig->ps_refcnt is 1 and p2->p_sigacts is NULL we dont' need
 	 * to share sigacts, so we use the up->u_sigacts.
 	 */
 	p2->p_stats = &up->u_stats;
 	if (p2->p_sigacts == NULL) {
 		if (p2->p_procsig->ps_refcnt != 1)
 			printf ("PID:%d NULL sigacts with refcnt not 1!\n",p2->p_pid);
 		p2->p_sigacts = &up->u_sigacts;
 		up->u_sigacts = *p1->p_sigacts;
 	}
 
 	bzero(&up->u_stats.pstat_startzero,
 	    (unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
 		(caddr_t) &up->u_stats.pstat_startzero));
 	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
 	    ((caddr_t) &up->u_stats.pstat_endcopy -
 		(caddr_t) &up->u_stats.pstat_startcopy));
 
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(p1, p2, flags);
 }
 
 /*
  * Set default limits for VM system.
  * Called for proc 0, and then inherited by all others.
  *
  * XXX should probably act directly on proc0.
  */
 static void
 vm_init_limits(udata)
 	void *udata;
 {
 	register struct proc *p = udata;
 	int rss_limit;
 
 	/*
 	 * Set up the initial limits on process VM. Set the maximum resident
 	 * set size to be half of (reasonably) available memory.  Since this
 	 * is a soft limit, it comes into effect only when the system is out
 	 * of memory - half of main memory helps to favor smaller processes,
 	 * and reduces thrashing of the object cache.
 	 */
 	p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
 	p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
 	p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
 	p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ;
 	/* limit the limit to no less than 2MB */
 	rss_limit = max(cnt.v_free_count, 512);
 	p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
 	p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
 }
 
 /*
  * Must be called with the proc struc mutex held.
  */
 void
 faultin(p)
 	struct proc *p;
 {
 
 	mtx_assert(&p->p_mtx, MA_OWNED);
 	mtx_lock_spin(&sched_lock);
 	if ((p->p_sflag & PS_INMEM) == 0) {
 
 		++p->p_lock;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 
 		mtx_assert(&Giant, MA_OWNED);
 		pmap_swapin_proc(p);
 
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		if (p->p_stat == SRUN) {
 			setrunqueue(p);
 		}
 
 		p->p_sflag |= PS_INMEM;
 
 		/* undo the effect of setting SLOCK above */
 		--p->p_lock;
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
  * Giant is still held at this point, to be released in tsleep.
  */
 /* ARGSUSED*/
 static void
 scheduler(dummy)
 	void *dummy;
 {
 	register struct proc *p;
 	register int pri;
 	struct proc *pp;
 	int ppri;
 
 	mtx_assert(&Giant, MA_OWNED);
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, &allproc, p_list) {
 		mtx_lock_spin(&sched_lock);
 		if (p->p_stat == SRUN &&
 			(p->p_sflag & (PS_INMEM | PS_SWAPPING)) == 0) {
 
 			pri = p->p_swtime + p->p_slptime;
 			if ((p->p_sflag & PS_SWAPINREQ) == 0) {
 				pri -= p->p_nice * 8;
 			}
 
 			/*
 			 * if this process is higher priority and there is
 			 * enough space, then select this process instead of
 			 * the previous selection.
 			 */
 			if (pri > ppri) {
 				pp = p;
 				ppri = pri;
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		tsleep(&proc0, PVM, "sched", 0);
 		goto loop;
 	}
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPINREQ;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 */
 	PROC_LOCK(p);
 	faultin(p);
 	PROC_UNLOCK(p);
 	mtx_lock_spin(&sched_lock);
 	p->p_swtime = 0;
 	mtx_unlock_spin(&sched_lock);
 	goto loop;
 }
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1,
 	CTLFLAG_RW, &swap_idle_threshold1, 0, "");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2,
 	CTLFLAG_RW, &swap_idle_threshold2, 0, "");
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  * procs and unwire their u-areas.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  * if any, otherwise the longest-resident process.
  */
 void
 swapout_procs(action)
 int action;
 {
 	register struct proc *p;
 	struct proc *outp, *outp2;
 	int outpri, outpri2;
 	int didswap = 0;
 
 	outp = outp2 = NULL;
 	outpri = outpri2 = INT_MIN;
 	ALLPROC_LOCK(AP_SHARED);
 retry:
 	LIST_FOREACH(p, &allproc, p_list) {
 		struct vmspace *vm;
 		
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
 		    (p->p_flag & (P_TRACED|P_SYSTEM|P_WEXIT)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		vm = p->p_vmspace;
 		PROC_UNLOCK(p);
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_sflag & (PS_INMEM|PS_SWAPPING)) != PS_INMEM) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 
 		switch (p->p_stat) {
 		default:
 			mtx_unlock_spin(&sched_lock);
 			continue;
 
 		case SSLEEP:
 		case SSTOP:
 			/*
 			 * do not swapout a realtime process
 			 */
-			if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) {
+			if (PRI_IS_REALTIME(p->p_pri.pri_class)) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 
 			/*
 			 * Do not swapout a process waiting on a critical
 			 * event of some kind.  Also guarantee swap_idle_threshold1
 			 * time in memory.
 			 */
-			if (((p->p_priority & 0x7f) < PSOCK) ||
+			if (((p->p_pri.pri_level) < PSOCK) ||
 				(p->p_slptime < swap_idle_threshold1)) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 
 			/*
 			 * If the system is under memory stress, or if we are swapping
 			 * idle processes >= swap_idle_threshold2, then swap the process
 			 * out.
 			 */
 			if (((action & VM_SWAP_NORMAL) == 0) &&
 				(((action & VM_SWAP_IDLE) == 0) ||
 				  (p->p_slptime < swap_idle_threshold2))) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 			mtx_unlock_spin(&sched_lock);
 
 			++vm->vm_refcnt;
 			/*
 			 * do not swapout a process that is waiting for VM
 			 * data structures there is a possible deadlock.
 			 */
 			if (lockmgr(&vm->vm_map.lock,
 					LK_EXCLUSIVE | LK_NOWAIT,
 					(void *)0, curproc)) {
 				vmspace_free(vm);
 				continue;
 			}
 			vm_map_unlock(&vm->vm_map);
 			/*
 			 * If the process has been asleep for awhile and had
 			 * most of its pages taken away already, swap it out.
 			 */
 			mtx_lock_spin(&sched_lock);
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (p->p_slptime > swap_idle_threshold2))) {
 				mtx_unlock_spin(&sched_lock);
 				swapout(p);
 				vmspace_free(vm);
 				didswap++;
 				goto retry;
 			} else
 				mtx_unlock_spin(&sched_lock);
 		}
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapout(p)
 	register struct proc *p;
 {
 
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 	++p->p_stats->p_ru.ru_nswap;
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 
 	(void) splhigh();
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_INMEM;
 	p->p_sflag |= PS_SWAPPING;
 	if (p->p_stat == SRUN)
 		remrunqueue(p);
 	mtx_unlock_spin(&sched_lock);
 	(void) spl0();
 
 	pmap_swapout_proc(p);
 
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPPING;
 	p->p_swtime = 0;
 	mtx_unlock_spin(&sched_lock);
 }
 #endif /* !NO_SWAPPING */
Index: head/sys/vm/vm_meter.c
===================================================================
--- head/sys/vm/vm_meter.c	(revision 72375)
+++ head/sys/vm/vm_meter.c	(revision 72376)
@@ -1,360 +1,361 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_meter.c	8.4 (Berkeley) 1/4/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/resource.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <sys/sysctl.h>
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 
 struct vmmeter cnt;
 
 static int maxslp = MAXSLP;
 
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  */
 static void
 loadav(struct loadavg *avg)
 {
 	register int i, nrun;
 	register struct proc *p;
 
 	ALLPROC_LOCK(AP_SHARED);
 	for (nrun = 0, p = LIST_FIRST(&allproc); p != 0; p = LIST_NEXT(p, p_list)) {
 		switch (p->p_stat) {
 		case SSLEEP:
-			if (p->p_priority > PZERO || p->p_slptime != 0)
+			if (p->p_pri.pri_level > PZERO ||
+			    p->p_slptime != 0)
 				continue;
 			/* FALLTHROUGH */
 		case SRUN:
 			if ((p->p_flag & P_NOLOAD) != 0)
 				continue;
 			/* FALLTHROUGH */
 		case SIDL:
 			nrun++;
 		}
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 }
 
 void
 vmmeter()
 {
 
 	if (time_second % 5 == 0)
 		loadav(&averunnable);
 	if (proc0.p_slptime > maxslp / 2)
 		wakeup(&proc0);
 }
 
 SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
 	CTLFLAG_RW, &cnt.v_free_min, 0, "");
 SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
 	CTLFLAG_RW, &cnt.v_free_target, 0, "");
 SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
 	CTLFLAG_RW, &cnt.v_free_reserved, 0, "");
 SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
 	CTLFLAG_RW, &cnt.v_inactive_target, 0, "");
 SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
 	CTLFLAG_RW, &cnt.v_cache_min, 0, "");
 SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
 	CTLFLAG_RW, &cnt.v_cache_max, 0, "");
 SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
 	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
 	CTLFLAG_RW, &cnt.v_free_severe, 0, "");
 
 SYSCTL_STRUCT(_vm, VM_LOADAVG, loadavg, CTLFLAG_RD, 
     &averunnable, loadavg, "Machine loadaverage history");
 
 static int
 vmtotal(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	struct vmtotal total, *totalp;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_map_t map;
 	int paging;
 
 	totalp = &total;
 	bzero(totalp, sizeof *totalp);
 	/*
 	 * Mark all objects as inactive.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		vm_object_clear_flag(object, OBJ_ACTIVE);
 	/*
 	 * Calculate process statistics.
 	 */
 	ALLPROC_LOCK(AP_SHARED);
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_flag & P_SYSTEM)
 			continue;
 		mtx_lock_spin(&sched_lock);
 		switch (p->p_stat) {
 		case 0:
 			mtx_unlock_spin(&sched_lock);
 			continue;
 
 		case SMTX:
 		case SSLEEP:
 		case SSTOP:
 			if (p->p_sflag & PS_INMEM) {
-				if (p->p_priority <= PZERO)
+				if (p->p_pri.pri_level <= PZERO)
 					totalp->t_dw++;
 				else if (p->p_slptime < maxslp)
 					totalp->t_sl++;
 			} else if (p->p_slptime < maxslp)
 				totalp->t_sw++;
 			if (p->p_slptime >= maxslp) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 			break;
 
 		case SWAIT:
 			totalp->t_sl++;
 			continue;
 
 		case SRUN:
 		case SIDL:
 			if (p->p_sflag & PS_INMEM)
 				totalp->t_rq++;
 			else
 				totalp->t_sw++;
 			if (p->p_stat == SIDL) {
 				mtx_unlock_spin(&sched_lock);
 				continue;
 			}
 			break;
 		}
 		mtx_unlock_spin(&sched_lock);
 		/*
 		 * Note active objects.
 		 */
 		paging = 0;
 		for (map = &p->p_vmspace->vm_map, entry = map->header.next;
 		    entry != &map->header; entry = entry->next) {
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			    entry->object.vm_object == NULL)
 				continue;
 			vm_object_set_flag(entry->object.vm_object, OBJ_ACTIVE);
 			paging |= entry->object.vm_object->paging_in_progress;
 		}
 		if (paging)
 			totalp->t_pw++;
 	}
 	ALLPROC_LOCK(AP_RELEASE);
 	/*
 	 * Calculate object memory usage statistics.
 	 */
 	for (object = TAILQ_FIRST(&vm_object_list);
 	    object != NULL;
 	    object = TAILQ_NEXT(object, object_list)) {
 		/*
 		 * devices, like /dev/mem, will badly skew our totals
 		 */
 		if (object->type == OBJT_DEVICE)
 			continue;
 		totalp->t_vm += object->size;
 		totalp->t_rm += object->resident_page_count;
 		if (object->flags & OBJ_ACTIVE) {
 			totalp->t_avm += object->size;
 			totalp->t_arm += object->resident_page_count;
 		}
 		if (object->shadow_count > 1) {
 			/* shared object */
 			totalp->t_vmshr += object->size;
 			totalp->t_rmshr += object->resident_page_count;
 			if (object->flags & OBJ_ACTIVE) {
 				totalp->t_avmshr += object->size;
 				totalp->t_armshr += object->resident_page_count;
 			}
 		}
 	}
 	totalp->t_free = cnt.v_free_count + cnt.v_cache_count;
 	return (sysctl_handle_opaque(oidp, totalp, sizeof total, req));
 }
 
 SYSCTL_PROC(_vm, VM_METER, vmmeter, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", 
     "System virtual memory statistics");
 SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats");
 SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0, "VM meter sys stats");
 SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0, "VM meter vm stats");
 SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
 SYSCTL_UINT(_vm_stats_sys, OID_AUTO,
 	v_swtch, CTLFLAG_RD, &cnt.v_swtch, 0, "Context switches");
 SYSCTL_UINT(_vm_stats_sys, OID_AUTO,
 	v_trap, CTLFLAG_RD, &cnt.v_trap, 0, "Traps");
 SYSCTL_UINT(_vm_stats_sys, OID_AUTO,
 	v_syscall, CTLFLAG_RD, &cnt.v_syscall, 0, "Syscalls");
 SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_intr, CTLFLAG_RD,
     &cnt.v_intr, 0, "Hardware interrupts");
 SYSCTL_UINT(_vm_stats_sys, OID_AUTO, v_soft, CTLFLAG_RD, 
     &cnt.v_soft, 0, "Software interrupts");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vm_faults, CTLFLAG_RD, &cnt.v_vm_faults, 0, "VM faults");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_cow_faults, CTLFLAG_RD, &cnt.v_cow_faults, 0, "COW faults");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_cow_optim, CTLFLAG_RD, &cnt.v_cow_optim, 0, "Optimized COW faults");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_zfod, CTLFLAG_RD, &cnt.v_zfod, 0, "Zero fill");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_ozfod, CTLFLAG_RD, &cnt.v_ozfod, 0, "Optimized zero fill");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_swapin, CTLFLAG_RD, &cnt.v_swapin, 0, "Swapin operations");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_swapout, CTLFLAG_RD, &cnt.v_swapout, 0, "Swapout operations");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_swappgsin, CTLFLAG_RD, &cnt.v_swappgsin, 0, "Swapin pages");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_swappgsout, CTLFLAG_RD, &cnt.v_swappgsout, 0, "Swapout pages");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vnodein, CTLFLAG_RD, &cnt.v_vnodein, 0, "Vnodein operations");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vnodeout, CTLFLAG_RD, &cnt.v_vnodeout, 0, "Vnodeout operations");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vnodepgsin, CTLFLAG_RD, &cnt.v_vnodepgsin, 0, "Vnodein pages");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vnodepgsout, CTLFLAG_RD, &cnt.v_vnodepgsout, 0, "Vnodeout pages");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_intrans, CTLFLAG_RD, &cnt.v_intrans, 0, "In transit page blocking");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_reactivated, CTLFLAG_RD, &cnt.v_reactivated, 0, "Reactivated pages");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_pdwakeups, CTLFLAG_RD, &cnt.v_pdwakeups, 0, "Pagedaemon wakeups");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_pdpages, CTLFLAG_RD, &cnt.v_pdpages, 0, "Pagedaemon page scans");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_dfree, CTLFLAG_RD, &cnt.v_dfree, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_pfree, CTLFLAG_RD, &cnt.v_pfree, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_tfree, CTLFLAG_RD, &cnt.v_tfree, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_page_size, CTLFLAG_RD, &cnt.v_page_size, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_page_count, CTLFLAG_RD, &cnt.v_page_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_free_reserved, CTLFLAG_RD, &cnt.v_free_reserved, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_free_target, CTLFLAG_RD, &cnt.v_free_target, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_free_min, CTLFLAG_RD, &cnt.v_free_min, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_free_count, CTLFLAG_RD, &cnt.v_free_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_wire_count, CTLFLAG_RD, &cnt.v_wire_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_active_count, CTLFLAG_RD, &cnt.v_active_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_inactive_target, CTLFLAG_RD, &cnt.v_inactive_target, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_inactive_count, CTLFLAG_RD, &cnt.v_inactive_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_cache_count, CTLFLAG_RD, &cnt.v_cache_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_cache_min, CTLFLAG_RD, &cnt.v_cache_min, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_cache_max, CTLFLAG_RD, &cnt.v_cache_max, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_pageout_free_min, CTLFLAG_RD, &cnt.v_pageout_free_min, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_interrupt_free_min, CTLFLAG_RD, &cnt.v_interrupt_free_min, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_forks, CTLFLAG_RD, &cnt.v_forks, 0, "Number of fork() calls");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vforks, CTLFLAG_RD, &cnt.v_vforks, 0, "Number of vfork() calls");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_rforks, CTLFLAG_RD, &cnt.v_rforks, 0, "Number of rfork() calls");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_kthreads, CTLFLAG_RD, &cnt.v_kthreads, 0, "Number of fork() calls by kernel");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_forkpages, CTLFLAG_RD, &cnt.v_forkpages, 0, "VM pages affected by fork()");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_vforkpages, CTLFLAG_RD, &cnt.v_vforkpages, 0, "VM pages affected by vfork()");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_rforkpages, CTLFLAG_RD, &cnt.v_rforkpages, 0, "VM pages affected by rfork()");
 SYSCTL_UINT(_vm_stats_vm, OID_AUTO,
 	v_kthreadpages, CTLFLAG_RD, &cnt.v_kthreadpages, 0, "VM pages affected by fork() by kernel");
 #if 0
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	page_mask, CTLFLAG_RD, &page_mask, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	page_shift, CTLFLAG_RD, &page_shift, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	first_page, CTLFLAG_RD, &first_page, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	last_page, CTLFLAG_RD, &last_page, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	vm_page_bucket_count, CTLFLAG_RD, &vm_page_bucket_count, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	vm_page_hash_mask, CTLFLAG_RD, &vm_page_hash_mask, 0, "");
 #endif