Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 173360)
+++ head/sys/amd64/amd64/machdep.c	(revision 173361)
@@ -1,1912 +1,1912 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_perfmon.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #endif
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <amd64/isa/icu.h>
 #else
 #include <machine/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void identify_cpu(void);
 extern void panicifcpuunsupported(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel, _ucode32sel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("usable memory = %ju (%ju MB)\n", ptoa((uintmax_t)physmem),
 	    ptoa((uintmax_t)physmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory  = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_rflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.
 	 * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
 	 * should sometimes set it there too.  tf_rflags is kept in
 	 * the signal context during signal handling and there is no
 	 * other place to remember it, so the PSL_RF bit may be
 	 * corrupted by the signal handler without us knowing.
 	 * Corruption of the PSL_RF bit at worst causes one more or
 	 * one less debugger trap, so allowing it is fairly harmless.
 	 */
 	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
 		printf("sigreturn: rflags = 0x%lx\n", rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		printf("sigreturn: cs = 0x%x\n", cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	ret = set_fpcontext(td, &ucp->uc_mcontext);
 	if (ret != 0)
 		return (ret);
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
 	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 	
 	critical_enter();
 	wrmsr(MSR_FSBASE, 0);
 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	critical_exit();
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_udatasel);
 	load_gs(_udatasel);
 	pcb->pcb_ds = _udatasel;
 	pcb->pcb_es = _udatasel;
 	pcb->pcb_fs = _udatasel;
 	pcb->pcb_gs = _udatasel;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = entry;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == PCPU_GET(curpcb)) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		pcb->pcb_flags &= ~PCB_DBREGS;
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE32_SEL	3 32 bit Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	4 32/64 bit Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	5 64 bit Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct amd64tss)-1,/* length - all address space */
 	SDT_SYSTSS,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* Actually, the TSS is a system descriptor which is double size */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUGS32_SEL	8 32 bit GS Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, ist)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int ist;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 u_int
 isa_irq_pending(void)
 {
 
 	return (0);
 }
 #endif
 
 u_int basemem;
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_tunable;
 	pt_entry_t *pte;
 	struct bios_smap *smapbase, *smap, *smapend;
 	u_int32_t smapsize;
 	quad_t dcons_addr, dcons_size;
 
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 	physmap_idx = 0;
 
 	/*
 	 * get memory map from INT 15:E820, kindly supplied by the loader.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		panic("No BIOS smap info from loader!");
 
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (smap->length == 0)
 			continue;
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-monotonic memory region, ignoring second region\n");
 				continue;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			continue;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 	}
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] == 0x00000000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0)
 		panic("BIOS smap did not include a basemem segment!");
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	u_int64_t msr;
 	char *env;
 
 	thread0.td_kstack = physfree + KERNBASE;
 	bzero((void *)thread0.td_kstack, KSTACK_PAGES * PAGE_SIZE);
 	physfree += KSTACK_PAGES * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * make gdt memory segments
 	 */
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 	PCPU_SET(tssp, &common_tss[0]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	identify_cpu();		/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = thread0.td_kstack + \
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
 	/* Ensure the stack is aligned to 16 bytes */
 	common_tss[0].tss_rsp0 &= ~0xFul;
 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/* Set up the fast syscall stuff */
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	fpuinit();
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
 	thread0.td_pcb->pcb_cr3 = KPML4phys;
 	thread0.td_frame = &proc0_tf;
 
         env = getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = (ISPL(tf->tf_cs)) ? tf->tf_rsp : (long)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	td->td_frame->tf_rip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	long rflags;
 	int ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	ret = set_fpcontext(td, mcp);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 	mcp->mc_ownedfp = fpugetregs(td, (struct savefpu *)&mcp->mc_fpstate);
 	mcp->mc_fpformat = fpuformat();
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct savefpu *fpstate;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/*
 		 * XXX we violate the dubious requirement that fpusetregs()
 		 * be called with interrupts disabled.
 		 * XXX obsolete on trap-16 systems?
 		 */
 		fpstate = (struct savefpu *)&mcp->mc_fpstate;
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		fpusetregs(td, fpstate);
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_FPUINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 173360)
+++ head/sys/amd64/amd64/pmap.c	(revision 173361)
@@ -1,3463 +1,3465 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 static int nkpt;
 static int ndmpdp;
 static vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end;
 pt_entry_t pg_nx;
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
 		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     vm_page_t *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 		vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
                 vm_page_t* free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
  * by using a large (2MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return newaddr;
 }
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return va >> PDRSHIFT;
 }
 
 
 /* Return various clipped indexes for a given VA */
 static __inline vm_pindex_t
 pmap_pte_index(vm_offset_t va)
 {
 
 	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pde_index(vm_offset_t va)
 {
 
 	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pdpe_index(vm_offset_t va)
 {
 
 	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pml4e_index(vm_offset_t va)
 {
 
 	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
 }
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	if (!pmap)
 		return NULL;
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 
 	pml4e = pmap_pml4e(pmap, va);
 	if (pml4e == NULL || (*pml4e & PG_V) == 0)
 		return NULL;
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		 return NULL;
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return NULL;
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 
 static __inline pt_entry_t *
 pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
 {
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return NULL;
 	*ptepde = *pde;
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i;
 
 	/* Allocate pages */
 	KPTphys = allocpages(firstaddr, NKPT);
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 	KPDphys = allocpages(firstaddr, NKPDPE);
 
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	DMPDPphys = allocpages(firstaddr, NDMPML4E);
 	DMPDphys = allocpages(firstaddr, ndmpdp);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Fill in the underlying page table pages */
 	/* Read-only from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
 	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
 		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
 		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
 	}
 
 	/* Now map the page tables at their location within PTmap */
 	for (i = 0; i < NKPT; i++) {
 		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
 	}
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
 		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
 	}
 
 	/* And connect up the PD to the PDP */
 	for (i = 0; i < NKPDPE; i++) {
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
 	}
 
 
 	/* Now set up the direct map space using 2MB pages */
 	for (i = 0; i < NPDEPG * ndmpdp; i++) {
 		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
 		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
 	}
 
 	/* And the direct map space's PDP */
 	for (i = 0; i < ndmpdp; i++) {
 		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the Direct Map slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
 	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the KVA slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
 	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 
 	/* XXX do %cr0 as well */
 	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
 	load_cr3(KPML4phys);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1 is only used for the memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 	invltlb();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if (!(cpu_feature & CPUID_PAT))
 		panic("no PAT??");
 
 #ifdef PAT_WORKS
 	/*
 	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
 	 * Program 4 and 5 as WP and WC.
 	 * Leave 6 and 7 as UC and UC-.
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
 	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(5, PAT_WRITE_COMBINING);
 #else
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
 	 * of UC-.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~PAT_MASK(2);
 	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 #endif
 	wrmsr(MSR_PAT, pat_msr);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 }
 
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 static int
 pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0 && req->newptr) {
 		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
 		pv_entry_high_water = 9 * (pv_entry_max / 10);
 	}
 	return (error);
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 
     &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
 
 static int
 pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0 && req->newptr) {
 		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 		pv_entry_high_water = 9 * (pv_entry_max / 10);
 	}
 	return (error);
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 
     &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int pat_flag, pat_index, cache_bits;
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* If we don't support PAT, map extended modes to older ones. */
 	if (!(cpu_feature & CPUID_PAT)) {
 		switch (mode) {
 		case PAT_UNCACHEABLE:
 		case PAT_WRITE_THROUGH:
 		case PAT_WRITE_BACK:
 			break;
 		case PAT_UNCACHED:
 		case PAT_WRITE_COMBINING:
 		case PAT_WRITE_PROTECTED:
 			mode = PAT_UNCACHEABLE;
 			break;
 		}
 	}
 	
 	/* Map the caching mode to a PAT index. */
 	switch (mode) {
 #ifdef PAT_WORKS
 	case PAT_UNCACHEABLE:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_UNCACHED:
 		pat_index = 2;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 5;
 		break;
 	case PAT_WRITE_PROTECTED:
 		pat_index = 4;
 		break;
 #else
 	case PAT_UNCACHED:
 	case PAT_UNCACHEABLE:
 	case PAT_WRITE_PROTECTED:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 2;
 		break;
 #endif
 	default:
 		panic("Unknown caching mode %d\n", mode);
 	}	
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_index & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_index & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_index & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 #endif /* !SMP */
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 	return (pmap == kernel_pmap ||
 	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde, *pdep;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL) {
 		pde = *pdep;
 		if (pde) {
 			if ((pde & PG_PS) != 0) {
 				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 				PMAP_UNLOCK(pmap);
 				return rtval;
 			}
 			pte = pmap_pde_to_pte(pdep, va);
 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pde = vtopde(va);
 		if (*pde & PG_PS) {
 			pa = (*pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			pa = *vtopte(va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return pa;
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | PG_G);
 }
 
 PMAP_INLINE void 
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, *pte;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		oldpte |= *pte;
 		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V);
 		pte++;
 		ma++;
 	}
 	if ((oldpte & PG_V) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static PMAP_INLINE void
 pmap_free_zero_pages(vm_page_t free)
 {
 	vm_page_t m;
 
 	while (free != NULL) {
 		m = free;
 		free = m->right;
 		vm_page_free_zero(m);
 	}
 }
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
 		return _pmap_unwire_pte_hold(pmap, va, m, free);
 	else
 		return 0;
 }
 
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 
     vm_page_t *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUPDE + NUPDPE)) {
 		/* PDP page */
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
 		*pml4 = 0;
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
 		*pd = 0;
 	}
 	--pmap->pm_stats.resident_count;
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_pte_hold(pmap, va, pdpg, free);
 	}
 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_pte_hold(pmap, va, pdppg, free);
 	}
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	m->right = *free;
 	*free = m;
 	
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 	return 1;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return pmap_unwire_pte_hold(pmap, va, mpte, free);
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
-void
+int
 pmap_pinit(pmap_t pmap)
 {
 	vm_page_t pml4pg;
 	static vm_pindex_t color;
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * allocate the page directory page
 	 */
 	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		VM_WAIT;
 
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
 	/* Wire in kernel global address entries. */
 	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
 	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
 
 	/* install self-referential address mapping entry(s) */
 	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+
+	return (1);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
 {
 	vm_page_t m, pdppg, pdpg;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			VM_WAIT;
 			vm_page_lock_queues();
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 			    flags) == NULL) {
 				--m->wire_count;
 				vm_page_free(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to pdp page */
 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 			pdppg->wire_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    flags) == NULL) {
 				--m->wire_count;
 				vm_page_free(m);
 				return (NULL);
 			}
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    flags) == NULL) {
 					--m->wire_count;
 					vm_page_free(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 				pdpg->wire_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe;
 	vm_page_t pdpg;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 		pdpg->wire_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
 		if (pdpg == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m, free;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		*pd = 0;
 		pd = 0;
 		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 		free = NULL;
 		pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va), &free);
 		pmap_invalidate_all(kernel_pmap);
 		pmap_free_zero_pages(free);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != 0 && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
 	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
 	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 
 	m->wire_count--;
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 	vm_page_free_zero(m);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t newpdp;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;                       
 			}
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pde = pmap_pde(kernel_pmap, kernel_vm_end);
 		if (pde == NULL) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, nkpt,
 			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 			if (!nkpg)
 				panic("pmap_growkernel: no memory to grow kernel");
 			pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			newpdp = (pdp_entry_t)
 				(paddr | PG_V | PG_RW | PG_A | PG_M);
 			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
 			continue; /* try again */
 		}
 		if ((*pde & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;                       
 			}
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
 		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
 
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 
 static int pmap_collect_inactive, pmap_collect_active;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 	"Current number times pmap_collect called on inactive queue");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.  This is normally called to
  * unmap inactive pages, and if necessary, active pages.
  */
 static void
 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 {
 	pd_entry_t ptepde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m, free;
 
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = PV_PMAP(pv);
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 			pte = pmap_pte_pde(pmap, va, &ptepde);
 			if (pte == NULL) {
 				panic("null pte in pmap_collect");
 			}
 			tpte = pte_load_clear(pte);
 			KASSERT((tpte & PG_W) == 0,
 			    ("pmap_collect: wired pte %#lx", tpte));
 			if (tpte & PG_A)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tpte & PG_M) {
 				KASSERT((tpte & PG_RW),
 	("pmap_collect: modified page not writable: va: %#lx, pte: %#lx",
 				    va, tpte));
 				vm_page_dirty(m);
 			}
 			free = NULL;
 			pmap_unuse_pt(pmap, va, ptepde, &free);
 			pmap_invalidate_page(pmap, va);
 			pmap_free_zero_pages(free);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 			free_pv_entry(pmap, pv);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 		}
 	}
 }
 
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	/* move to head of list */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2)
 		return;
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_free(m);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, int try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	static vm_pindex_t colour;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max sysctl.\n");
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			}
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
 	if (m == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		/*
 		 * Reclaim pv entries: At first, destroy mappings to inactive
 		 * pages.  After that, if a pv chunk entry is still needed,
 		 * destroy mappings to active pages.
 		 */
 		PV_STAT(pmap_collect_inactive++);
 		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
 		m = vm_page_alloc(NULL, colour,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
 		if (m == NULL) {
 			PV_STAT(pmap_collect_active++);
 			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
 			m = vm_page_alloc(NULL, colour,
 			    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
 			if (m == NULL)
 				panic("get_pv_entry: increase vm.pmap.shpgperproc");
 		}
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	colour++;
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) 
 			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, vm_page_t *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if (oldpte & PG_M) {
 			KASSERT((oldpte & PG_RW),
 	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
 			    va, oldpte));
 			vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, vm_page_t *free)
 {
 	pt_entry_t *pte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, *pde, free);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde, &free);
 			goto out;
 		}
 	}
 
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			*pde = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			pmap_unuse_pt(pmap, sva, *pdpe, &free);
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
 				break;
 		}
 	}
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();	
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t ptepde;
 	vm_page_t free;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde);
 		if (pte == NULL) {
 			panic("null pte in pmap_remove_all");
 		}
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			KASSERT((tpte & PG_RW),
 	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
 			    pv->pv_va, tpte));
 			vm_page_dirty(m);
 		}
 		free = NULL;
 		pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		pmap_free_zero_pages(free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
 	int anychanged;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	anychanged = 0;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			if ((prot & VM_PROT_WRITE) == 0)
 				*pde &= ~(PG_M|PG_RW);
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				*pde |= pg_nx;
 			anychanged = 1;
 			continue;
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits &
 						    PG_FRAME);
 					vm_page_dirty(m);
 				}
 			}
 
 			if ((prot & VM_PROT_WRITE) == 0)
 				pbits &= ~(PG_RW | PG_M);
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
 	va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
 #endif
 
 	mpte = NULL;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
 				origpte, va);
 		}
 	}
 #endif
 
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL) {
 		if ((*pde & PG_PS) != 0)
 			panic("pmap_enter: attempted pmap_enter on 2MB page");
 		pte = pmap_pde_to_pte(pde, va);
 	} else
 		pte = NULL;
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL)
 		panic("pmap_enter: invalid page directory va=%#lx\n", va);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte | PG_A);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
 				    PG_NX) == 0 && (newpte & PG_NX)))
 					invlva = TRUE;
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
 	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
 				    va, origpte));
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((newpte & PG_RW) == 0)
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte | PG_A);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
 		    prot, mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pmap);
 	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	vm_page_t free;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 2MB page");
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(free);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 	vm_offset_t va;
 	vm_page_t p, pdpg;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		vm_page_t m[1];
 		pd_entry_t ptepa, *pde;
 
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, addr);
 		if (pde != 0 && (*pde & PG_V) != 0)
 			goto out;
 		PMAP_UNLOCK(pmap);
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 			vm_page_unlock_queues();
 		}
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		PMAP_LOCK(pmap);
 		for (va = addr; va < addr + size; va += NBPDR) {
 			while ((pdpg =
 			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
 				PMAP_UNLOCK(pmap);
 				vm_page_lock_queues();
 				vm_page_busy(p);
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(object);
 				VM_WAIT;
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				vm_page_wakeup(p);
 				vm_page_unlock_queues();
 				PMAP_LOCK(pmap);
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			pde = &pde[pmap_pde_index(va)];
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			} else {
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				     "to page directory page, va: 0x%lx", va));
 			}
 			ptepa += NBPDR;
 		}
 		pmap_invalidate_all(pmap);
 out:
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	pt_entry_t *pte;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
 	if (wired && (*pte & PG_W) == 0) {
 		pmap->pm_stats.wired_count++;
 		atomic_set_long(pte, PG_W);
 	} else if (!wired && (*pte & PG_W) != 0) {
 		pmap->pm_stats.wired_count--;
 		atomic_clear_long(pte, PG_W);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t va_next;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpde, dstmpte, srcmpte;
 		pml4_entry_t *pml4e;
 		pdp_entry_t *pdpe;
 		pd_entry_t srcptepaddr, *pde;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (addr + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
 			if (dstmpde == NULL)
 				break;
 			pde = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0) {
 				*pde = srcptepaddr & ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			} else
 				dstmpde->wire_count--;
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < va_next) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 	 			} else {
 					free = NULL;
 					if (pmap_unwire_pte_hold(dst_pmap,
 					    addr, dstmpte, &free)) {
 					    	pmap_invalidate_page(dst_pmap,
 					 	    addr);
 				    	    	pmap_free_zero_pages(free);
 					}
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, free = NULL;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	int field, idx;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = vtopte(pv->pv_va);
 				tpte = *pte;
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08lx\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT(m < &vm_page_array[vm_page_array_size],
 					("pmap_remove_pages: bad tpte %#jx",
 					(uintmax_t)tpte));
 
 				pmap->pm_stats.resident_count--;
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (tpte & PG_M)
 					vm_page_dirty(m);
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				m->md.pv_list_count--;
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 				if (TAILQ_EMPTY(&m->md.pv_list))
 					vm_page_flag_clear(m, PG_WRITEABLE);
 				pmap_unuse_pt(pmap, pv->pv_va,
 				    *vtopde(pv->pv_va), &free);
 			}
 		}
 		if (allfree) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m->phys_addr);
 			vm_page_free(m);
 		}
 	}
 	pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & PG_V)) {
 		pte = vtopte(addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t oldpte, *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pvf = pv;
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			pmap = PV_PMAP(pv);
 			PMAP_LOCK(pmap);
 			pte = pmap_pte(pmap, pv->pv_va);
 			if ((*pte & PG_A) != 0) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				rtval++;
 				if (rtval > 4)
 					pvn = NULL;
 			}
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		if (*pte & PG_M) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		if (*pte & PG_A) {
 			atomic_clear_long(pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(vm_offset_t va, int mode)
 {
 	pt_entry_t *pte;
 	u_int opte, npte;
 
 	pte = vtopte(va);
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
 		npte |= pmap_cache_bits(mode, 0);
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(vm_offset_t va, int mode)
 {
 	pd_entry_t *pde;
 	u_int opde, npde;
 
 	pde = pmap_pde(kernel_pmap, va);
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT);
 		npde |= pmap_cache_bits(mode, 1);
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	vm_offset_t va, tmpva, offset;
 
 	/*
 	 * If this fits within the direct map window and use WB caching
 	 * mode, use the direct map.
 	 */
 	if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK)
 		return ((void *)PHYS_TO_DMAP(pa));
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	pa = trunc_page(pa);
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter_attr(tmpva, pa, mode);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	pmap_invalidate_cache();
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset, tmpva;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 		pmap_kremove(tmpva);
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 int
 pmap_change_attr(va, size, mode)
 	vm_offset_t va;
 	vm_size_t size;
 	int mode;
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	/* Only supported on kernel virtual addresses. */
 	if (base <= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	/*
 	 * XXX: We have to support tearing 2MB pages down into 4k pages if
 	 * needed here.
 	 */
 	/* Pages that aren't mapped aren't supported. */
 	for (tmpva = base; tmpva < (base + size); ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde == 0)
 			return (EINVAL);
 		if (*pde & PG_PS) {
 			/* Handle 2MB pages that are completely contained. */
 			if (size >= NBPDR) {
 				tmpva += NBPDR;
 				continue;
 			}
 			return (EINVAL);
 		}
 		pte = vtopte(va);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode.
 	 */
 	for (tmpva = base; size > 0; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS) {
 			pmap_pde_attr(tmpva, mode);
 			tmpva += NBPDR;
 			size -= NBPDR;
 		} else {
 			pmap_pte_attr(tmpva, mode);
 			tmpva += PAGE_SIZE;
 			size -= PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that shouldn't
 	 * be, etc.
 	 */    
 	pmap_invalidate_range(kernel_pmap, base, tmpva);
 	pmap_invalidate_cache();
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	ptep = pmap_pte(pmap, addr);
 	pte = (ptep != NULL) ? *ptep : 0;
 	PMAP_UNLOCK(pmap);
 
 	if (pte != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int64_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #ifdef SMP
 if (oldpmap)	/* XXX FIXME */
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 if (oldpmap)	/* XXX FIXME */
 	oldpmap->pm_active &= ~PCPU_GET(cpumask);
 	pmap->pm_active |= PCPU_GET(cpumask);
 #endif
 	cr3 = vtophys(pmap->pm_pml4);
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
Index: head/sys/arm/arm/pmap.c
===================================================================
--- head/sys/arm/arm/pmap.c	(revision 173360)
+++ head/sys/arm/arm/pmap.c	(revision 173361)
@@ -1,4905 +1,4906 @@
 /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */
 /*-
  * Copyright 2004 Olivier Houchard.
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2002-2003 Wasabi Systems, Inc.
  * Copyright (c) 2001 Richard Earnshaw
  * Copyright (c) 2001-2002 Christopher Gilbert
  * All rights reserved.
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Mark Brinicombe.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  *
  * RiscBSD kernel project
  *
  * pmap.c
  *
  * Machine dependant vm stuff
  *
  * Created      : 20/09/94
  */
 
 /*
  * Special compilation symbols
  * PMAP_DEBUG           - Build in pmap_debug_level code
  */
 /* Include header files */
 
 #include "opt_vm.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
 #include <machine/pcb.h>
 
 #ifdef PMAP_DEBUG
 #define PDEBUG(_lev_,_stat_) \
         if (pmap_debug_level >= (_lev_)) \
                 ((_stat_))
 #define dprintf printf
 
 int pmap_debug_level = 0;
 #define PMAP_INLINE 
 #else   /* PMAP_DEBUG */
 #define PDEBUG(_lev_,_stat_) /* Nothing */
 #define dprintf(x, arg...)
 #define PMAP_INLINE __inline
 #endif  /* PMAP_DEBUG */
 
 extern struct pv_addr systempage;
 /*
  * Internal function prototypes
  */
 static void pmap_free_pv_entry (pv_entry_t);
 static pv_entry_t pmap_get_pv_entry(void);
 
 static void		pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t, int);
 static void		pmap_vac_me_harder(struct vm_page *, pmap_t,
     vm_offset_t);
 static void		pmap_vac_me_kpmap(struct vm_page *, pmap_t, 
     vm_offset_t);
 static void		pmap_vac_me_user(struct vm_page *, pmap_t, vm_offset_t);
 static void		pmap_alloc_l1(pmap_t);
 static void		pmap_free_l1(pmap_t);
 static void		pmap_use_l1(pmap_t);
 
 static int		pmap_clearbit(struct vm_page *, u_int);
 
 static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t);
 static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t);
 static void		pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int);
 static vm_offset_t	kernel_pt_lookup(vm_paddr_t);
 
 static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1");
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t pmap_curmaxkvaddr;
 vm_paddr_t kernel_l1pa;
 
 extern void *end;
 vm_offset_t kernel_vm_end = 0;
 
 struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 
 static pt_entry_t *csrc_pte, *cdst_pte;
 static vm_offset_t csrcp, cdstp;
 static struct mtx cmtx;
 
 static void		pmap_init_l1(struct l1_ttable *, pd_entry_t *);
 /*
  * These routines are called when the CPU type is identified to set up
  * the PTE prototypes, cache modes, etc.
  *
  * The variables are always here, just in case LKMs need to reference
  * them (though, they shouldn't).
  */
 
 pt_entry_t	pte_l1_s_cache_mode;
 pt_entry_t	pte_l1_s_cache_mode_pt;
 pt_entry_t	pte_l1_s_cache_mask;
 
 pt_entry_t	pte_l2_l_cache_mode;
 pt_entry_t	pte_l2_l_cache_mode_pt;
 pt_entry_t	pte_l2_l_cache_mask;
 
 pt_entry_t	pte_l2_s_cache_mode;
 pt_entry_t	pte_l2_s_cache_mode_pt;
 pt_entry_t	pte_l2_s_cache_mask;
 
 pt_entry_t	pte_l2_s_prot_u;
 pt_entry_t	pte_l2_s_prot_w;
 pt_entry_t	pte_l2_s_prot_mask;
 
 pt_entry_t	pte_l1_s_proto;
 pt_entry_t	pte_l1_c_proto;
 pt_entry_t	pte_l2_s_proto;
 
 void		(*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t);
 void		(*pmap_zero_page_func)(vm_paddr_t, int, int);
 /*
  * Which pmap is currently 'live' in the cache
  *
  * XXXSCW: Fix for SMP ...
  */
 union pmap_cache_state *pmap_cache_state;
 
 struct msgbuf *msgbufp = 0;
 
 extern void bcopy_page(vm_offset_t, vm_offset_t);
 extern void bzero_page(vm_offset_t);
 
 extern vm_offset_t alloc_firstaddr;
 
 char *_tmppt;
 
 /*
  * Metadata for L1 translation tables.
  */
 struct l1_ttable {
 	/* Entry on the L1 Table list */
 	SLIST_ENTRY(l1_ttable) l1_link;
 
 	/* Entry on the L1 Least Recently Used list */
 	TAILQ_ENTRY(l1_ttable) l1_lru;
 
 	/* Track how many domains are allocated from this L1 */
 	volatile u_int l1_domain_use_count;
 
 	/*
 	 * A free-list of domain numbers for this L1.
 	 * We avoid using ffs() and a bitmap to track domains since ffs()
 	 * is slow on ARM.
 	 */
 	u_int8_t l1_domain_first;
 	u_int8_t l1_domain_free[PMAP_DOMAINS];
 
 	/* Physical address of this L1 page table */
 	vm_paddr_t l1_physaddr;
 
 	/* KVA of this L1 page table */
 	pd_entry_t *l1_kva;
 };
 
 /*
  * Convert a virtual address into its L1 table index. That is, the
  * index used to locate the L2 descriptor table pointer in an L1 table.
  * This is basically used to index l1->l1_kva[].
  *
  * Each L2 descriptor table represents 1MB of VA space.
  */
 #define	L1_IDX(va)		(((vm_offset_t)(va)) >> L1_S_SHIFT)
 
 /*
  * L1 Page Tables are tracked using a Least Recently Used list.
  *  - New L1s are allocated from the HEAD.
  *  - Freed L1s are added to the TAIl.
  *  - Recently accessed L1s (where an 'access' is some change to one of
  *    the userland pmaps which owns this L1) are moved to the TAIL.
  */
 static TAILQ_HEAD(, l1_ttable) l1_lru_list;
 /*
  * A list of all L1 tables
  */
 static SLIST_HEAD(, l1_ttable) l1_list;
 static struct mtx l1_lru_lock;
 
 /*
  * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots.
  *
  * This is normally 16MB worth L2 page descriptors for any given pmap.
  * Reference counts are maintained for L2 descriptors so they can be
  * freed when empty.
  */
 struct l2_dtable {
 	/* The number of L2 page descriptors allocated to this l2_dtable */
 	u_int l2_occupancy;
 
 	/* List of L2 page descriptors */
 	struct l2_bucket {
 		pt_entry_t *l2b_kva;	/* KVA of L2 Descriptor Table */
 		vm_paddr_t l2b_phys;	/* Physical address of same */
 		u_short l2b_l1idx;	/* This L2 table's L1 index */
 		u_short l2b_occupancy;	/* How many active descriptors */
 	} l2_bucket[L2_BUCKET_SIZE];
 };
 
 /* pmap_kenter_internal flags */
 #define KENTER_CACHE	0x1
 #define KENTER_USER	0x2
 
 /*
  * Given an L1 table index, calculate the corresponding l2_dtable index
  * and bucket index within the l2_dtable.
  */
 #define	L2_IDX(l1idx)		(((l1idx) >> L2_BUCKET_LOG2) & \
 				 (L2_SIZE - 1))
 #define	L2_BUCKET(l1idx)	((l1idx) & (L2_BUCKET_SIZE - 1))
 
 /*
  * Given a virtual address, this macro returns the
  * virtual address required to drop into the next L2 bucket.
  */
 #define	L2_NEXT_BUCKET(va)	(((va) & L1_S_FRAME) + L1_S_SIZE)
 
 /*
  * L2 allocation.
  */
 #define	pmap_alloc_l2_dtable()		\
 		(void*)uma_zalloc(l2table_zone, M_NOWAIT|M_USE_RESERVE)
 #define	pmap_free_l2_dtable(l2)		\
 		uma_zfree(l2table_zone, l2)
 
 /*
  * We try to map the page tables write-through, if possible.  However, not
  * all CPUs have a write-through cache mode, so on those we have to sync
  * the cache when we frob page tables.
  *
  * We try to evaluate this at compile time, if possible.  However, it's
  * not always possible to do that, hence this run-time var.
  */
 int	pmap_needs_pte_sync;
 
 /*
  * Macro to determine if a mapping might be resident in the
  * instruction cache and/or TLB
  */
 #define	PV_BEEN_EXECD(f)  (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC))
 
 /*
  * Macro to determine if a mapping might be resident in the
  * data cache and/or TLB
  */
 #define	PV_BEEN_REFD(f)   (((f) & PVF_REF) != 0)
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #define pmap_is_current(pm)	((pm) == pmap_kernel() || \
             curproc->p_vmspace->vm_map.pmap == (pm))
 static uma_zone_t pvzone;
 uma_zone_t l2zone;
 static uma_zone_t l2table_zone;
 static vm_offset_t pmap_kernel_l2dtable_kva;
 static vm_offset_t pmap_kernel_l2ptp_kva;
 static vm_paddr_t pmap_kernel_l2ptp_phys;
 static struct vm_object pvzone_obj;
 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 
 /*
  * This list exists for the benefit of pmap_map_chunk().  It keeps track
  * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can
  * find them as necessary.
  *
  * Note that the data on this list MUST remain valid after initarm() returns,
  * as pmap_bootstrap() uses it to contruct L2 table metadata.
  */
 SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list);
 
 static void
 pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt)
 {
 	int i;
 
 	l1->l1_kva = l1pt;
 	l1->l1_domain_use_count = 0;
 	l1->l1_domain_first = 1;
 
 	for (i = 0; i < PMAP_DOMAINS; i++)
 		l1->l1_domain_free[i] = i + 2;
 
 	/*
 	 * Copy the kernel's L1 entries to each new L1.
 	 */
 	if (l1pt != pmap_kernel()->pm_l1->l1_kva)
 		memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE);
 
 	if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0)
 		panic("pmap_init_l1: can't get PA of L1 at %p", l1pt);
 	SLIST_INSERT_HEAD(&l1_list, l1, l1_link);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 }
 
 static vm_offset_t
 kernel_pt_lookup(vm_paddr_t pa)
 {
 	struct pv_addr *pv;
 
 	SLIST_FOREACH(pv, &kernel_pt_list, pv_list) {
 		if (pv->pv_pa == pa)
 			return (pv->pv_va);
 	}
 	return (0);
 }
 
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0
 void
 pmap_pte_init_generic(void)
 {
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_generic;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_generic;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_generic;
 
 	/*
 	 * If we have a write-through cache, set B and C.  If
 	 * we have a write-back cache, then we assume setting
 	 * only C will make those pages write-through.
 	 */
 	if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) {
 		pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_B|L2_C;
 		pte_l2_s_cache_mode_pt = L2_B|L2_C;
 	} else {
 		pte_l1_s_cache_mode_pt = L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_C;
 		pte_l2_s_cache_mode_pt = L2_C;
 	}
 
 	pte_l2_s_prot_u = L2_S_PROT_U_generic;
 	pte_l2_s_prot_w = L2_S_PROT_W_generic;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_generic;
 
 	pte_l1_s_proto = L1_S_PROTO_generic;
 	pte_l1_c_proto = L1_C_PROTO_generic;
 	pte_l2_s_proto = L2_S_PROTO_generic;
 
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 }
 
 #if defined(CPU_ARM8)
 void
 pmap_pte_init_arm8(void)
 {
 
 	/*
 	 * ARM8 is compatible with generic, but we need to use
 	 * the page tables uncached.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = 0;
 	pte_l2_l_cache_mode_pt = 0;
 	pte_l2_s_cache_mode_pt = 0;
 }
 #endif /* CPU_ARM8 */
 
 #if defined(CPU_ARM9) && defined(ARM9_CACHE_WRITE_THROUGH)
 void
 pmap_pte_init_arm9(void)
 {
 
 	/*
 	 * ARM9 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_C;
 	pte_l2_l_cache_mode = L2_C;
 	pte_l2_s_cache_mode = L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 }
 #endif /* CPU_ARM9 */
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if defined(CPU_ARM10)
 void
 pmap_pte_init_arm10(void)
 {
 
 	/*
 	 * ARM10 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_B | L1_S_C;
 	pte_l2_l_cache_mode = L2_B | L2_C;
 	pte_l2_s_cache_mode = L2_B | L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 
 }
 #endif /* CPU_ARM10 */
 
 #if  ARM_MMU_SA1 == 1
 void
 pmap_pte_init_sa1(void)
 {
 
 	/*
 	 * The StrongARM SA-1 cache does not have a write-through
 	 * mode.  So, do the generic initialization, then reset
 	 * the page table cache mode to B=1,C=1, and note that
 	 * the PTEs need to be sync'd.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_B|L2_C;
 	pte_l2_s_cache_mode_pt = L2_B|L2_C;
 
 	pmap_needs_pte_sync = 1;
 }
 #endif /* ARM_MMU_SA1 == 1*/
 
 #if ARM_MMU_XSCALE == 1
 #if (ARM_NMMUS > 1) || defined (CPU_XSCALE_CORE3)
 static u_int xscale_use_minidata;
 #endif
 
 void
 pmap_pte_init_xscale(void)
 {
 	uint32_t auxctl;
 	int write_through = 0;
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C|L1_S_XSCALE_P;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_xscale;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_xscale;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_xscale;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 #ifdef XSCALE_CACHE_READ_WRITE_ALLOCATE
 	/*
 	 * The XScale core has an enhanced mode where writes that
 	 * miss the cache cause a cache line to be allocated.  This
 	 * is significantly faster than the traditional, write-through
 	 * behavior of this case.
 	 */
 	pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_X);
 	pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_X);
 	pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_X);
 #endif /* XSCALE_CACHE_READ_WRITE_ALLOCATE */
 #ifdef XSCALE_CACHE_WRITE_THROUGH
 	/*
 	 * Some versions of the XScale core have various bugs in
 	 * their cache units, the work-around for which is to run
 	 * the cache in write-through mode.  Unfortunately, this
 	 * has a major (negative) impact on performance.  So, we
 	 * go ahead and run fast-and-loose, in the hopes that we
 	 * don't line up the planets in a way that will trip the
 	 * bugs.
 	 *
 	 * However, we give you the option to be slow-but-correct.
 	 */
 	write_through = 1;
 #elif defined(XSCALE_CACHE_WRITE_BACK)
 	/* force write back cache mode */
 	write_through = 0;
 #elif defined(CPU_XSCALE_PXA2X0)
 	/*
 	 * Intel PXA2[15]0 processors are known to have a bug in
 	 * write-back cache on revision 4 and earlier (stepping
 	 * A[01] and B[012]).  Fixed for C0 and later.
 	 */
 	{
 		uint32_t id, type;
 
 		id = cpufunc_id();
 		type = id & ~(CPU_ID_XSCALE_COREREV_MASK|CPU_ID_REVISION_MASK);
 
 		if (type == CPU_ID_PXA250 || type == CPU_ID_PXA210) {
 			if ((id & CPU_ID_REVISION_MASK) < 5) {
 				/* write through for stepping A0-1 and B0-2 */
 				write_through = 1;
 			}
 		}
 	}
 #endif /* XSCALE_CACHE_WRITE_THROUGH */
 
 	if (write_through) {
 		pte_l1_s_cache_mode = L1_S_C;
 		pte_l2_l_cache_mode = L2_C;
 		pte_l2_s_cache_mode = L2_C;
 	}
 
 #if (ARM_NMMUS > 1)
 	xscale_use_minidata = 1;
 #endif
 
 	pte_l2_s_prot_u = L2_S_PROT_U_xscale;
 	pte_l2_s_prot_w = L2_S_PROT_W_xscale;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_xscale;
 
 	pte_l1_s_proto = L1_S_PROTO_xscale;
 	pte_l1_c_proto = L1_C_PROTO_xscale;
 	pte_l2_s_proto = L2_S_PROTO_xscale;
 
 #ifdef CPU_XSCALE_CORE3
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 	xscale_use_minidata = 0;
 	/* Make sure it is L2-cachable */
     	pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_T);
 	pte_l1_s_cache_mode_pt = pte_l1_s_cache_mode &~ L1_S_XSCALE_P;
 	pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_T) ;
 	pte_l2_l_cache_mode_pt = pte_l1_s_cache_mode;
 	pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_T);
 	pte_l2_s_cache_mode_pt = pte_l2_s_cache_mode;
 
 #else
 	pmap_copy_page_func = pmap_copy_page_xscale;
 	pmap_zero_page_func = pmap_zero_page_xscale;
 #endif
 
 	/*
 	 * Disable ECC protection of page table access, for now.
 	 */
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl &= ~XSCALE_AUXCTL_P;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 
 /*
  * xscale_setup_minidata:
  *
  *	Set up the mini-data cache clean area.  We require the
  *	caller to allocate the right amount of physically and
  *	virtually contiguous space.
  */
 extern vm_offset_t xscale_minidata_clean_addr;
 extern vm_size_t xscale_minidata_clean_size; /* already initialized */
 void
 xscale_setup_minidata(vm_offset_t l1pt, vm_offset_t va, vm_paddr_t pa)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte;
 	vm_size_t size;
 	uint32_t auxctl;
 
 	xscale_minidata_clean_addr = va;
 
 	/* Round it to page size. */
 	size = (xscale_minidata_clean_size + L2_S_OFFSET) & L2_S_FRAME;
 
 	for (; size != 0;
 	     va += L2_S_SIZE, pa += L2_S_SIZE, size -= L2_S_SIZE) {
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("xscale_setup_minidata: can't find L2 table for "
 			    "VA 0x%08x", (u_int32_t) va);
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 		    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);
 	}
 
 	/*
 	 * Configure the mini-data cache for write-back with
 	 * read/write-allocate.
 	 *
 	 * NOTE: In order to reconfigure the mini-data cache, we must
 	 * make sure it contains no valid data!  In order to do that,
 	 * we must issue a global data cache invalidate command!
 	 *
 	 * WE ASSUME WE ARE RUNNING UN-CACHED WHEN THIS ROUTINE IS CALLED!
 	 * THIS IS VERY IMPORTANT!
 	 */
 
 	/* Invalidate data and mini-data. */
 	__asm __volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0));
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl = (auxctl & ~XSCALE_AUXCTL_MD_MASK) | XSCALE_AUXCTL_MD_WB_RWA;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 #endif
 
 /*
  * Allocate an L1 translation table for the specified pmap.
  * This is called at pmap creation time.
  */
 static void
 pmap_alloc_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 	u_int8_t domain;
 
 	/*
 	 * Remove the L1 at the head of the LRU list
 	 */
 	mtx_lock(&l1_lru_lock);
 	l1 = TAILQ_FIRST(&l1_lru_list);
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Pick the first available domain number, and update
 	 * the link to the next number.
 	 */
 	domain = l1->l1_domain_first;
 	l1->l1_domain_first = l1->l1_domain_free[domain];
 
 	/*
 	 * If there are still free domain numbers in this L1,
 	 * put it back on the TAIL of the LRU list.
 	 */
 	if (++l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 
 	/*
 	 * Fix up the relevant bits in the pmap structure
 	 */
 	pm->pm_l1 = l1;
 	pm->pm_domain = domain;
 }
 
 /*
  * Free an L1 translation table.
  * This is called at pmap destruction time.
  */
 static void
 pmap_free_l1(pmap_t pm)
 {
 	struct l1_ttable *l1 = pm->pm_l1;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * If this L1 is currently on the LRU list, remove it.
 	 */
 	if (l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Free up the domain number which was allocated to the pmap
 	 */
 	l1->l1_domain_free[pm->pm_domain] = l1->l1_domain_first;
 	l1->l1_domain_first = pm->pm_domain;
 	l1->l1_domain_use_count--;
 
 	/*
 	 * The L1 now must have at least 1 free domain, so add
 	 * it back to the LRU list. If the use count is zero,
 	 * put it at the head of the list, otherwise it goes
 	 * to the tail.
 	 */
 	if (l1->l1_domain_use_count == 0) {
 		TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru);
 	}	else
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 static PMAP_INLINE void
 pmap_use_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 
 	/*
 	 * Do nothing if we're in interrupt context.
 	 * Access to an L1 by the kernel pmap must not affect
 	 * the LRU list.
 	 */
 	if (pm == pmap_kernel())
 		return;
 
 	l1 = pm->pm_l1;
 
 	/*
 	 * If the L1 is not currently on the LRU list, just return
 	 */
 	if (l1->l1_domain_use_count == PMAP_DOMAINS)
 		return;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * Check the use count again, now that we've acquired the lock
 	 */
 	if (l1->l1_domain_use_count == PMAP_DOMAINS) {
 		mtx_unlock(&l1_lru_lock);
 		return;
 	}
 
 	/*
 	 * Move the L1 to the back of the LRU list
 	 */
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA, or NULL if no L2 bucket exists for the address.
  */
 static PMAP_INLINE struct l2_bucket *
 pmap_get_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL ||
 	    (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL)
 		return (NULL);
 
 	return (l2b);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA.
  *
  * If no L2 bucket exists, perform the necessary allocations to put an L2
  * bucket/page table in place.
  *
  * Note that if a new L2 bucket/page was allocated, the caller *must*
  * increment the bucket occupancy counter appropriately *before* 
  * releasing the pmap's lock to ensure no other thread or cpu deallocates
  * the bucket/page in the meantime.
  */
 static struct l2_bucket *
 pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	PMAP_ASSERT_LOCKED(pm);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 again_l2table:
 		PMAP_UNLOCK(pm);
 		vm_page_unlock_queues();
 		if ((l2 = pmap_alloc_l2_dtable()) == NULL) {
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			return (NULL);
 		}
 		vm_page_lock_queues();
 		PMAP_LOCK(pm);
 		if (pm->pm_l2[L2_IDX(l1idx)] != NULL) {
 			PMAP_UNLOCK(pm);
 			vm_page_unlock_queues();
 			uma_zfree(l2table_zone, l2);
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			l2 = pm->pm_l2[L2_IDX(l1idx)];
 			if (l2 == NULL)
 				goto again_l2table;
 			/*
 			 * Someone already allocated the l2_dtable while
 			 * we were doing the same.
 			 */
 		} else {
 			bzero(l2, sizeof(*l2));
 			/*
 			 * Link it into the parent pmap
 			 */
 			pm->pm_l2[L2_IDX(l1idx)] = l2;
 		}
 	} 
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 again_ptep:
 		PMAP_UNLOCK(pm);
 		vm_page_unlock_queues();
 		ptep = (void*)uma_zalloc(l2zone, M_NOWAIT|M_USE_RESERVE);
 		vm_page_lock_queues();
 		PMAP_LOCK(pm);
 		if (l2b->l2b_kva != 0) {
 			/* We lost the race. */
 			PMAP_UNLOCK(pm);
 			vm_page_unlock_queues();
 			uma_zfree(l2zone, ptep);
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			if (l2b->l2b_kva == 0)
 				goto again_ptep;
 			return (l2b);
 		}
 		l2b->l2b_phys = vtophys(ptep);
 		if (ptep == NULL) {
 			/*
 			 * Oops, no more L2 page tables available at this
 			 * time. We may need to deallocate the l2_dtable
 			 * if we allocated a new one above.
 			 */
 			if (l2->l2_occupancy == 0) {
 				pm->pm_l2[L2_IDX(l1idx)] = NULL;
 				pmap_free_l2_dtable(l2);
 			}
 			return (NULL);
 		}
 
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 	}
 
 	return (l2b);
 }
 
 static PMAP_INLINE void
 #ifndef PMAP_INCLUDE_PTE_SYNC
 pmap_free_l2_ptp(pt_entry_t *l2)
 #else
 pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2)
 #endif
 {
 #ifdef PMAP_INCLUDE_PTE_SYNC
 	/*
 	 * Note: With a write-back cache, we may need to sync this
 	 * L2 table before re-using it.
 	 * This is because it may have belonged to a non-current
 	 * pmap, in which case the cache syncs would have been
 	 * skipped when the pages were being unmapped. If the
 	 * L2 table were then to be immediately re-allocated to
 	 * the *current* pmap, it may well contain stale mappings
 	 * which have not yet been cleared by a cache write-back
 	 * and so would still be visible to the mmu.
 	 */
 	if (need_sync)
 		PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 #endif
 	uma_zfree(l2zone, l2);
 }
 /*
  * One or more mappings in the specified L2 descriptor table have just been
  * invalidated.
  *
  * Garbage collect the metadata and descriptor table itself if necessary.
  *
  * The pmap lock must be acquired when this is called (not necessary
  * for the kernel pmap).
  */
 static void
 pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 
 	/*
 	 * Update the bucket's reference count according to how many
 	 * PTEs the caller has just invalidated.
 	 */
 	l2b->l2b_occupancy -= count;
 
 	/*
 	 * Note:
 	 *
 	 * Level 2 page tables allocated to the kernel pmap are never freed
 	 * as that would require checking all Level 1 page tables and
 	 * removing any references to the Level 2 page table. See also the
 	 * comment elsewhere about never freeing bootstrap L2 descriptors.
 	 *
 	 * We make do with just invalidating the mapping in the L2 table.
 	 *
 	 * This isn't really a big deal in practice and, in fact, leads
 	 * to a performance win over time as we don't need to continually
 	 * alloc/free.
 	 */
 	if (l2b->l2b_occupancy > 0 || pm == pmap_kernel())
 		return;
 
 	/*
 	 * There are no more valid mappings in this level 2 page table.
 	 * Go ahead and NULL-out the pointer in the bucket, then
 	 * free the page table.
 	 */
 	l1idx = l2b->l2b_l1idx;
 	ptep = l2b->l2b_kva;
 	l2b->l2b_kva = NULL;
 
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 
 	/*
 	 * If the L1 slot matches the pmap's domain
 	 * number, then invalidate it.
 	 */
 	l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK);
 	if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) {
 		*pl1pd = 0;
 		PTE_SYNC(pl1pd);
 	}
 
 	/*
 	 * Release the L2 descriptor table back to the pool cache.
 	 */
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	pmap_free_l2_ptp(ptep);
 #else
 	pmap_free_l2_ptp(!pmap_is_current(pm), ptep);
 #endif
 
 	/*
 	 * Update the reference count in the associated l2_dtable
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (--l2->l2_occupancy > 0)
 		return;
 
 	/*
 	 * There are no more valid mappings in any of the Level 1
 	 * slots managed by this l2_dtable. Go ahead and NULL-out
 	 * the pointer in the parent pmap and free the l2_dtable.
 	 */
 	pm->pm_l2[L2_IDX(l1idx)] = NULL;
 	pmap_free_l2_dtable(l2);
 }
 
 /*
  * Pool cache constructors for L2 descriptor tables, metadata and pmap
  * structures.
  */
 static int
 pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags)
 {
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 #ifdef ARM_USE_SMALL_ALLOC
 	pd_entry_t *pde;
 #endif
 	vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK;
 
 	/*
 	 * The mappings for these page tables were initially made using
 	 * pmap_kenter() by the pool subsystem. Therefore, the cache-
 	 * mode will not be right for page table mappings. To avoid
 	 * polluting the pmap_kenter() code with a special case for
 	 * page tables, we simply fix up the cache-mode here if it's not
 	 * correct.
 	 */
 #ifdef ARM_USE_SMALL_ALLOC
 	pde = &kernel_pmap->pm_l1->l1_kva[L1_IDX(va)];
 	if (!l1pte_section_p(*pde)) {
 #endif
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		pte = *ptep;
 		
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			/*
 			 * Page tables must have the cache-mode set to 
 			 * Write-Thru.
 			 */
 			*ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_tlb_flushD_SE(va);
 			cpu_cpwait();
 		}
 #ifdef ARM_USE_SMALL_ALLOC
 	}
 #endif
 #endif
 	memset(mem, 0, L2_TABLE_SIZE_REAL);
 	PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 	return (0);
 }
 
 /*
  * A bunch of routines to conditionally flush the caches/TLB depending
  * on whether the specified pmap actually needs to be flushed at any
  * given time.
  */
 static PMAP_INLINE void
 pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushID(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID();
 }
 static PMAP_INLINE void
 pmap_tlb_flushD(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD();
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len)
 {
 
 	if (pmap_is_current(pm))
 		cpu_idcache_wbinv_range(va, len);
 }
 
 static PMAP_INLINE void
 pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len,
     boolean_t do_inv, boolean_t rd_only)
 {
 
 	if (pmap_is_current(pm)) {
 		if (do_inv) {
 			if (rd_only)
 				cpu_dcache_inv_range(va, len);
 			else
 				cpu_dcache_wbinv_range(va, len);
 		} else
 		if (!rd_only)
 			cpu_dcache_wb_range(va, len);
 	}
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_idcache_wbinv_all();
 }
 
 static PMAP_INLINE void
 pmap_dcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_dcache_wbinv_all();
 }
 
 /*
  * PTE_SYNC_CURRENT:
  *
  *     Make sure the pte is written out to RAM.
  *     We need to do this for one of two cases:
  *       - We're dealing with the kernel pmap
  *       - There is no pmap active in the cache/tlb.
  *       - The specified pmap is 'active' in the cache/tlb.
  */
 #ifdef PMAP_INCLUDE_PTE_SYNC
 #define	PTE_SYNC_CURRENT(pm, ptep)	\
 do {					\
 	if (PMAP_NEEDS_PTE_SYNC && 	\
 	    pmap_is_current(pm))	\
 		PTE_SYNC(ptep);		\
 } while (/*CONSTCOND*/0)
 #else
 #define	PTE_SYNC_CURRENT(pm, ptep)	/* nothing */
 #endif
 
 /*
  * Since we have a virtually indexed cache, we may need to inhibit caching if
  * there is more than one mapping and at least one of them is writable.
  * Since we purge the cache on every context switch, we only need to check for
  * other mappings within the same pmap, or kernel_pmap.
  * This function is also called when a page is unmapped, to possibly reenable
  * caching on any remaining mappings.
  *
  * The code implements the following logic, where:
  *
  * KW = # of kernel read/write pages
  * KR = # of kernel read only pages
  * UW = # of user read/write pages
  * UR = # of user read only pages
  * 
  * KC = kernel mapping is cacheable
  * UC = user mapping is cacheable
  *
  *               KW=0,KR=0  KW=0,KR>0  KW=1,KR=0  KW>1,KR>=0
  *             +---------------------------------------------
  * UW=0,UR=0   | ---        KC=1       KC=1       KC=0
  * UW=0,UR>0   | UC=1       KC=1,UC=1  KC=0,UC=0  KC=0,UC=0
  * UW=1,UR=0   | UC=1       KC=0,UC=0  KC=0,UC=0  KC=0,UC=0
  * UW>1,UR>=0  | UC=0       KC=0,UC=0  KC=0,UC=0  KC=0,UC=0
  */
 
 static const int pmap_vac_flags[4][4] = {
 	{-1,		0,		0,		PVF_KNC},
 	{0,		0,		PVF_NC,		PVF_NC},
 	{0,		PVF_NC,		PVF_NC,		PVF_NC},
 	{PVF_UNC,	PVF_NC,		PVF_NC,		PVF_NC}
 };
 
 static PMAP_INLINE int
 pmap_get_vac_flags(const struct vm_page *pg)
 {
 	int kidx, uidx;
 
 	kidx = 0;
 	if (pg->md.kro_mappings || pg->md.krw_mappings > 1)
 		kidx |= 1;
 	if (pg->md.krw_mappings)
 		kidx |= 2;
 
 	uidx = 0;
 	if (pg->md.uro_mappings || pg->md.urw_mappings > 1)
 		uidx |= 1;
 	if (pg->md.urw_mappings)
 		uidx |= 2;
 
 	return (pmap_vac_flags[uidx][kidx]);
 }
 
 static __inline void
 pmap_vac_me_harder(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	int nattr;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	nattr = pmap_get_vac_flags(pg);
 
 	if (nattr < 0) {
 		pg->md.pvh_attrs &= ~PVF_NC;
 		return;
 	}
 
 	if (nattr == 0 && (pg->md.pvh_attrs & PVF_NC) == 0) {
 		return;
 	}
 
 	if (pm == pmap_kernel())
 		pmap_vac_me_kpmap(pg, pm, va);
 	else
 		pmap_vac_me_user(pg, pm, va);
 
 	pg->md.pvh_attrs = (pg->md.pvh_attrs & ~PVF_NC) | nattr;
 }
 
 static void
 pmap_vac_me_kpmap(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	u_int u_cacheable, u_entries;
 	struct pv_entry *pv;
 	pmap_t last_pmap = pm;
 
 	/* 
 	 * Pass one, see if there are both kernel and user pmaps for
 	 * this page.  Calculate whether there are user-writable or
 	 * kernel-writable pages.
 	 */
 	u_cacheable = 0;
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		if (pv->pv_pmap != pm && (pv->pv_flags & PVF_NC) == 0)
 			u_cacheable++;
 	}
 
 	u_entries = pg->md.urw_mappings + pg->md.uro_mappings;
 
 	/* 
 	 * We know we have just been updating a kernel entry, so if
 	 * all user pages are already cacheable, then there is nothing
 	 * further to do.
 	 */
 	if (pg->md.k_mappings == 0 && u_cacheable == u_entries)
 		return;
 
 	if (u_entries) {
 		/* 
 		 * Scan over the list again, for each entry, if it
 		 * might not be set correctly, call pmap_vac_me_user
 		 * to recalculate the settings.
 		 */
 		TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 			/* 
 			 * We know kernel mappings will get set
 			 * correctly in other calls.  We also know
 			 * that if the pmap is the same as last_pmap
 			 * then we've just handled this entry.
 			 */
 			if (pv->pv_pmap == pm || pv->pv_pmap == last_pmap)
 				continue;
 
 			/* 
 			 * If there are kernel entries and this page
 			 * is writable but non-cacheable, then we can
 			 * skip this entry also.  
 			 */
 			if (pg->md.k_mappings &&
 			    (pv->pv_flags & (PVF_NC | PVF_WRITE)) ==
 			    (PVF_NC | PVF_WRITE))
 				continue;
 
 			/* 
 			 * Similarly if there are no kernel-writable 
 			 * entries and the page is already 
 			 * read-only/cacheable.
 			 */
 			if (pg->md.krw_mappings == 0 &&
 			    (pv->pv_flags & (PVF_NC | PVF_WRITE)) == 0)
 				continue;
 
 			/* 
 			 * For some of the remaining cases, we know
 			 * that we must recalculate, but for others we
 			 * can't tell if they are correct or not, so
 			 * we recalculate anyway.
 			 */
 			pmap_vac_me_user(pg, (last_pmap = pv->pv_pmap), 0);
 		}
 
 		if (pg->md.k_mappings == 0)
 			return;
 	}
 
 	pmap_vac_me_user(pg, pm, va);
 }
 
 static void
 pmap_vac_me_user(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	pmap_t kpmap = pmap_kernel();
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	u_int entries = 0;
 	u_int writable = 0;
 	u_int cacheable_entries = 0;
 	u_int kern_cacheable = 0;
 	u_int other_writable = 0;
 
 	/*
 	 * Count mappings and writable mappings in this pmap.
 	 * Include kernel mappings as part of our own.
 	 * Keep a pointer to the first one.
 	 */
 	npv = TAILQ_FIRST(&pg->md.pv_list);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		/* Count mappings in the same pmap */
 		if (pm == pv->pv_pmap || kpmap == pv->pv_pmap) {
 			if (entries++ == 0)
 				npv = pv;
 
 			/* Cacheable mappings */
 			if ((pv->pv_flags & PVF_NC) == 0) {
 				cacheable_entries++;
 				if (kpmap == pv->pv_pmap)
 					kern_cacheable++;
 			}
 
 			/* Writable mappings */
 			if (pv->pv_flags & PVF_WRITE)
 				++writable;
 		} else
 		if (pv->pv_flags & PVF_WRITE)
 			other_writable = 1;
 	}
 
 	/*
 	 * Enable or disable caching as necessary.
 	 * Note: the first entry might be part of the kernel pmap,
 	 * so we can't assume this is indicative of the state of the
 	 * other (maybe non-kpmap) entries.
 	 */
 	if ((entries > 1 && writable) ||
 	    (entries > 0 && pm == kpmap && other_writable)) {
 		if (cacheable_entries == 0)
 			return;
 
 		for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) {
 			if ((pm != pv->pv_pmap && kpmap != pv->pv_pmap) ||
 			    (pv->pv_flags & PVF_NC))
 				continue;
 
 			pv->pv_flags |= PVF_NC;
 
 			l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 			ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 			pte = *ptep & ~L2_S_CACHE_MASK;
 
 			if ((va != pv->pv_va || pm != pv->pv_pmap) &&
 			    l2pte_valid(pte)) {
 				if (PV_BEEN_EXECD(pv->pv_flags)) {
 					pmap_idcache_wbinv_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE);
 					pmap_tlb_flushID_SE(pv->pv_pmap,
 					    pv->pv_va);
 				} else
 				if (PV_BEEN_REFD(pv->pv_flags)) {
 					pmap_dcache_wb_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE, TRUE,
 					    (pv->pv_flags & PVF_WRITE) == 0);
 					pmap_tlb_flushD_SE(pv->pv_pmap,
 					    pv->pv_va);
 				}
 			}
 
 			*ptep = pte;
 			PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		}
 		cpu_cpwait();
 	} else
 	if (entries > cacheable_entries) {
 		/*
 		 * Turn cacheing back on for some pages.  If it is a kernel
 		 * page, only do so if there are no other writable pages.
 		 */
 		for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) {
 			if (!(pv->pv_flags & PVF_NC) || (pm != pv->pv_pmap &&
 			    (kpmap != pv->pv_pmap || other_writable)))
 				continue;
 
 			pv->pv_flags &= ~PVF_NC;
 
 			l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 			ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 			pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode;
 
 			if (l2pte_valid(pte)) {
 				if (PV_BEEN_EXECD(pv->pv_flags)) {
 					pmap_tlb_flushID_SE(pv->pv_pmap,
 					    pv->pv_va);
 				} else
 				if (PV_BEEN_REFD(pv->pv_flags)) {
 					pmap_tlb_flushD_SE(pv->pv_pmap,
 					    pv->pv_va);
 				}
 			}
 
 			*ptep = pte;
 			PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		}
 	}
 }
 
 /*
  * Modify pte bits for all ptes corresponding to the given physical address.
  * We use `maskbits' rather than `clearbits' because we're always passing
  * constants and the latter would require an extra inversion at run-time.
  */
 static int 
 pmap_clearbit(struct vm_page *pg, u_int maskbits)
 {
 	struct l2_bucket *l2b;
 	struct pv_entry *pv;
 	pt_entry_t *ptep, npte, opte;
 	pmap_t pm;
 	vm_offset_t va;
 	u_int oflags;
 	int count = 0;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Clear saved attributes (modify, reference)
 	 */
 	pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF));
 
 	if (TAILQ_EMPTY(&pg->md.pv_list)) {
 		return (0);
 	}
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos
 	 */
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		va = pv->pv_va;
 		pm = pv->pv_pmap;
 		oflags = pv->pv_flags;
 		pv->pv_flags &= ~maskbits;
 
 		PMAP_LOCK(pm);
 
 		l2b = pmap_get_l2_bucket(pm, va);
 
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		npte = opte = *ptep;
 
 		if (maskbits & (PVF_WRITE|PVF_MOD)) {
 			if ((pv->pv_flags & PVF_NC)) {
 				/* 
 				 * Entry is not cacheable:
 				 *
 				 * Don't turn caching on again if this is a 
 				 * modified emulation. This would be
 				 * inconsitent with the settings created by
 				 * pmap_vac_me_harder(). Otherwise, it's safe
 				 * to re-enable cacheing.
 				 *
 				 * There's no need to call pmap_vac_me_harder()
 				 * here: all pages are losing their write
 				 * permission.
 				 */
 				if (maskbits & PVF_WRITE) {
 					npte |= pte_l2_s_cache_mode;
 					pv->pv_flags &= ~PVF_NC;
 				}
 			} else
 			if (opte & L2_S_PROT_W) {
 				vm_page_dirty(pg);
 				/* 
 				 * Entry is writable/cacheable: check if pmap
 				 * is current if it is flush it, otherwise it
 				 * won't be in the cache
 				 */
 				if (PV_BEEN_EXECD(oflags))
 					pmap_idcache_wbinv_range(pm, pv->pv_va,
 					    PAGE_SIZE);
 				else
 				if (PV_BEEN_REFD(oflags))
 					pmap_dcache_wb_range(pm, pv->pv_va,
 					    PAGE_SIZE,
 					    (maskbits & PVF_REF) ? TRUE : FALSE,
 					    FALSE);
 			}
 
 			/* make the pte read only */
 			npte &= ~L2_S_PROT_W;
 
 			if (maskbits & PVF_WRITE) {
 				/*
 				 * Keep alias accounting up to date
 				 */
 				if (pv->pv_pmap == pmap_kernel()) {
 					if (oflags & PVF_WRITE) {
 						pg->md.krw_mappings--;
 						pg->md.kro_mappings++;
 					}
 				} else
 				if (oflags & PVF_WRITE) {
 					pg->md.urw_mappings--;
 					pg->md.uro_mappings++;
 				}
 			}
 		}
 
 		if (maskbits & PVF_REF) {
 			if ((pv->pv_flags & PVF_NC) == 0 &&
 			    (maskbits & (PVF_WRITE|PVF_MOD)) == 0) {
 				/*
 				 * Check npte here; we may have already
 				 * done the wbinv above, and the validity
 				 * of the PTE is the same for opte and
 				 * npte.
 				 */
 				if (npte & L2_S_PROT_W) {
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, FALSE);
 				} else
 				if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) {
 					/* XXXJRT need idcache_inv_range */
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, TRUE);
 				}
 			}
 
 			/*
 			 * Make the PTE invalid so that we will take a
 			 * page fault the next time the mapping is
 			 * referenced.
 			 */
 			npte &= ~L2_TYPE_MASK;
 			npte |= L2_TYPE_INV;
 		}
 
 		if (npte != opte) {
 			count++;
 			*ptep = npte;
 			PTE_SYNC(ptep);
 			/* Flush the TLB entry if a current pmap. */
 			if (PV_BEEN_EXECD(oflags))
 				pmap_tlb_flushID_SE(pm, pv->pv_va);
 			else
 			if (PV_BEEN_REFD(oflags))
 				pmap_tlb_flushD_SE(pm, pv->pv_va);
 		}
 
 		PMAP_UNLOCK(pm);
 
 	}
 
 	if (maskbits & PVF_WRITE)
 		vm_page_flag_clear(pg, PG_WRITEABLE);
 	return (count);
 }
 
 /*
  * main pv_entry manipulation functions:
  *   pmap_enter_pv: enter a mapping onto a vm_page list
  *   pmap_remove_pv: remove a mappiing from a vm_page list
  *
  * NOTE: pmap_enter_pv expects to lock the pvh itself
  *       pmap_remove_pv expects te caller to lock the pvh before calling
  */
 
 /*
  * pmap_enter_pv: enter a mapping onto a vm_page lst
  *
  * => caller should hold the proper lock on pmap_main_lock
  * => caller should have pmap locked
  * => we will gain the lock on the vm_page and allocate the new pv_entry
  * => caller should adjust ptp's wire_count before calling
  * => caller should not adjust pmap's wire_count
  */
 static void
 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm,
     vm_offset_t va, u_int flags)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_ASSERT_LOCKED(pm);
 	pve->pv_pmap = pm;
 	pve->pv_va = va;
 	pve->pv_flags = flags;
 
 	TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
 	TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist);
 	pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD);
 	if (pm == pmap_kernel()) {
 		if (flags & PVF_WRITE)
 			pg->md.krw_mappings++;
 		else
 			pg->md.kro_mappings++;
 	} 
 	if (flags & PVF_WRITE)
 		pg->md.urw_mappings++;
 	else
 		pg->md.uro_mappings++;
 	pg->md.pv_list_count++;
 	if (pve->pv_flags & PVF_WIRED)
 		++pm->pm_stats.wired_count;
 	vm_page_flag_set(pg, PG_REFERENCED);
 }
 
 /*
  *
  * pmap_find_pv: Find a pv entry
  *
  * => caller should hold lock on vm_page
  */
 static PMAP_INLINE struct pv_entry *
 pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pv;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list)
 	    if (pm == pv->pv_pmap && va == pv->pv_va)
 		    break;
 	return (pv);
 }
 
 /*
  * vector_page_setprot:
  *
  *	Manipulate the protection of the vector page.
  */
 void
 vector_page_setprot(int prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page);
 
 	ptep = &l2b->l2b_kva[l2pte_index(vector_page)];
 
 	*ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot);
 	PTE_SYNC(ptep);
 	cpu_tlb_flushD_SE(vector_page);
 	cpu_cpwait();
 }
 
 /*
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
  * => caller should hold proper lock on pmap_main_lock
  * => pmap should be locked
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should adjust ptp's wire_count and free PTP if needed
  * => caller should NOT adjust pmap's wire_count
  * => we return the removed pve
  */
 
 static void
 pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_ASSERT_LOCKED(pm);
 	TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list);
 	TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist);
 	if (pve->pv_flags & PVF_WIRED)
 		--pm->pm_stats.wired_count;
 	pg->md.pv_list_count--;
 	if (pg->md.pvh_attrs & PVF_MOD)
 		vm_page_dirty(pg);
 	if (pm == pmap_kernel()) {
 		if (pve->pv_flags & PVF_WRITE)
 			pg->md.krw_mappings--;
 		else
 			pg->md.kro_mappings--;
 	} else
 		if (pve->pv_flags & PVF_WRITE)
 			pg->md.urw_mappings--;
 		else
 			pg->md.uro_mappings--;
 	if (TAILQ_FIRST(&pg->md.pv_list) == NULL ||
 	    (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0)) {
 		pg->md.pvh_attrs &= ~PVF_MOD;
 		if (TAILQ_FIRST(&pg->md.pv_list) == NULL)
 			pg->md.pvh_attrs &= ~PVF_REF;
 		vm_page_flag_clear(pg, PG_WRITEABLE);
 	}
 	if (TAILQ_FIRST(&pg->md.pv_list))
 		vm_page_flag_set(pg, PG_REFERENCED);
 	if (pve->pv_flags & PVF_WRITE)
 		pmap_vac_me_harder(pg, pm, 0);
 }
 
 static struct pv_entry *
 pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pve;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pve = TAILQ_FIRST(&pg->md.pv_list);
 
 	while (pve) {
 		if (pve->pv_pmap == pm && pve->pv_va == va) {	/* match? */
 			pmap_nuke_pv(pg, pm, pve);
 			break;
 		}
 		pve = TAILQ_NEXT(pve, pv_list);
 	}
 
 	return(pve);				/* return removed pve */
 }
 /*
  *
  * pmap_modify_pv: Update pv flags
  *
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should NOT adjust pmap's wire_count
  * => caller must call pmap_vac_me_harder() if writable status of a page
  *    may have changed.
  * => we return the old flags
  * 
  * Modify a physical-virtual mapping in the pv table
  */
 static u_int
 pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va,
     u_int clr_mask, u_int set_mask)
 {
 	struct pv_entry *npv;
 	u_int flags, oflags;
 
 	PMAP_ASSERT_LOCKED(pm);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((npv = pmap_find_pv(pg, pm, va)) == NULL)
 		return (0);
 
 	/*
 	 * There is at least one VA mapping this page.
 	 */
 
 	if (clr_mask & (PVF_REF | PVF_MOD))
 		pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD);
 
 	oflags = npv->pv_flags;
 	npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask;
 
 	if ((flags ^ oflags) & PVF_WIRED) {
 		if (flags & PVF_WIRED)
 			++pm->pm_stats.wired_count;
 		else
 			--pm->pm_stats.wired_count;
 	}
 
 	if ((flags ^ oflags) & PVF_WRITE) {
 		if (pm == pmap_kernel()) {
 			if (flags & PVF_WRITE) {
 				pg->md.krw_mappings++;
 				pg->md.kro_mappings--;
 			} else {
 				pg->md.kro_mappings++;
 				pg->md.krw_mappings--;
 			}
 		} else
 		if (flags & PVF_WRITE) {
 			pg->md.urw_mappings++;
 			pg->md.uro_mappings--;
 		} else {
 			pg->md.uro_mappings++;
 			pg->md.urw_mappings--;
 		}
 		if (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0) {
 			pg->md.pvh_attrs &= ~PVF_MOD;
 			vm_page_flag_clear(pg, PG_WRITEABLE);
 		}
 		pmap_vac_me_harder(pg, pm, 0);
 	}
 
 	return (oflags);
 }
 
 /* Function to set the debug level of the pmap code */
 #ifdef PMAP_DEBUG
 void
 pmap_debug(int level)
 {
 	pmap_debug_level = level;
 	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
 }
 #endif  /* PMAP_DEBUG */
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap));
 
 	dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n",
 		(u_int32_t) pmap, (u_int32_t) pmap->pm_pdir);
 	bcopy(kernel_pmap, pmap, sizeof(*pmap));
 	bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx));
 	PMAP_LOCK_INIT(pmap);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *      Initialize the pmap module.
  *      Called by vm_init, to initialize any structures that the pmap
  *      system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	PDEBUG(1, printf("pmap_init: phys_start = %08x\n"));
 
 	/*
 	 * init the pv free list
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	PDEBUG(1, printf("pmap_init: done!\n"));
 
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
 }
 
 int
 pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	int rv = 0;
 
 	l1idx = L1_IDX(va);
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 
 	/*
 	 * If there is no l2_dtable for this address, then the process
 	 * has no business accessing it.
 	 *
 	 * Note: This will catch userland processes trying to access
 	 * kernel addresses.
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (l2 == NULL)
 		goto out;
 
 	/*
 	 * Likewise if there is no L2 descriptor table
 	 */
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 	if (l2b->l2b_kva == NULL)
 		goto out;
 
 	/*
 	 * Check the PTE itself.
 	 */
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	if (pte == 0)
 		goto out;
 
 	/*
 	 * Catch a userland access to the vector page mapped at 0x0
 	 */
 	if (user && (pte & L2_S_PROT_U) == 0)
 		goto out;
 	if (va == vector_page)
 		goto out;
 
 	pa = l2pte_pa(pte);
 
 	if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) {
 		/*
 		 * This looks like a good candidate for "page modified"
 		 * emulation...
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) {
 			goto out;
 		}
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL) {
 			goto out;
 		}
 
 		/*
 		 * Do the flags say this page is writable? If not then it
 		 * is a genuine write fault. If yes then the write fault is
 		 * our fault as we did not reflect the write access in the
 		 * PTE. Now we know a write has occurred we can correct this
 		 * and also set the modified bit
 		 */
 		if ((pv->pv_flags & PVF_WRITE) == 0) {
 			goto out;
 		}
 
 		pg->md.pvh_attrs |= PVF_REF | PVF_MOD;
 		vm_page_dirty(pg);
 		pv->pv_flags |= PVF_REF | PVF_MOD;
 
 		/* 
 		 * Re-enable write permissions for the page.  No need to call
 		 * pmap_vac_me_harder(), since this is just a
 		 * modified-emulation fault, and the PVF_WRITE bit isn't
 		 * changing. We've already set the cacheable bits based on
 		 * the assumption that we can write to this page.
 		 */
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W;
 		PTE_SYNC(ptep);
 		rv = 1;
 	} else
 	if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) {
 		/*
 		 * This looks like a good candidate for "page referenced"
 		 * emulation.
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
 			goto out;
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL)
 			goto out;
 
 		pg->md.pvh_attrs |= PVF_REF;
 		pv->pv_flags |= PVF_REF;
 
 
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
 		PTE_SYNC(ptep);
 		rv = 1;
 	}
 
 	/*
 	 * We know there is a valid mapping here, so simply
 	 * fix up the L1 if necessary.
 	 */
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO;
 	if (*pl1pd != l1pd) {
 		*pl1pd = l1pd;
 		PTE_SYNC(pl1pd);
 		rv = 1;
 	}
 
 #ifdef CPU_SA110
 	/*
 	 * There are bugs in the rev K SA110.  This is a check for one
 	 * of them.
 	 */
 	if (rv == 0 && curcpu()->ci_arm_cputype == CPU_ID_SA110 &&
 	    curcpu()->ci_arm_cpurev < 3) {
 		/* Always current pmap */
 		if (l2pte_valid(pte)) {
 			extern int kernel_debug;
 			if (kernel_debug & 1) {
 				struct proc *p = curlwp->l_proc;
 				printf("prefetch_abort: page is already "
 				    "mapped - pte=%p *pte=%08x\n", ptep, pte);
 				printf("prefetch_abort: pc=%08lx proc=%p "
 				    "process=%s\n", va, p, p->p_comm);
 				printf("prefetch_abort: far=%08x fs=%x\n",
 				    cpu_faultaddress(), cpu_faultstatus());
 			}
 #ifdef DDB
 			if (kernel_debug & 2)
 				Debugger();
 #endif
 			rv = 1;
 		}
 	}
 #endif /* CPU_SA110 */
 
 #ifdef DEBUG
 	/*
 	 * If 'rv == 0' at this point, it generally indicates that there is a
 	 * stale TLB entry for the faulting address. This happens when two or
 	 * more processes are sharing an L1. Since we don't flush the TLB on
 	 * a context switch between such processes, we can take domain faults
 	 * for mappings which exist at the same VA in both processes. EVEN IF
 	 * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for
 	 * example.
 	 *
 	 * This is extremely likely to happen if pmap_enter() updated the L1
 	 * entry for a recently entered mapping. In this case, the TLB is
 	 * flushed for the new mapping, but there may still be TLB entries for
 	 * other mappings belonging to other processes in the 1MB range
 	 * covered by the L1 entry.
 	 *
 	 * Since 'rv == 0', we know that the L1 already contains the correct
 	 * value, so the fault must be due to a stale TLB entry.
 	 *
 	 * Since we always need to flush the TLB anyway in the case where we
 	 * fixed up the L1, or frobbed the L2 PTE, we effectively deal with
 	 * stale TLB entries dynamically.
 	 *
 	 * However, the above condition can ONLY happen if the current L1 is
 	 * being shared. If it happens when the L1 is unshared, it indicates
 	 * that other parts of the pmap are not doing their job WRT managing
 	 * the TLB.
 	 */
 	if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) {
 		extern int last_fault_code;
 		printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n",
 		    pm, va, ftype);
 		printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n",
 		    l2, l2b, ptep, pl1pd);
 		printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n",
 		    pte, l1pd, last_fault_code);
 #ifdef DDB
 		Debugger();
 #endif
 	}
 #endif
 
 	cpu_tlb_flushID_SE(va);
 	cpu_cpwait();
 
 	rv = 1;
 
 out:
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pm);
 	return (rv);
 }
 
 void
 pmap_postinit(void)
 {
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pt;
 	pt_entry_t *ptep, pte;
 	vm_offset_t va, eva;
 	u_int loop, needed;
 	
 	needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0);
 	needed -= 1;
 	l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK);
 
 	for (loop = 0; loop < needed; loop++, l1++) {
 		/* Allocate a L1 page table */
 		va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0,
 		    0xffffffff, L1_TABLE_SIZE, 0);
 
 		if (va == 0)
 			panic("Cannot allocate L1 KVM");
 
 		eva = va + L1_TABLE_SIZE;
 		pl1pt = (pd_entry_t *)va;
 		
 		while (va < eva) {
 				l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 				ptep = &l2b->l2b_kva[l2pte_index(va)];
 				pte = *ptep;
 				pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 				cpu_tlb_flushD_SE(va);
 				
 				va += PAGE_SIZE;
 		}
 		pmap_init_l1(l1, pl1pt);
 	}
 
 
 #ifdef DEBUG
 	printf("pmap_postinit: Allocated %d static L1 descriptor tables\n",
 	    needed);
 #endif
 }
 
 /*
  * This is used to stuff certain critical values into the PCB where they
  * can be accessed quickly from cpu_switch() et al.
  */
 void
 pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb)
 {
 	struct l2_bucket *l2b;
 
 	pcb->pcb_pagedir = pm->pm_l1->l1_physaddr;
 	pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) |
 	    (DOMAIN_CLIENT << (pm->pm_domain * 2));
 
 	if (vector_page < KERNBASE) {
 		pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)];
 		l2b = pmap_get_l2_bucket(pm, vector_page);
 		pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO |
 	 	    L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL);
 	} else
 		pcb->pcb_pl1vec = NULL;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pm;
 	struct pcb *pcb;
 
 	pm = vmspace_pmap(td->td_proc->p_vmspace);
 	pcb = td->td_pcb;
 
 	critical_enter();
 	pmap_set_pcb_pagedir(pm, pcb);
 
 	if (td == curthread) {
 		u_int cur_dacr, cur_ttb;
 
 		__asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb));
 		__asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr));
 
 		cur_ttb &= ~(L1_TABLE_SIZE - 1);
 
 		if (cur_ttb == (u_int)pcb->pcb_pagedir &&
 		    cur_dacr == pcb->pcb_dacr) {
 			/*
 			 * No need to switch address spaces.
 			 */
 			critical_exit();
 			return;
 		}
 
 
 		/*
 		 * We MUST, I repeat, MUST fix up the L1 entry corresponding
 		 * to 'vector_page' in the incoming L1 table before switching
 		 * to it otherwise subsequent interrupts/exceptions (including
 		 * domain faults!) will jump into hyperspace.
 		 */
 		if (pcb->pcb_pl1vec) {
 
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			/*
 			 * Don't need to PTE_SYNC() at this point since
 			 * cpu_setttb() is about to flush both the cache
 			 * and the TLB.
 			 */
 		}
 
 		cpu_domains(pcb->pcb_dacr);
 		cpu_setttb(pcb->pcb_pagedir);
 	}
 	critical_exit();
 }
 
 static int
 pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va)
 {
 	pd_entry_t *pdep, pde;
 	pt_entry_t *ptep, pte;
 	vm_offset_t pa;
 	int rv = 0;
 
 	/*
 	 * Make sure the descriptor itself has the correct cache mode
 	 */
 	pdep = &kl1[L1_IDX(va)];
 	pde = *pdep;
 
 	if (l1pte_section_p(pde)) {
 		if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) {
 			*pdep = (pde & ~L1_S_CACHE_MASK) |
 			    pte_l1_s_cache_mode_pt;
 			PTE_SYNC(pdep);
 			cpu_dcache_wbinv_range((vm_offset_t)pdep,
 			    sizeof(*pdep));
 			rv = 1;
 		}
 	} else {
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		if (ptep == NULL)
 			panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep);
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			*ptep = (pte & ~L2_S_CACHE_MASK) |
 			    pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_dcache_wbinv_range((vm_offset_t)ptep,
 			    sizeof(*ptep));
 			rv = 1;
 		}
 	}
 
 	return (rv);
 }
 
 static void
 pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap, 
     pt_entry_t **ptep)
 {
 	vm_offset_t va = *availp;
 	struct l2_bucket *l2b;
 
 	if (ptep) {
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		if (l2b == NULL)
 			panic("pmap_alloc_specials: no l2b for 0x%x", va);
 
 		*ptep = &l2b->l2b_kva[l2pte_index(va)];
 	}
 
 	*vap = va;
 	*availp = va + (PAGE_SIZE * pages);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the arm this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 #define PMAP_STATIC_L2_SIZE 16
 #ifdef ARM_USE_SMALL_ALLOC
 extern struct mtx smallalloc_mtx;
 #endif
 
 void
 pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt)
 {
 	static struct l1_ttable static_l1;
 	static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE];
 	struct l1_ttable *l1 = &static_l1;
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t pde;
 	pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_size_t size;
 	int l1idx, l2idx, l2next = 0;
 
 	PDEBUG(1, printf("firstaddr = %08x, loadaddr = %08x\n",
 	    firstaddr, loadaddr));
 	
 	virtual_avail = firstaddr;
 	kernel_pmap = &kernel_pmap_store;
 	kernel_pmap->pm_l1 = l1;
 	kernel_l1pa = l1pt->pv_pa;
 	
 	/*
 	 * Scan the L1 translation table created by initarm() and create
 	 * the required metadata for all valid mappings found in it.
 	 */
 	for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) {
 		pde = kernel_l1pt[l1idx];
 
 		/*
 		 * We're only interested in Coarse mappings.
 		 * pmap_extract() can deal with section mappings without
 		 * recourse to checking L2 metadata.
 		 */
 		if ((pde & L1_TYPE_MASK) != L1_TYPE_C)
 			continue;
 
 		/*
 		 * Lookup the KVA of this L2 descriptor table
 		 */
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		
 		if (ptep == NULL) {
 			panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx",
 			    (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa);
 		}
 
 		/*
 		 * Fetch the associated L2 metadata structure.
 		 * Allocate a new one if necessary.
 		 */
 		if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) {
 			if (l2next == PMAP_STATIC_L2_SIZE)
 				panic("pmap_bootstrap: out of static L2s");
 			kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 = 
 			    &static_l2[l2next++];
 		}
 
 		/*
 		 * One more L1 slot tracked...
 		 */
 		l2->l2_occupancy++;
 
 		/*
 		 * Fill in the details of the L2 descriptor in the
 		 * appropriate bucket.
 		 */
 		l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 		l2b->l2b_kva = ptep;
 		l2b->l2b_phys = pa;
 		l2b->l2b_l1idx = l1idx;
 
 		/*
 		 * Establish an initial occupancy count for this descriptor
 		 */
 		for (l2idx = 0;
 		    l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 		    l2idx++) {
 			if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) {
 				l2b->l2b_occupancy++;
 			}
 		}
 
 		/*
 		 * Make sure the descriptor itself has the correct cache mode.
 		 * If not, fix it, but whine about the problem. Port-meisters
 		 * should consider this a clue to fix up their initarm()
 		 * function. :)
 		 */
 		if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) {
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "L2 pte @ %p\n", ptep);
 		}
 	}
 
 	
 	/*
 	 * Ensure the primary (kernel) L1 has the correct cache mode for
 	 * a page table. Bitch if it is not correctly set.
 	 */
 	for (va = (vm_offset_t)kernel_l1pt;
 	    va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) {
 		if (pmap_set_pt_cache_mode(kernel_l1pt, va))
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "primary L1 @ 0x%x\n", va);
 	}
 
 	cpu_dcache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_active = -1;
 	kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define SYSMAP(c, p, v, n)						\
     v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
     
 	pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte);
 	pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte);
 	size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE,
 	    &pmap_kernel_l2ptp_kva, NULL);
 	
 	size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE,
 	    &pmap_kernel_l2dtable_kva, NULL);
 
 	pmap_alloc_specials(&virtual_avail,
 	    1, (vm_offset_t*)&_tmppt, NULL);
 	SLIST_INIT(&l1_list);
 	TAILQ_INIT(&l1_lru_list);
 	mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF);
 	pmap_init_l1(l1, kernel_l1pt);
 	cpu_dcache_wbinv_all();
 
 	virtual_avail = round_page(virtual_avail);
 	virtual_end = lastaddr;
 	kernel_vm_end = pmap_curmaxkvaddr;
 	arm_nocache_startaddr = lastaddr;
 	mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF);
 
 #ifdef ARM_USE_SMALL_ALLOC
 	mtx_init(&smallalloc_mtx, "Small alloc page list", NULL, MTX_DEF);
 	arm_init_smallalloc();
 #endif
 	pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	struct pcb *pcb;
 	
 	pmap_idcache_wbinv_all(pmap);
 	pmap_tlb_flushID(pmap);
 	cpu_cpwait();
 	if (vector_page < KERNBASE) {
 		struct pcb *curpcb = PCPU_GET(curpcb);
 		pcb = thread0.td_pcb;
 		if (pmap_is_current(pmap)) {
 			/*
  			 * Frob the L1 entry corresponding to the vector
 			 * page so that it contains the kernel pmap's domain
 			 * number. This will ensure pmap_remove() does not
 			 * pull the current vector page out from under us.
 			 */
 			critical_enter();
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			cpu_domains(pcb->pcb_dacr);
 			cpu_setttb(pcb->pcb_pagedir);
 			critical_exit();
 		}
 		pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE);
 		/*
 		 * Make sure cpu_switch(), et al, DTRT. This is safe to do
 		 * since this process has no remaining mappings of its own.
 		 */
 		curpcb->pcb_pl1vec = pcb->pcb_pl1vec;
 		curpcb->pcb_l1vec = pcb->pcb_l1vec;
 		curpcb->pcb_dacr = pcb->pcb_dacr;
 		curpcb->pcb_pagedir = pcb->pcb_pagedir;
 
 	}
 	pmap_free_l1(pmap);
 	PMAP_LOCK_DESTROY(pmap);
 	
 	dprintf("pmap_release()\n");
 }
 
 
 
 /*
  * Helper function for pmap_grow_l2_bucket()
  */
 static __inline int
 pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	struct vm_page *pg;
 	
 	pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (pg == NULL)
 		return (1);
 	pa = VM_PAGE_TO_PHYS(pg);
 
 	if (pap)
 		*pap = pa;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	*ptep = L2_S_PROTO | pa | cache_mode |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE);
 	PTE_SYNC(ptep);
 	return (0);
 }
 
 /*
  * This is the same as pmap_alloc_l2_bucket(), except that it is only
  * used by pmap_growkernel().
  */
 static __inline struct l2_bucket *
 pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pd;
 	u_short l1idx;
 	vm_offset_t nva;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		nva = pmap_kernel_l2dtable_kva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		l2 = (struct l2_dtable *)nva;
 		nva += sizeof(struct l2_dtable);
 
 		if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva & 
 		    PAGE_MASK)) {
 			/*
 			 * The new l2_dtable straddles a page boundary.
 			 * Map in another page to cover it.
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		pmap_kernel_l2dtable_kva = nva;
 
 		/*
 		 * Link it into the parent pmap
 		 */
 		pm->pm_l2[L2_IDX(l1idx)] = l2;
 		memset(l2, 0, sizeof(*l2));
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		nva = pmap_kernel_l2ptp_kva;
 		ptep = (pt_entry_t *)nva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt,
 			    &pmap_kernel_l2ptp_phys))
 				return (NULL);
 			PTE_SYNC_RANGE(ptep, PAGE_SIZE / sizeof(pt_entry_t));
 		}
 		memset(ptep, 0, L2_TABLE_SIZE_REAL);
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 		l2b->l2b_phys = pmap_kernel_l2ptp_phys;
 
 		pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL;
 		pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL;
 	}
 
 	/* Distribute new L1 entry to all other L1s */
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 			pl1pd = &l1->l1_kva[L1_IDX(va)];
 			*pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) |
 			    L1_C_PROTO;
 			PTE_SYNC(pl1pd);
 	}
 
 	return (l2b);
 }
 
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	pmap_t kpm = pmap_kernel();
 
 	if (addr <= pmap_curmaxkvaddr)
 		return;		/* we are OK */
 
 	/*
 	 * whoops!   we need to add kernel PTPs
 	 */
 
 	/* Map 1MB at a time */
 	for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE)
 		pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr);
 
 	/*
 	 * flush out the cache, expensive but growkernel will happen so
 	 * rarely
 	 */
 	cpu_dcache_wbinv_all();
 	cpu_tlb_flushD();
 	cpu_cpwait();
 	kernel_vm_end = pmap_curmaxkvaddr;
 
 }
 
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b = NULL;
 	vm_page_t m;
 	pt_entry_t *pt;
 	
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		if (pv->pv_flags & PVF_WIRED) {
 			/* The page is wired, cannot remove it now. */
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		pmap->pm_stats.resident_count--;
 		l2b = pmap_get_l2_bucket(pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages"));
 		pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK);
 #ifdef ARM_USE_SMALL_ALLOC
 		KASSERT((vm_offset_t)m >= alloc_firstaddr, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #else
 		KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #endif
 		*pt = 0;
 		PTE_SYNC(pt);
 		npv = TAILQ_NEXT(pv, pv_plist);
 		pmap_nuke_pv(m, pmap, pv);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_WRITEABLE);
 		pmap_free_pv_entry(pv);
 		pmap_free_l2_bucket(pmap, l2b, 1);
 	}
 	vm_page_unlock_queues();
 	cpu_idcache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 	PMAP_UNLOCK(pmap);
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 #ifdef ARM_HAVE_SUPERSECTIONS
 /* Map a super section into the KVA. */
 
 void
 pmap_kenter_supersection(vm_offset_t va, uint64_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | L1_S_SUPERSEC | (pa & L1_SUP_FRAME) |
 	    (((pa >> 32) & 0xf) << 20) | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;	
 	vm_offset_t va0, va_end;
 
 	KASSERT(((va | pa) & L1_SUP_OFFSET) == 0,
 	    ("Not a valid super section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 	va0 = va & L1_SUP_FRAME;
 	va_end = va + L1_SUP_SIZE;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		va = va0;
 		for (; va < va_end; va += L1_S_SIZE) {
 			l1->l1_kva[L1_IDX(va)] = pd;
 			PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 		}
 	}
 }
 #endif
 
 /* Map a section into the KVA. */
 
 void
 pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0,
 	    ("Not a valid section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		l1->l1_kva[L1_IDX(va)] = pd;
 		PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 	}
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 static PMAP_INLINE void
 pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte;
 	pt_entry_t opte;
 	PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n",
 	    (uint32_t) va, (uint32_t) pa));
 
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (l2b == NULL)
 		l2b = pmap_grow_l2_bucket(pmap_kernel(), va);
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n",
 	    (uint32_t) pte, opte, *pte));
 	if (l2pte_valid(opte)) {
 		cpu_dcache_wbinv_range(va, PAGE_SIZE);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 	} else {
 		if (opte == 0)
 			l2b->l2b_occupancy++;
 	}
 	*pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, 
 	    VM_PROT_READ | VM_PROT_WRITE);
 	if (flags & KENTER_CACHE)
 		*pte |= pte_l2_s_cache_mode;
 	if (flags & KENTER_USER)
 		*pte |= L2_S_PROT_U;
 	PTE_SYNC(pte);
 }
 
 void
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pmap_kenter_internal(va, pa, KENTER_CACHE);
 }
 
 void
 pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, 0);
 }
 
 void
 pmap_kenter_user(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER);
 	/*
 	 * Call pmap_fault_fixup now, to make sure we'll have no exception
 	 * at the first use of the new address, or bad things will happen,
 	 * as we use one of these addresses in the exception handlers.
 	 */
 	pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1);
 }
 
 /*
  * remove a page rom the kernel pagetables
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte, opte;
 		
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (!l2b)
 		return;
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	if (l2pte_valid(opte)) {
 		cpu_dcache_wbinv_range(va, PAGE_SIZE);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 		*pte = 0;
 	}
 }
 
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	return (arm_ptovirt(start));
 #else
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
 
 	PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, "
 	    "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end,
 	    prot));
 	    
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	*virt = va;
 	return (sva);
 #endif
 }
 
 static void
 pmap_wb_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, FALSE,
 		(pv->pv_flags & PVF_WRITE) == 0);
 }
 
 static void
 pmap_inv_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, TRUE);
 }
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pmap_wb_page(m[i]);
 		pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]), 
 		    KENTER_CACHE);
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	vm_paddr_t pa;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pa = vtophys(va);
 		if (pa) {
 			pmap_inv_page(PHYS_TO_VM_PAGE(pa));
 			pmap_kremove(va);
 		}
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	if (!pmap_get_pde_pte(pmap, addr, &pde, &pte))
 		return (FALSE);
 	KASSERT(pte != NULL, ("Valid mapping but no pte ?"));
 	if (*pte == 0)
 		return (TRUE);
 	return (FALSE);
 }
 
 /*
  * Fetch pointers to the PDE/PTE for the given pmap/VA pair.
  * Returns TRUE if the mapping exists, else FALSE.
  *
  * NOTE: This function is only used by a couple of arm-specific modules.
  * It is not safe to take any pmap locks here, since we could be right
  * in the middle of debugging the pmap anyway...
  *
  * It is possible for this routine to return FALSE even though a valid
  * mapping does exist. This is because we don't lock, so the metadata
  * state may be inconsistent.
  *
  * NOTE: We can return a NULL *ptp in the case where the L1 pde is
  * a "section" mapping.
  */
 boolean_t
 pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 	if (pm->pm_l1 == NULL)
 		return (FALSE);
 
 	l1idx = L1_IDX(va);
 	*pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = *pl1pd;
 
 	if (l1pte_section_p(l1pd)) {
 		*ptp = NULL;
 		return (TRUE);
 	}
 
 	if (pm->pm_l2 == NULL)
 		return (FALSE);
 
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 	if (l2 == NULL ||
 	    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		return (FALSE);
 	}
 
 	*ptp = &ptep[l2pte_index(va)];
 	return (TRUE);
 }
 
 /*
  *      Routine:        pmap_remove_all
  *      Function:
  *              Removes this physical page from
  *              all physical maps in which it resides.
  *              Reflects back modify bits to the pager.
  *
  *      Notes:
  *              Original versions of this routine were very
  *              inefficient because they iteratively called
  *              pmap_remove (slow...)
  */
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *ptep, pte;
 	struct l2_bucket *l2b;
 	boolean_t flush = FALSE;
 	pmap_t curpm;
 	int flags = 0;
 
 #if defined(PMAP_DEBUG)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
 	}
 #endif
 
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	curpm = vmspace_pmap(curproc->p_vmspace);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		if (flush == FALSE && (pv->pv_pmap == curpm ||
 		    pv->pv_pmap == pmap_kernel()))
 			flush = TRUE;
 		PMAP_LOCK(pv->pv_pmap);
 		l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No l2 bucket"));
 		ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		pte = *ptep;
 		*ptep = 0;
 		PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		pmap_free_l2_bucket(pv->pv_pmap, l2b, 1);
 		if (pv->pv_flags & PVF_WIRED)
 			pv->pv_pmap->pm_stats.wired_count--;
 		pv->pv_pmap->pm_stats.resident_count--;
 		flags |= pv->pv_flags;
 		pmap_nuke_pv(m, pv->pv_pmap, pv);
 		PMAP_UNLOCK(pv->pv_pmap);
 		pmap_free_pv_entry(pv);
 	}
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(curpm);
 		else
 			pmap_tlb_flushD(curpm);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t next_bucket;
 	u_int flags;
 	int flush;
 
 	if ((prot & VM_PROT_READ) == 0) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE) {
 		/*
 		 * If this is a read->write transition, just ignore it and let
 		 * vm_fault() take care of it later.
 		 */
 		return;
 	}
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 
 	/*
 	 * OK, at this point, we know we're doing write-protect operation.
 	 * If the pmap is active, write-back the range.
 	 */
 	pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE);
 
 	flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1;
 	flags = 0;
 
 	while (sva < eva) {
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 
 		while (sva < next_bucket) {
 			if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) {
 				struct vm_page *pg;
 				u_int f;
 
 				pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 				pte &= ~L2_S_PROT_W;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 
 				if (pg != NULL) {
 					f = pmap_modify_pv(pg, pm, sva,
 					    PVF_WRITE, 0);
 					pmap_vac_me_harder(pg, pm, sva);
 					vm_page_dirty(pg);
 				} else
 					f = PVF_REF | PVF_EXEC;
 
 				if (flush >= 0) {
 					flush++;
 					flags |= f;
 				} else
 				if (PV_BEEN_EXECD(f))
 					pmap_tlb_flushID_SE(pm, sva);
 				else
 				if (PV_BEEN_REFD(f))
 					pmap_tlb_flushD_SE(pm, sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 		}
 	}
 
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(pm);
 		else
 		if (PV_BEEN_REFD(flags))
 			pmap_tlb_flushD(pm);
 	}
 	vm_page_unlock_queues();
 
  	PMAP_UNLOCK(pm);
 }
 
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK);
 	vm_page_unlock_queues();
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	The page queues and pmap must be locked.
  */
 static void
 pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired, int flags)
 {
 	struct l2_bucket *l2b = NULL;
 	struct vm_page *opg;
 	struct pv_entry *pve = NULL;
 	pt_entry_t *ptep, npte, opte;
 	u_int nflags;
 	u_int oflags;
 	vm_paddr_t pa;
 
 	PMAP_ASSERT_LOCKED(pmap);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (va == vector_page) {
 		pa = systempage.pv_pa;
 		m = NULL;
 	} else
 		pa = VM_PAGE_TO_PHYS(m);
 	nflags = 0;
 	if (prot & VM_PROT_WRITE)
 		nflags |= PVF_WRITE;
 	if (prot & VM_PROT_EXECUTE)
 		nflags |= PVF_EXEC;
 	if (wired)
 		nflags |= PVF_WIRED;
 	PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, "
 	    "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired));
 	    
 	if (pmap == pmap_kernel()) {
 		l2b = pmap_get_l2_bucket(pmap, va);
 		if (l2b == NULL)
 			l2b = pmap_grow_l2_bucket(pmap, va);
 	} else {
 do_l2b_alloc:
 		l2b = pmap_alloc_l2_bucket(pmap, va);
 		if (l2b == NULL) {
 			if (flags & M_WAITOK) {
 				PMAP_UNLOCK(pmap);
 				vm_page_unlock_queues();
 				VM_WAIT;
 				vm_page_lock_queues();
 				PMAP_LOCK(pmap);
 				goto do_l2b_alloc;
 			}
 			return;
 		}
 	}
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 		    
 	opte = *ptep;
 	npte = pa;
 	oflags = 0;
 	if (opte) {
 		/*
 		 * There is already a mapping at this address.
 		 * If the physical address is different, lookup the
 		 * vm_page.
 		 */
 		if (l2pte_pa(opte) != pa)
 			opg = PHYS_TO_VM_PAGE(l2pte_pa(opte));
 		else
 			opg = m;
 	} else
 		opg = NULL;
 
 	if ((prot & (VM_PROT_ALL)) ||
 	    (!m || m->md.pvh_attrs & PVF_REF)) {
 		/*
 		 * - The access type indicates that we don't need
 		 *   to do referenced emulation.
 		 * OR
 		 * - The physical page has already been referenced
 		 *   so no need to re-do referenced emulation here.
 		 */
 		npte |= L2_S_PROTO;
 		
 		nflags |= PVF_REF;
 		
 		if (m && ((prot & VM_PROT_WRITE) != 0 ||
 		    (m->md.pvh_attrs & PVF_MOD))) {
 			/*
 			 * This is a writable mapping, and the
 			 * page's mod state indicates it has
 			 * already been modified. Make it
 			 * writable from the outset.
 			 */
 			nflags |= PVF_MOD;
 			if (!(m->md.pvh_attrs & PVF_MOD))
 				vm_page_dirty(m);
 		}
 		if (m && opte)
 			vm_page_flag_set(m, PG_REFERENCED);
 	} else {
 		/*
 		 * Need to do page referenced emulation.
 		 */
 		npte |= L2_TYPE_INV;
 	}
 	
 	if (prot & VM_PROT_WRITE) {
 		npte |= L2_S_PROT_W;
 		if (m != NULL)
 			vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	npte |= pte_l2_s_cache_mode;
 	if (m && m == opg) {
 		/*
 		 * We're changing the attrs of an existing mapping.
 		 */
 		oflags = pmap_modify_pv(m, pmap, va,
 		    PVF_WRITE | PVF_EXEC | PVF_WIRED |
 		    PVF_MOD | PVF_REF, nflags);
 		
 		/*
 		 * We may need to flush the cache if we're
 		 * doing rw-ro...
 		 */
 		if (pmap_is_current(pmap) &&
 		    (oflags & PVF_NC) == 0 &&
 			    (opte & L2_S_PROT_W) != 0 &&
 			    (prot & VM_PROT_WRITE) == 0)
 			cpu_dcache_wb_range(va, PAGE_SIZE);
 	} else {
 		/*
 		 * New mapping, or changing the backing page
 		 * of an existing mapping.
 		 */
 		if (opg) {
 			/*
 			 * Replacing an existing mapping with a new one.
 			 * It is part of our managed memory so we
 			 * must remove it from the PV list
 			 */
 			pve = pmap_remove_pv(opg, pmap, va);
 			if (m && (m->flags & (PG_UNMANAGED | PG_FICTITIOUS)) &&
 			    pve)
 				pmap_free_pv_entry(pve);
 			else if (!pve && 
 			    !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS)))
 				pve = pmap_get_pv_entry();
 			KASSERT(pve != NULL || m->flags & (PG_UNMANAGED | 
 			    PG_FICTITIOUS), ("No pv"));
 			oflags = pve->pv_flags;
 			
 			/*
 			 * If the old mapping was valid (ref/mod
 			 * emulation creates 'invalid' mappings
 			 * initially) then make sure to frob
 			 * the cache.
 			 */
 			if ((oflags & PVF_NC) == 0 &&
 			    l2pte_valid(opte)) {
 				if (PV_BEEN_EXECD(oflags)) {
 					pmap_idcache_wbinv_range(pmap, va,
 					    PAGE_SIZE);
 				} else
 					if (PV_BEEN_REFD(oflags)) {
 						pmap_dcache_wb_range(pmap, va,
 						    PAGE_SIZE, TRUE,
 						    (oflags & PVF_WRITE) == 0);
 					}
 			}
 		} else if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS)))
 			if ((pve = pmap_get_pv_entry()) == NULL) {
 				panic("pmap_enter: no pv entries");	
 			}
 		if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS))) {
 			KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 			    ("pmap_enter: managed mapping within the clean submap"));
 			pmap_enter_pv(m, pve, pmap, va, nflags);
 		}
 	}
 	/*
 	 * Make sure userland mappings get the right permissions
 	 */
 	if (pmap != pmap_kernel() && va != vector_page) {
 		npte |= L2_S_PROT_U;
 	}
 
 	/*
 	 * Keep the stats up to date
 	 */
 	if (opte == 0) {
 		l2b->l2b_occupancy++;
 		pmap->pm_stats.resident_count++;
 	} 
 
 
 	/*
 	 * If this is just a wiring change, the two PTEs will be
 	 * identical, so there's no need to update the page table.
 	 */
 	if (npte != opte) {
 		boolean_t is_cached = pmap_is_current(pmap);
 
 		*ptep = npte;
 		if (is_cached) {
 			/*
 			 * We only need to frob the cache/tlb if this pmap
 			 * is current
 			 */
 			PTE_SYNC(ptep);
 			if (L1_IDX(va) != L1_IDX(vector_page) && 
 			    l2pte_valid(npte)) {
 				/*
 				 * This mapping is likely to be accessed as
 				 * soon as we return to userland. Fix up the
 				 * L1 entry to avoid taking another
 				 * page/domain fault.
 				 */
 				pd_entry_t *pl1pd, l1pd;
 
 				pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)];
 				l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) |
 				    L1_C_PROTO;
 				if (*pl1pd != l1pd) {
 					*pl1pd = l1pd;
 					PTE_SYNC(pl1pd);
 				}
 			}
 		}
 
 		if (PV_BEEN_EXECD(oflags))
 			pmap_tlb_flushID_SE(pmap, va);
 		else if (PV_BEEN_REFD(oflags))
 			pmap_tlb_flushD_SE(pmap, va);
 
 
 		if (m)
 			pmap_vac_me_harder(m, pmap, va);
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pmap, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
  	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE, M_NOWAIT);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_page_t pg;
 
 	vm_page_lock_queues();
  	PMAP_LOCK(pmap);
 	l2b = pmap_get_l2_bucket(pmap, va);
 	KASSERT(l2b, ("No l2b bucket in pmap_change_wiring"));
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 	if (pg) 
 		pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired);
 	vm_page_unlock_queues();
  	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 }
 
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 
 	PMAP_LOCK(pm);
 	l1pd = pm->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pm == pmap_kernel(), ("huh"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC) 
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 			PMAP_UNLOCK(pm);
 			return (0);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 			PMAP_UNLOCK(pm);
 			return (0);
 		}
 
 		switch (pte & L2_TYPE_MASK) {
 		case L2_TYPE_L:
 			pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 			break;
 
 		default:
 			pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 			break;
 		}
 	}
 
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  *
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	vm_page_t m = NULL;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 
 	vm_page_lock_queues();
  	PMAP_LOCK(pmap);
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pmap == pmap_kernel(), ("huh"));
 		/* XXX: what to do about the bits > 32 ? */
 		if (l1pd & L1_S_SUPERSEC) 
 			pa = (l1pd & L1_SUP_FRAME) | (va & L1_SUP_OFFSET);
 		else
 			pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 		if (l1pd & L1_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 			
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		 	PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			return (NULL);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 		 	PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			return (NULL);
 		}
 		if (pte & L2_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			switch (pte & L2_TYPE_MASK) {
 			case L2_TYPE_L:
 				pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 				break;
 				
 			default:
 				pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 				break;
 			}
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	}
 
  	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
 	return (m);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 
-void
+int
 pmap_pinit(pmap_t pmap)
 {
 	PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap));
 	
 	PMAP_LOCK_INIT(pmap);
 	pmap_alloc_l1(pmap);
 	bzero(pmap->pm_l2, sizeof(pmap->pm_l2));
 
 	pmap->pm_count = 1;
 	pmap->pm_active = 0;
 		
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_stats.resident_count = 1;
 	if (vector_page < KERNBASE) {
 		pmap_enter(pmap, vector_page, PHYS_TO_VM_PAGE(systempage.pv_pa),
 		    VM_PROT_READ, 1);
 	} 
+	return (1);
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 
 static void
 pmap_free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 pmap_get_pv_entry(void)
 {
 	pv_entry_t ret_value;
 	
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	ret_value = uma_zalloc(pvzone, M_NOWAIT);
 	return ret_value;
 }
 
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 #define  PMAP_REMOVE_CLEAN_LIST_SIZE     3
 void
 pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
 	struct l2_bucket *l2b;
 	vm_offset_t next_bucket;
 	pt_entry_t *ptep;
 	u_int cleanlist_idx, total, cnt;
 	struct {
 		vm_offset_t va;
 		pt_entry_t *pte;
 	} cleanlist[PMAP_REMOVE_CLEAN_LIST_SIZE];
 	u_int mappings, is_exec, is_refd;
 	int flushall = 0;
 
 
 	/*
 	 * we lock in the pmap => pv_head direction
 	 */
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (!pmap_is_current(pm)) {
 		cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1;
 	} else
 		cleanlist_idx = 0;
 
 	total = 0;
 	while (sva < eva) {
 		/*
 		 * Do one L2 bucket's worth at a time.
 		 */
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 		mappings = 0;
 
 		while (sva < next_bucket) {
 			struct vm_page *pg;
 			pt_entry_t pte;
 			vm_paddr_t pa;
 
 			pte = *ptep;
 
 			if (pte == 0) {
 				/*
 				 * Nothing here, move along
 				 */
 				sva += PAGE_SIZE;
 				ptep++;
 				continue;
 			}
 
 			pm->pm_stats.resident_count--;
 			pa = l2pte_pa(pte);
 			is_exec = 0;
 			is_refd = 1;
 
 			/*
 			 * Update flags. In a number of circumstances,
 			 * we could cluster a lot of these and do a
 			 * number of sequential pages in one go.
 			 */
 			if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) {
 				struct pv_entry *pve;
 
 				pve = pmap_remove_pv(pg, pm, sva);
 				if (pve) {
 					is_exec = PV_BEEN_EXECD(pve->pv_flags);
 					is_refd = PV_BEEN_REFD(pve->pv_flags);
 					pmap_free_pv_entry(pve);
 				}
 			}
 
 			if (!l2pte_valid(pte)) {
 				*ptep = 0;
 				PTE_SYNC_CURRENT(pm, ptep);
 				sva += PAGE_SIZE;
 				ptep++;
 				mappings++;
 				continue;
 			}
 
 			if (cleanlist_idx < PMAP_REMOVE_CLEAN_LIST_SIZE) {
 				/* Add to the clean list. */
 				cleanlist[cleanlist_idx].pte = ptep;
 				cleanlist[cleanlist_idx].va =
 				    sva | (is_exec & 1);
 				cleanlist_idx++;
 			} else
 			if (cleanlist_idx == PMAP_REMOVE_CLEAN_LIST_SIZE) {
 				/* Nuke everything if needed. */
 				pmap_idcache_wbinv_all(pm);
 				pmap_tlb_flushID(pm);
 
 				/*
 				 * Roll back the previous PTE list,
 				 * and zero out the current PTE.
 				 */
 				for (cnt = 0;
 				     cnt < PMAP_REMOVE_CLEAN_LIST_SIZE; cnt++) {
 					*cleanlist[cnt].pte = 0;
 				}
 				*ptep = 0;
 				PTE_SYNC(ptep);
 				cleanlist_idx++;
 				flushall = 1;
 			} else {
 				*ptep = 0;
 				PTE_SYNC(ptep);
 					if (is_exec)
 						pmap_tlb_flushID_SE(pm, sva);
 					else
 					if (is_refd)
 						pmap_tlb_flushD_SE(pm, sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 			mappings++;
 		}
 
 		/*
 		 * Deal with any left overs
 		 */
 		if (cleanlist_idx <= PMAP_REMOVE_CLEAN_LIST_SIZE) {
 			total += cleanlist_idx;
 			for (cnt = 0; cnt < cleanlist_idx; cnt++) {
 				vm_offset_t clva =
 				    cleanlist[cnt].va & ~1;
 				if (cleanlist[cnt].va & 1) {
 					pmap_idcache_wbinv_range(pm,
 					    clva, PAGE_SIZE);
 					pmap_tlb_flushID_SE(pm, clva);
 				} else {
 					pmap_dcache_wb_range(pm,
 					    clva, PAGE_SIZE, TRUE,
 					    FALSE);
 					pmap_tlb_flushD_SE(pm, clva);
 				}
 				*cleanlist[cnt].pte = 0;
 				PTE_SYNC_CURRENT(pm, cleanlist[cnt].pte);
 			}
 
 			if (total <= PMAP_REMOVE_CLEAN_LIST_SIZE)
 				cleanlist_idx = 0;
 			else {
 				/*
 				 * We are removing so much entries it's just
 				 * easier to flush the whole cache.
 				 */
 				cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1;
 				pmap_idcache_wbinv_all(pm);
 				flushall = 1;
 			}
 		}
 
 		pmap_free_l2_bucket(pm, l2b, mappings);
 	}
 
 	vm_page_unlock_queues();
 	if (flushall)
 		cpu_tlb_flushID();
  	PMAP_UNLOCK(pm);
 }
 
 
 
 
 /*
  * pmap_zero_page()
  * 
  * Zero a given physical page by mapping it at a page hook point.
  * In doing the zero page op, the page we zero is mapped cachable, as with
  * StrongARM accesses to non-cached pages are non-burst making writing
  * _any_ bulk data very slow.
  */
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_CORE3)
 void
 pmap_zero_page_generic(vm_paddr_t phys, int off, int size)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	char *dstpg;
 #endif
 
 #ifdef DEBUG
 	struct vm_page *pg = PHYS_TO_VM_PAGE(phys);
 
 	if (pg->md.pvh_list != NULL)
 		panic("pmap_zero_page: page has mappings");
 #endif
 
 	if (_arm_bzero && size >= _min_bzero_size &&
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 
 #ifdef ARM_USE_SMALL_ALLOC
 	dstpg = (char *)arm_ptovirt(phys);
 	if (off || size != PAGE_SIZE) {
 		bzero(dstpg + off, size);
 		cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size);
 	} else {
 		bzero_page((vm_offset_t)dstpg);
 		cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 	}
 #else
 
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, and purge the cache for that
 	 * zeroed page. Invalidate the TLB as needed.
 	 */
 	*cdst_pte = L2_S_PROTO | phys |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode;
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE) {
 		bzero((void *)(cdstp + off), size);
 		cpu_dcache_wbinv_range(cdstp + off, size);
 	} else {
 		bzero_page(cdstp);
 		cpu_dcache_wbinv_range(cdstp, PAGE_SIZE);
 	}
 	mtx_unlock(&cmtx);
 #endif
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_zero_page_xscale(vm_paddr_t phys, int off, int size)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	char *dstpg;
 #endif
 
 	if (_arm_bzero && size >= _min_bzero_size &&
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 #ifdef ARM_USE_SMALL_ALLOC
 	dstpg = (char *)arm_ptovirt(phys);
 	if (off || size != PAGE_SIZE) {
 		bzero(dstpg + off, size);
 		cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size);
 	} else {
 		bzero_page((vm_offset_t)dstpg);
 		cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 	}
 #else
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, and purge the cache for that
 	 * zeroed page. Invalidate the TLB as needed.
 	 */
 	*cdst_pte = L2_S_PROTO | phys |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE)
 		bzero((void *)(cdstp + off), size);
 	else
 		bzero_page(cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 #endif
 }
 
 /*
  * Change the PTEs for the specified kernel mappings such that they
  * will use the mini data cache instead of the main data cache.
  */
 void
 pmap_use_minicache(vm_offset_t va, vm_size_t size)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, *sptep, pte;
 	vm_offset_t next_bucket, eva;
 
 #if (ARM_NMMUS > 1) || defined(CPU_XSCALE_CORE3)
 	if (xscale_use_minidata == 0)
 		return;
 #endif
 
 	eva = va + size;
 
 	while (va < eva) {
 		next_bucket = L2_NEXT_BUCKET(va);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 		sptep = ptep = &l2b->l2b_kva[l2pte_index(va)];
 
 		while (va < next_bucket) {
 			pte = *ptep;
 			if (!l2pte_minidata(pte)) {
 				cpu_dcache_wbinv_range(va, PAGE_SIZE);
 				cpu_tlb_flushD_SE(va);
 				*ptep = pte & ~L2_B;
 			}
 			ptep++;
 			va += PAGE_SIZE;
 		}
 		PTE_SYNC_RANGE(sptep, (u_int)(ptep - sptep));
 	}
 	cpu_cpwait();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), 0, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	pmap_zero_page(m);
 }
 
 #if 0
 /*
  * pmap_clean_page()
  *
  * This is a local function used to work out the best strategy to clean
  * a single page referenced by its entry in the PV table. It's used by
  * pmap_copy_page, pmap_zero page and maybe some others later on.
  *
  * Its policy is effectively:
  *  o If there are no mappings, we don't bother doing anything with the cache.
  *  o If there is one mapping, we clean just that page.
  *  o If there are multiple mappings, we clean the entire cache.
  *
  * So that some functions can be further optimised, it returns 0 if it didn't
  * clean the entire cache, or 1 if it did.
  *
  * XXX One bug in this routine is that if the pv_entry has a single page
  * mapped at 0x00000000 a whole cache clean will be performed rather than
  * just the 1 page. Since this should not occur in everyday use and if it does
  * it will just result in not the most efficient clean for the page.
  */
 static int
 pmap_clean_page(struct pv_entry *pv, boolean_t is_src)
 {
 	pmap_t pm, pm_to_clean = NULL;
 	struct pv_entry *npv;
 	u_int cache_needs_cleaning = 0;
 	u_int flags = 0;
 	vm_offset_t page_to_clean = 0;
 
 	if (pv == NULL) {
 		/* nothing mapped in so nothing to flush */
 		return (0);
 	}
 
 	/*
 	 * Since we flush the cache each time we change to a different
 	 * user vmspace, we only need to flush the page if it is in the
 	 * current pmap.
 	 */
 	if (curthread)
 		pm = vmspace_pmap(curproc->p_vmspace);
 	else
 		pm = pmap_kernel();
 
 	for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) {
 		if (npv->pv_pmap == pmap_kernel() || npv->pv_pmap == pm) {
 			flags |= npv->pv_flags;
 			/*
 			 * The page is mapped non-cacheable in 
 			 * this map.  No need to flush the cache.
 			 */
 			if (npv->pv_flags & PVF_NC) {
 #ifdef DIAGNOSTIC
 				if (cache_needs_cleaning)
 					panic("pmap_clean_page: "
 					    "cache inconsistency");
 #endif
 				break;
 			} else if (is_src && (npv->pv_flags & PVF_WRITE) == 0)
 				continue;
 			if (cache_needs_cleaning) {
 				page_to_clean = 0;
 				break;
 			} else {
 				page_to_clean = npv->pv_va;
 				pm_to_clean = npv->pv_pmap;
 			}
 			cache_needs_cleaning = 1;
 		}
 	}
 	if (page_to_clean) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE);
 		else
 			pmap_dcache_wb_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0);
 	} else if (cache_needs_cleaning) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_all(pm);
 		else
 			pmap_dcache_wbinv_all(pm);
 		return (1);
 	}
 	return (0);
 }
 #endif
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 
 /*
  * pmap_copy_page()
  *
  * Copy one physical page into another, by mapping the pages into
  * hook points. The same comment regarding cachability as in
  * pmap_zero_page also applies here.
  */
 #if  (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined (CPU_XSCALE_CORE3)
 void
 pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 #ifdef DEBUG
 	struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst);
 
 	if (dst_pg->md.pvh_list != NULL)
 		panic("pmap_copy_page: dst page has mappings");
 #endif
 
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode;
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode;
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	cpu_dcache_inv_range(csrcp, PAGE_SIZE);
 	cpu_dcache_wbinv_range(cdstp, PAGE_SIZE);
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	/* XXX: Only needed for pmap_clean_page(), which is commented out. */
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 #ifdef DEBUG
 	struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst);
 
 	if (dst_pg->md.pvh_list != NULL)
 		panic("pmap_copy_page: dst page has mappings");
 #endif
 
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	vm_offset_t srcpg, dstpg;
 #endif
 
 	cpu_dcache_wbinv_all();
 	if (_arm_memcpy && PAGE_SIZE >= _min_memcpy_size &&
 	    _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst), 
 	    (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0)
 		return;
 #ifdef ARM_USE_SMALL_ALLOC
 	srcpg = arm_ptovirt(VM_PAGE_TO_PHYS(src));
 	dstpg = arm_ptovirt(VM_PAGE_TO_PHYS(dst));
 	bcopy_page(srcpg, dstpg);
 	cpu_dcache_wbinv_range(dstpg, PAGE_SIZE);
 #else
 	pmap_copy_page_func(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
 #endif
 }
 
 
 
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 	
 	if (m->flags & PG_FICTITIOUS)
 		return (FALSE);
 		
 	/*
 	 * Not found, check current mappings returning immediately
 	 */
 	for (pv = TAILQ_FIRST(&m->md.pv_list);
 	    pv;
 	    pv = TAILQ_NEXT(pv, pv_list)) {
 	    	if (pv->pv_pmap == pmap) {
 	    		return (TRUE);
 	    	}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 
 	if (m->flags & PG_FICTITIOUS)
 		return (0);
 	return (pmap_clearbit(m, PVF_REF));
 }
 
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_MOD)
 		return (TRUE);
 	
 	return(FALSE);
 }
 
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_MOD)
 		pmap_clearbit(m, PVF_MOD);
 }
 
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_REF) 
 		pmap_clearbit(m, PVF_REF);
 }
 
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 
 	if (m->flags & PG_WRITEABLE)
 		pmap_clearbit(m, PVF_WRITE);
 }
 
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	printf("pmap_mincore()\n");
 	
 	return (0);
 }
 
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	return(addr);
 }
 
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	vm_offset_t va, tmpva, offset;
 	
 	offset = pa & PAGE_MASK;
 	size = roundup(size, PAGE_SIZE);
 	
 	GIANT_REQUIRED;
 	
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	for (tmpva = va; size > 0;) {
 		pmap_kenter_internal(tmpva, pa, 0);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	
 	return ((void *)(va + offset));
 }
 
 #define BOOTSTRAP_DEBUG
 
 /*
  * pmap_map_section:
  *
  *	Create a single section mapping.
  */
 void
 pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pd_entry_t fl;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l1_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l1_s_cache_mode_pt;
 		break;
 	}
 
 	pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 	    L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 
 }
 
 /*
  * pmap_link_l2pt:
  *
  *	Link the L2 page table specified by l2pv.pv_pa into the L1
  *	page table at the slot for "va".
  */
 void
 pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt, proto;
 	u_int slot = va >> L1_S_SHIFT;
 
 	proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO;
 
 #ifdef VERBOSE_INIT_ARM     
 	printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va);
 #endif
 
 	pde[slot + 0] = proto | (l2pv->pv_pa + 0x000);
 
 	PTE_SYNC(&pde[slot]);
 
 	SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list);
 
 	
 }
 
 /*
  * pmap_map_entry
  *
  * 	Create a single page mapping.
  */
 void
 pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
     int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t fl;
 	pt_entry_t *pte;
 
 	KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 		panic("pmap_map_entry: no L2 table for VA 0x%08x", va);
 
 	pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 
 	if (pte == NULL)
 		panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va);
 
 	pte[l2pte_index(va)] =
 	    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl;
 	PTE_SYNC(&pte[l2pte_index(va)]);
 }
 
 /*
  * pmap_map_chunk:
  *
  *	Map a chunk of memory using the most efficient mappings
  *	possible (section. large page, small page) into the
  *	provided L1 and L2 tables at the specified virtual address.
  */
 vm_size_t
 pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     vm_size_t size, int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte, f1, f2s, f2l;
 	vm_size_t resid;  
 	int i;
 
 	resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 
 	if (l1pt == 0)
 		panic("pmap_map_chunk: no L1 table provided");
 
 #ifdef VERBOSE_INIT_ARM     
 	printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x "
 	    "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache);
 #endif
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		f1 = 0;
 		f2l = 0;
 		f2s = 0;
 		break;
 
 	case PTE_CACHE:
 		f1 = pte_l1_s_cache_mode;
 		f2l = pte_l2_l_cache_mode;
 		f2s = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		f1 = pte_l1_s_cache_mode_pt;
 		f2l = pte_l2_l_cache_mode_pt;
 		f2s = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	size = resid;
 
 	while (resid > 0) {
 		/* See if we can use a section mapping. */
 		if (L1_S_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("S");
 #endif
 			pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 			    L1_S_PROT(PTE_KERNEL, prot) | f1 |
 			    L1_S_DOM(PMAP_DOMAIN_KERNEL);
 			PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 			va += L1_S_SIZE;
 			pa += L1_S_SIZE;
 			resid -= L1_S_SIZE;
 			continue;
 		}
 
 		/*
 		 * Ok, we're going to use an L2 table.  Make sure
 		 * one is actually in the corresponding L1 slot
 		 * for the current VA.
 		 */
 		if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 			panic("pmap_map_chunk: no L2 table for VA 0x%08x", va);
 
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("pmap_map_chunk: can't find L2 table for VA"
 			    "0x%08x", va);
 		/* See if we can use a L2 large page mapping. */
 		if (L2_L_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("L");
 #endif
 			for (i = 0; i < 16; i++) {
 				pte[l2pte_index(va) + i] =
 				    L2_L_PROTO | pa |
 				    L2_L_PROT(PTE_KERNEL, prot) | f2l;
 				PTE_SYNC(&pte[l2pte_index(va) + i]);
 			}
 			va += L2_L_SIZE;
 			pa += L2_L_SIZE;
 			resid -= L2_L_SIZE;
 			continue;
 		}
 
 		/* Use a small page mapping. */
 #ifdef VERBOSE_INIT_ARM
 		printf("P");
 #endif
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s;
 		PTE_SYNC(&pte[l2pte_index(va)]);
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		resid -= PAGE_SIZE;
 	}
 #ifdef VERBOSE_INIT_ARM
 	printf("\n");
 #endif
 	return (size);
 
 }
 
 /********************** Static device map routines ***************************/
 
 static const struct pmap_devmap *pmap_devmap_table;
 
 /*
  * Register the devmap table.  This is provided in case early console
  * initialization needs to register mappings created by bootstrap code
  * before pmap_devmap_bootstrap() is called.
  */
 void
 pmap_devmap_register(const struct pmap_devmap *table)
 {
 
 	pmap_devmap_table = table;
 }
 
 /*
  * Map all of the static regions in the devmap table, and remember
  * the devmap table so other parts of the kernel can look up entries
  * later.
  */
 void
 pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table)
 {
 	int i;
 
 	pmap_devmap_table = table;
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 #ifdef VERBOSE_INIT_ARM
 		printf("devmap: %08x -> %08x @ %08x\n",
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_pa +
 			pmap_devmap_table[i].pd_size - 1,
 		    pmap_devmap_table[i].pd_va);
 #endif
 		pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va,
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_size,
 		    pmap_devmap_table[i].pd_prot,
 		    pmap_devmap_table[i].pd_cache);
 	}
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (pa >= pmap_devmap_table[i].pd_pa &&
 		    pa + size <= pmap_devmap_table[i].pd_pa +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_va(vm_offset_t va, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (va >= pmap_devmap_table[i].pd_va &&
 		    va + size <= pmap_devmap_table[i].pd_va +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
Index: head/sys/arm/at91/kb920x_machdep.c
===================================================================
--- head/sys/arm/at91/kb920x_machdep.c	(revision 173360)
+++ head/sys/arm/at91/kb920x_machdep.c	(revision 173361)
@@ -1,495 +1,495 @@
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 #include "opt_msgbuf.h"
 #include "opt_ddb.h"
 #include "opt_at91.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <sys/msgbuf.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 #include <arm/at91/at91rm92reg.h>
 #include <arm/at91/at91_piovar.h>
 #include <arm/at91/at91_pio_rm9200.h>
 
 #define KERNEL_PT_SYS		0	/* Page table for mapping proc0 zero page */
 #define KERNEL_PT_KERN		1	
 #define KERNEL_PT_KERN_NUM	22
 #define KERNEL_PT_AFKERNEL	KERNEL_PT_KERN + KERNEL_PT_KERN_NUM	/* L2 table for mapping after kernel */
 #define	KERNEL_PT_AFKERNEL_NUM	5
 
 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */
 #define NUM_KERNEL_PTS		(KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #define UND_STACK_SIZE	1
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_offset_t physical_pages;
 
 struct pv_addr systempage;
 struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 
 static struct trapframe proc0_tf;
 
 /* Static device mappings. */
 static const struct pmap_devmap kb920x_devmap[] = {
 	/* 
 	 * Map the on-board devices VA == PA so that we can access them
 	 * with the MMU on or off.
 	 */
 	{
 		/*
 		 * This at least maps the interrupt controller, the UART
 		 * and the timer. Other devices should use newbus to
 		 * map their memory anyway.
 		 */
 		0xdff00000,
 		0xfff00000,
 		0x100000,
 		VM_PROT_READ|VM_PROT_WRITE,                             
 		PTE_NOCACHE,
 	},
 	/*
 	 * We can't just map the OHCI registers VA == PA, because
 	 * AT91RM92_OHCI_BASE belongs to the userland address space.
 	 * We could just choose a different virtual address, but a better
 	 * solution would probably be to just use pmap_mapdev() to allocate
 	 * KVA, as we don't need the OHCI controller before the vm
 	 * initialization is done. However, the AT91 resource allocation
 	 * system doesn't know how to use pmap_mapdev() yet.
 	 */
 #if 1
 	{
 		/*
 		 * Add the ohci controller, and anything else that might be
 		 * on this chip select for a VA/PA mapping.
 		 */
 		AT91RM92_OHCI_BASE,
 		AT91RM92_OHCI_PA_BASE,
 		AT91RM92_OHCI_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,                             
 		PTE_NOCACHE,
 	},
 #endif
 	{
 		0,
 		0,
 		0,
 		0,
 		0,
 	}
 };
 
 #define SDRAM_START 0xa0000000
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 static long
 ramsize(void)
 {
 	uint32_t *SDRAMC = (uint32_t *)(AT91RM92_BASE + AT91RM92_SDRAMC_BASE);
 	uint32_t cr, mr;
 	int banks, rows, cols, bw;
 	
 	cr = SDRAMC[AT91RM92_SDRAMC_CR / 4];
 	mr = SDRAMC[AT91RM92_SDRAMC_MR / 4];
 	bw = (mr & AT91RM92_SDRAMC_MR_DBW_16) ? 1 : 2;
 	banks = (cr & AT91RM92_SDRAMC_CR_NB_4) ? 2 : 1;
 	rows = ((cr & AT91RM92_SDRAMC_CR_NR_MASK) >> 2) + 11;
 	cols = (cr & AT91RM92_SDRAMC_CR_NC_MASK) + 8;
 	return (1 << (cols + rows + banks + bw));
 }
 
 static long
 board_init(void)
 {
 	/*
 	 * Since the USART supprots RS-485 multidrop mode, it allows the
 	 * TX pins to float.  However, for RS-232 operations, we don't want
 	 * these pins to float.  Instead, they should be pulled up to avoid
 	 * mismatches.  Linux does something similar when it configures the
 	 * TX lines.  This implies that we also allow the RX lines to float
 	 * rather than be in the state they are left in by the boot loader.
 	 * Since they are input pins, I think that this is the right thing
 	 * to do.
 	 */
 
 	/* PIOA's A periph: Turn USART 0 and 2's TX/RX pins */
 	at91_pio_use_periph_a(AT91RM92_PIOA_BASE,
 	    AT91C_PA18_RXD0 | AT91C_PA22_RXD2, 0);
 	at91_pio_use_periph_a(AT91RM92_PIOA_BASE,
 	    AT91C_PA17_TXD0 | AT91C_PA23_TXD2, 1);
 	/* PIOA's B periph: Turn USART 3's TX/RX pins */
 	at91_pio_use_periph_b(AT91RM92_PIOA_BASE, AT91C_PA6_RXD3, 0);
 	at91_pio_use_periph_b(AT91RM92_PIOA_BASE, AT91C_PA5_TXD3, 1);
 #ifdef AT91_TSC
 	/* We're using TC0's A1 and A2 input */
 	at91_pio_use_periph_b(AT91RM92_PIOA_BASE,
 	    AT91C_PA19_TIOA1 | AT91C_PA21_TIOA2, 0);
 #endif
 	/* PIOB's A periph: Turn USART 1's TX/RX pins */
 	at91_pio_use_periph_a(AT91RM92_PIOB_BASE, AT91C_PB21_RXD1, 0);
 	at91_pio_use_periph_a(AT91RM92_PIOB_BASE, AT91C_PB20_TXD1, 1);
 
 	/* Pin assignment */
 #ifdef AT91_TSC
 	/* Assert PA24 low -- talk to rubidium */
 	at91_pio_use_gpio(AT91RM92_PIOA_BASE, AT91C_PIO_PA24);
 	at91_pio_gpio_output(AT91RM92_PIOA_BASE, AT91C_PIO_PA24, 0);
 	at91_pio_gpio_clear(AT91RM92_PIOA_BASE, AT91C_PIO_PA24);
 	at91_pio_use_gpio(AT91RM92_PIOB_BASE,
 	    AT91C_PIO_PB16 | AT91C_PIO_PB17 | AT91C_PIO_PB18 | AT91C_PIO_PB19);
 #endif
 
 	return (ramsize());
 }
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pv_addr  kernel_l1pt;
 	int loop;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t afterkern;
 	int i;
 	uint32_t fake_preload[35];
 	uint32_t memsize;
 	vm_offset_t lastaddr;
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 
 	i = 0;
 
 	set_cpufuncs();
 
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNVIRTADDR;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNVIRTADDR;
 #ifdef DDB
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		ksym_start = zstart;
 		ksym_end = zend;
 	} else
 #endif
 		lastaddr = (vm_offset_t)&end;
 		
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 #define KERNEL_TEXT_BASE (KERNBASE)
 	freemempos = (lastaddr + PAGE_MASK) & ~PAGE_MASK;
 	/* Define a macro to simplify memory allocation */
 #define valloc_pages(var, np)                   \
 	alloc_pages((var).pv_va, (np));         \
 	(var).pv_pa = (var).pv_va + (KERNPHYSADDR - KERNVIRTADDR);
 
 #define alloc_pages(var, np)			\
 	(var) = freemempos;		\
 	freemempos += (np * PAGE_SIZE);		\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos += PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_va = freemempos -
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_pa = 
 			    kernel_pt_table[loop].pv_va - KERNVIRTADDR +
 			    KERNPHYSADDR;
 		}
 		i++;
 	}
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE);
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH,
 	    &kernel_pt_table[KERNEL_PT_SYS]);
 	for (i = 0; i < KERNEL_PT_KERN_NUM; i++)
 		pmap_link_l2pt(l1pagetable, KERNBASE + i * 0x100000,
 		    &kernel_pt_table[KERNEL_PT_KERN + i]);
 	pmap_map_chunk(l1pagetable, KERNBASE, PHYSADDR,
 	   (((uint32_t)(lastaddr) - KERNBASE) + PAGE_SIZE) & ~(PAGE_SIZE - 1),
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	afterkern = round_page((lastaddr + L1_S_SIZE) & ~(L1_S_SIZE 
 	    - 1));
 	for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) {
 		pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000,
 		    &kernel_pt_table[KERNEL_PT_AFKERNEL + i]);
 	}
 
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	/* Map the stack pages */
 	pmap_map_chunk(l1pagetable, irqstack.pv_va, irqstack.pv_pa,
 	    IRQ_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, abtstack.pv_va, abtstack.pv_pa,
 	    ABT_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, undstack.pv_va, undstack.pv_pa,
 	    UND_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, kernelstack.pv_va, kernelstack.pv_pa,
 	    KSTACK_PAGES * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa,
 	    L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	pmap_map_chunk(l1pagetable, msgbufpv.pv_va, msgbufpv.pv_pa,
 	    MSGBUF_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		pmap_map_chunk(l1pagetable, kernel_pt_table[loop].pv_va,
 		    kernel_pt_table[loop].pv_pa, L2_TABLE_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	}
 
 	pmap_devmap_bootstrap(l1pagetable, kb920x_devmap);
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 	cninit();
 	memsize = board_init();
 	physmem = memsize / PAGE_SIZE;
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 
 	cpu_control(CPU_CONTROL_MMU_ENABLE, CPU_CONTROL_MMU_ENABLE);
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 
 	/* Set stack for exception handlers */
 	
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 				
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 	
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 
 	pmap_curmaxkvaddr = afterkern + 0x100000 * (KERNEL_PT_KERN_NUM - 1);
 	/*
 	 * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before
 	 * calling pmap_bootstrap.
 	 */
 	dump_avail[0] = PHYSADDR;
 	dump_avail[1] = PHYSADDR + memsize;
 	dump_avail[2] = 0;
 	dump_avail[3] = 0;
 					
 	pmap_bootstrap(freemempos,
 	    KERNVIRTADDR + 3 * memsize,
 	    &kernel_l1pt);
 	msgbufp = (void*)msgbufpv.pv_va;
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	mutex_init();
 	
 	i = 0;
 	
 #if PHYSADDR != KERNPHYSADDR
 	phys_avail[i++] = PHYSADDR;
 	phys_avail[i++] = KERNPHYSADDR;
 #endif
 	phys_avail[i++] = virtual_avail - KERNVIRTADDR + KERNPHYSADDR;
 	phys_avail[i++] = PHYSADDR + memsize;
 	phys_avail[i++] = 0;
 	phys_avail[i++] = 0;
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
Index: head/sys/arm/sa11x0/assabet_machdep.c
===================================================================
--- head/sys/arm/sa11x0/assabet_machdep.c	(revision 173360)
+++ head/sys/arm/sa11x0/assabet_machdep.c	(revision 173361)
@@ -1,455 +1,455 @@
 /*	$NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $	*/
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_md.h"
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 #include <arm/sa11x0/sa11x0_reg.h>
 
 #define MDROOT_ADDR 0xd0400000
 
 #define KERNEL_PT_VMEM		0	/* Page table for mapping video memory */
 #define KERNEL_PT_SYS		0	/* Page table for mapping proc0 zero page */
 #define KERNEL_PT_IO		3	/* Page table for mapping IO */
 #define KERNEL_PT_IRQ		2	/* Page table for mapping irq handler */
 #define KERNEL_PT_KERNEL	1	/* Page table for mapping kernel */
 #define KERNEL_PT_L1		4	/* Page table for mapping l1pt */
 #define	KERNEL_PT_VMDATA	5	/* Page tables for mapping kernel VM */
 #define	KERNEL_PT_VMDATA_NUM	7	/* start with 16MB of KVM */
 #define	NUM_KERNEL_PTS		(KERNEL_PT_VMDATA + KERNEL_PT_VMDATA_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #ifdef IPKDB
 #define UND_STACK_SIZE	2
 #else
 #define UND_STACK_SIZE	1
 #endif
 #define	KERNEL_VM_BASE		(KERNBASE + 0x00100000)
 #define	KERNEL_VM_SIZE		0x05000000
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern vm_offset_t sa1110_uart_vaddr;
 
 extern vm_offset_t sa1_cache_clean_addr;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 #ifndef MD_ROOT_SIZE
 #define MD_ROOT_SIZE 65535
 #endif
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_paddr_t physical_start;
 vm_paddr_t physical_end;
 vm_paddr_t physical_freestart;
 vm_offset_t physical_pages;
 
 struct pv_addr systempage;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 static struct trapframe proc0_tf;
 
 /* Static device mappings. */
 static const struct pmap_devmap assabet_devmap[] = {
 	/*
 	 * Map the on-board devices VA == PA so that we can access them
 	 * with the MMU on or off.
 	 */
 	{
 		SACOM1_VBASE,
 		SACOM1_BASE,
 		SACOM1_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,
 		PTE_NOCACHE,
 	},
 	{
 		SAIPIC_BASE,
 		SAIPIC_BASE,
 		SAIPIC_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,
 		PTE_NOCACHE,
 	},
 	{
 		0,
 		0,
 		0,
 		0,
 		0,
 	}
 };
 
 struct arm32_dma_range *
 bus_dma_get_range(void)
 {
 
 	return (NULL);
 }
 
 int
 bus_dma_get_range_nb(void)
 {
 	return (0);
 }
 
 void
 cpu_reset()
 {
 	cpu_halt();
 	while (1);
 }
 
 #define CPU_SA110_CACHE_CLEAN_SIZE (0x4000 * 2)
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pcpu *pc;
 	struct pv_addr  kernel_l1pt;
 	struct pv_addr	md_addr;
 	struct pv_addr	md_bla;
 	int loop;
 	u_int kerneldatasize, symbolsize;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t lastalloced;
 	vm_size_t pt_size;
 	int i = 0;
 	uint32_t fake_preload[35];
 	uint32_t memsize = 32 * 1024 * 1024;
 	sa1110_uart_vaddr = SACOM1_VBASE;
 
 	boothowto = RB_VERBOSE | RB_SINGLE;
 	cninit();
 	set_cpufuncs();
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNBASE;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNBASE;
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("md root") + 1;
 	strcpy((char*)&fake_preload[i++], "md root");
 	i += 1;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("md_image") + 1;
 	strcpy((char*)&fake_preload[i++], "md_image");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = MDROOT_ADDR;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = MD_ROOT_SIZE * 1024;
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 	physmem = memsize / PAGE_SIZE;
 	pc = &__pcpu;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 	physical_start = (vm_offset_t) KERNBASE;
 	physical_end =  (vm_offset_t) &end;
 	physical_freestart = (((vm_offset_t)physical_end) + PAGE_MASK) & ~PAGE_MASK;
 	md_addr.pv_va = md_addr.pv_pa = MDROOT_ADDR;
 #define KERNEL_TEXT_BASE (KERNBASE + 0x00040000)
 	kerneldatasize = (u_int32_t)&end - (u_int32_t)KERNEL_TEXT_BASE;
 	symbolsize = 0;
 	freemempos = (vm_offset_t)round_page(physical_freestart);
 	memset((void *)freemempos, 0, 256*1024);
 		/* Define a macro to simplify memory allocation */
 #define	valloc_pages(var, np)			\
 	alloc_pages((var).pv_pa, (np));		\
 	(var).pv_va = (var).pv_pa;
 
 #define alloc_pages(var, np)			\
 	(var) = freemempos;		\
 	freemempos += ((np) * PAGE_SIZE);\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while ((freemempos & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos += PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	valloc_pages(md_bla, L2_TABLE_SIZE / PAGE_SIZE);
 	alloc_pages(sa1_cache_clean_addr, CPU_SA110_CACHE_CLEAN_SIZE / PAGE_SIZE);
 
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_pa = freemempos +
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_va = 
 			    kernel_pt_table[loop].pv_pa;
 		}
 	}
 
 	valloc_pages(systempage, 1);
 
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	pt_size = round_page(freemempos) - physical_freestart;
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	lastalloced = kernelstack.pv_va;
 
 	/*
 	 * Allocate memory for the l1 and l2 page tables. The scheme to avoid
 	 * wasting memory by allocating the l1pt on the first 16k memory was
 	 * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for
 	 * this to work (which is supposed to be the case).
 	 */
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_pa;
 
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, 0x00000000,
 	    &kernel_pt_table[KERNEL_PT_SYS]);
 	pmap_link_l2pt(l1pagetable, KERNBASE,
 	    &kernel_pt_table[KERNEL_PT_KERNEL]);
 	pmap_link_l2pt(l1pagetable, 0xd0000000,
 	    &kernel_pt_table[KERNEL_PT_IO]);
 	pmap_link_l2pt(l1pagetable, lastalloced & ~((L1_S_SIZE * 4) - 1),
 	    &kernel_pt_table[KERNEL_PT_L1]);
 	pmap_link_l2pt(l1pagetable, 0x90000000, &kernel_pt_table[KERNEL_PT_IRQ]);
 	pmap_link_l2pt(l1pagetable, MDROOT_ADDR,
 	    &md_bla);
 	for (loop = 0; loop < KERNEL_PT_VMDATA_NUM; ++loop)
 		pmap_link_l2pt(l1pagetable, KERNEL_VM_BASE + loop * 0x00100000,
 		    &kernel_pt_table[KERNEL_PT_VMDATA + loop]);
 	pmap_map_chunk(l1pagetable, KERNBASE, KERNBASE,
 	   ((uint32_t)&end - KERNBASE), VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	/* Map the stack pages */
 	pmap_map_chunk(l1pagetable, irqstack.pv_va, irqstack.pv_pa,
 	    IRQ_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, md_addr.pv_va, md_addr.pv_pa,
 	    MD_ROOT_SIZE * 1024, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, abtstack.pv_va, abtstack.pv_pa,
 	    ABT_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, undstack.pv_va, undstack.pv_pa,
 	    UND_STACK_SIZE * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, kernelstack.pv_va, kernelstack.pv_pa,
 	    KSTACK_PAGES * PAGE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	pmap_map_chunk(l1pagetable, kernel_l1pt.pv_va, kernel_l1pt.pv_pa,
 	    L1_TABLE_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		pmap_map_chunk(l1pagetable, kernel_pt_table[loop].pv_va,
 		    kernel_pt_table[loop].pv_pa, L2_TABLE_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	}
 	pmap_map_chunk(l1pagetable, md_bla.pv_va, md_bla.pv_pa, L2_TABLE_SIZE,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, vector_page, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	/* Map the statically mapped devices. */
 	pmap_devmap_bootstrap(l1pagetable, assabet_devmap);
 	pmap_map_chunk(l1pagetable, sa1_cache_clean_addr, 0xf0000000, 
 	    CPU_SA110_CACHE_CLEAN_SIZE, VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 
 
 	bootverbose = 1;
 
 	/* Set stack for exception handlers */
 	
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	
 	
 	/* Enable MMU, I-cache, D-cache, write buffer. */
 
 	cpufunc_control(0x337f, 0x107d);
 	arm_vector_init(ARM_VECTORS_LOW, ARM_VEC_ALL);
 
 	pmap_curmaxkvaddr = freemempos + KERNEL_PT_VMDATA_NUM * 0x400000;
 
 	dump_avail[0] = phys_avail[0] = round_page(virtual_avail);
 	dump_avail[1] = phys_avail[1] = 0xc0000000 + 0x02000000 - 1;
 	dump_avail[2] = phys_avail[2] = 0;
 	dump_avail[3] = phys_avail[3] = 0;
 					
 	mutex_init();
 	pmap_bootstrap(freemempos, 
 	    0xd0000000, &kernel_l1pt);
 
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
Index: head/sys/arm/xscale/i80321/ep80219_machdep.c
===================================================================
--- head/sys/arm/xscale/i80321/ep80219_machdep.c	(revision 173360)
+++ head/sys/arm/xscale/i80321/ep80219_machdep.c	(revision 173361)
@@ -1,517 +1,517 @@
 /*	$NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $	*/
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 #include "opt_msgbuf.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <sys/msgbuf.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 #include <arm/xscale/i80321/i80321reg.h>
 #include <arm/xscale/i80321/i80321var.h>
 #include <arm/xscale/i80321/iq80321reg.h>
 #include <arm/xscale/i80321/obiovar.h>
 
 #define KERNEL_PT_SYS			0	/* Page table for mapping proc0 zero page */
 #define KERNEL_PT_IOPXS			1
 #define KERNEL_PT_BEFOREKERN	2
 #define KERNEL_PT_AFKERNEL		3	/* L2 table for mapping after kernel */
 #define KERNEL_PT_AFKERNEL_NUM	9
 
 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */
 #define NUM_KERNEL_PTS		(KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #ifdef IPKDB
 #define UND_STACK_SIZE	2
 #else
 #define UND_STACK_SIZE	1
 #endif
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_offset_t physical_pages;
 vm_offset_t clean_sva, clean_eva;
 
 struct pv_addr systempage;
 struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 struct pv_addr minidataclean;
 
 static struct trapframe proc0_tf;
 
 
 /* #define IQ80321_OBIO_BASE 0xfe800000UL */
 /* #define IQ80321_OBIO_SIZE 0x00100000UL */
 
 /* Static device mappings. */
 static const struct pmap_devmap ep80219_devmap[] = {
 	/* 
 	 * Map the on-board devices VA == PA so that we can access them
 	 * with the MMU on or off.
 	 */
 	{
 		IQ80321_OBIO_BASE,
 		IQ80321_OBIO_BASE,
 		IQ80321_OBIO_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,                             
 		PTE_NOCACHE,
 	},
 	{
 		IQ80321_IOW_VBASE,
 		VERDE_OUT_XLATE_IO_WIN0_BASE,
 		VERDE_OUT_XLATE_IO_WIN_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,
 		PTE_NOCACHE,
 	},	    
 	{
 		IQ80321_80321_VBASE,
 		VERDE_PMMR_BASE,
 		VERDE_PMMR_SIZE,
 		VM_PROT_READ|VM_PROT_WRITE,
 		PTE_NOCACHE,
 	},
 	{
 		0,
 		0,
 		0,
 		0,
 		0,
 	}
 };
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 extern vm_offset_t xscale_cache_clean_addr;
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pv_addr  kernel_l1pt;
 	int loop;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t freemem_pt;
 	vm_offset_t afterkern;
 	vm_offset_t freemem_after;
 	vm_offset_t lastaddr;
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 	int i = 0;
 	uint32_t fake_preload[35];
 	uint32_t memsize, memstart;
 
 	i = 0;
 
 	set_cpufuncs();
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNBASE + 0x00200000;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000;
 #ifdef DDB
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		ksym_start = zstart;
 		ksym_end = zend;
 	} else
 #endif
 		lastaddr = (vm_offset_t)&end;
 	
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000)
 	freemempos = 0xa0200000;
 	/* Define a macro to simplify memory allocation */
 #define	valloc_pages(var, np)			\
 	alloc_pages((var).pv_pa, (np));				\
 	(var).pv_va = (var).pv_pa + 0x20000000;
 
 #define alloc_pages(var, np)			\
 	freemempos -= (np * PAGE_SIZE);		\
 	(var) = freemempos;		\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos -= PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 						 L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_pa = freemempos +
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_va = 
 			    kernel_pt_table[loop].pv_pa + 0x20000000;
 		}
 		i++;
 	}
 	freemem_pt = freemempos;
 	freemempos = 0xa0100000;
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	alloc_pages(minidataclean.pv_pa, 1);
 	valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE);
 #ifdef ARM_USE_SMALL_ALLOC
 	freemempos -= PAGE_SIZE;
 	freemem_pt = trunc_page(freemem_pt);
 	freemem_after = freemempos - ((freemem_pt - 0xa0100000) /
 								  PAGE_SIZE) * sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000)
 							 , (void *)0xc0100000, freemem_pt - 0xa0100000, 1);
 	freemem_after -= ((freemem_after - 0xa0001000) / PAGE_SIZE) *
 	    sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000),
 							 (void *)0xc0001000,
 							 trunc_page(freemem_after) - 0xa0001000, 0);
 	
 	freemempos = trunc_page(freemem_after);
 	freemempos -= PAGE_SIZE;
 #endif
 	/*
 	 * Allocate memory for the l1 and l2 page tables. The scheme to avoid
 	 * wasting memory by allocating the l1pt on the first 16k memory was
 	 * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for
 	 * this to work (which is supposed to be the case).
 	 */
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1),
 				   &kernel_pt_table[KERNEL_PT_SYS]);
 	pmap_link_l2pt(l1pagetable, IQ80321_IOPXS_VBASE,
 				   &kernel_pt_table[KERNEL_PT_IOPXS]);
 	pmap_link_l2pt(l1pagetable, KERNBASE,
 				   &kernel_pt_table[KERNEL_PT_BEFOREKERN]);
 	pmap_map_chunk(l1pagetable, KERNBASE, IQ80321_SDRAM_START, 0x100000,
 				   VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, IQ80321_SDRAM_START + 0x100000,
 				   0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, IQ80321_SDRAM_START + 0x200000,
 				   (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1),
 				   VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1);
 	afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE 
 																   - 1));
 	for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) {
 		pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000,
 					   &kernel_pt_table[KERNEL_PT_AFKERNEL + i]);
 	}
 	pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, 
 				   VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	
 
 #ifdef ARM_USE_SMALL_ALLOC
 	if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) {
 		arm_add_smallalloc_pages((void *)(freemem_after),
 		    (void*)(freemem_after + PAGE_SIZE),
 		    afterkern - (freemem_after + PAGE_SIZE), 0);
 		    
 	}
 #endif
 
 	/* Map the Mini-Data cache clean area. */
 	xscale_setup_minidata(l1pagetable, afterkern,
 						  minidataclean.pv_pa);
 
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 				   VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_devmap_bootstrap(l1pagetable, ep80219_devmap);
 	/*
 	 * Give the XScale global cache clean code an appropriately
 	 * sized chunk of unmapped VA space starting at 0xff000000
 	 * (our device mappings end before this address).
 	 */
 	xscale_cache_clean_addr = 0xff000000U;
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 
 				   
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 	/*
 	 * Fetch the SDRAM start/size from the i80321 SDRAM configration
 	 * registers.
 	 */
 	i80321_calibrate_delay();
 	i80321_sdram_bounds(&obio_bs_tag, IQ80321_80321_VBASE + VERDE_MCU_BASE,
 	    &memstart, &memsize);
 	physmem = memsize / PAGE_SIZE;
 	cninit();
 
 	/* Set stack for exception handlers */
 	
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 				
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 	
 	/* Enable MMU, I-cache, D-cache, write buffer. */
 
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 
 
 
 	pmap_curmaxkvaddr = afterkern + PAGE_SIZE;
 	dump_avail[0] = 0xa0000000;
 	dump_avail[1] = 0xa0000000 + memsize;
 	dump_avail[2] = 0;
 	dump_avail[3] = 0;
 	pmap_bootstrap(pmap_curmaxkvaddr, 
 	    0xd0000000, &kernel_l1pt);
 	msgbufp = (void*)msgbufpv.pv_va;
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	mutex_init();
 	
 	i = 0;
 #ifdef ARM_USE_SMALL_ALLOC
 	phys_avail[i++] = 0xa0000000;
 	phys_avail[i++] = 0xa0001000; 	/*
 					 *XXX: Gross hack to get our
 					 * pages in the vm_page_array
 					 . */
 #endif
 	phys_avail[i++] = round_page(virtual_avail - KERNBASE + IQ80321_SDRAM_START);
 	phys_avail[i++] = trunc_page(0xa0000000 + memsize - 1);
 	phys_avail[i++] = 0;
 	phys_avail[i] = 0;
 	
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
 
 extern int
 machdep_pci_route_interrupt(device_t pcib, device_t dev, int pin)
 {
 	int bus;
 	int device;
 	int func;
 	uint32_t busno;
 	struct i80321_pci_softc *sc = device_get_softc(pcib);
 	bus = pci_get_bus(dev);
 	device = pci_get_slot(dev);
 	func = pci_get_function(dev);
 	busno = bus_space_read_4(sc->sc_st, sc->sc_atu_sh, ATU_PCIXSR);
 	busno = PCIXSR_BUSNO(busno);
 	if (busno == 0xff)
 		busno = 0;
 	if (bus != busno)
 		goto no_mapping;
 	switch (device) {
 		/* EP80219 PCI */
 	case 1: /* Ethernet i82555 10/100 */
 		printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(0));
 		return (ICU_INT_XINT(0));
 	case 2: /* UART */
 		printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(1));
 		return (ICU_INT_XINT(1));
 	case 3:
 		/*
 		 * The S-ATA chips are behind the bridge, and all of
 		 * the S-ATA interrupts are wired together.
 		 */
 		printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(2));
 		return (ICU_INT_XINT(2));
 	case 4: /* MINI-PIC_INT */
 		printf("Device %d routed to irq %d\n", device, ICU_INT_XINT(3));
 		return( ICU_INT_XINT(3));
 	default:
 no_mapping:
 		printf("No mapping for %d/%d/%d/%c\n", bus, device, func, pin);
 		
 	}
 	return (0);
 
 }
Index: head/sys/arm/xscale/i80321/iq31244_machdep.c
===================================================================
--- head/sys/arm/xscale/i80321/iq31244_machdep.c	(revision 173360)
+++ head/sys/arm/xscale/i80321/iq31244_machdep.c	(revision 173361)
@@ -1,533 +1,533 @@
 /*	$NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $	*/
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 #include "opt_msgbuf.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <sys/msgbuf.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 #include <arm/xscale/i80321/i80321reg.h>
 #include <arm/xscale/i80321/i80321var.h>
 #include <arm/xscale/i80321/iq80321reg.h>
 #include <arm/xscale/i80321/obiovar.h>
 
 #define KERNEL_PT_SYS		0	/* Page table for mapping proc0 zero page */
 #define	KERNEL_PT_IOPXS		1
 #define KERNEL_PT_BEFOREKERN	2
 #define KERNEL_PT_AFKERNEL	3	/* L2 table for mapping after kernel */
 #define	KERNEL_PT_AFKERNEL_NUM	9
 
 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */
 #define NUM_KERNEL_PTS		(KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #ifdef IPKDB
 #define UND_STACK_SIZE	2
 #else
 #define UND_STACK_SIZE	1
 #endif
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_offset_t physical_pages;
 vm_offset_t clean_sva, clean_eva;
 
 struct pv_addr systempage;
 struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 struct pv_addr minidataclean;
 
 static struct trapframe proc0_tf;
 
 #define IQ80321_OBIO_BASE 0xfe800000UL
 #define IQ80321_OBIO_SIZE 0x00100000UL
 /* Static device mappings. */
 static const struct pmap_devmap iq80321_devmap[] = {
 	/* 
 	 * Map the on-board devices VA == PA so that we can access them
 	 * with the MMU on or off.
 	 */
 	    {
 		    IQ80321_OBIO_BASE,
 		    IQ80321_OBIO_BASE,
 		    IQ80321_OBIO_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE,                             
 		    PTE_NOCACHE,
 	    },
 	    {
 	    	    IQ80321_IOW_VBASE,
 		    VERDE_OUT_XLATE_IO_WIN0_BASE,
 		    VERDE_OUT_XLATE_IO_WIN_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE,
 		    PTE_NOCACHE,
 	    },
 	    
 	    {
 		    IQ80321_80321_VBASE,
 		    VERDE_PMMR_BASE,
 		    VERDE_PMMR_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE,
 		    PTE_NOCACHE,
 	    },
 	    {
 		    0,
 		    0,
 		    0,
 		    0,
 		    0,
 	    }
 };
 
 #define SDRAM_START 0xa0000000
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 extern vm_offset_t xscale_cache_clean_addr;
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pv_addr  kernel_l1pt;
 	int loop;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t freemem_pt;
 	vm_offset_t afterkern;
 	vm_offset_t freemem_after;
 	vm_offset_t lastaddr;
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 	int i;
 	uint32_t fake_preload[35];
 	uint32_t memsize, memstart;
 
 	i = 0;
 
 	set_cpufuncs();
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNBASE + 0x00200000;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000;
 #ifdef DDB
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		ksym_start = zstart;
 		ksym_end = zend;
 	} else
 #endif
 		lastaddr = (vm_offset_t)&end;
 
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000)
 	freemempos = 0xa0200000;
 	/* Define a macro to simplify memory allocation */
 #define	valloc_pages(var, np)			\
 	alloc_pages((var).pv_pa, (np));		\
 	(var).pv_va = (var).pv_pa + 0x20000000;
 
 #define alloc_pages(var, np)			\
 	freemempos -= (np * PAGE_SIZE);		\
 	(var) = freemempos;		\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos -= PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_pa = freemempos +
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_va = 
 			    kernel_pt_table[loop].pv_pa + 0x20000000;
 		}
 	}
 	freemem_pt = freemempos;
 	freemempos = 0xa0100000;
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	alloc_pages(minidataclean.pv_pa, 1);
 	valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE);
 #ifdef ARM_USE_SMALL_ALLOC
 	freemempos -= PAGE_SIZE;
 	freemem_pt = trunc_page(freemem_pt);
 	freemem_after = freemempos - ((freemem_pt - 0xa0100000) /
 	    PAGE_SIZE) * sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000)
 	    , (void *)0xc0100000, freemem_pt - 0xa0100000, 1);
 	freemem_after -= ((freemem_after - 0xa0001000) / PAGE_SIZE) *
 	    sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0x20000000)
 	, (void *)0xc0001000, trunc_page(freemem_after) - 0xa0001000, 0);
 	freemempos = trunc_page(freemem_after);
 	freemempos -= PAGE_SIZE;
 #endif
 	/*
 	 * Allocate memory for the l1 and l2 page tables. The scheme to avoid
 	 * wasting memory by allocating the l1pt on the first 16k memory was
 	 * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for
 	 * this to work (which is supposed to be the case).
 	 */
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1),
 	    &kernel_pt_table[KERNEL_PT_SYS]);
 	pmap_link_l2pt(l1pagetable, IQ80321_IOPXS_VBASE,
 	                &kernel_pt_table[KERNEL_PT_IOPXS]);
 	pmap_link_l2pt(l1pagetable, KERNBASE,
 	    &kernel_pt_table[KERNEL_PT_BEFOREKERN]);
 	pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000,
 	    0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000,
 	   (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1),
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1);
 	afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE 
 	    - 1));
 	for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) {
 		pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000,
 		    &kernel_pt_table[KERNEL_PT_AFKERNEL + i]);
 	}
 	pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, 
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	
 
 #ifdef ARM_USE_SMALL_ALLOC
 	if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) {
 		arm_add_smallalloc_pages((void *)(freemem_after),
 		    (void*)(freemem_after + PAGE_SIZE),
 		    afterkern - (freemem_after + PAGE_SIZE), 0);
 		    
 	}
 #endif
 
 	/* Map the Mini-Data cache clean area. */
 	xscale_setup_minidata(l1pagetable, afterkern,
 	    minidataclean.pv_pa);
 
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_devmap_bootstrap(l1pagetable, iq80321_devmap);
 	/*
 	 * Give the XScale global cache clean code an appropriately
 	 * sized chunk of unmapped VA space starting at 0xff000000
 	 * (our device mappings end before this address).
 	 */
 	xscale_cache_clean_addr = 0xff000000U;
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 
 				   
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 	/*
 	 * Fetch the SDRAM start/size from the i80321 SDRAM configration
 	 * registers.
 	 */
 	i80321_calibrate_delay();
 	i80321_sdram_bounds(&obio_bs_tag, IQ80321_80321_VBASE + VERDE_MCU_BASE,
 	    &memstart, &memsize);
 	physmem = memsize / PAGE_SIZE;
 	cninit();
 
 	/* Set stack for exception handlers */
 	
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 				
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 	
 	/* Enable MMU, I-cache, D-cache, write buffer. */
 
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 
 
 
 	pmap_curmaxkvaddr = afterkern + PAGE_SIZE;
 	/*
 	 * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before
 	 * calling pmap_bootstrap.
 	 */
 	dump_avail[0] = 0xa0000000;
 	dump_avail[1] = 0xa0000000 + memsize;
 	dump_avail[2] = 0;
 	dump_avail[3] = 0;
 					
 	pmap_bootstrap(pmap_curmaxkvaddr, 
 	    0xd0000000, &kernel_l1pt);
 	msgbufp = (void*)msgbufpv.pv_va;
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	mutex_init();
 	
 	i = 0;
 #ifdef ARM_USE_SMALL_ALLOC
 	phys_avail[i++] = 0xa0000000;
 	phys_avail[i++] = 0xa0001000; 	/*
 					 *XXX: Gross hack to get our
 					 * pages in the vm_page_array
 					 . */
 #endif
 	phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START);
 	phys_avail[i++] = trunc_page(0xa0000000 + memsize - 1);
 	phys_avail[i++] = 0;
 	phys_avail[i] = 0;
 	
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
 
 
 extern int
 machdep_pci_route_interrupt(device_t pcib, device_t dev, int pin)
 {
 	int bus;
 	int device;
 	int func;
 	uint32_t busno;
 	struct i80321_pci_softc *sc = device_get_softc(pcib);
 	bus = pci_get_bus(dev);
 	device = pci_get_slot(dev);
 	func = pci_get_function(dev);
 	busno = bus_space_read_4(sc->sc_st, sc->sc_atu_sh, ATU_PCIXSR);
 	busno = PCIXSR_BUSNO(busno);
 	if (busno == 0xff)
 		busno = 0;
 	if (bus != busno)
 		goto no_mapping;
 	switch (device) {
 		/* IQ31244 PCI */
 	case 1: /* PCIX-PCIX bridge */
 		/*
 		 * The S-ATA chips are behind the bridge, and all of
 		 * the S-ATA interrupts are wired together.
 		 */
 		return (ICU_INT_XINT(2));
 	case 2: /* PCI slot */
 		/* All pins are wired together. */
 		return (ICU_INT_XINT(3));
 	case 3: /* i82546 dual Gig-E */
 		if (pin == 1 || pin == 2)
 			return (ICU_INT_XINT(0));
 		goto no_mapping;
 		/* IQ80321 PCI */
 	case 4: /* i82544 Gig-E */
 	case 8: /*
 		 * Apparently you can set the device for the ethernet adapter
 		 * to 8 with a jumper, so handle that as well
 		 */
 		if (pin == 1)
 			return (ICU_INT_XINT(0));
 		goto no_mapping;
 	case 6: /* S-PCI-X slot */
 		if (pin == 1)
 			return (ICU_INT_XINT(2));
 		if (pin == 2)
 			return (ICU_INT_XINT(3));
 		goto no_mapping;
 	default:
 no_mapping:
 		printf("No mapping for %d/%d/%d/%c\n", bus, device, func, pin);
 		
 	}
 	return (0);
 
 }
Index: head/sys/arm/xscale/i8134x/crb_machdep.c
===================================================================
--- head/sys/arm/xscale/i8134x/crb_machdep.c	(revision 173360)
+++ head/sys/arm/xscale/i8134x/crb_machdep.c	(revision 173361)
@@ -1,458 +1,458 @@
 /*	$NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $	*/
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 #include "opt_msgbuf.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <sys/msgbuf.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 
 #include <arm/xscale/i80321/i80321var.h> /* For i80321_calibrate_delay() */
 
 #include <arm/xscale/i8134x/i81342reg.h>
 #include <arm/xscale/i8134x/i81342var.h>
 #include <arm/xscale/i8134x/obiovar.h>
 
 
 #define KERNEL_PT_SYS		0	/* Page table for mapping proc0 zero page */
 #define	KERNEL_PT_IOPXS		1
 #define KERNEL_PT_BEFOREKERN	2
 #define KERNEL_PT_AFKERNEL	3	/* L2 table for mapping after kernel */
 #define	KERNEL_PT_AFKERNEL_NUM	9
 
 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */
 #define NUM_KERNEL_PTS		(KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #ifdef IPKDB
 #define UND_STACK_SIZE	2
 #else
 #define UND_STACK_SIZE	1
 #endif
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern vm_offset_t sa1_cache_clean_addr;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_offset_t physical_pages;
 vm_offset_t clean_sva, clean_eva;
 
 struct pv_addr systempage;
 struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 
 static struct trapframe proc0_tf;
 
 /* Static device mappings. */
 static const struct pmap_devmap iq81342_devmap[] = {
 	    {
 		    IOP34X_VADDR,
 		    IOP34X_HWADDR,
 		    IOP34X_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE,
 		    PTE_NOCACHE,
 	    },
 	    {
 		    /*
 		     * Cheat and map a whole section, this will bring
 		     * both PCI-X and PCI-E outbound I/O
 		     */
 		    IOP34X_PCIX_OIOBAR_VADDR &~ (0x100000 - 1),
 		    IOP34X_PCIX_OIOBAR &~ (0x100000 - 1),
 		    0x100000,
 		    VM_PROT_READ|VM_PROT_WRITE,
 		    PTE_NOCACHE,
 	    },
 	    {
 		    IOP34X_PCE1_VADDR,
 		    IOP34X_PCE1,
 		    IOP34X_PCE1_SIZE,
 		    VM_PROT_READ|VM_PROT_WRITE,
 		    PTE_NOCACHE,
 	    },
 	    {	    
 		    0,
 		    0,
 		    0,
 		    0,
 		    0,
 	    }
 };
 
 #define SDRAM_START 0x00000000
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 extern vm_offset_t xscale_cache_clean_addr;
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pv_addr  kernel_l1pt;
 	int loop;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t freemem_pt;
 	vm_offset_t afterkern;
 	vm_offset_t freemem_after;
 	vm_offset_t lastaddr;
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 	int i;
 	uint32_t fake_preload[35];
 	uint32_t memsize, memstart;
 
 	i = 0;
 
 	set_cpufuncs();
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNBASE + 0x00200000;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000;
 #ifdef DDB
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		ksym_start = zstart;
 		ksym_end = zend;
 	} else
 #endif
 		lastaddr = (vm_offset_t)&end;
 
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000)
 	freemempos = 0x00200000;
 	/* Define a macro to simplify memory allocation */
 #define	valloc_pages(var, np)			\
 	alloc_pages((var).pv_pa, (np));		\
 	(var).pv_va = (var).pv_pa + 0xc0000000;
 
 #define alloc_pages(var, np)			\
 	freemempos -= (np * PAGE_SIZE);		\
 	(var) = freemempos;		\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos -= PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_pa = freemempos +
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_va = 
 			    kernel_pt_table[loop].pv_pa + 0xc0000000;
 		}
 	}
 	freemem_pt = freemempos;
 	freemempos = 0x00100000;
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE);
 #ifdef ARM_USE_SMALL_ALLOC
 	freemempos -= PAGE_SIZE;
 	freemem_pt = trunc_page(freemem_pt);
 	freemem_after = freemempos - ((freemem_pt - 0x00100000) /
 	    PAGE_SIZE) * sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0xc0000000)
 	    , (void *)0xc0100000, freemem_pt - 0x00100000, 1);
 	freemem_after -= ((freemem_after - 0x00001000) / PAGE_SIZE) *
 	    sizeof(struct arm_small_page);
 #if 0
 	arm_add_smallalloc_pages((void *)(freemem_after + 0xc0000000)
 	, (void *)0xc0001000, trunc_page(freemem_after) - 0x00001000, 0);
 #endif
 	freemempos = trunc_page(freemem_after);
 	freemempos -= PAGE_SIZE;
 #endif
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1),
 	    &kernel_pt_table[KERNEL_PT_SYS]);
 	pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000,
 	    0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000,
 	   (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1),
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1);
 	afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE 
 	    - 1));
 	for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) {
 		pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000,
 		    &kernel_pt_table[KERNEL_PT_AFKERNEL + i]);
 	}
 	
 
 #ifdef ARM_USE_SMALL_ALLOC
 	if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) {
 		arm_add_smallalloc_pages((void *)(freemem_after),
 		    (void*)(freemem_after + PAGE_SIZE),
 		    afterkern - (freemem_after + PAGE_SIZE), 0);
 		    
 	}
 #endif
 
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_devmap_bootstrap(l1pagetable, iq81342_devmap);
 	/*
 	 * Give the XScale global cache clean code an appropriately
 	 * sized chunk of unmapped VA space starting at 0xff000000
 	 * (our device mappings end before this address).
 	 */
 	xscale_cache_clean_addr = 0xff000000U;
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 
 				   
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 	i80321_calibrate_delay();
 	i81342_sdram_bounds(&obio_bs_tag, IOP34X_VADDR, &memstart, &memsize);
 	physmem = memsize / PAGE_SIZE;
 	cninit();
 	/* Set stack for exception handlers */
 	
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 				
 #ifdef KSE
 	proc_linkup(&proc0, &ksegrp0, &thread0);
 #else
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 #endif
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 	
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 
 	pmap_curmaxkvaddr = afterkern + PAGE_SIZE;
 	/*
 	 * ARM_USE_SMALL_ALLOC uses dump_avail, so it must be filled before
 	 * calling pmap_bootstrap.
 	 */
 	dump_avail[0] = 0x00000000;
 	dump_avail[1] = 0x00000000 + memsize;
 	dump_avail[2] = 0;
 	dump_avail[3] = 0;
 					
 	pmap_bootstrap(pmap_curmaxkvaddr, 
 	    0xd0000000, &kernel_l1pt);
 	msgbufp = (void*)msgbufpv.pv_va;
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	mutex_init();
 	
 	i = 0;
 #ifdef ARM_USE_SMALL_ALLOC
 	phys_avail[i++] = 0x00000000;
 	phys_avail[i++] = 0x00001000; 	/*
 					 *XXX: Gross hack to get our
 					 * pages in the vm_page_array
 					 . */
 #endif
 	phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START);
 	phys_avail[i++] = trunc_page(0x00000000 + memsize - 1);
 	phys_avail[i++] = 0;
 	phys_avail[i] = 0;
 	
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
Index: head/sys/arm/xscale/ixp425/avila_machdep.c
===================================================================
--- head/sys/arm/xscale/ixp425/avila_machdep.c	(revision 173360)
+++ head/sys/arm/xscale/ixp425/avila_machdep.c	(revision 173361)
@@ -1,538 +1,538 @@
 /*	$NetBSD: hpc_machdep.c,v 1.70 2003/09/16 08:18:22 agc Exp $	*/
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * RiscBSD kernel project
  *
  * machdep.c
  *
  * Machine dependant functions for kernel setup
  *
  * This file needs a lot of work. 
  *
  * Created      : 17/09/94
  */
 
 #include "opt_msgbuf.h"
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/cons.h>
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/buf.h>
 #include <sys/exec.h>
 #include <sys/kdb.h>
 #include <sys/msgbuf.h>
 #include <machine/reg.h>
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <machine/pmap.h>
 #include <machine/vmparam.h>
 #include <machine/pcb.h>
 #include <machine/undefined.h>
 #include <machine/machdep.h>
 #include <machine/metadata.h>
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <sys/reboot.h>
 
 #include <arm/xscale/ixp425/ixp425reg.h>
 #include <arm/xscale/ixp425/ixp425var.h>
 
 #define KERNEL_PT_SYS		0	/* Page table for mapping proc0 zero page */
 #define	KERNEL_PT_IO		1
 #define KERNEL_PT_IO_NUM	3
 #define KERNEL_PT_BEFOREKERN	KERNEL_PT_IO + KERNEL_PT_IO_NUM
 #define KERNEL_PT_AFKERNEL	KERNEL_PT_BEFOREKERN + 1	/* L2 table for mapping after kernel */
 #define	KERNEL_PT_AFKERNEL_NUM	9
 
 /* this should be evenly divisable by PAGE_SIZE / L2_TABLE_SIZE_REAL (or 4) */
 #define NUM_KERNEL_PTS		(KERNEL_PT_AFKERNEL + KERNEL_PT_AFKERNEL_NUM)
 
 /* Define various stack sizes in pages */
 #define IRQ_STACK_SIZE	1
 #define ABT_STACK_SIZE	1
 #ifdef IPKDB
 #define UND_STACK_SIZE	2
 #else
 #define UND_STACK_SIZE	1
 #endif
 
 extern u_int data_abort_handler_address;
 extern u_int prefetch_abort_handler_address;
 extern u_int undefined_handler_address;
 
 struct pv_addr kernel_pt_table[NUM_KERNEL_PTS];
 
 extern void *_end;
 
 extern int *end;
 
 struct pcpu __pcpu;
 struct pcpu *pcpup = &__pcpu;
 
 /* Physical and virtual addresses for some global pages */
 
 vm_paddr_t phys_avail[10];
 vm_paddr_t dump_avail[4];
 vm_offset_t physical_pages;
 vm_offset_t clean_sva, clean_eva;
 
 struct pv_addr systempage;
 struct pv_addr msgbufpv;
 struct pv_addr irqstack;
 struct pv_addr undstack;
 struct pv_addr abtstack;
 struct pv_addr kernelstack;
 struct pv_addr minidataclean;
 
 static struct trapframe proc0_tf;
 
 /* Static device mappings. */
 static const struct pmap_devmap ixp425_devmap[] = {
 	/* Physical/Virtual address for I/O space */
     {
 	IXP425_IO_VBASE,
 	IXP425_IO_HWBASE,
 	IXP425_IO_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 
 	/* Expansion Bus */
     {
 	IXP425_EXP_VBASE,
 	IXP425_EXP_HWBASE,
 	IXP425_EXP_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 
 	/* IXP425 PCI Configuration */
     {
 	IXP425_PCI_VBASE,
 	IXP425_PCI_HWBASE,
 	IXP425_PCI_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 
 	/* SDRAM Controller */
     {
 	IXP425_MCU_VBASE,
 	IXP425_MCU_HWBASE,
 	IXP425_MCU_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 
 	/* PCI Memory Space */
     {
 	IXP425_PCI_MEM_VBASE,
 	IXP425_PCI_MEM_HWBASE,
 	IXP425_PCI_MEM_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* NPE-A Memory Space */
     {
 	IXP425_NPE_A_VBASE,
 	IXP425_NPE_A_HWBASE,
 	IXP425_NPE_A_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* NPE-B Memory Space */
     {
 	IXP425_NPE_B_VBASE,
 	IXP425_NPE_B_HWBASE,
 	IXP425_NPE_B_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* NPE-C Memory Space */
     {
 	IXP425_NPE_C_VBASE,
 	IXP425_NPE_C_HWBASE,
 	IXP425_NPE_C_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* MAC-A Memory Space */
     {
 	IXP425_MAC_A_VBASE,
 	IXP425_MAC_A_HWBASE,
 	IXP425_MAC_A_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* MAC-B Memory Space */
     {
 	IXP425_MAC_B_VBASE,
 	IXP425_MAC_B_HWBASE,
 	IXP425_MAC_B_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 	/* Q-Mgr Memory Space */
     {
 	IXP425_QMGR_VBASE,
 	IXP425_QMGR_HWBASE,
 	IXP425_QMGR_SIZE,
 	VM_PROT_READ|VM_PROT_WRITE,
 	PTE_NOCACHE,
     },
 
     {
 	0,
 	0,
 	0,
 	0,
 	0,
     }
 };
 
 #define SDRAM_START 0x10000000
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 extern vm_offset_t xscale_cache_clean_addr;
 
 void *
 initarm(void *arg, void *arg2)
 {
 	struct pv_addr  kernel_l1pt;
 	int loop;
 	u_int l1pagetable;
 	vm_offset_t freemempos;
 	vm_offset_t freemem_pt;
 	vm_offset_t afterkern;
 	vm_offset_t freemem_after;
 	vm_offset_t lastaddr;
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
 	int i;
 	uint32_t fake_preload[35];
 	uint32_t memsize;
 
 	i = 0;
 
 	set_cpufuncs();
 	fake_preload[i++] = MODINFO_NAME;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_TYPE;
 	fake_preload[i++] = strlen("elf kernel") + 1;
 	strcpy((char*)&fake_preload[i++], "elf kernel");
 	i += 2;
 	fake_preload[i++] = MODINFO_ADDR;
 	fake_preload[i++] = sizeof(vm_offset_t);
 	fake_preload[i++] = KERNBASE + 0x00200000;
 	fake_preload[i++] = MODINFO_SIZE;
 	fake_preload[i++] = sizeof(uint32_t);
 	fake_preload[i++] = (uint32_t)&end - KERNBASE - 0x00200000;
 #ifdef DDB
 	if (*(uint32_t *)KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_SSYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 4);
 		fake_preload[i++] = MODINFO_METADATA|MODINFOMD_ESYM;
 		fake_preload[i++] = sizeof(vm_offset_t);
 		fake_preload[i++] = *(uint32_t *)(KERNVIRTADDR + 8);
 		lastaddr = *(uint32_t *)(KERNVIRTADDR + 8);
 		zend = lastaddr;
 		zstart = *(uint32_t *)(KERNVIRTADDR + 4);
 		ksym_start = zstart;
 		ksym_end = zend;
 	} else
 #endif
 		lastaddr = (vm_offset_t)&end;
 
 	fake_preload[i++] = 0;
 	fake_preload[i] = 0;
 	preload_metadata = (void *)fake_preload;
 
 
 	pcpu_init(pcpup, 0, sizeof(struct pcpu));
 	PCPU_SET(curthread, &thread0);
 
 #define KERNEL_TEXT_BASE (KERNBASE + 0x00200000)
 	freemempos = 0x10200000;
 	/* Define a macro to simplify memory allocation */
 #define	valloc_pages(var, np)			\
 	alloc_pages((var).pv_pa, (np));		\
 	(var).pv_va = (var).pv_pa + 0xb0000000;
 
 #define alloc_pages(var, np)			\
 	freemempos -= (np * PAGE_SIZE);		\
 	(var) = freemempos;		\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	while (((freemempos - L1_TABLE_SIZE) & (L1_TABLE_SIZE - 1)) != 0)
 		freemempos -= PAGE_SIZE;
 	valloc_pages(kernel_l1pt, L1_TABLE_SIZE / PAGE_SIZE);
 	for (loop = 0; loop < NUM_KERNEL_PTS; ++loop) {
 		if (!(loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL))) {
 			valloc_pages(kernel_pt_table[loop],
 			    L2_TABLE_SIZE / PAGE_SIZE);
 		} else {
 			kernel_pt_table[loop].pv_pa = freemempos +
 			    (loop % (PAGE_SIZE / L2_TABLE_SIZE_REAL)) *
 			    L2_TABLE_SIZE_REAL;
 			kernel_pt_table[loop].pv_va = 
 			    kernel_pt_table[loop].pv_pa + 0xb0000000;
 		}
 	}
 	freemem_pt = freemempos;
 	freemempos = 0x10100000;
 	/*
 	 * Allocate a page for the system page mapped to V0x00000000
 	 * This page will just contain the system vectors and can be
 	 * shared by all processes.
 	 */
 	valloc_pages(systempage, 1);
 
 	/* Allocate stacks for all modes */
 	valloc_pages(irqstack, IRQ_STACK_SIZE);
 	valloc_pages(abtstack, ABT_STACK_SIZE);
 	valloc_pages(undstack, UND_STACK_SIZE);
 	valloc_pages(kernelstack, KSTACK_PAGES);
 	alloc_pages(minidataclean.pv_pa, 1);
 	valloc_pages(msgbufpv, round_page(MSGBUF_SIZE) / PAGE_SIZE);
 #ifdef ARM_USE_SMALL_ALLOC
 	freemempos -= PAGE_SIZE;
 	freemem_pt = trunc_page(freemem_pt);
 	freemem_after = freemempos - ((freemem_pt - 0x10100000) /
 	    PAGE_SIZE) * sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0xb0000000)
 	    , (void *)0xc0100000, freemem_pt - 0x10100000, 1);
 	freemem_after -= ((freemem_after - 0x10001000) / PAGE_SIZE) *
 	    sizeof(struct arm_small_page);
 	arm_add_smallalloc_pages((void *)(freemem_after + 0xb0000000)
 	, (void *)0xc0001000, trunc_page(freemem_after) - 0x10001000, 0);
 	freemempos = trunc_page(freemem_after);
 	freemempos -= PAGE_SIZE;
 #endif
 	/*
 	 * Allocate memory for the l1 and l2 page tables. The scheme to avoid
 	 * wasting memory by allocating the l1pt on the first 16k memory was
 	 * taken from NetBSD rpc_machdep.c. NKPT should be greater than 12 for
 	 * this to work (which is supposed to be the case).
 	 */
 
 	/*
 	 * Now we start construction of the L1 page table
 	 * We start by mapping the L2 page tables into the L1.
 	 * This means that we can replace L1 mappings later on if necessary
 	 */
 	l1pagetable = kernel_l1pt.pv_va;
 
 	/* Map the L2 pages tables in the L1 page table */
 	pmap_link_l2pt(l1pagetable, ARM_VECTORS_HIGH & ~(0x00100000 - 1),
 	    &kernel_pt_table[KERNEL_PT_SYS]);
 	pmap_link_l2pt(l1pagetable, IXP425_IO_VBASE,
 	                &kernel_pt_table[KERNEL_PT_IO]);
 	pmap_link_l2pt(l1pagetable, IXP425_MCU_VBASE,
 	    		&kernel_pt_table[KERNEL_PT_IO + 1]);
 	pmap_link_l2pt(l1pagetable, IXP425_PCI_MEM_VBASE,
 	    		&kernel_pt_table[KERNEL_PT_IO + 2]);
 	pmap_link_l2pt(l1pagetable, KERNBASE,
 	    &kernel_pt_table[KERNEL_PT_BEFOREKERN]);
 	pmap_map_chunk(l1pagetable, KERNBASE, SDRAM_START, 0x100000,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x100000, SDRAM_START + 0x100000,
 	    0x100000, VM_PROT_READ|VM_PROT_WRITE, PTE_PAGETABLE);
 	pmap_map_chunk(l1pagetable, KERNBASE + 0x200000, SDRAM_START + 0x200000,
 	   (((uint32_t)(lastaddr) - KERNBASE - 0x200000) + L1_S_SIZE) & ~(L1_S_SIZE - 1),
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	freemem_after = ((int)lastaddr + PAGE_SIZE) & ~(PAGE_SIZE - 1);
 	afterkern = round_page(((vm_offset_t)lastaddr + L1_S_SIZE) & ~(L1_S_SIZE 
 	    - 1));
 	for (i = 0; i < KERNEL_PT_AFKERNEL_NUM; i++) {
 		pmap_link_l2pt(l1pagetable, afterkern + i * 0x00100000,
 		    &kernel_pt_table[KERNEL_PT_AFKERNEL + i]);
 	}
 	pmap_map_entry(l1pagetable, afterkern, minidataclean.pv_pa, 
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	
 
 #ifdef ARM_USE_SMALL_ALLOC
 	if ((freemem_after + 2 * PAGE_SIZE) <= afterkern) {
 		arm_add_smallalloc_pages((void *)(freemem_after),
 		    (void*)(freemem_after + PAGE_SIZE),
 		    afterkern - (freemem_after + PAGE_SIZE), 0);
 		    
 	}
 #endif
 
 	/* Map the Mini-Data cache clean area. */
 	xscale_setup_minidata(l1pagetable, afterkern,
 	    minidataclean.pv_pa);
 
 	/* Map the vector page. */
 	pmap_map_entry(l1pagetable, ARM_VECTORS_HIGH, systempage.pv_pa,
 	    VM_PROT_READ|VM_PROT_WRITE, PTE_CACHE);
 	pmap_devmap_bootstrap(l1pagetable, ixp425_devmap);
 	/*
 	 * Give the XScale global cache clean code an appropriately
 	 * sized chunk of unmapped VA space starting at 0xff000000
 	 * (our device mappings end before this address).
 	 */
 	xscale_cache_clean_addr = 0xff000000U;
 
 	cpu_domains((DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2)) | DOMAIN_CLIENT);
 	setttb(kernel_l1pt.pv_pa);
 	cpu_tlb_flushID();
 	cpu_domains(DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL*2));
 	/*
 	 * Pages were allocated during the secondary bootstrap for the
 	 * stacks for different CPU modes.
 	 * We must now set the r13 registers in the different CPU modes to
 	 * point to these stacks.
 	 * Since the ARM stacks use STMFD etc. we must set r13 to the top end
 	 * of the stack memory.
 	 */
 
 				   
 	set_stackptr(PSR_IRQ32_MODE,
 	    irqstack.pv_va + IRQ_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_ABT32_MODE,
 	    abtstack.pv_va + ABT_STACK_SIZE * PAGE_SIZE);
 	set_stackptr(PSR_UND32_MODE,
 	    undstack.pv_va + UND_STACK_SIZE * PAGE_SIZE);
 
 
 
 	/*
 	 * We must now clean the cache again....
 	 * Cleaning may be done by reading new data to displace any
 	 * dirty data in the cache. This will have happened in setttb()
 	 * but since we are boot strapping the addresses used for the read
 	 * may have just been remapped and thus the cache could be out
 	 * of sync. A re-clean after the switch will cure this.
 	 * After booting there are no gross reloations of the kernel thus
 	 * this problem will not occur after initarm().
 	 */
 	cpu_idcache_wbinv_all();
 	/*
 	 * Fetch the SDRAM start/size from the ixp425 SDRAM configration
 	 * registers.
 	 */
 	cninit();
 	memsize = ixp425_sdram_size();
 	physmem = memsize / PAGE_SIZE;
 
 	/* Set stack for exception handlers */
 	
 	data_abort_handler_address = (u_int)data_abort_handler;
 	prefetch_abort_handler_address = (u_int)prefetch_abort_handler;
 	undefined_handler_address = (u_int)undefinedinstruction_bounce;
 	undefined_init();
 				
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_kstack = kernelstack.pv_va;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_flags = 0;
 	thread0.td_frame = &proc0_tf;
 	pcpup->pc_curpcb = thread0.td_pcb;
 	
 	/* Enable MMU, I-cache, D-cache, write buffer. */
 
 	arm_vector_init(ARM_VECTORS_HIGH, ARM_VEC_ALL);
 
 
 
 	pmap_curmaxkvaddr = afterkern + PAGE_SIZE;
 	dump_avail[0] = 0x10000000;
 	dump_avail[1] = 0x10000000 + memsize;
 	dump_avail[2] = 0;
 	dump_avail[3] = 0;
 					
 	pmap_bootstrap(pmap_curmaxkvaddr, 
 	    0xd0000000, &kernel_l1pt);
 	msgbufp = (void*)msgbufpv.pv_va;
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	mutex_init();
 	
 	i = 0;
 #ifdef ARM_USE_SMALL_ALLOC
 	phys_avail[i++] = 0x10000000;
 	phys_avail[i++] = 0x10001000; 	/*
 					 *XXX: Gross hack to get our
 					 * pages in the vm_page_array
 					 . */
 #endif
 	phys_avail[i++] = round_page(virtual_avail - KERNBASE + SDRAM_START);
 	phys_avail[i++] = trunc_page(0x10000000 + memsize - 1);
 	phys_avail[i++] = 0;
 	phys_avail[i] = 0;
 	
 	/* Do basic tuning, hz etc */
 	init_param1();
 	init_param2(physmem);
 	kdb_init();
 	return ((void *)(kernelstack.pv_va + USPACE_SVC_STACK_TOP -
 	    sizeof(struct pcb)));
 }
Index: head/sys/compat/pecoff/imgact_pecoff.c
===================================================================
--- head/sys/compat/pecoff/imgact_pecoff.c	(revision 173360)
+++ head/sys/compat/pecoff/imgact_pecoff.c	(revision 173361)
@@ -1,602 +1,606 @@
 /*-
  * Copyright (c) 2000 Masaru OKI
  * Copyright (c) 1994, 1995, 1998 Scott Bartram
  * Copyright (c) 1994 Adam Glass
  * Copyright (c) 1993, 1994 Christopher G. Demetriou
  *
  * originally from NetBSD kern/exec_ecoff.c
  *
  * Copyright (c) 2000 Takanori Watanabe
  * Copyright (c) 2000 KUROSAWA Takahiro
  * Copyright (c) 1995-1996 Sen Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * originally from FreeBSD kern/imgact_elf.c
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Masaru OKI.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/imgact.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <machine/reg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <machine/cpu.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <machine/md_var.h>
 #include <machine/pecoff_machdep.h>
 #include <compat/pecoff/imgact_pecoff.h>
 
 #include "opt_pecoff.h"
 
 #define PECOFF_PE_SIGNATURE "PE\0\0"
 static int      pecoff_fixup(register_t **, struct image_params *);
 #ifndef PECOFF_DEBUG
 #define DPRINTF(a)
 #else
 #define DPRINTF(a) printf a
 #endif
 static struct sysentvec pecoff_sysvec = {
 	SYS_MAXSYSCALL,
 	sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	pecoff_fixup,
 	sendsig,
 	sigcode,
 	&szsigcode,
 	0,
 	"FreeBSD PECoff",
 	NULL,
 	NULL,
 	MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_setregs,
 	NULL
 	
 };
 
 static const char signature[] = PECOFF_PE_SIGNATURE;
 
 static int 
 exec_pecoff_coff_prep_omagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 static int 
 exec_pecoff_coff_prep_nmagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 static int 
 exec_pecoff_coff_prep_zmagic(struct image_params *,
 			     struct coff_filehdr *,
 			     struct coff_aouthdr *, int peoffs);
 
 static int 
 exec_pecoff_coff_makecmds(struct image_params *,
 			  struct coff_filehdr *, int);
 
 static int      pecoff_signature(struct thread *, struct vnode *, const struct pecoff_dos_filehdr *);
 static int      pecoff_read_from(struct thread *, struct vnode *, int, caddr_t, int);
 static int 
 pecoff_load_section(struct thread * td,
 		    struct vmspace * vmspace, struct vnode * vp,
 	     vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
 		    vm_prot_t prot);
 
 static int 
 pecoff_fixup(register_t ** stack_base, struct image_params * imgp)
 {
 	int             len = sizeof(struct pecoff_args);
 	struct pecoff_imghdr *ap;
 	register_t     *pos;
 
 	pos = *stack_base + (imgp->args->argc + imgp->args->envc + 2);
 	ap = (struct pecoff_imghdr *) imgp->auxargs;
 	if (copyout(ap, pos, len)) {
 		return 0;
 	}
 	free(ap, M_TEMP);
 	imgp->auxargs = NULL;
 	(*stack_base)--;
 	suword(*stack_base, (long) imgp->args->argc);
 	return 0;
 }
 
 static int 
 pecoff_load_section(struct thread * td, struct vmspace * vmspace, struct vnode * vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	size_t          map_len;
 	vm_offset_t     map_addr;
 	int             error, rv;
 	size_t          copy_len;
 	size_t          copy_map_len;
 	size_t          copy_start;
 	vm_object_t     object;
 	vm_offset_t     copy_map_offset;
 	vm_offset_t     file_addr;
 	vm_offset_t     data_buf = 0;
 
 	object = vp->v_object;
 	error = 0;
 
 	map_addr = trunc_page((vm_offset_t) vmaddr);
 	file_addr = trunc_page(offset);
 	DPRINTF(("SECARG:%x %p %x %x\n", offset, vmaddr, memsz, filsz));
 	if (file_addr != offset) {
 		/*
 		 * The section is not on page  boundary. We can't use
 		 * vm_map_insert(). Use copyin instead.
 		 */
 		map_len = round_page(memsz);
 		copy_len = filsz;
 		copy_map_offset = file_addr;
 		copy_map_len = round_page(offset + filsz) - file_addr;
 		copy_start = offset - file_addr;
 
 		DPRINTF(("offset=%x vmaddr=%lx filsz=%x memsz=%x\n",
 			 offset, (long)vmaddr, filsz, memsz));
 		DPRINTF(("map_len=%x copy_len=%x copy_map_offset=%x"
 			 " copy_map_len=%x copy_start=%x\n",
 			 map_len, copy_len, copy_map_offset,
 			 copy_map_len, copy_start));
 	} else {
 
 		map_len = trunc_page(filsz);
 
 		if (map_len != 0) {
 			vm_object_reference(object);
 			vm_map_lock(&vmspace->vm_map);
 			rv = vm_map_insert(&vmspace->vm_map,
 					   object,
 					   file_addr,	/* file offset */
 					   map_addr,	/* virtual start */
 					   map_addr + map_len,	/* virtual end */
 					   prot,
 					   VM_PROT_ALL,
 					   MAP_COPY_ON_WRITE | MAP_PREFAULT);
 
 			vm_map_unlock(&vmspace->vm_map);
 			if (rv != KERN_SUCCESS) {
 				vm_object_deallocate(object);
 				return EINVAL;
 			}
 			/* we can stop now if we've covered it all */
 			if (memsz == filsz)
 				return 0;
 
 		}
 		copy_map_offset = trunc_page(offset + filsz);
 		copy_map_len = PAGE_SIZE;
 		copy_start = 0;
 		copy_len = (offset + filsz) - trunc_page(offset + filsz);
 		map_addr = trunc_page((vm_offset_t) vmaddr + filsz);
 		map_len = round_page((vm_offset_t) vmaddr + memsz) - map_addr;
 
 	}
 
 	if (map_len != 0) {
 		vm_map_lock(&vmspace->vm_map);
 		rv = vm_map_insert(&vmspace->vm_map, NULL, 0,
 				   map_addr, map_addr + map_len,
 				   VM_PROT_ALL, VM_PROT_ALL, 0);
 		vm_map_unlock(&vmspace->vm_map);
 		DPRINTF(("EMP-rv:%d,%x %x\n", rv, map_addr, map_addr + map_len));
 		if (rv != KERN_SUCCESS) {
 			return EINVAL;
 		}
 	}
 	DPRINTF(("COPYARG %x %x\n", map_addr, copy_len));
 	if (copy_len != 0) {
 		vm_object_reference(object);
 		rv = vm_map_find(exec_map,
 				 object,
 				 copy_map_offset,
 				 &data_buf,
 				 copy_map_len,
 				 TRUE,
 				 VM_PROT_READ,
 				 VM_PROT_ALL,
 				 MAP_COPY_ON_WRITE | MAP_PREFAULT_PARTIAL);
 		if (rv != KERN_SUCCESS) {
 			vm_object_deallocate(object);
 			return EINVAL;
 		}
 		/* send the page fragment to user space */
 
 		error = copyout((caddr_t) data_buf + copy_start,
 				(caddr_t) map_addr, copy_len);
 		vm_map_remove(exec_map, data_buf, data_buf + copy_map_len);
 		DPRINTF(("%d\n", error));
 		if (error)
 			return (error);
 	}
 	/*
 	 * set it to the specified protection
 	 */
 	vm_map_protect(&vmspace->vm_map, map_addr,
 		       map_addr + map_len, prot,
 		       FALSE);
 	return error;
 
 }
 static int 
 pecoff_load_file(struct thread * td, const char *file, u_long * addr, u_long * entry, u_long * ldexport)
 {
 
 	struct nameidata nd;
 	struct pecoff_dos_filehdr dh;
 	struct coff_filehdr *fp = 0;
 	struct coff_aouthdr *ap;
 	struct pecoff_opthdr *wp;
 	struct coff_scnhdr *sh = 0;
 	struct vmspace *vmspace = td->td_proc->p_vmspace;
 	struct vattr    attr;
 	struct image_params image_params, *imgp;
 	int             peofs;
 	int             error, i, scnsiz;
 
 	imgp = &image_params;
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = td->td_proc;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->firstpage = NULL;
 
 	NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW, UIO_SYSSPACE, file, td);
 
 	if ((error = namei(&nd)) != 0) {
 		nd.ni_vp = NULL;
 		goto fail;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd.ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	if (error)
 		goto fail;
 	if ((error = pecoff_read_from(td, imgp->vp, 0, (caddr_t) & dh, sizeof(dh))) != 0)
 		goto fail;
 	if ((error = pecoff_signature(td, imgp->vp, &dh) != 0))
 		goto fail;
 	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
 	peofs = dh.d_peofs + sizeof(signature) - 1;
 	if ((error = pecoff_read_from(td, imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE) != 0))
 		goto fail;
 	if (COFF_BADMAG(fp)) {
 		error = ENOEXEC;
 		goto fail;
 	}
 	ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
 	wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
 	/* read section header */
 	scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
 	sh = malloc(scnsiz, M_TEMP, M_WAITOK);
 	if ((error = pecoff_read_from(td, imgp->vp, peofs + PECOFF_HDR_SIZE,
 				      (caddr_t) sh, scnsiz)) != 0)
 		goto fail;
 
 	/*
 	 * Read Section infomation and map sections.
 	 */
 
 	for (i = 0; i < fp->f_nscns; i++) {
 		int             prot = 0;
 
 		if (sh[i].s_flags & COFF_STYP_DISCARD)
 			continue;
 		/* XXX ? */
 		if ((sh[i].s_flags & COFF_STYP_TEXT) &&
 		    (sh[i].s_flags & COFF_STYP_EXEC) == 0)
 			continue;
 		if ((sh[i].s_flags & (COFF_STYP_TEXT | COFF_STYP_DATA | COFF_STYP_BSS)) == 0)
 			continue;
 
 		prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
 
 		sh[i].s_vaddr += wp->w_base;	/* RVA --> VA */
 		if ((error = pecoff_load_section(td, vmspace, imgp->vp, sh[i].s_scnptr
 						 ,(caddr_t) sh[i].s_vaddr,
 						 sh[i].s_paddr, sh[i].s_size
 						 ,prot)) != 0)
 			goto fail;
 
 	}
 	*entry = wp->w_base + ap->a_entry;
 	*addr = wp->w_base;
 	*ldexport = wp->w_imghdr[0].i_vaddr + wp->w_base;
 fail:
 	if (fp)
 		free(fp, M_TEMP);
 	if (sh)
 		free(sh, M_TEMP);
 	if (nd.ni_vp)
 		vrele(nd.ni_vp);
 
 	return error;
 }
 static int
 exec_pecoff_coff_prep_omagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	return ENOEXEC;
 }
 static int
 exec_pecoff_coff_prep_nmagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	return ENOEXEC;
 }
 static int
 exec_pecoff_coff_prep_zmagic(struct image_params * imgp,
 			     struct coff_filehdr * fp,
 			     struct coff_aouthdr * ap, int peofs)
 {
 	int             scnsiz = sizeof(struct coff_scnhdr) * fp->f_nscns;
 	int             error = ENOEXEC, i;
 	int             prot;
 	u_long          text_size = 0, data_size = 0, dsize;
 	u_long          text_addr = 0, data_addr = VM_MAXUSER_ADDRESS;
 	u_long          ldexport = 0, ldbase = 0;
 	struct pecoff_opthdr *wp;
 	struct coff_scnhdr *sh;
 	struct vmspace *vmspace;
 	struct pecoff_args *argp = NULL;
 
 	sh = malloc(scnsiz, M_TEMP, M_WAITOK);
 
 	wp = (void *) ((char *) ap + sizeof(struct coff_aouthdr));
 	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc), imgp->vp,
 	    peofs + PECOFF_HDR_SIZE, (caddr_t) sh, scnsiz);
-	exec_new_vmspace(imgp, &pecoff_sysvec);
+	if (error)
+		return (error);
+	error = exec_new_vmspace(imgp, &pecoff_sysvec);
+	if (error)
+		return (error);
 	vmspace = imgp->proc->p_vmspace;
 	for (i = 0; i < fp->f_nscns; i++) {
 		prot = VM_PROT_WRITE;	/* XXX for relocation? */
 		prot |= (sh[i].s_flags & COFF_STYP_READ) ? VM_PROT_READ : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_WRITE) ? VM_PROT_WRITE : 0;
 		prot |= (sh[i].s_flags & COFF_STYP_EXEC) ? VM_PROT_EXECUTE : 0;
 		sh[i].s_vaddr += wp->w_base;
 		if (sh[i].s_flags & COFF_STYP_DISCARD)
 			continue;
 		if ((sh[i].s_flags & COFF_STYP_TEXT) != 0) {
 
 			error = pecoff_load_section(
 			    FIRST_THREAD_IN_PROC(imgp->proc),
 			    vmspace, imgp->vp, sh[i].s_scnptr,
 			    (caddr_t) sh[i].s_vaddr, sh[i].s_paddr,
 			    sh[i].s_size ,prot);
 			DPRINTF(("ERROR%d\n", error));
 			if (error)
 				goto fail;
 			text_addr = trunc_page(sh[i].s_vaddr);
 			text_size = trunc_page(sh[i].s_size + sh[i].s_vaddr - text_addr);
 
 		}
 		if ((sh[i].s_flags & (COFF_STYP_DATA|COFF_STYP_BSS)) != 0) {
 			if (pecoff_load_section(
 			    FIRST_THREAD_IN_PROC(imgp->proc), vmspace,
 			    imgp->vp, sh[i].s_scnptr, (caddr_t) sh[i].s_vaddr,
 			    sh[i].s_paddr, sh[i].s_size, prot) != 0)
 				goto fail;
 			data_addr = min(trunc_page(sh[i].s_vaddr), data_addr);
 			dsize = round_page(sh[i].s_vaddr + sh[i].s_paddr)
 				- data_addr;
 			data_size = max(dsize, data_size);
 
 		}
 	}
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t) data_addr;
 	argp = malloc(sizeof(struct pecoff_args), M_TEMP, M_WAITOK);
 	if (argp == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	argp->a_base = wp->w_base;
 	argp->a_entry = wp->w_base + ap->a_entry;
 	argp->a_end = data_addr + data_size;
 	argp->a_subsystem = wp->w_subvers;
 	error = pecoff_load_file(FIRST_THREAD_IN_PROC(imgp->proc),
 	    "/usr/libexec/ld.so.dll", &ldbase, &imgp->entry_addr, &ldexport);
 	if (error)
 		goto fail;
 
 	argp->a_ldbase = ldbase;
 	argp->a_ldexport = ldexport;
 	memcpy(argp->a_imghdr, wp->w_imghdr, sizeof(struct pecoff_imghdr) * 16);
 	for (i = 0; i < 16; i++) {
 		argp->a_imghdr[i].i_vaddr += wp->w_base;
 	}
 	imgp->proc->p_sysent = &pecoff_sysvec;
 	imgp->auxargs = argp;
 	imgp->auxarg_size = sizeof(struct pecoff_args);
 	imgp->interpreted = 0;
 
 	if (sh != NULL)
 		free(sh, M_TEMP);
 	return 0;
 fail:
 	error = (error) ? error : ENOEXEC;
 	if (sh != NULL)
 		free(sh, M_TEMP);
 	if (argp != NULL)
 		free(argp, M_TEMP);
 
 	return error;
 }
 
 int
 exec_pecoff_coff_makecmds(struct image_params * imgp,
 			  struct coff_filehdr * fp, int peofs)
 {
 	struct coff_aouthdr *ap;
 	int             error;
 
 	if (COFF_BADMAG(fp)) {
 		return ENOEXEC;
 	}
 	ap = (void *) ((char *) fp + sizeof(struct coff_filehdr));
 	switch (ap->a_magic) {
 	case COFF_OMAGIC:
 		error = exec_pecoff_coff_prep_omagic(imgp, fp, ap, peofs);
 		break;
 	case COFF_NMAGIC:
 		error = exec_pecoff_coff_prep_nmagic(imgp, fp, ap, peofs);
 		break;
 	case COFF_ZMAGIC:
 		error = exec_pecoff_coff_prep_zmagic(imgp, fp, ap, peofs);
 		break;
 	default:
 		return ENOEXEC;
 	}
 
 	return error;
 }
 
 static int
 pecoff_signature(td, vp, dp)
 	struct thread  *td;
 	struct vnode   *vp;
 	const struct pecoff_dos_filehdr *dp;
 {
 	int             error;
 	char            buf[512];
 	char           *pesig;
 	if (DOS_BADMAG(dp)) {
 		return ENOEXEC;
 	}
 	error = pecoff_read_from(td, vp, dp->d_peofs, buf, sizeof(buf));
 	if (error) {
 		return error;
 	}
 	pesig = buf;
 	if (memcmp(pesig, signature, sizeof(signature) - 1) == 0) {
 		return 0;
 	}
 	return EFTYPE;
 }
 int
 pecoff_read_from(td, vp, pos, buf, siz)
 	struct thread  *td;
 	struct vnode   *vp;
 	int             pos;
 	caddr_t         buf;
 	int             siz;
 {
 	int             error;
 	size_t          resid;
 
 	error = vn_rdwr(UIO_READ, vp, buf, siz, pos,
 			UIO_SYSSPACE, IO_NODELOCKED, td->td_ucred, NOCRED,
 			&resid, td);
 	if (error)
 		return error;
 
 	if (resid != 0) {
 		return ENOEXEC;
 	}
 	return 0;
 }
 
 static int 
 imgact_pecoff(struct image_params * imgp)
 {
 	const struct pecoff_dos_filehdr *dp = (const struct pecoff_dos_filehdr *)
 	imgp->image_header;
 	struct coff_filehdr *fp;
 	int             error, peofs;
 	struct thread *td = curthread;
 
 	error = pecoff_signature(FIRST_THREAD_IN_PROC(imgp->proc),
 	    imgp->vp, dp);
 	if (error) {
 		return -1;
 	}
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	peofs = dp->d_peofs + sizeof(signature) - 1;
 	fp = malloc(PECOFF_HDR_SIZE, M_TEMP, M_WAITOK);
 	error = pecoff_read_from(FIRST_THREAD_IN_PROC(imgp->proc),
 	     imgp->vp, peofs, (caddr_t) fp, PECOFF_HDR_SIZE);
 	if (error)
 		goto fail;
 
 	error = exec_pecoff_coff_makecmds(imgp, fp, peofs);
 fail:   
 	free(fp, M_TEMP);
         vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	return error;
 }
 
 static struct execsw pecoff_execsw = {imgact_pecoff, "FreeBSD PEcoff"};
 EXEC_SET(pecoff, pecoff_execsw);
Index: head/sys/compat/svr4/imgact_svr4.c
===================================================================
--- head/sys/compat/svr4/imgact_svr4.c	(revision 173360)
+++ head/sys/compat/svr4/imgact_svr4.c	(revision 173361)
@@ -1,241 +1,243 @@
 /*-
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Based heavily on /sys/kern/imgact_aout.c which is:
  * Copyright (c) 1993, David Greenman
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <compat/svr4/svr4.h>
 
 static int	exec_svr4_imgact(struct image_params *iparams);
 
 static int
 exec_svr4_imgact(imgp)
     struct image_params *imgp;
 {
     const struct exec *a_out = (const struct exec *) imgp->image_header;
     struct vmspace *vmspace;
     vm_offset_t vmaddr;
     unsigned long virtual_offset, file_offset;
     vm_offset_t buffer;
     unsigned long bss_size;
     int error;
     struct thread *td = curthread;
 
     if (((a_out->a_magic >> 16) & 0xff) != 0x64)
 	return -1;
 
     /*
      * Set file/virtual offset based on a.out variant.
      */
     switch ((int)(a_out->a_magic & 0xffff)) {
     case 0413:
 	virtual_offset = 0;
 	file_offset = 1024;
 	break;
     case 0314:
 	virtual_offset = 4096;
 	file_offset = 0;
 	break;
     default:
 	return (-1);
     }
     bss_size = round_page(a_out->a_bss);
 #ifdef DEBUG
     printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n", (u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
 #endif
 
     /*
      * Check various fields in header for validity/bounds.
      */
     if (a_out->a_entry < virtual_offset ||
 	a_out->a_entry >= virtual_offset + a_out->a_text ||
 	a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 	return (-1);
 
     /* text + data can't exceed file size */
     if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 	return (EFAULT);
     /*
      * text/data/bss must not exceed limits
      */
     PROC_LOCK(imgp->proc);
     if (a_out->a_text > maxtsiz ||
 	a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
     	PROC_UNLOCK(imgp->proc);
 	return (ENOMEM);
     }
     PROC_UNLOCK(imgp->proc);
 
     VOP_UNLOCK(imgp->vp, 0, td);
 
     /*
      * Destroy old process VM and create a new one (with a new stack)
      */
-    exec_new_vmspace(imgp, &svr4_sysvec);
+    error = exec_new_vmspace(imgp, &svr4_sysvec);
+    if (error)
+	    goto fail;
     vmspace = imgp->proc->p_vmspace;
 
     /*
      * Check if file_offset page aligned,.
      * Currently we cannot handle misalinged file offsets,
      * and so we read in the entire image (what a waste).
      */
     if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 	printf("imgact: Non page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data+bss read/write/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 		    	    a_out->a_text + a_out->a_data + bss_size, FALSE,
 			    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 	    goto fail;
 
 	error = vm_mmap(kernel_map, &buffer,
 			round_page(a_out->a_text + a_out->a_data + file_offset),
 			VM_PROT_READ, VM_PROT_READ, 0,
 			OBJT_VNODE, imgp->vp, trunc_page(file_offset));
 	if (error)
 	    goto fail;
 
 	error = copyout((caddr_t)(buffer + file_offset), (caddr_t)vmaddr, 
 			a_out->a_text + a_out->a_data);
 
 	vm_map_remove(kernel_map, buffer,
 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
 
 	if (error)
 	    goto fail;
 
 	/*
 	 * remove write enable on the 'text' part
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr,
 		   	       vmaddr + a_out->a_text,
 		   	       VM_PROT_EXECUTE|VM_PROT_READ,
 		   	       TRUE);
 	if (error)
 	    goto fail;
     }
     else {
 #ifdef DEBUG
 	printf("imgact: Page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data read/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_mmap(&vmspace->vm_map, &vmaddr,
 			a_out->a_text + a_out->a_data,
 	    		VM_PROT_READ | VM_PROT_EXECUTE,
 	    		VM_PROT_ALL,
 	    		MAP_PRIVATE | MAP_FIXED,
 			OBJT_VNODE, imgp->vp, file_offset);
 	if (error)
 	    goto fail;
     
 #ifdef DEBUG
 	printf("imgact: startaddr=%08lx, length=%08lx\n", (u_long)vmaddr,
 	    (u_long)a_out->a_text + a_out->a_data);
 #endif
 	/*
 	 * allow read/write of data
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr + a_out->a_text,
 			       vmaddr + a_out->a_text + a_out->a_data,
 			       VM_PROT_ALL,
 			       FALSE);
 	if (error)
 	    goto fail;
     
 	/*
 	 * Allocate anon demand-zeroed area for uninitialized data
 	 */
 	if (bss_size != 0) {
 	    vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
 	    error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, 
 				bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	    if (error)
 		goto fail;
 #ifdef DEBUG
 	    printf("imgact: bssaddr=%08lx, length=%08lx\n",
 	        (u_long)vmaddr, bss_size);
 #endif
 
 	}
     }
     /* Fill in process VM information */
     vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
     vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
     vmspace->vm_taddr = (caddr_t)virtual_offset;
     vmspace->vm_daddr = (caddr_t)virtual_offset + a_out->a_text;
 
     /* Fill in image_params */
     imgp->interpreted = 0;
     imgp->entry_addr = a_out->a_entry;
     
     imgp->proc->p_sysent = &svr4_sysvec;
 fail:
     vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
     return (error);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 struct execsw svr4_execsw = { exec_svr4_imgact, "svr4 ELF" };
 EXEC_SET(execsw_set, svr4_execsw);
 
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 173360)
+++ head/sys/i386/i386/machdep.c	(revision 173361)
@@ -1,3094 +1,3094 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/privatespace.h>
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ISA
 #include <i386/isa/icu.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 
 int arch_i386_is_xbox = 0;
 uint32_t arch_i386_xbox_memsize = 0;
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct pcpu __pcpu;
 #endif
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
 	    ptoa((uintmax_t)Maxmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	struct proc *p = td->td_proc;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	SIGSETOLD(td->td_sigmask, scp->sc_mask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if (!tsc_present)
 		return (EOPNOTSUPP);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
 	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length  */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	10 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 
 /* Show privileged registers. */
 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
 {
 	uint64_t idtr, gdtr;
 
 	idtr = ridt();
 	db_printf("idtr\t0x%08x/%04x\n",
 	    (u_int)(idtr >> 16), (u_int)idtr & 0xffff);
 	gdtr = rgdt();
 	db_printf("gdtr\t0x%08x/%04x\n",
 	    (u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
 	db_printf("ldtr\t0x%04x\n", rldt());
 	db_printf("tr\t0x%04x\n", rtr());
 	db_printf("cr0\t0x%08x\n", rcr0());
 	db_printf("cr2\t0x%08x\n", rcr2());
 	db_printf("cr3\t0x%08x\n", rcr3());
 	db_printf("cr4\t0x%08x\n", rcr4());
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	int hasbrokenint12, has_smap;
 	u_long physmem_tunable;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	struct bios_smap *smap;
 	quad_t dcons_addr, dcons_size;
 
 	has_smap = 0;
 #ifdef XBOX
 	if (arch_i386_is_xbox) {
 		/*
 		 * We queried the memory size before, so chop off 4MB for
 		 * the framebuffer and inform the OS of this.
 		 */
 		physmap[0] = 0;
 		physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 		physmap_idx = 0;
 		goto physmap_done;
 	}
 #endif
 
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Some newer BIOSes has broken INT 12H implementation which cause
 	 * kernel panic immediately. In this case, we need to scan SMAP
 	 * with INT 15:E820 first, then determine base memory size.
 	 */
 	if (hasbrokenint12) {
 		goto int15e820;
 	}
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 int15e820:
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016llx len=%016llx\n",
 			    smap->type, smap->base, smap->length);
 		has_smap = 1;
 
 		if (smap->type != SMAP_TYPE_MEMORY)
 			continue;
 
 		if (smap->length == 0)
 			continue;
 
 #ifndef PAE
 		if (smap->base >= 0xffffffff) {
 			printf("%uK of memory above 4GB ignored\n",
 			    (u_int)(smap->length / 1024));
 			continue;
 		}
 #endif
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-monotonic memory region, ignoring second region\n");
 				continue;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			continue;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 	} while (vmf.vmf_ebx != 0);
 
 	/*
 	 * Perform "base memory" related probes & setup based on SMAP
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/*
 		 * XXX this function is horribly organized and has to the same
 		 * things that it does above here.
 		 */
 		if (basemem == 0)
 			basemem = 640;
 		if (basemem > 640) {
 			printf(
 		    "Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			    basemem);
 			basemem = 640;
 		}
 
 		/*
 		 * Let vm86 scribble on pages between basemem and
 		 * ISA_HOLE_START, as above.
 		 */
 		for (pa = trunc_page(basemem * 1024);
 		     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 			pmap_kenter(KERNBASE + pa, pa);
 		pte = (pt_entry_t *)vm86paddr;
 		for (i = basemem / 4; i < 160; i++)
 			pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		/*
 		 * Prefer the RTC value for extended memory.
 		 */
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 #ifdef SMP
 	pc = &SMP_prvspace[0].pcpu;
 #else
 	pc = &__pcpu;
 #endif
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 #ifdef XBOX
 	/*
 	 * The following code queries the PCI ID of 0:0:0. For the XBOX,
 	 * This should be 0x10de / 0x02a5.
 	 *
 	 * This is exactly what Linux does.
 	 */
 	outl(0xcf8, 0x80000000);
 	if (inl(0xcfc) == 0x02a510de) {
 		arch_i386_is_xbox = 1;
 		pic16l_setled(XBOX_LED_GREEN);
 
 		/*
 		 * We are an XBOX, but we may have either 64MB or 128MB of
 		 * memory. The PCI host bridge should be programmed for this,
 		 * so we just query it. 
 		 */
 		outl(0xcf8, 0x80000084);
 		arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 	}
 #endif /* XBOX */
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 	elcr_probe();
 	atpic_startup();
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL)
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
 						(struct save87 *)fpregs);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		set_fpregs_xmm((struct save87 *)fpregs,
 					   &td->td_pcb->pcb_save.sv_xmm);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 #else
 	union savefpu *addr;
 
 	/*
 	 * XXX mc_fpstate might be misaligned, since its declaration is not
 	 * unportabilized using __attribute__((aligned(16))) like the
 	 * declaration of struct savemm, and anyway, alignment doesn't work
 	 * for auto variables since we don't use gcc's pessimal stack
 	 * alignment.  Work around this by abusing the spare fields after
 	 * mcp->mc_fpstate.
 	 *
 	 * XXX unpessimize most cases by only aligning when fxsave might be
 	 * called, although this requires knowing too much about
 	 * npxgetregs()'s internals.
 	 */
 	addr = (union savefpu *)&mcp->mc_fpstate;
 	if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 	    cpu_fxsr &&
 #endif
 	    ((uintptr_t)(void *)addr & 0xF)) {
 		do
 			addr = (void *)((char *)addr + 4);
 		while ((uintptr_t)(void *)addr & 0xF);
 	}
 	mcp->mc_ownedfp = npxgetregs(td, addr);
 	if (addr != (union savefpu *)&mcp->mc_fpstate) {
 		bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 		bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	}
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	union savefpu *addr;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/* XXX align as above. */
 		addr = (union savefpu *)&mcp->mc_fpstate;
 		if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 		    cpu_fxsr &&
 #endif
 		    ((uintptr_t)(void *)addr & 0xF)) {
 			do
 				addr = (void *)((char *)addr + 4);
 			while ((uintptr_t)(void *)addr & 0xF);
 			bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
 		}
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		/*
 		 * XXX we violate the dubious requirement that npxsetregs()
 		 * be called with interrupts disabled.
 		 */
 		npxsetregs(td, addr);
 #endif
 		/*
 		 * Don't bother putting things back where they were in the
 		 * misaligned case, since we know that the caller won't use
 		 * them again.
 		 */
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifndef DEV_APIC
 #include <machine/apicvar.h>
 
 /*
  * Provide stub functions so that the MADT APIC enumerator in the acpi
  * kernel module will link against a kernel without 'device apic'.
  *
  * XXX - This is a gross hack.
  */
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 }
 
 void *
 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	return (NULL);
 }
 
 int
 ioapic_disable_pin(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_get_vector(void *cookie, u_int pin)
 {
 	return (-1);
 }
 
 void
 ioapic_register(void *cookie)
 {
 }
 
 int
 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_extint(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_nmi(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	return (ENXIO);
 }
 
 void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 }
 
 void
 lapic_init(vm_paddr_t addr)
 {
 }
 
 int
 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 {
 	return (ENXIO);
 }
 
 int
 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 	return (ENXIO);
 }
 
 int
 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 {
 	return (ENXIO);
 }
 #endif
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 173360)
+++ head/sys/i386/i386/pmap.c	(revision 173361)
@@ -1,3670 +1,3677 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_msgbuf.h"
 #include "opt_smp.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 #endif
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
 int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt;
 vm_offset_t kernel_vm_end;
 extern u_int32_t KERNend;
 
 #ifdef PAE
 pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
 int pv_maxchunks;			/* How many chunks we have KVA for */
 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 struct sysmaps {
 	struct	mtx lock;
 	pt_entry_t *CMAP1;
 	pt_entry_t *CMAP2;
 	caddr_t	CADDR1;
 	caddr_t	CADDR2;
 };
 static struct sysmaps sysmaps_pcpu[MAXCPU];
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP3;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR3;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 #ifdef SMP
 extern pt_entry_t *SMPpt;
 #endif
 static pt_entry_t *PMAP1 = 0, *PMAP2;
 static pt_entry_t *PADDR1 = 0, *PADDR2;
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
 	   &PMAP1changedcpu, 0,
 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
 	   &PMAP1changed, 0,
 	   "Number of times pmap_pte_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
 	   &PMAP1unchanged, 0,
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     vm_page_t *free);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
     vm_page_t *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 #ifdef PAE
 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 #endif
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE)
 		newaddr = (addr + PDRMASK) & ~PDRMASK;
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct sysmaps *sysmaps;
 	int i;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 #ifdef PAE
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	LIST_INIT(&allpmaps);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the idle process page zeroing.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
 		sysmaps = &sysmaps_pcpu[i];
 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
 	}
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 	*CMAP3 = 0;
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 	/*
 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
 	 * physical memory region that is used by the ACPI wakeup code.  This
 	 * mapping must not have PG_G set. 
 	 */
 #ifdef XBOX
 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
 	 * an early stadium, we cannot yet neatly map video memory ... :-(
 	 * Better fixes are very welcome! */
 	if (!arch_i386_is_xbox)
 #endif
 	for (i = 1; i < NKPT; i++)
 		PTD[i] = 0;
 
 	/* Initialize the PAT MSR if present. */
 	pmap_init_pat();
 
 	/* Turn on PG_G on kernel page(s) */
 	pmap_set_pg();
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if (!(cpu_feature & CPUID_PAT))
 		return;
 
 #ifdef PAT_WORKS
 	/*
 	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
 	 * Program 4 and 5 as WP and WC.
 	 * Leave 6 and 7 as UC and UC-.
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
 	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(5, PAT_WRITE_COMBINING);
 #else
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
 	 * of UC-.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~PAT_MASK(2);
 	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 #endif
 	wrmsr(MSR_PAT, pat_msr);
 }
 
 /*
  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
  */
 void
 pmap_set_pg(void)
 {
 	pd_entry_t pdir;
 	pt_entry_t *pte;
 	vm_offset_t va, endva;
 	int i; 
 
 	if (pgeflag == 0)
 		return;
 
 	i = KERNLOAD/NBPDR;
 	endva = KERNBASE + KERNend;
 
 	if (pseflag) {
 		va = KERNBASE + KERNLOAD;
 		while (va  < endva) {
 			pdir = kernel_pmap->pm_pdir[KPTDI+i];
 			pdir |= pgeflag;
 			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
 			invltlb();	/* Play it safe, invltlb() every time */
 			i++;
 			va += NBPDR;
 		}
 	} else {
 		va = (vm_offset_t)btext;
 		while (va < endva) {
 			pte = vtopte(va);
 			if (*pte)
 				*pte |= pgeflag;
 			invltlb();	/* Play it safe, invltlb() every time */
 			va += PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 #ifdef PAE
 
 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
 
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
 	    1, 0));
 }
 #endif
 
 /*
  * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
  * Requirements:
  *  - Must deal with pages in order to ensure that none of the PG_* bits
  *    are ever set, PG_V in particular.
  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  *    on PAE systems.  This should be ok.
  *  - Assumes nothing will ever test these addresses for 0 to indicate
  *    no mapping instead of correctly checking PG_V.
  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  * Because PG_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_ptelist_alloc(vm_offset_t *head)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		return (va);	/* Out of memory */
 	pte = vtopte(va);
 	*head = *pte;
 	if (*head & PG_V)
 		panic("pmap_ptelist_alloc: va with PG_V set!");
 	*pte = 0;
 	return (va);
 }
 
 static void
 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if (va & PG_V)
 		panic("pmap_ptelist_free: freeing va with PG_V set!");
 	pte = vtopte(va);
 	*pte = *head;		/* virtual! PG_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_ptelist_free(head, va);
 	}
 }
 
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
 	    PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("pmap_init: not enough kvm for pv chunks");
 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 }
 
 
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 	"Max number of PV entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 	"Page share factor per proc");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int pat_flag, pat_index, cache_bits;
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* If we don't support PAT, map extended modes to older ones. */
 	if (!(cpu_feature & CPUID_PAT)) {
 		switch (mode) {
 		case PAT_UNCACHEABLE:
 		case PAT_WRITE_THROUGH:
 		case PAT_WRITE_BACK:
 			break;
 		case PAT_UNCACHED:
 		case PAT_WRITE_COMBINING:
 		case PAT_WRITE_PROTECTED:
 			mode = PAT_UNCACHEABLE;
 			break;
 		}
 	}
 	
 	/* Map the caching mode to a PAT index. */
 	switch (mode) {
 #ifdef PAT_WORKS
 	case PAT_UNCACHEABLE:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_UNCACHED:
 		pat_index = 2;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 5;
 		break;
 	case PAT_WRITE_PROTECTED:
 		pat_index = 4;
 		break;
 #else
 	case PAT_UNCACHED:
 	case PAT_UNCACHEABLE:
 	case PAT_WRITE_PROTECTED:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 2;
 		break;
 #endif
 	default:
 		panic("Unknown caching mode %d\n", mode);
 	}	
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_index & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_index & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_index & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 #endif /* !SMP */
 
 /*
  * Are we current address space or kernel?  N.B. We return FALSE when
  * a pmap's page table is in use because a kernel thread is borrowing
  * it.  The borrowed page table can change spontaneously, making any
  * dependence on its continued use subject to a race condition.
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ||
 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, vm_page_queue_mtx
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 			PMAP_UNLOCK(pmap);
 			return rtval;
 		}
 		pte = pmap_pte(pmap, va);
 		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 		pmap_pte_release(pte);
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			sched_pin();
 			pte = *pmap_pte_quick(pmap, va);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 			sched_unpin();
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
 PMAP_INLINE void 
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 
 	va = sva = *virt;
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, *pte;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		oldpte |= *pte;
 		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
 		pte++;
 		ma++;
 	}
 	if ((oldpte & PG_V) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static PMAP_INLINE void
 pmap_free_zero_pages(vm_page_t free)
 {
 	vm_page_t m;
 
 	while (free != NULL) {
 		m = free;
 		free = m->right;
 		vm_page_free_zero(m);
 	}
 }
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m, free);
 	else
 		return 0;
 }
 
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	atomic_subtract_int(&cnt.v_wire_count, 1);
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	m->right = *free;
 	*free = m;
 
 	return 1;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return pmap_unwire_pte_hold(pmap, mpte, free);
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
 #ifdef PAE
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
 	pmap->pm_active = 0;
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
-void
+int
 pmap_pinit(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	vm_paddr_t pa;
 	static int color;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
 		    NBPTD);
+
+		if (pmap->pm_pdir == NULL) {
+			PMAP_LOCK_DESTROY(pmap);
+			return (0);
+		}
 #ifdef PAE
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 	}
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD;) {
 		m = vm_page_alloc(NULL, color++,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (m == NULL)
 			VM_WAIT;
 		else {
 			ptdpg[i++] = m;
 		}
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
 	}
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	/* Wire in kernel global address entries. */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
 #endif
 
 	/* install self-referential address mapping entry(s) */
 	for (i = 0; i < NPGPTD; i++) {
 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
 #ifdef PAE
 		pmap->pm_pdpt[i] = pa | PG_V;
 #endif
 	}
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+
+	return (1);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			VM_WAIT;
 			vm_page_lock_queues();
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 {
 	unsigned ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 		pmap_invalidate_all(kernel_pmap);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 #ifdef SMP
 /*
  * Deal with a SMP shootdown of other users of the pmap that we are
  * trying to dispose of.  This can be a bit hairy.
  */
 static u_int *lazymask;
 static u_int lazyptd;
 static volatile u_int lazywait;
 
 void pmap_lazyfix_action(void);
 
 void
 pmap_lazyfix_action(void)
 {
 	u_int mymask = PCPU_GET(cpumask);
 
 #ifdef COUNT_IPIS
 	*ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
 #endif
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 	atomic_store_rel_int(&lazywait, 1);
 }
 
 static void
 pmap_lazyfix_self(u_int mymask)
 {
 
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 }
 
 
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int mymask;
 	u_int mask;
 	u_int spins;
 
 	while ((mask = pmap->pm_active) != 0) {
 		spins = 50000000;
 		mask = mask & -mask;	/* Find least significant set bit */
 		mtx_lock_spin(&smp_ipi_mtx);
 #ifdef PAE
 		lazyptd = vtophys(pmap->pm_pdpt);
 #else
 		lazyptd = vtophys(pmap->pm_pdir);
 #endif
 		mymask = PCPU_GET(cpumask);
 		if (mask == mymask) {
 			lazymask = &pmap->pm_active;
 			pmap_lazyfix_self(mymask);
 		} else {
 			atomic_store_rel_int((u_int *)&lazymask,
 			    (u_int)&pmap->pm_active);
 			atomic_store_rel_int(&lazywait, 0);
 			ipi_selected(mask, IPI_LAZYPMAP);
 			while (lazywait == 0) {
 				ia32_pause();
 				if (--spins == 0)
 					break;
 			}
 		}
 		mtx_unlock_spin(&smp_ipi_mtx);
 		if (spins == 0)
 			printf("pmap_lazyfix: spun for 50000000\n");
 	}
 }
 
 #else	/* SMP */
 
 /*
  * Cleaning up on uniprocessor is easy.  For various reasons, we're
  * unlikely to have to even execute this code, including the fact
  * that the cleanup is deferred until the parent does a wait(2), which
  * means that another userland process has run.
  */
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int cr3;
 
 	cr3 = vtophys(pmap->pm_pdir);
 	if (cr3 == rcr3()) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 		pmap->pm_active &= ~(PCPU_GET(cpumask));
 	}
 }
 #endif	/* SMP */
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	pmap_lazyfix(pmap);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	for (i = 0; i < NPGPTD; i++)
 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
 		    PG_FRAME);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = 0;
 #endif
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		m = ptdpg[i];
 #ifdef PAE
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 	}
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct pmap *pmap;
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 	pt_entry_t *pde;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 		mtx_lock_spin(&allpmaps_lock);
 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
 			pde = pmap_pde(pmap, kernel_vm_end);
 			pde_store(pde, newpdir);
 		}
 		mtx_unlock_spin(&allpmaps_lock);
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static uint32_t pc_freemask[11] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 
 static int pmap_collect_inactive, pmap_collect_active;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 	"Current number times pmap_collect called on inactive queue");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.  This is normally called to
  * unmap inactive pages, and if necessary, active pages.
  */
 static void
 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 {
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m, free;
 
 	sched_pin();
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = PV_PMAP(pv);
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 			pte = pmap_pte_quick(pmap, va);
 			tpte = pte_load_clear(pte);
 			KASSERT((tpte & PG_W) == 0,
 			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
 			if (tpte & PG_A)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tpte & PG_M) {
 				KASSERT((tpte & PG_RW),
 	("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)tpte));
 				vm_page_dirty(m);
 			}
 			free = NULL;
 			pmap_unuse_pt(pmap, va, &free);
 			pmap_invalidate_page(pmap, va);
 			pmap_free_zero_pages(free);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 			free_pv_entry(pmap, pv);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 		}
 	}
 	sched_unpin();
 }
 
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	/* move to head of list */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx])
 			return;
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire(m, 0);
 	vm_page_free(m);
 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, int try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	static vm_pindex_t colour;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entry_max tunable.\n");
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfl(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 	m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL || pc == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			if (m) {
 				vm_page_lock_queues();
 				vm_page_unwire(m, 0);
 				vm_page_free(m);
 				vm_page_unlock_queues();
 			}
 			if (pc)
 				pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 			return (NULL);
 		}
 		/*
 		 * Reclaim pv entries: At first, destroy mappings to
 		 * inactive pages.  After that, if a pv chunk entry
 		 * is still needed, destroy mappings to active pages.
 		 */
 		PV_STAT(pmap_collect_inactive++);
 		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
 		if (m == NULL)
 			m = vm_page_alloc(NULL, colour, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 		if (pc == NULL)
 			pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 		if (m == NULL || pc == NULL) {
 			PV_STAT(pmap_collect_active++);
 			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
 			if (m == NULL)
 				m = vm_page_alloc(NULL, colour,
 				    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 				    VM_ALLOC_WIRED);
 			if (pc == NULL)
 				pc = (struct pv_chunk *)
 				    pmap_ptelist_alloc(&pv_vafree);
 			if (m == NULL || pc == NULL)
 				panic("get_pv_entry: increase vm.pmap.shpgperproc");
 		}
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	colour++;
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va)
 			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if (oldpte & PG_M) {
 			KASSERT((oldpte & PG_RW),
 	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
 			    va, (uintmax_t)oldpte));
 			vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, vm_page_t *free)
 {
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, free);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva, &free);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	vm_page_t free;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	sched_pin();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			KASSERT((tpte & PG_RW),
 	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
 			    pv->pv_va, (uintmax_t)tpte));
 			vm_page_dirty(m);
 		}
 		free = NULL;
 		pmap_unuse_pt(pmap, pv->pv_va, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		pmap_free_zero_pages(free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	int anychanged;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 #ifdef PAE
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 #else
 	if (prot & VM_PROT_WRITE)
 		return;
 #endif
 
 	anychanged = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
 		unsigned pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			if ((prot & VM_PROT_WRITE) == 0)
 				pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 #ifdef PAE
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pmap->pm_pdir[pdirindex] |= pg_nx;
 #endif
 			anychanged = 1;
 			continue;
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 			}
 
 			if ((prot & VM_PROT_WRITE) == 0)
 				pbits &= ~(PG_RW | PG_M);
 #ifdef PAE
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 #endif
 
 			if (pbits != obits) {
 #ifdef PAE
 				if (!atomic_cmpset_64(pte, obits, pbits))
 					goto retry;
 #else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 #endif
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = 1;
 			}
 		}
 	}
 	sched_unpin();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
 	va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 	}
 #endif
 
 	pde = pmap_pde(pmap, va);
 	if ((*pde & PG_PS) != 0)
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 #ifdef PAE
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 #endif
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte | PG_A);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
 #ifdef PAE
 				if ((origpte & PG_NX) == 0 &&
 				    (newpte & PG_NX) != 0)
 					invlva = TRUE;
 #endif
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
 	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)origpte));
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte | PG_A);
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
 		    prot, mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pmap);
 	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	vm_page_t free;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		unsigned ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(free);
 			}
 			
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 #ifdef PAE
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 #endif
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		int i;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		pd_entry_t ptepa;
 
 		PMAP_LOCK(pmap);
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			goto out;
 		PMAP_UNLOCK(pmap);
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 			vm_page_unlock_queues();
 		}
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i = 0; i < npdes; i++) {
 			pde_store(&pmap->pm_pdir[ptepindex],
 			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		pmap_invalidate_all(pmap);
 out:
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	pt_entry_t *pte;
 
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 	pmap_pte_release(pte);
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
 				    ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 	 			} else {
 					free = NULL;
 					if (pmap_unwire_pte_hold( dst_pmap,
 					    dstmpte, &free)) {
 						pmap_invalidate_page(dst_pmap,
 						    addr);
 						pmap_free_zero_pages(free);
 					}
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 #if defined(CPU_ENABLE_SSE)
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 #endif
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	pagezero(sysmaps->CADDR2);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(sysmaps->CADDR2);
 	else
 		bzero((char *)sysmaps->CADDR2 + off, size);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	if (*CMAP3)
 		panic("pmap_zero_page: CMAP3 busy");
 	sched_pin();
 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(CADDR3);
 	pagezero(CADDR3);
 	*CMAP3 = 0;
 	sched_unpin();
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*sysmaps->CMAP2)
 		panic("pmap_copy_page: CMAP2 busy");
 	sched_pin();
 	invlpg((u_int)sysmaps->CADDR1);
 	invlpg((u_int)sysmaps->CADDR2);
 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
 	*sysmaps->CMAP1 = 0;
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, free = NULL;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = vtopte(pv->pv_va);
 				tpte = *pte;
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08x\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT(m < &vm_page_array[vm_page_array_size],
 					("pmap_remove_pages: bad tpte %#jx",
 					(uintmax_t)tpte));
 
 				pmap->pm_stats.resident_count--;
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (tpte & PG_M)
 					vm_page_dirty(m);
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				m->md.pv_list_count--;
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 				if (TAILQ_EMPTY(&m->md.pv_list))
 					vm_page_flag_clear(m, PG_WRITEABLE);
 
 				pmap_unuse_pt(pmap, pv->pv_va, &free);
 			}
 		}
 		if (allfree) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			vm_page_unwire(m, 0);
 			vm_page_free(m);
 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	if (*pmap_pde(pmap, addr)) {
 		pte = vtopte(addr);
 		rv = *pte == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t oldpte, *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if ((oldpte & PG_RW) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_RW and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pvf = pv;
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			pmap = PV_PMAP(pv);
 			PMAP_LOCK(pmap);
 			pte = pmap_pte_quick(pmap, pv->pv_va);
 			if ((*pte & PG_A) != 0) {
 				atomic_clear_int((u_int *)pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				rtval++;
 				if (rtval > 4)
 					pvn = NULL;
 			}
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	sched_unpin();
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_M) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_M is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_A) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_A is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	pa = pa & PG_FRAME;
 
 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
 		va = KERNBASE + pa;
 	else
 		va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter_attr(tmpva, pa, mode);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	pmap_invalidate_cache();
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset, tmpva;
 
 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
 		return;
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 		pmap_kremove(tmpva);
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 int
 pmap_change_attr(va, size, mode)
 	vm_offset_t va;
 	vm_size_t size;
 	int mode;
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t *pte;
 	u_int opte, npte;
 	pd_entry_t *pde;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	/* Only supported on kernel virtual addresses. */
 	if (base <= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	/* 4MB pages and pages that aren't mapped aren't supported. */
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS)
 			return (EINVAL);
 		if (*pde == 0)
 			return (EINVAL);
 		pte = vtopte(va);
 		if (*pte == 0)
 			return (EINVAL);
 	}
 
 	/*
 	 * Ok, all the pages exist and are 4k, so run through them updating
 	 * their cache mode.
 	 */
 	for (tmpva = base; size > 0; ) {
 		pte = vtopte(tmpva);
 
 		/*
 		 * The cache mode bits are all in the low 32-bits of the
 		 * PTE, so we can just spin on updating the low 32-bits.
 		 */
 		do {
 			opte = *(u_int *)pte;
 			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
 			npte |= pmap_cache_bits(mode, 0);
 		} while (npte != opte &&
 		    !atomic_cmpset_int((u_int *)pte, opte, npte));
 		tmpva += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that shouldn't
 	 * be, etc.
 	 */    
 	pmap_invalidate_range(kernel_pmap, base, tmpva);
 	pmap_invalidate_cache();
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	ptep = pmap_pte(pmap, addr);
 	pte = (ptep != NULL) ? *ptep : 0;
 	pmap_pte_release(ptep);
 	PMAP_UNLOCK(pmap);
 
 	if (pte != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #if defined(SMP)
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	oldpmap->pm_active &= ~1;
 	pmap->pm_active |= 1;
 #endif
 #ifdef PAE
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + PDRMASK) & ~PDRMASK;
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return npte;
 						}
 						pte = pmap_pte(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads(pmap_t pm);
 void		pmap_pvdump(vm_offset_t pa);
 
 /* print address space of pmap*/
 static void
 pads(pmap_t pm)
 {
 	int i, j;
 	vm_paddr_t va;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < NPDEPTD; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < NPTEPG; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *ptep);
 			};
 
 }
 
 void
 pmap_pvdump(vm_paddr_t pa)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	vm_page_t m;
 
 	printf("pa %x", pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
 		pads(pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/i386/ibcs2/imgact_coff.c
===================================================================
--- head/sys/i386/ibcs2/imgact_coff.c	(revision 173360)
+++ head/sys/i386/ibcs2/imgact_coff.c	(revision 173361)
@@ -1,495 +1,497 @@
 /*-
  * Copyright (c) 1994 Sean Eric Fagan
  * Copyright (c) 1994 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <i386/ibcs2/coff.h>
 #include <i386/ibcs2/ibcs2_util.h>
 
 MODULE_DEPEND(coff, ibcs2, 1, 1, 1);
 
 extern struct sysentvec ibcs2_svr3_sysvec;
 
 static int coff_load_file(struct thread *td, char *name);
 static int exec_coff_imgact(struct image_params *imgp);
 
 static int load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot);
 
 static int
 load_coff_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset,
 		  caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
 {
 	size_t map_len;
 	vm_offset_t map_offset;
 	vm_offset_t map_addr;
 	int error;
 	unsigned char *data_buf = 0;
 	size_t copy_len;
 
 	map_offset = trunc_page(offset);
 	map_addr = trunc_page((vm_offset_t)vmaddr);
 
 	if (memsz > filsz) {
 		/*
 		 * We have the stupid situation that
 		 * the section is longer than it is on file,
 		 * which means it has zero-filled areas, and
 		 * we have to work for it.  Stupid iBCS!
 		 */
 		map_len = trunc_page(offset + filsz) - trunc_page(map_offset);
 	} else {
 		/*
 		 * The only stuff we care about is on disk, and we
 		 * don't care if we map in more than is really there.
 		 */
 		map_len = round_page(offset + filsz) - trunc_page(map_offset);
 	}
 
 	DPRINTF(("%s(%d):  vm_mmap(&vmspace->vm_map, &0x%08lx, 0x%x, 0x%x, "
 		"VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, 0x%x)\n",
 		__FILE__, __LINE__, map_addr, map_len, prot, map_offset));
 
 	if ((error = vm_mmap(&vmspace->vm_map,
 			     &map_addr,
 			     map_len,
 			     prot,
 			     VM_PROT_ALL,
 			     MAP_PRIVATE | MAP_FIXED,
 			     OBJT_VNODE,
 			     vp,
 			     map_offset)) != 0)
 		return error;
 
 	if (memsz == filsz) {
 		/* We're done! */
 		return 0;
 	}
 
 	/*
 	 * Now we have screwball stuff, to accomodate stupid COFF.
 	 * We have to map the remaining bit of the file into the kernel's
 	 * memory map, allocate some anonymous memory, copy that last
 	 * bit into it, and then we're done. *sigh*
 	 * For clean-up reasons, we actally map in the file last.
 	 */
 
 	copy_len = (offset + filsz) - trunc_page(offset + filsz);
 	map_addr = trunc_page((vm_offset_t)vmaddr + filsz);
 	map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr;
 
 	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx,0x%x, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n", __FILE__, __LINE__, map_addr, map_len));
 
 	if (map_len != 0) {
 		error = vm_map_find(&vmspace->vm_map, NULL, 0, &map_addr,
 				    map_len, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			return error;
 	}
 
 	if ((error = vm_mmap(kernel_map,
 			    (vm_offset_t *) &data_buf,
 			    PAGE_SIZE,
 			    VM_PROT_READ,
 			    VM_PROT_READ,
 			    0,
 			    OBJT_VNODE,
 			    vp,
 			    trunc_page(offset + filsz))) != 0)
 		return error;
 
 	error = copyout(data_buf, (caddr_t) map_addr, copy_len);
 
 	if (vm_map_remove(kernel_map,
 			  (vm_offset_t) data_buf,
 			  (vm_offset_t) data_buf + PAGE_SIZE))
 		panic("load_coff_section vm_map_remove failed");
 
 	return error;
 }
 
 static int
 coff_load_file(struct thread *td, char *name)
 {
 	struct proc *p = td->td_proc;
   	struct vmspace *vmspace = p->p_vmspace;
   	int error;
   	struct nameidata nd;
   	struct vnode *vp;
   	struct vattr attr;
   	struct filehdr *fhdr;
   	struct aouthdr *ahdr;
   	struct scnhdr *scns;
   	char *ptr = 0;
   	int nscns;
   	unsigned long text_offset = 0, text_address = 0, text_size = 0;
   	unsigned long data_offset = 0, data_address = 0, data_size = 0;
   	unsigned long bss_size = 0;
   	int i;
 
 	NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME,
 	    UIO_SYSSPACE, name, td);
 
   	error = namei(&nd);
   	if (error)
     		return error;
 
   	vp = nd.ni_vp;
   	if (vp == NULL)
     		return ENOEXEC;
 
   	if (vp->v_writecount) {
     		error = ETXTBSY;
     		goto fail;
   	}
 
   	if ((error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) != 0)
     		goto fail;
 
   	if ((vp->v_mount->mnt_flag & MNT_NOEXEC)
 	    || ((attr.va_mode & 0111) == 0)
 	    || (attr.va_type != VREG))
     		goto fail;
 
   	if (attr.va_size == 0) {
     		error = ENOEXEC;
     		goto fail;
   	}
 
   	if ((error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td)) != 0)
     		goto fail;
 
   	if ((error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL)) != 0)
     		goto fail;
 
 	/*
 	 * Lose the lock on the vnode. It's no longer needed, and must not
 	 * exist for the pagefault paging to work below.
 	 */
 	VOP_UNLOCK(vp, 0, td);
 
   	if ((error = vm_mmap(kernel_map,
 			    (vm_offset_t *) &ptr,
 			    PAGE_SIZE,
 			    VM_PROT_READ,
 		       	    VM_PROT_READ,
 			    0,
 			    OBJT_VNODE,
 			    vp,
 			    0)) != 0)
 		goto unlocked_fail;
 
   	fhdr = (struct filehdr *)ptr;
 
   	if (fhdr->f_magic != I386_COFF) {
     		error = ENOEXEC;
     		goto dealloc_and_fail;
   	}
 
   	nscns = fhdr->f_nscns;
 
   	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
     		/*
      		 * XXX -- just fail.  I'm so lazy.
      		 */
     		error = ENOEXEC;
     		goto dealloc_and_fail;
   	}
 
   	ahdr = (struct aouthdr*)(ptr + sizeof(struct filehdr));
 
   	scns = (struct scnhdr*)(ptr + sizeof(struct filehdr)
 			  + sizeof(struct aouthdr));
 
   	for (i = 0; i < nscns; i++) {
     		if (scns[i].s_flags & STYP_NOLOAD)
       			continue;
     		else if (scns[i].s_flags & STYP_TEXT) {
       			text_address = scns[i].s_vaddr;
       			text_size = scns[i].s_size;
       			text_offset = scns[i].s_scnptr;
     		}
 		else if (scns[i].s_flags & STYP_DATA) {
       			data_address = scns[i].s_vaddr;
       			data_size = scns[i].s_size;
       			data_offset = scns[i].s_scnptr;
     		} else if (scns[i].s_flags & STYP_BSS) {
       			bss_size = scns[i].s_size;
     		}
   	}
 
   	if ((error = load_coff_section(vmspace, vp, text_offset,
 				      (caddr_t)(void *)(uintptr_t)text_address,
 				      text_size, text_size,
 				      VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
     		goto dealloc_and_fail;
   	}
   	if ((error = load_coff_section(vmspace, vp, data_offset,
 				      (caddr_t)(void *)(uintptr_t)data_address,
 				      data_size + bss_size, data_size,
 				      VM_PROT_ALL)) != 0) {
     		goto dealloc_and_fail;
   	}
 
   	error = 0;
 
  dealloc_and_fail:
 	if (vm_map_remove(kernel_map,
 			  (vm_offset_t) ptr,
 			  (vm_offset_t) ptr + PAGE_SIZE))
     		panic("%s vm_map_remove failed", __func__);
 
  fail:
 	VOP_UNLOCK(vp, 0, td);
  unlocked_fail:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
   	return error;
 }
 
 static int
 exec_coff_imgact(imgp)
 	struct image_params *imgp;
 {
 	const struct filehdr *fhdr = (const struct filehdr*)imgp->image_header;
 	const struct aouthdr *ahdr;
 	const struct scnhdr *scns;
 	int i;
 	struct vmspace *vmspace;
 	int nscns;
 	int error;
 	unsigned long text_offset = 0, text_address = 0, text_size = 0;
 	unsigned long data_offset = 0, data_address = 0, data_size = 0;
 	unsigned long bss_size = 0;
 	caddr_t hole;
 	struct thread *td = curthread;
 
 	if (fhdr->f_magic != I386_COFF ||
 	    !(fhdr->f_flags & F_EXEC)) {
 
 		 DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
 		 return -1;
 	}
 
 	nscns = fhdr->f_nscns;
 	if ((nscns * sizeof(struct scnhdr)) > PAGE_SIZE) {
 	  	/*
 	   	 * For now, return an error -- need to be able to
 	   	 * read in all of the section structures.
 	   	 */
 
 		DPRINTF(("%s(%d): return -1\n", __FILE__, __LINE__));
 		return -1;
 	}
 
 	ahdr = (const struct aouthdr*)
 	       ((const char*)(imgp->image_header) + sizeof(struct filehdr));
 	imgp->entry_addr = ahdr->entry;
 
 	scns = (const struct scnhdr*)
 	       ((const char*)(imgp->image_header) + sizeof(struct filehdr) +
 		sizeof(struct aouthdr));
 
 	VOP_UNLOCK(imgp->vp, 0, td);
 
-	exec_new_vmspace(imgp, &ibcs2_svr3_sysvec);
+	error = exec_new_vmspace(imgp, &ibcs2_svr3_sysvec);
+	if (error)
+		goto fail;
 	vmspace = imgp->proc->p_vmspace;
 
 	for (i = 0; i < nscns; i++) {
 
 	  DPRINTF(("i = %d, scns[i].s_name = %s, scns[i].s_vaddr = %08lx, "
 		   "scns[i].s_scnptr = %d\n", i, scns[i].s_name,
 		   scns[i].s_vaddr, scns[i].s_scnptr));
 	  if (scns[i].s_flags & STYP_NOLOAD) {
 	    	/*
 	     	 * A section that is not loaded, for whatever
 	     	 * reason.  It takes precedance over other flag
 	     	 * bits...
 	     	 */
 	    	continue;
 	  } else if (scns[i].s_flags & STYP_TEXT) {
 	    	text_address = scns[i].s_vaddr;
 	    	text_size = scns[i].s_size;
 	    	text_offset = scns[i].s_scnptr;
 	  } else if (scns[i].s_flags & STYP_DATA) {
 	    	/* .data section */
 	    	data_address = scns[i].s_vaddr;
 	    	data_size = scns[i].s_size;
 	    	data_offset = scns[i].s_scnptr;
 	  } else if (scns[i].s_flags & STYP_BSS) {
 	    	/* .bss section */
 	    	bss_size = scns[i].s_size;
 	  } else if (scns[i].s_flags & STYP_LIB) {
 	    	char *buf = 0;
 	    	int foff = trunc_page(scns[i].s_scnptr);
 	    	int off = scns[i].s_scnptr - foff;
 	    	int len = round_page(scns[i].s_size + PAGE_SIZE);
 	    	int j;
 
 	    	if ((error = vm_mmap(kernel_map,
 				    (vm_offset_t *) &buf,
 				    len,
 				    VM_PROT_READ,
 				    VM_PROT_READ,
 				    0,
 				    OBJT_VNODE,
 				    imgp->vp,
 				    foff)) != 0) {
 	      		error = ENOEXEC;
 			goto fail;
 	    	}
 		if(scns[i].s_size) {
 			char *libbuf;
 			int emul_path_len = strlen(ibcs2_emul_path);
 
 			libbuf = malloc(MAXPATHLEN + emul_path_len,
 					M_TEMP, M_WAITOK);
 			strcpy(libbuf, ibcs2_emul_path);
 
 		    	for (j = off; j < scns[i].s_size + off;) {
 				long stroff, nextoff;
 	      			char *libname;
 
 				nextoff = 4 * *(long *)(buf + j);
 				stroff = 4 * *(long *)(buf + j + sizeof(long));
 
 		      		libname = buf + j + stroff;
 		      		j += nextoff;
 
 				DPRINTF(("%s(%d):  shared library %s\n",
 					 __FILE__, __LINE__, libname));
 				strlcpy(&libbuf[emul_path_len], libname, MAXPATHLEN);
 /* XXXKSE only 1:1 in coff */  	error = coff_load_file(
 				    FIRST_THREAD_IN_PROC(imgp->proc), libbuf);
 		      		if (error)
 	      				error = coff_load_file(
 					    FIRST_THREAD_IN_PROC(imgp->proc),
 					    libname);
 		      		if (error)
 					break;
 		    	}
 			free(libbuf, M_TEMP);
 		}
 		if (vm_map_remove(kernel_map,
 				  (vm_offset_t) buf,
 				  (vm_offset_t) buf + len))
 	      		panic("exec_coff_imgact vm_map_remove failed");
 	    	if (error)
 	      		goto fail;
 	  	}
 	}
 	/*
 	 * Map in .text now
 	 */
 
 	DPRINTF(("%s(%d):  load_coff_section(vmspace, "
 		"imgp->vp, %08lx, %08lx, 0x%x, 0x%x, 0x%x)\n",
 		__FILE__, __LINE__, text_offset, text_address,
 		text_size, text_size, VM_PROT_READ | VM_PROT_EXECUTE));
 	if ((error = load_coff_section(vmspace, imgp->vp,
 				      text_offset,
 				      (caddr_t)(void *)(uintptr_t)text_address,
 				      text_size, text_size,
 				      VM_PROT_READ | VM_PROT_EXECUTE)) != 0) {
 		DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
 		goto fail;
        	}
 	/*
 	 * Map in .data and .bss now
 	 */
 
 
 	DPRINTF(("%s(%d): load_coff_section(vmspace, "
 		"imgp->vp, 0x%08lx, 0x%08lx, 0x%x, 0x%x, 0x%x)\n",
 		__FILE__, __LINE__, data_offset, data_address,
 		data_size + bss_size, data_size, VM_PROT_ALL));
 	if ((error = load_coff_section(vmspace, imgp->vp,
 				      data_offset,
 				      (caddr_t)(void *)(uintptr_t)data_address,
 				      data_size + bss_size, data_size,
 				      VM_PROT_ALL)) != 0) {
 
 		DPRINTF(("%s(%d): error = %d\n", __FILE__, __LINE__, error));
 		goto fail;
 	}
 
 	imgp->interpreted = 0;
 	imgp->proc->p_sysent = &ibcs2_svr3_sysvec;
 
 	vmspace->vm_tsize = round_page(text_size) >> PAGE_SHIFT;
 	vmspace->vm_dsize = round_page(data_size + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)text_address;
 	vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)data_address;
 
 	hole = (caddr_t)trunc_page((vm_offset_t)vmspace->vm_daddr) + ctob(vmspace->vm_dsize);
 
 
 	DPRINTF(("%s(%d): vm_map_find(&vmspace->vm_map, NULL, 0, &0x%08lx, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0)\n",
 		__FILE__, __LINE__, hole));
         DPRINTF(("imgact: error = %d\n", error));
 
 	error = vm_map_find(&vmspace->vm_map, NULL, 0,
 			    (vm_offset_t *) &hole, PAGE_SIZE, FALSE,
 				VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	DPRINTF(("IBCS2: start vm_dsize = 0x%x, vm_daddr = 0x%x end = 0x%x\n",
 		ctob(vmspace->vm_dsize), vmspace->vm_daddr,
 		ctob(vmspace->vm_dsize) + vmspace->vm_daddr ));
 	DPRINTF(("%s(%d):  returning successfully!\n", __FILE__, __LINE__));
 
 fail:
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	return error;
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw coff_execsw = { exec_coff_imgact, "coff" };
 EXEC_SET(coff, coff_execsw);
Index: head/sys/i386/linux/imgact_linux.c
===================================================================
--- head/sys/i386/linux/imgact_linux.c	(revision 173360)
+++ head/sys/i386/linux/imgact_linux.c	(revision 173361)
@@ -1,245 +1,247 @@
 /*-
  * Copyright (c) 1994-1996 S�ren Schmidt
  * All rights reserved.
  *
  * Based heavily on /sys/kern/imgact_aout.c which is:
  * Copyright (c) 1993, David Greenman
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <i386/linux/linux.h>
 
 static int	exec_linux_imgact(struct image_params *iparams);
 
 static int
 exec_linux_imgact(struct image_params *imgp)
 {
     const struct exec *a_out = (const struct exec *) imgp->image_header;
     struct vmspace *vmspace;
     vm_offset_t vmaddr;
     unsigned long virtual_offset, file_offset;
     vm_offset_t buffer;
     unsigned long bss_size;
     struct thread *td = curthread;
     int error;
 
     if (((a_out->a_magic >> 16) & 0xff) != 0x64)
 	return -1;
 
     /*
      * Set file/virtual offset based on a.out variant.
      */
     switch ((int)(a_out->a_magic & 0xffff)) {
     case 0413:
 	virtual_offset = 0;
 	file_offset = 1024;
 	break;
     case 0314:
 	virtual_offset = 4096;
 	file_offset = 0;
 	break;
     default:
 	return (-1);
     }
     bss_size = round_page(a_out->a_bss);
 #ifdef DEBUG
     printf("imgact: text: %08lx, data: %08lx, bss: %08lx\n",
 	(u_long)a_out->a_text, (u_long)a_out->a_data, bss_size);
 #endif
 
     /*
      * Check various fields in header for validity/bounds.
      */
     if (a_out->a_entry < virtual_offset ||
 	a_out->a_entry >= virtual_offset + a_out->a_text ||
 	a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 	return (-1);
 
     /* text + data can't exceed file size */
     if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 	return (EFAULT);
     /*
      * text/data/bss must not exceed limits
      */
     PROC_LOCK(imgp->proc);
     if (a_out->a_text > maxtsiz ||
 	a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
 	PROC_UNLOCK(imgp->proc);
 	return (ENOMEM);
     }
     PROC_UNLOCK(imgp->proc);
 
     VOP_UNLOCK(imgp->vp, 0, td);
 
     /*
      * Destroy old process VM and create a new one (with a new stack)
      */
-    exec_new_vmspace(imgp, &linux_sysvec);
+    error = exec_new_vmspace(imgp, &linux_sysvec);
+    if (error)
+	    goto fail;
     vmspace = imgp->proc->p_vmspace;
 
     /*
      * Check if file_offset page aligned,.
      * Currently we cannot handle misaligned file offsets,
      * and so we read in the entire image (what a waste).
      */
     if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 	printf("imgact: Non page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data+bss read/write/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 			    a_out->a_text + a_out->a_data + bss_size, FALSE,
 			    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 	    goto fail;
 
 	error = vm_mmap(kernel_map, &buffer,
 			round_page(a_out->a_text + a_out->a_data + file_offset),
 			VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE,
 			imgp->vp, trunc_page(file_offset));
 	if (error)
 	    goto fail;
 
 	error = copyout((void *)(uintptr_t)(buffer + file_offset),
 			(void *)vmaddr, a_out->a_text + a_out->a_data);
 
 	vm_map_remove(kernel_map, buffer,
 		      buffer + round_page(a_out->a_text + a_out->a_data + file_offset));
 
 	if (error)
 	    goto fail;
 
 	/*
 	 * remove write enable on the 'text' part
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr,
 			       vmaddr + a_out->a_text,
 			       VM_PROT_EXECUTE|VM_PROT_READ,
 			       TRUE);
 	if (error)
 	    goto fail;
     }
     else {
 #ifdef DEBUG
 	printf("imgact: Page aligned binary %lu\n", file_offset);
 #endif
 	/*
 	 * Map text+data read/execute
 	 */
 	vmaddr = virtual_offset;
 	error = vm_mmap(&vmspace->vm_map, &vmaddr,
 			a_out->a_text + a_out->a_data,
 			VM_PROT_READ | VM_PROT_EXECUTE,
 			VM_PROT_ALL,
 			MAP_PRIVATE | MAP_FIXED,
 			OBJT_VNODE,
 			imgp->vp, file_offset);
 	if (error)
 	    goto fail;
 
 #ifdef DEBUG
 	printf("imgact: startaddr=%08lx, length=%08lx\n",
 	    (u_long)vmaddr, (u_long)a_out->a_text + (u_long)a_out->a_data);
 #endif
 	/*
 	 * allow read/write of data
 	 */
 	error = vm_map_protect(&vmspace->vm_map,
 			       vmaddr + a_out->a_text,
 			       vmaddr + a_out->a_text + a_out->a_data,
 			       VM_PROT_ALL,
 			       FALSE);
 	if (error)
 	    goto fail;
 
 	/*
 	 * Allocate anon demand-zeroed area for uninitialized data
 	 */
 	if (bss_size != 0) {
 	    vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
 	    error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr,
 				bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	    if (error)
 		goto fail;
 #ifdef DEBUG
 	    printf("imgact: bssaddr=%08lx, length=%08lx\n",
 		(u_long)vmaddr, bss_size);
 #endif
 
 	}
 	/* Indicate that this file should not be modified */
 	mp_fixme("Unlocked v_flag access");
 	imgp->vp->v_vflag |= VV_TEXT;
     }
     /* Fill in process VM information */
     vmspace->vm_tsize = round_page(a_out->a_text) >> PAGE_SHIFT;
     vmspace->vm_dsize = round_page(a_out->a_data + bss_size) >> PAGE_SHIFT;
     vmspace->vm_taddr = (caddr_t)(void *)(uintptr_t)virtual_offset;
     vmspace->vm_daddr = (caddr_t)(void *)(uintptr_t)
 	(virtual_offset + a_out->a_text);
 
     /* Fill in image_params */
     imgp->interpreted = 0;
     imgp->entry_addr = a_out->a_entry;
 
     imgp->proc->p_sysent = &linux_sysvec;
 
 fail:
     vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
     return (error);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw linux_execsw = { exec_linux_imgact, "linux a.out" };
 EXEC_SET(linuxaout, linux_execsw);
Index: head/sys/ia64/ia64/machdep.c
===================================================================
--- head/sys/ia64/ia64/machdep.c	(revision 173360)
+++ head/sys/ia64/ia64/machdep.c	(revision 173361)
@@ -1,1532 +1,1532 @@
 /*-
  * Copyright (c) 2003,2004 Marcel Moolenaar
  * Copyright (c) 2000,2001 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/random.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/uuid.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/efi.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/mutex.h>
 #include <machine/pal.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/sal.h>
 #include <machine/sigframe.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/unwind.h>
 #include <machine/vmparam.h>
 
 #include <i386/include/specialreg.h>
 
 u_int64_t processor_frequency;
 u_int64_t bus_frequency;
 u_int64_t itc_frequency;
 int cold = 1;
 
 u_int64_t pa_bootinfo;
 struct bootinfo bootinfo;
 
 struct pcpu pcpu0;
 extern char kstack[]; 
 vm_offset_t proc0kstack;
 
 extern u_int64_t kernel_text[], _end[];
 
 extern u_int64_t ia64_gateway_page[];
 extern u_int64_t break_sigtramp[];
 extern u_int64_t epc_sigtramp[];
 
 struct fpswa_iface *fpswa_iface;
 
 u_int64_t ia64_pal_base;
 u_int64_t ia64_port_base;
 
 char machine[] = MACHINE;
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static char cpu_model[64];
 SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
     "The CPU model name");
 
 static char cpu_family[64];
 SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
     "The CPU family name");
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 struct msgbuf *msgbufp=0;
 
 long Maxmem = 0;
 long realmem = 0;
 
 #define	PHYSMAP_SIZE	(2 * VM_PHYSSEG_MAX)
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 void mi_startup(void);		/* XXX should be in a MI header */
 
 struct kva_md_info kmi;
 
 #define	Mhz	1000000L
 #define	Ghz	(1000L*Mhz)
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 identifycpu(void)
 {
 	char vendor[17];
 	char *family_name, *model_name;
 	u_int64_t features, tmp;
 	int number, revision, model, family, archrev;
 
 	/*
 	 * Assumes little-endian.
 	 */
 	*(u_int64_t *) &vendor[0] = ia64_get_cpuid(0);
 	*(u_int64_t *) &vendor[8] = ia64_get_cpuid(1);
 	vendor[16] = '\0';
 
 	tmp = ia64_get_cpuid(3);
 	number = (tmp >> 0) & 0xff;
 	revision = (tmp >> 8) & 0xff;
 	model = (tmp >> 16) & 0xff;
 	family = (tmp >> 24) & 0xff;
 	archrev = (tmp >> 32) & 0xff;
 
 	family_name = model_name = "unknown";
 	switch (family) {
 	case 0x07:
 		family_name = "Itanium";
 		model_name = "Merced";
 		break;
 	case 0x1f:
 		family_name = "Itanium 2";
 		switch (model) {
 		case 0x00:
 			model_name = "McKinley";
 			break;
 		case 0x01:
 			/*
 			 * Deerfield is a low-voltage variant based on the
 			 * Madison core. We need circumstantial evidence
 			 * (i.e. the clock frequency) to identify those.
 			 * Allow for roughly 1% error margin.
 			 */
 			tmp = processor_frequency >> 7;
 			if ((processor_frequency - tmp) < 1*Ghz &&
 			    (processor_frequency + tmp) >= 1*Ghz)
 				model_name = "Deerfield";
 			else
 				model_name = "Madison";
 			break;
 		case 0x02:
 			model_name = "Madison II";
 			break;
 		}
 		break;
 	case 0x20:
 		family_name = "Itanium 2";
 		switch (model) {
 		case 0x00:
 			model_name = "Montecito";
 			break;
 		}
 		break;
 	}
 	snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
 	snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);
 
 	features = ia64_get_cpuid(4);
 
 	printf("CPU: %s (", model_name);
 	if (processor_frequency) {
 		printf("%ld.%02ld-Mhz ",
 		    (processor_frequency + 4999) / Mhz,
 		    ((processor_frequency + 4999) / (Mhz/100)) % 100);
 	}
 	printf("%s)\n", family_name);
 	printf("  Origin = \"%s\"  Revision = %d\n", vendor, revision);
 	printf("  Features = 0x%b\n", (u_int32_t) features,
 	    "\020"
 	    "\001LB"	/* long branch (brl) instruction. */
 	    "\002SD"	/* Spontaneous deferral. */
 	    "\003AO"	/* 16-byte atomic operations (ld, st, cmpxchg). */ );
 }
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	identifycpu();
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ia64_ptob(Maxmem),
 	    ia64_ptob(Maxmem) / 1048576);
 	realmem = Maxmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			long size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 >> PAGE_SHIFT);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1048576);
  
 	if (fpswa_iface == NULL)
 		printf("Warning: no FPSWA package supplied\n");
 	else
 		printf("FPSWA Revision = 0x%lx, Entry = %p\n",
 		    (long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	/*
 	 * Traverse the MADT to discover IOSAPIC and Local SAPIC
 	 * information.
 	 */
 	ia64_probe_sapics();
 	ia64_mca_init();
 }
 
 void
 cpu_boot(int howto)
 {
 
 	efi_reset_system();
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	*rate = processor_frequency;
 	return (0);
 }
 
 void
 cpu_halt()
 {
 
 	efi_reset_system();
 }
 
 static void
 cpu_idle_default(void)
 {
 	struct ia64_pal_result res;
 
 	res = ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
 }
 
 void
 cpu_idle()
 {
 	(*cpu_idle_hook)();
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 void
 cpu_reset()
 {
 
 	cpu_boot(0);
 }
 
 void
 cpu_switch(struct thread *old, struct thread *new, struct mtx *mtx)
 {
 	struct pcb *oldpcb, *newpcb;
 
 	oldpcb = old->td_pcb;
 #ifdef COMPAT_IA32
 	ia32_savectx(oldpcb);
 #endif
 	if (PCPU_GET(fpcurthread) == old)
 		old->td_frame->tf_special.psr |= IA64_PSR_DFH;
 	if (!savectx(oldpcb)) {
 		newpcb = new->td_pcb;
 		oldpcb->pcb_current_pmap =
 		    pmap_switch(newpcb->pcb_current_pmap);
 		PCPU_SET(curthread, new);
 #ifdef COMPAT_IA32
 		ia32_restorectx(newpcb);
 #endif
 		if (PCPU_GET(fpcurthread) == new)
 			new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
 		restorectx(newpcb);
 		/* We should not get here. */
 		panic("cpu_switch: restorectx() returned");
 		/* NOTREACHED */
 	}
 }
 
 void
 cpu_throw(struct thread *old __unused, struct thread *new)
 {
 	struct pcb *newpcb;
 
 	newpcb = new->td_pcb;
 	(void)pmap_switch(newpcb->pcb_current_pmap);
 	PCPU_SET(curthread, new);
 #ifdef COMPAT_IA32
 	ia32_restorectx(newpcb);
 #endif
 	restorectx(newpcb);
 	/* We should not get here. */
 	panic("cpu_throw: restorectx() returned");
 	/* NOTREACHED */
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = cpuid;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_intr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_intr);
 }
 
 void
 map_vhpt(uintptr_t vhpt)
 {
 	pt_entry_t pte;
 	uint64_t psr;
 
 	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
 	    PTE_PL_KERN | PTE_AR_RW;
 	pte |= vhpt & PTE_PPN_MASK;
 
 	__asm __volatile("ptr.d %0,%1" :: "r"(vhpt),
 	    "r"(IA64_ID_PAGE_SHIFT<<2));
 
 	__asm __volatile("mov   %0=psr" : "=r"(psr));
 	__asm __volatile("rsm   psr.ic|psr.i");
 	ia64_srlz_i();
 	ia64_set_ifa(vhpt);
 	ia64_set_itir(IA64_ID_PAGE_SHIFT << 2);
 	ia64_srlz_d();
 	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(2), "r"(pte));
 	__asm __volatile("mov   psr.l=%0" :: "r" (psr));
 	ia64_srlz_i();
 }
 
 void
 map_pal_code(void)
 {
 	pt_entry_t pte;
 	uint64_t psr;
 
 	if (ia64_pal_base == 0)
 		return;
 
 	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
 	    PTE_PL_KERN | PTE_AR_RWX;
 	pte |= ia64_pal_base & PTE_PPN_MASK;
 
 	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
 	    "r"(IA64_PHYS_TO_RR7(ia64_pal_base)), "r"(IA64_ID_PAGE_SHIFT<<2));
 
 	__asm __volatile("mov	%0=psr" : "=r"(psr));
 	__asm __volatile("rsm	psr.ic|psr.i");
 	ia64_srlz_i();
 	ia64_set_ifa(IA64_PHYS_TO_RR7(ia64_pal_base));
 	ia64_set_itir(IA64_ID_PAGE_SHIFT << 2);
 	ia64_srlz_d();
 	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(1), "r"(pte));
 	ia64_srlz_d();
 	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(1), "r"(pte));
 	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
 	ia64_srlz_i();
 }
 
 void
 map_gateway_page(void)
 {
 	pt_entry_t pte;
 	uint64_t psr;
 
 	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
 	    PTE_PL_KERN | PTE_AR_X_RX;
 	pte |= (uint64_t)ia64_gateway_page & PTE_PPN_MASK;
 
 	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
 	    "r"(VM_MAX_ADDRESS), "r"(PAGE_SHIFT << 2));
 
 	__asm __volatile("mov	%0=psr" : "=r"(psr));
 	__asm __volatile("rsm	psr.ic|psr.i");
 	ia64_srlz_i();
 	ia64_set_ifa(VM_MAX_ADDRESS);
 	ia64_set_itir(PAGE_SHIFT << 2);
 	ia64_srlz_d();
 	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(3), "r"(pte));
 	ia64_srlz_d();
 	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(3), "r"(pte));
 	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
 	ia64_srlz_i();
 
 	/* Expose the mapping to userland in ar.k5 */
 	ia64_set_k5(VM_MAX_ADDRESS);
 }
 
 static void
 calculate_frequencies(void)
 {
 	struct ia64_sal_result sal;
 	struct ia64_pal_result pal;
 
 	sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
 	pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
 
 	if (sal.sal_status == 0 && pal.pal_status == 0) {
 		if (bootverbose) {
 			printf("Platform clock frequency %ld Hz\n",
 			       sal.sal_result[0]);
 			printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
 			       "ITC ratio %ld/%ld\n",
 			       pal.pal_result[0] >> 32,
 			       pal.pal_result[0] & ((1L << 32) - 1),
 			       pal.pal_result[1] >> 32,
 			       pal.pal_result[1] & ((1L << 32) - 1),
 			       pal.pal_result[2] >> 32,
 			       pal.pal_result[2] & ((1L << 32) - 1));
 		}
 		processor_frequency =
 			sal.sal_result[0] * (pal.pal_result[0] >> 32)
 			/ (pal.pal_result[0] & ((1L << 32) - 1));
 		bus_frequency =
 			sal.sal_result[0] * (pal.pal_result[1] >> 32)
 			/ (pal.pal_result[1] & ((1L << 32) - 1));
 		itc_frequency =
 			sal.sal_result[0] * (pal.pal_result[2] >> 32)
 			/ (pal.pal_result[2] & ((1L << 32) - 1));
 	}
 }
 
 void
 ia64_init(void)
 {
 	int phys_avail_cnt;
 	vm_offset_t kernstart, kernend;
 	vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
 	char *p;
 	struct efi_md *md;
 	int metadata_missing;
 
 	/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */
 
 	/*
 	 * TODO: Disable interrupts, floating point etc.
 	 * Maybe flush cache and tlb
 	 */
 	ia64_set_fpsr(IA64_FPSR_DEFAULT);
 
 	/*
 	 * TODO: Get critical system information (if possible, from the
 	 * information provided by the boot program).
 	 */
 
 	/*
 	 * pa_bootinfo is the physical address of the bootinfo block as
 	 * passed to us by the loader and set in locore.s.
 	 */
 	bootinfo = *(struct bootinfo *)(IA64_PHYS_TO_RR7(pa_bootinfo));
 
 	if (bootinfo.bi_magic != BOOTINFO_MAGIC || bootinfo.bi_version != 1) {
 		bzero(&bootinfo, sizeof(bootinfo));
 		bootinfo.bi_kernend = (vm_offset_t) round_page(_end);
 	}
 
 	/*
 	 * Look for the I/O ports first - we need them for console
 	 * probing.
 	 */
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
 		switch (md->md_type) {
 		case EFI_MD_TYPE_IOPORT:
 			ia64_port_base = IA64_PHYS_TO_RR6(md->md_phys);
 			break;
 		case EFI_MD_TYPE_PALCODE:
 			ia64_pal_base = md->md_phys;
 			break;
 		}
 	}
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep)
 		preload_metadata = (caddr_t)bootinfo.bi_modulep;
 	else
 		metadata_missing = 1;
 
 	if (envmode == 0 && bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp;
 	else
 		kern_envp = static_env;
 
 	/*
 	 * Look at arguments passed to us and compute boothowto.
 	 */
 	boothowto = bootinfo.bi_boothowto;
 
 	/*
 	 * Catch case of boot_verbose set in environment.
 	 */
 	if ((p = getenv("boot_verbose")) != NULL) {
 		if (strcmp(p, "yes") == 0 || strcmp(p, "YES") == 0) {
 			boothowto |= RB_VERBOSE;
 		}
 		freeenv(p);
 	}
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose = 1;
 
 	/*
 	 * Setup the PCPU data for the bootstrap processor. It is needed
 	 * by printf(). Also, since printf() has critical sections, we
 	 * need to initialize at least pc_curthread.
 	 */
 	pcpup = &pcpu0;
 	ia64_set_k4((u_int64_t)pcpup);
 	pcpu_init(pcpup, 0, sizeof(pcpu0));
 	PCPU_SET(curthread, &thread0);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/* OUTPUT NOW ALLOWED */
 
 	if (ia64_pal_base != 0) {
 		ia64_pal_base &= ~IA64_ID_PAGE_MASK;
 		/*
 		 * We use a TR to map the first 256M of memory - this might
 		 * cover the palcode too.
 		 */
 		if (ia64_pal_base == 0)
 			printf("PAL code mapped by the kernel's TR\n");
 	} else
 		printf("PAL code not found\n");
 
 	/*
 	 * Wire things up so we can call the firmware.
 	 */
 	map_pal_code();
 	efi_boot_minimal(bootinfo.bi_systab);
 	ia64_sal_init();
 	calculate_frequencies();
 
 	/*
 	 * Find the beginning and end of the kernel.
 	 */
 	kernstart = trunc_page(kernel_text);
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 	kernend = (vm_offset_t)round_page(ksym_end);
 #else
 	kernend = (vm_offset_t)round_page(_end);
 #endif
 
 	/* But if the bootstrap tells us otherwise, believe it! */
 	if (bootinfo.bi_kernend)
 		kernend = round_page(bootinfo.bi_kernend);
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 	/* Get FPSWA interface */
 	fpswa_iface = (bootinfo.bi_fpswa == 0) ? NULL :
 	    (struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo.bi_fpswa);
 
 	/* Init basic tunables, including hz */
 	init_param1();
 
 	p = getenv("kernelname");
 	if (p) {
 		strncpy(kernelname, p, sizeof(kernelname) - 1);
 		freeenv(p);
 	}
 
 	kernstartpfn = atop(IA64_RR_MASK(kernstart));
 	kernendpfn = atop(IA64_RR_MASK(kernend));
 
 	/*
 	 * Size the memory regions and load phys_avail[] with the results.
 	 */
 
 	/*
 	 * Find out how much memory is available, by looking at
 	 * the memory descriptors.
 	 */
 
 #ifdef DEBUG_MD
 	printf("Memory descriptor count: %d\n", mdcount);
 #endif
 
 	phys_avail_cnt = 0;
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
 #ifdef DEBUG_MD
 		printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
 		    md->md_type, md->md_phys, md->md_pages);
 #endif
 
 		pfn0 = ia64_btop(round_page(md->md_phys));
 		pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
 		if (pfn1 <= pfn0)
 			continue;
 
 		if (md->md_type != EFI_MD_TYPE_FREE)
 			continue;
 
 		/*
 		 * We have a memory descriptor that describes conventional
 		 * memory that is for general use. We must determine if the
 		 * loader has put the kernel in this region.
 		 */
 		physmem += (pfn1 - pfn0);
 		if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
 			/*
 			 * Must compute the location of the kernel
 			 * within the segment.
 			 */
 #ifdef DEBUG_MD
 			printf("Descriptor %p contains kernel\n", mp);
 #endif
 			if (pfn0 < kernstartpfn) {
 				/*
 				 * There is a chunk before the kernel.
 				 */
 #ifdef DEBUG_MD
 				printf("Loading chunk before kernel: "
 				       "0x%lx / 0x%lx\n", pfn0, kernstartpfn);
 #endif
 				phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
 				phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
 				phys_avail_cnt += 2;
 			}
 			if (kernendpfn < pfn1) {
 				/*
 				 * There is a chunk after the kernel.
 				 */
 #ifdef DEBUG_MD
 				printf("Loading chunk after kernel: "
 				       "0x%lx / 0x%lx\n", kernendpfn, pfn1);
 #endif
 				phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
 				phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
 				phys_avail_cnt += 2;
 			}
 		} else {
 			/*
 			 * Just load this cluster as one chunk.
 			 */
 #ifdef DEBUG_MD
 			printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
 			       pfn0, pfn1);
 #endif
 			phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
 			phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
 			phys_avail_cnt += 2;
 			
 		}
 	}
 	phys_avail[phys_avail_cnt] = 0;
 
 	Maxmem = physmem;
 	init_param2(physmem);
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
 	msgbufp = (struct msgbuf *)pmap_steal_memory(MSGBUF_SIZE);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	/*
 	 * Init mapping for kernel stack for proc 0
 	 */
 	proc0kstack = (vm_offset_t)kstack;
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 
 	mutex_init();
 
 	/*
 	 * Initialize the rest of proc 0's PCB.
 	 *
 	 * Set the kernel sp, reserving space for an (empty) trapframe,
 	 * and make proc0's trapframe pointer point to it for sanity.
 	 * Initialise proc0's backing store to start after u area.
 	 */
 	cpu_thread_setup(&thread0);
 	thread0.td_frame->tf_flags = FRAME_SYSCALL;
 	thread0.td_pcb->pcb_special.sp =
 	    (u_int64_t)thread0.td_frame - 16;
 	thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;
 
 	/*
 	 * Initialize the virtual memory system.
 	 */
 	pmap_bootstrap();
 
 	/*
 	 * Initialize debuggers, and break into them if appropriate.
 	 */
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger\n");
 #endif
 
 	ia64_set_tpr(0);
 	ia64_srlz_d();
 
 	/*
 	 * Save our current context so that we have a known (maybe even
 	 * sane) context as the initial context for new threads that are
 	 * forked from us. If any of those threads (including thread0)
 	 * does something wrong, we may be lucky and return here where
 	 * we're ready for them with a nice panic.
 	 */
 	if (!savectx(thread0.td_pcb))
 		mi_startup();
 
 	/* We should not get here. */
 	panic("ia64_init: Whooaa there!");
 	/* NOTREACHED */
 }
 
 __volatile void *
 ia64_ioport_address(u_int port)
 {
 	uint64_t addr;
 
 	addr = (port > 0xffff) ? IA64_PHYS_TO_RR6((uint64_t)port) :
 	    ia64_port_base | ((port & 0xfffc) << 10) | (port & 0xFFF);
 	return ((__volatile void *)addr);
 }
 
 uint64_t
 ia64_get_hcdp(void)
 {
 
 	return (bootinfo.bi_hcdp);
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 DELAY(int n)
 {
 	u_int64_t start, end, now;
 
 	start = ia64_get_itc();
 	end = start + (itc_frequency * n) / 1000000;
 	/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
 	do {
 		now = ia64_get_itc();
 	} while (now < end || (now > start && end < start));
 }
 
 /*
  * Send an interrupt (signal) to a process.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct proc *p;
 	struct thread *td;
 	struct trapframe *tf;
 	struct sigacts *psp;
 	struct sigframe sf, *sfp;
 	u_int64_t sbs, sp;
 	int oonstack;
 	int sig;
 	u_long code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_special.sp;
 	oonstack = sigonstack(sp);
 	sbs = 0;
 
 	/* save user context */
 	bzero(&sf, sizeof(struct sigframe));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/*
 	 * Allocate and validate space for the signal handler
 	 * context. Note that if the stack is in P0 space, the
 	 * call to grow() is a nop, and the useracc() check
 	 * will fail if the process has not already allocated
 	 * the space with a `brk'.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sbs = (u_int64_t)td->td_sigstk.ss_sp;
 		sbs = (sbs + 15) & ~15;
 		sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe *)sp;
 	sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);
 
 	/* Fill in the siginfo structure for POSIX handlers. */
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		/*
 		 * XXX this shouldn't be here after code in trap.c
 		 * is fixed
 		 */
 		sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
 		code = (u_int64_t)&sfp->sf_si;
 	}
 
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 
 	/* Copy the frame out to userland. */
 	if (copyout(&sf, sfp, sizeof(sf)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		return;
 	}
 
 	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
 		tf->tf_special.psr &= ~IA64_PSR_RI;
 		tf->tf_special.iip = ia64_get_k5() +
 		    ((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
 	} else
 		tf->tf_special.iip = ia64_get_k5() +
 		    ((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);
 
 	/*
 	 * Setup the trapframe to return to the signal trampoline. We pass
 	 * information to the trampoline in the following registers:
 	 *
 	 *	gp	new backing store or NULL
 	 *	r8	signal number
 	 *	r9	signal code or siginfo pointer
 	 *	r10	signal handler (function descriptor)
 	 */
 	tf->tf_special.sp = (u_int64_t)sfp - 16;
 	tf->tf_special.gp = sbs;
 	tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
 	tf->tf_special.ndirty = 0;
 	tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
 	tf->tf_scratch.gr8 = sig;
 	tf->tf_scratch.gr9 = code;
 	tf->tf_scratch.gr10 = (u_int64_t)catcher;
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sigreturn(struct thread *td,
 	struct sigreturn_args /* {
 		ucontext_t *sigcntxp;
 	} */ *uap)
 {
 	ucontext_t uc;
 	struct trapframe *tf;
 	struct proc *p;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	p = td->td_proc;
 	pcb = td->td_pcb;
 
 	/*
 	 * Fetch the entire context structure at once for speed.
 	 * We don't use a normal argument to simplify RSE handling.
 	 */
 	if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
 		return (EFAULT);
 
 	set_mcontext(td, &uc.uc_mcontext);
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (sigonstack(tf->tf_special.sp))
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_special = tf->tf_special;
 	pcb->pcb_special.__spare = ~0UL;	/* XXX see unwind.c */
 	save_callee_saved(&pcb->pcb_preserved);
 	save_callee_saved_fp(&pcb->pcb_preserved_fp);
 }
 
 int
 ia64_flush_dirty(struct thread *td, struct _special *r)
 {
 	struct iovec iov;
 	struct uio uio;
 	uint64_t bspst, kstk, rnat;
 	int error;
 
 	if (r->ndirty == 0)
 		return (0);
 
 	kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
 	if (td == curthread) {
 		__asm __volatile("mov	ar.rsc=0;;");
 		__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
 		/* Make sure we have all the user registers written out. */
 		if (bspst - kstk < r->ndirty) {
 			__asm __volatile("flushrs;;");
 			__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
 		}
 		__asm __volatile("mov	%0=ar.rnat;;" : "=r"(rnat));
 		__asm __volatile("mov	ar.rsc=3");
 		error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty);
 		kstk += r->ndirty;
 		r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
 		    ? *(uint64_t*)(kstk | 0x1f8L) : rnat;
 	} else {
 		PHOLD(td->td_proc);
 		iov.iov_base = (void*)(uintptr_t)kstk;
 		iov.iov_len = r->ndirty;
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = r->bspstore;
 		uio.uio_resid = r->ndirty;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		uio.uio_td = td;
 		error = proc_rwmem(td->td_proc, &uio);
 		/*
 		 * XXX proc_rwmem() doesn't currently return ENOSPC,
 		 * so I think it can bogusly return 0. Neither do
 		 * we allow short writes.
 		 */
 		if (uio.uio_resid != 0 && error == 0)
 			error = ENOSPC;
 		PRELE(td->td_proc);
 	}
 
 	r->bspstore += r->ndirty;
 	r->ndirty = 0;
 	return (error);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	int error;
 
 	tf = td->td_frame;
 	bzero(mc, sizeof(*mc));
 	mc->mc_special = tf->tf_special;
 	error = ia64_flush_dirty(td, &mc->mc_special);
 	if (tf->tf_flags & FRAME_SYSCALL) {
 		mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT;
 		mc->mc_scratch = tf->tf_scratch;
 		if (flags & GET_MC_CLEAR_RET) {
 			mc->mc_scratch.gr8 = 0;
 			mc->mc_scratch.gr9 = 0;
 			mc->mc_scratch.gr10 = 0;
 			mc->mc_scratch.gr11 = 0;
 		}
 	} else {
 		mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT;
 		mc->mc_scratch = tf->tf_scratch;
 		mc->mc_scratch_fp = tf->tf_scratch_fp;
 		/*
 		 * XXX If the thread never used the high FP registers, we
 		 * probably shouldn't waste time saving them.
 		 */
 		ia64_highfp_save(td);
 		mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID;
 		mc->mc_high_fp = td->td_pcb->pcb_high_fp;
 	}
 	save_callee_saved(&mc->mc_preserved);
 	save_callee_saved_fp(&mc->mc_preserved_fp);
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct _special s;
 	struct trapframe *tf;
 	uint64_t psrmask;
 
 	tf = td->td_frame;
 
 	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
 	    ("Whoa there! We have more than 8KB of dirty registers!"));
 
 	s = mc->mc_special;
 	/*
 	 * Only copy the user mask and the restart instruction bit from
 	 * the new context.
 	 */
 	psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
 	    IA64_PSR_MFH | IA64_PSR_RI;
 	s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask);
 	/* We don't have any dirty registers of the new context. */
 	s.ndirty = 0;
 	if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
 		/*
 		 * We can get an async context passed to us while we
 		 * entered the kernel through a syscall: sigreturn(2)
 		 * and kse_switchin(2) both take contexts that could
 		 * previously be the result of a trap or interrupt.
 		 * Hence, we cannot assert that the trapframe is not
 		 * a syscall frame, but we can assert that it's at
 		 * least an expected syscall.
 		 */
 		if (tf->tf_flags & FRAME_SYSCALL) {
 			KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn ||
 			    tf->tf_scratch.gr15 == SYS_kse_switchin, ("foo"));
 			tf->tf_flags &= ~FRAME_SYSCALL;
 		}
 		tf->tf_scratch = mc->mc_scratch;
 		tf->tf_scratch_fp = mc->mc_scratch_fp;
 		if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
 			td->td_pcb->pcb_high_fp = mc->mc_high_fp;
 	} else {
 		KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
 		if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
 			s.cfm = tf->tf_special.cfm;
 			s.iip = tf->tf_special.iip;
 			tf->tf_scratch.gr15 = 0;	/* Clear syscall nr. */
 		} else
 			tf->tf_scratch = mc->mc_scratch;
 	}
 	tf->tf_special = s;
 	restore_callee_saved(&mc->mc_preserved);
 	restore_callee_saved_fp(&mc->mc_preserved_fp);
 
 	if (mc->mc_flags & _MC_FLAGS_KSE_SET_MBOX)
 		suword((caddr_t)mc->mc_special.ifa, mc->mc_special.isr);
 
 	return (0);
 }
 
 /*
  * Clear registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	uint64_t *ksttop, *kst;
 
 	tf = td->td_frame;
 	ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
 	    (tf->tf_special.bspstore & 0x1ffUL));
 
 	/*
 	 * We can ignore up to 8KB of dirty registers by masking off the
 	 * lower 13 bits in exception_restore() or epc_syscall(). This
 	 * should be enough for a couple of years, but if there are more
 	 * than 8KB of dirty registers, we lose track of the bottom of
 	 * the kernel stack. The solution is to copy the active part of
 	 * the kernel stack down 1 page (or 2, but not more than that)
 	 * so that we always have less than 8KB of dirty registers.
 	 */
 	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
 	    ("Whoa there! We have more than 8KB of dirty registers!"));
 
 	bzero(&tf->tf_special, sizeof(tf->tf_special));
 	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {	/* break syscalls. */
 		bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
 		bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
 		tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL;
 		tf->tf_special.bspstore = IA64_BACKINGSTORE;
 		/*
 		 * Copy the arguments onto the kernel register stack so that
 		 * they get loaded by the loadrs instruction. Skip over the
 		 * NaT collection points.
 		 */
 		kst = ksttop - 1;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst-- = 0;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst-- = ps_strings;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst = stack;
 		tf->tf_special.ndirty = (ksttop - kst) << 3;
 	} else {				/* epc syscalls (default). */
 		tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL;
 		tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
 		/*
 		 * Write values for out0, out1 and out2 to the user's backing
 		 * store and arrange for them to be restored into the user's
 		 * initial register frame.
 		 * Assumes that (bspstore & 0x1f8) < 0x1e0.
 		 */
 		suword((caddr_t)tf->tf_special.bspstore - 24, stack);
 		suword((caddr_t)tf->tf_special.bspstore - 16, ps_strings);
 		suword((caddr_t)tf->tf_special.bspstore -  8, 0);
 	}
 
 	tf->tf_special.iip = entry;
 	tf->tf_special.sp = (stack & ~15) - 16;
 	tf->tf_special.rsc = 0xf;
 	tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
 	tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT |
 	    IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN |
 	    IA64_PSR_CPL_USER;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	uint64_t slot;
 
 	switch (addr & 0xFUL) {
 	case 0:
 		slot = IA64_PSR_RI_0;
 		break;
 	case 1:
 		/* XXX we need to deal with MLX bundles here */
 		slot = IA64_PSR_RI_1;
 		break;
 	case 2:
 		slot = IA64_PSR_RI_2;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	td->td_frame->tf_special.iip = addr & ~0x0FULL;
 	td->td_frame->tf_special.psr =
 	    (td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	/*
 	 * There's no way to set single stepping when we're leaving the
 	 * kernel through the EPC syscall path. The way we solve this is
 	 * by enabling the lower-privilege trap so that we re-enter the
 	 * kernel as soon as the privilege level changes. See trap.c for
 	 * how we proceed from there.
 	 */
 	tf = td->td_frame;
 	if (tf->tf_flags & FRAME_SYSCALL)
 		tf->tf_special.psr |= IA64_PSR_LP;
 	else
 		tf->tf_special.psr |= IA64_PSR_SS;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	/*
 	 * Clear any and all status bits we may use to implement single
 	 * stepping.
 	 */
 	tf = td->td_frame;
 	tf->tf_special.psr &= ~IA64_PSR_SS;
 	tf->tf_special.psr &= ~IA64_PSR_LP;
 	tf->tf_special.psr &= ~IA64_PSR_TB;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	regs->r_special = tf->tf_special;
 	regs->r_scratch = tf->tf_scratch;
 	save_callee_saved(&regs->r_preserved);
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 	int error;
 
 	tf = td->td_frame;
 	error = ia64_flush_dirty(td, &tf->tf_special);
 	if (!error) {
 		tf->tf_special = regs->r_special;
 		tf->tf_special.bspstore += tf->tf_special.ndirty;
 		tf->tf_special.ndirty = 0;
 		tf->tf_scratch = regs->r_scratch;
 		restore_callee_saved(&regs->r_preserved);
 	}
 	return (error);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *frame = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Save the high FP registers. */
 	ia64_highfp_save(td);
 
 	fpregs->fpr_scratch = frame->tf_scratch_fp;
 	save_callee_saved_fp(&fpregs->fpr_preserved);
 	fpregs->fpr_high = pcb->pcb_high_fp;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *frame = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Throw away the high FP registers (should be redundant). */
 	ia64_highfp_drop(td);
 
 	frame->tf_scratch_fp = fpregs->fpr_scratch;
 	restore_callee_saved_fp(&fpregs->fpr_preserved);
 	pcb->pcb_high_fp = fpregs->fpr_high;
 	return (0);
 }
 
 /*
  * High FP register functions.
  */
 
 int
 ia64_highfp_drop(struct thread *td)
 {
 	struct pcb *pcb;
 	struct pcpu *cpu;
 	struct thread *thr;
 
 	mtx_lock_spin(&td->td_md.md_highfp_mtx);
 	pcb = td->td_pcb;
 	cpu = pcb->pcb_fpcpu;
 	if (cpu == NULL) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		return (0);
 	}
 	pcb->pcb_fpcpu = NULL;
 	thr = cpu->pc_fpcurthread;
 	cpu->pc_fpcurthread = NULL;
 	mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 
 	/* Post-mortem sanity checking. */
 	KASSERT(thr == td, ("Inconsistent high FP state"));
 	return (1);
 }
 
 int
 ia64_highfp_save(struct thread *td)
 {
 	struct pcb *pcb;
 	struct pcpu *cpu;
 	struct thread *thr;
 
 	/* Don't save if the high FP registers weren't modified. */
 	if ((td->td_frame->tf_special.psr & IA64_PSR_MFH) == 0)
 		return (ia64_highfp_drop(td));
 
 	mtx_lock_spin(&td->td_md.md_highfp_mtx);
 	pcb = td->td_pcb;
 	cpu = pcb->pcb_fpcpu;
 	if (cpu == NULL) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		return (0);
 	}
 #ifdef SMP
 	if (td == curthread)
 		sched_pin();
 	if (cpu != pcpup) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		ipi_send(cpu, IPI_HIGH_FP);
 		if (td == curthread)
 			sched_unpin();
 		while (pcb->pcb_fpcpu == cpu)
 			DELAY(100);
 		return (1);
 	} else {
 		save_high_fp(&pcb->pcb_high_fp);
 		if (td == curthread)
 			sched_unpin();
 	}
 #else
 	save_high_fp(&pcb->pcb_high_fp);
 #endif
 	pcb->pcb_fpcpu = NULL;
 	thr = cpu->pc_fpcurthread;
 	cpu->pc_fpcurthread = NULL;
 	mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 
 	/* Post-mortem sanity cxhecking. */
 	KASSERT(thr == td, ("Inconsistent high FP state"));
 	return (1);
 }
 
 int
 sysbeep(int pitch, int period)
 {
 	return (ENODEV);
 }
Index: head/sys/ia64/ia64/pmap.c
===================================================================
--- head/sys/ia64/ia64/pmap.c	(revision 173360)
+++ head/sys/ia64/ia64/pmap.c	(revision 173361)
@@ -1,2408 +1,2409 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 1998,2000 Doug Rabson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  *	from:	i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp
  *		with some ideas from NetBSD's alpha pmap
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 
 #include <machine/md_var.h>
 #include <machine/pal.h>
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 /*
  * Following the Linux model, region IDs are allocated in groups of
  * eight so that a single region ID can be used for as many RRs as we
  * want by encoding the RR number into the low bits of the ID.
  *
  * We reserve region ID 0 for the kernel and allocate the remaining
  * IDs for user pmaps.
  *
  * Region 0..4
  *	User virtually mapped
  *
  * Region 5
  *	Kernel virtually mapped
  *
  * Region 6
  *	Kernel physically mapped uncacheable
  *
  * Region 7
  *	Kernel physically mapped cacheable
  */
 
 /* XXX move to a header. */
 extern uint64_t ia64_gateway_page[];
 
 MALLOC_DEFINE(M_PMAP, "PMAP", "PMAP Structures");
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if !defined(DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define	pmap_accessed(lpte)		((lpte)->pte & PTE_ACCESSED)
 #define	pmap_dirty(lpte)		((lpte)->pte & PTE_DIRTY)
 #define	pmap_managed(lpte)		((lpte)->pte & PTE_MANAGED)
 #define	pmap_ppn(lpte)			((lpte)->pte & PTE_PPN_MASK)
 #define	pmap_present(lpte)		((lpte)->pte & PTE_PRESENT)
 #define	pmap_prot(lpte)			(((lpte)->pte & PTE_PROT_MASK) >> 56)
 #define	pmap_wired(lpte)		((lpte)->pte & PTE_WIRED)
 
 #define	pmap_clear_accessed(lpte)	(lpte)->pte &= ~PTE_ACCESSED
 #define	pmap_clear_dirty(lpte)		(lpte)->pte &= ~PTE_DIRTY
 #define	pmap_clear_present(lpte)	(lpte)->pte &= ~PTE_PRESENT
 #define	pmap_clear_wired(lpte)		(lpte)->pte &= ~PTE_WIRED
 
 #define	pmap_set_wired(lpte)		(lpte)->pte |= PTE_WIRED
 
 /*
  * The VHPT bucket head structure.
  */
 struct ia64_bucket {
 	uint64_t	chain;
 	struct mtx	mutex;
 	u_int		length;
 };
 
 /*
  * Statically allocated kernel pmap
  */
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 /*
  * Kernel virtual memory management.
  */
 static int nkpt;
 struct ia64_lpte ***ia64_kptdir;
 #define KPTE_DIR0_INDEX(va) \
 	(((va) >> (3*PAGE_SHIFT-8)) & ((1<<(PAGE_SHIFT-3))-1))
 #define KPTE_DIR1_INDEX(va) \
 	(((va) >> (2*PAGE_SHIFT-5)) & ((1<<(PAGE_SHIFT-3))-1))
 #define KPTE_PTE_INDEX(va) \
 	(((va) >> PAGE_SHIFT) & ((1<<(PAGE_SHIFT-5))-1))
 #define NKPTEPG		(PAGE_SIZE / sizeof(struct ia64_lpte))
 
 vm_offset_t kernel_vm_end;
 
 /* Values for ptc.e. XXX values for SKI. */
 static uint64_t pmap_ptc_e_base = 0x100000000;
 static uint64_t pmap_ptc_e_count1 = 3;
 static uint64_t pmap_ptc_e_count2 = 2;
 static uint64_t pmap_ptc_e_stride1 = 0x2000;
 static uint64_t pmap_ptc_e_stride2 = 0x100000000;
 struct mtx pmap_ptcmutex;
 
 /*
  * Data for the RID allocator
  */
 static int pmap_ridcount;
 static int pmap_rididx;
 static int pmap_ridmapsz;
 static int pmap_ridmax;
 static uint64_t *pmap_ridmap;
 struct mtx pmap_ridmutex;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 
 /*
  * Data for allocating PTEs for user processes.
  */
 static uma_zone_t ptezone;
 
 /*
  * Virtual Hash Page Table (VHPT) data.
  */
 /* SYSCTL_DECL(_machdep); */
 SYSCTL_NODE(_machdep, OID_AUTO, vhpt, CTLFLAG_RD, 0, "");
 
 struct ia64_bucket *pmap_vhpt_bucket;
 
 int pmap_vhpt_nbuckets;
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, nbuckets, CTLFLAG_RD,
     &pmap_vhpt_nbuckets, 0, "");
 
 uint64_t pmap_vhpt_base[MAXCPU];
 
 int pmap_vhpt_log2size = 0;
 TUNABLE_INT("machdep.vhpt.log2size", &pmap_vhpt_log2size);
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, log2size, CTLFLAG_RD,
     &pmap_vhpt_log2size, 0, "");
 
 static int pmap_vhpt_inserts;
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, inserts, CTLFLAG_RD,
     &pmap_vhpt_inserts, 0, "");
 
 static int pmap_vhpt_population(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_machdep_vhpt, OID_AUTO, population, CTLTYPE_INT | CTLFLAG_RD,
     NULL, 0, pmap_vhpt_population, "I", "");
 
 static struct ia64_lpte *pmap_find_vhpt(vm_offset_t va);
 
 static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 
 static void	pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m, vm_prot_t prot);
 static void	pmap_invalidate_all(pmap_t pmap);
 static int	pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte,
 		    vm_offset_t va, pv_entry_t pv, int freepte);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m);
 
 vm_offset_t
 pmap_steal_memory(vm_size_t size)
 {
 	vm_size_t bank_size;
 	vm_offset_t pa, va;
 
 	size = round_page(size);
 
 	bank_size = phys_avail[1] - phys_avail[0];
 	while (size > bank_size) {
 		int i;
 		for (i = 0; phys_avail[i+2]; i+= 2) {
 			phys_avail[i] = phys_avail[i+2];
 			phys_avail[i+1] = phys_avail[i+3];
 		}
 		phys_avail[i] = 0;
 		phys_avail[i+1] = 0;
 		if (!phys_avail[0])
 			panic("pmap_steal_memory: out of memory");
 		bank_size = phys_avail[1] - phys_avail[0];
 	}
 
 	pa = phys_avail[0];
 	phys_avail[0] += size;
 
 	va = IA64_PHYS_TO_RR7(pa);
 	bzero((caddr_t) va, size);
 	return va;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap()
 {
 	struct ia64_pal_result res;
 	struct ia64_lpte *pte;
 	vm_offset_t base, limit;
 	size_t size;
 	int i, j, count, ridbits;
 
 	/*
 	 * Query the PAL Code to find the loop parameters for the
 	 * ptc.e instruction.
 	 */
 	res = ia64_call_pal_static(PAL_PTCE_INFO, 0, 0, 0);
 	if (res.pal_status != 0)
 		panic("Can't configure ptc.e parameters");
 	pmap_ptc_e_base = res.pal_result[0];
 	pmap_ptc_e_count1 = res.pal_result[1] >> 32;
 	pmap_ptc_e_count2 = res.pal_result[1] & ((1L<<32) - 1);
 	pmap_ptc_e_stride1 = res.pal_result[2] >> 32;
 	pmap_ptc_e_stride2 = res.pal_result[2] & ((1L<<32) - 1);
 	if (bootverbose)
 		printf("ptc.e base=0x%lx, count1=%ld, count2=%ld, "
 		       "stride1=0x%lx, stride2=0x%lx\n",
 		       pmap_ptc_e_base,
 		       pmap_ptc_e_count1,
 		       pmap_ptc_e_count2,
 		       pmap_ptc_e_stride1,
 		       pmap_ptc_e_stride2);
 	mtx_init(&pmap_ptcmutex, "Global PTC lock", NULL, MTX_SPIN);
 
 	/*
 	 * Setup RIDs. RIDs 0..7 are reserved for the kernel.
 	 *
 	 * We currently need at least 19 bits in the RID because PID_MAX
 	 * can only be encoded in 17 bits and we need RIDs for 5 regions
 	 * per process. With PID_MAX equalling 99999 this means that we
 	 * need to be able to encode 499995 (=5*PID_MAX).
 	 * The Itanium processor only has 18 bits and the architected
 	 * minimum is exactly that. So, we cannot use a PID based scheme
 	 * in those cases. Enter pmap_ridmap...
 	 * We should avoid the map when running on a processor that has
 	 * implemented enough bits. This means that we should pass the
 	 * process/thread ID to pmap. This we currently don't do, so we
 	 * use the map anyway. However, we don't want to allocate a map
 	 * that is large enough to cover the range dictated by the number
 	 * of bits in the RID, because that may result in a RID map of
 	 * 2MB in size for a 24-bit RID. A 64KB map is enough.
 	 * The bottomline: we create a 32KB map when the processor only
 	 * implements 18 bits (or when we can't figure it out). Otherwise
 	 * we create a 64KB map.
 	 */
 	res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0);
 	if (res.pal_status != 0) {
 		if (bootverbose)
 			printf("Can't read VM Summary - assuming 18 Region ID bits\n");
 		ridbits = 18; /* guaranteed minimum */
 	} else {
 		ridbits = (res.pal_result[1] >> 8) & 0xff;
 		if (bootverbose)
 			printf("Processor supports %d Region ID bits\n",
 			    ridbits);
 	}
 	if (ridbits > 19)
 		ridbits = 19;
 
 	pmap_ridmax = (1 << ridbits);
 	pmap_ridmapsz = pmap_ridmax / 64;
 	pmap_ridmap = (uint64_t *)pmap_steal_memory(pmap_ridmax / 8);
 	pmap_ridmap[0] |= 0xff;
 	pmap_rididx = 0;
 	pmap_ridcount = 8;
 	mtx_init(&pmap_ridmutex, "RID allocator lock", NULL, MTX_DEF);
 
 	/*
 	 * Allocate some memory for initial kernel 'page tables'.
 	 */
 	ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE);
 	nkpt = 0;
 	kernel_vm_end = VM_MIN_KERNEL_ADDRESS - VM_GATEWAY_SIZE;
 
 	for (i = 0; phys_avail[i+2]; i+= 2)
 		;
 	count = i+2;
 
 	/*
 	 * Figure out a useful size for the VHPT, based on the size of
 	 * physical memory and try to locate a region which is large
 	 * enough to contain the VHPT (which must be a power of two in
 	 * size and aligned to a natural boundary).
 	 * We silently bump up the VHPT size to the minimum size if the
 	 * user has set the tunable too small. Likewise, the VHPT size
 	 * is silently capped to the maximum allowed.
 	 */
 	TUNABLE_INT_FETCH("machdep.vhpt.log2size", &pmap_vhpt_log2size);
 	if (pmap_vhpt_log2size == 0) {
 		pmap_vhpt_log2size = 15;
 		size = 1UL << pmap_vhpt_log2size;
 		while (size < Maxmem * 32) {
 			pmap_vhpt_log2size++;
 			size <<= 1;
 		}
 	} else if (pmap_vhpt_log2size < 15)
 		pmap_vhpt_log2size = 15;
 	if (pmap_vhpt_log2size > 61)
 		pmap_vhpt_log2size = 61;
 
 	pmap_vhpt_base[0] = 0;
 	base = limit = 0;
 	size = 1UL << pmap_vhpt_log2size;
 	while (pmap_vhpt_base[0] == 0) {
 		if (bootverbose)
 			printf("Trying VHPT size 0x%lx\n", size);
 		for (i = 0; i < count; i += 2) {
 			base = (phys_avail[i] + size - 1) & ~(size - 1);
 			limit = base + MAXCPU * size;
 			if (limit <= phys_avail[i+1])
 				/*
 				 * VHPT can fit in this region
 				 */
 				break;
 		}
 		if (!phys_avail[i]) {
 			/* Can't fit, try next smaller size. */
 			pmap_vhpt_log2size--;
 			size >>= 1;
 		} else
 			pmap_vhpt_base[0] = IA64_PHYS_TO_RR7(base);
 	}
 	if (pmap_vhpt_log2size < 15)
 		panic("Can't find space for VHPT");
 
 	if (bootverbose)
 		printf("Putting VHPT at 0x%lx\n", base);
 
 	if (base != phys_avail[i]) {
 		/* Split this region. */
 		if (bootverbose)
 			printf("Splitting [%p-%p]\n", (void *)phys_avail[i],
 			    (void *)phys_avail[i+1]);
 		for (j = count; j > i; j -= 2) {
 			phys_avail[j] = phys_avail[j-2];
 			phys_avail[j+1] = phys_avail[j-2+1];
 		}
 		phys_avail[i+1] = base;
 		phys_avail[i+2] = limit;
 	} else
 		phys_avail[i] = limit;
 
 	pmap_vhpt_nbuckets = size / sizeof(struct ia64_lpte);
 
 	pmap_vhpt_bucket = (void *)pmap_steal_memory(pmap_vhpt_nbuckets *
 	    sizeof(struct ia64_bucket));
 	pte = (struct ia64_lpte *)pmap_vhpt_base[0];
 	for (i = 0; i < pmap_vhpt_nbuckets; i++) {
 		pte[i].pte = 0;
 		pte[i].itir = 0;
 		pte[i].tag = 1UL << 63;	/* Invalid tag */
 		pte[i].chain = (uintptr_t)(pmap_vhpt_bucket + i);
 		/* Stolen memory is zeroed! */
 		mtx_init(&pmap_vhpt_bucket[i].mutex, "VHPT bucket lock", NULL,
 		    MTX_SPIN);
 	}
 
 	for (i = 1; i < MAXCPU; i++) {
 		pmap_vhpt_base[i] = pmap_vhpt_base[i - 1] + size;
 		bcopy((void *)pmap_vhpt_base[i - 1], (void *)pmap_vhpt_base[i],
 		    size);
 	}
 
 	map_vhpt(pmap_vhpt_base[0]);
 	ia64_set_pta(pmap_vhpt_base[0] + (1 << 8) +
 	    (pmap_vhpt_log2size << 2) + 1);
 	ia64_srlz_i();
 
 	virtual_avail = VM_MIN_KERNEL_ADDRESS;
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	for (i = 0; i < 5; i++)
 		kernel_pmap->pm_rid[i] = 0;
 	kernel_pmap->pm_active = 1;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	PCPU_SET(current_pmap, kernel_pmap);
 
 	/*
 	 * Region 5 is mapped via the vhpt.
 	 */
 	ia64_set_rr(IA64_RR_BASE(5),
 		    (5 << 8) | (PAGE_SHIFT << 2) | 1);
 
 	/*
 	 * Region 6 is direct mapped UC and region 7 is direct mapped
 	 * WC. The details of this is controlled by the Alt {I,D}TLB
 	 * handlers. Here we just make sure that they have the largest 
 	 * possible page size to minimise TLB usage.
 	 */
 	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (IA64_ID_PAGE_SHIFT << 2));
 	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (IA64_ID_PAGE_SHIFT << 2));
 	ia64_srlz_d();
 
 	/*
 	 * Clear out any random TLB entries left over from booting.
 	 */
 	pmap_invalidate_all(kernel_pmap);
 
 	map_gateway_page();
 }
 
 static int
 pmap_vhpt_population(SYSCTL_HANDLER_ARGS)
 {
 	int count, error, i;
 
 	count = 0;
 	for (i = 0; i < pmap_vhpt_nbuckets; i++)
 		count += pmap_vhpt_bucket[i].length;
 
 	error = SYSCTL_OUT(req, &count, sizeof(count));
 	return (error);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), 
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 
 /***************************************************
  * Manipulate TLBs for a pmap
  ***************************************************/
 
 #if 0
 static __inline void
 pmap_invalidate_page_locally(void *arg)
 {
 	vm_offset_t va = (uintptr_t)arg;
 	struct ia64_lpte *pte;
 
 	pte = (struct ia64_lpte *)ia64_thash(va);
 	if (pte->tag == ia64_ttag(va))
 		pte->tag = 1UL << 63;
 	ia64_ptc_l(va, PAGE_SHIFT << 2);
 }
 
 #ifdef SMP
 static void
 pmap_invalidate_page_1(void *arg)
 {
 	void **args = arg;
 	pmap_t oldpmap;
 
 	critical_enter();
 	oldpmap = pmap_switch(args[0]);
 	pmap_invalidate_page_locally(args[1]);
 	pmap_switch(oldpmap);
 	critical_exit();
 }
 #endif
 
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 #ifdef SMP
 	if (mp_ncpus > 1) {
 		void *args[2];
 		args[0] = pmap;
 		args[1] = (void *)va;
 		smp_rendezvous(NULL, pmap_invalidate_page_1, NULL, args);
 	} else
 #endif
 	pmap_invalidate_page_locally((void *)va);
 }
 #endif /* 0 */
 
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	int i, vhpt_ofs;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 	vhpt_ofs = ia64_thash(va) - pmap_vhpt_base[PCPU_GET(cpuid)];
 	critical_enter();
 	for (i = 0; i < MAXCPU; i++) {
 		pte = (struct ia64_lpte *)(pmap_vhpt_base[i] + vhpt_ofs);
 		if (pte->tag == ia64_ttag(va))
 			pte->tag = 1UL << 63;
 	}
 	critical_exit();
 	mtx_lock_spin(&pmap_ptcmutex);
 	ia64_ptc_ga(va, PAGE_SHIFT << 2);
 	mtx_unlock_spin(&pmap_ptcmutex);
 }
 
 static void
 pmap_invalidate_all_1(void *arg)
 {
 	uint64_t addr;
 	int i, j;
 
 	critical_enter();
 	addr = pmap_ptc_e_base;
 	for (i = 0; i < pmap_ptc_e_count1; i++) {
 		for (j = 0; j < pmap_ptc_e_count2; j++) {
 			ia64_ptc_e(addr);
 			addr += pmap_ptc_e_stride2;
 		}
 		addr += pmap_ptc_e_stride1;
 	}
 	critical_exit();
 }
 
 static void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 #ifdef SMP
 	if (mp_ncpus > 1)
 		smp_rendezvous(NULL, pmap_invalidate_all_1, NULL, NULL);
 	else
 #endif
 	pmap_invalidate_all_1(NULL);
 }
 
 static uint32_t
 pmap_allocate_rid(void)
 {
 	uint64_t bit, bits;
 	int rid;
 
 	mtx_lock(&pmap_ridmutex);
 	if (pmap_ridcount == pmap_ridmax)
 		panic("pmap_allocate_rid: All Region IDs used");
 
 	/* Find an index with a free bit. */
 	while ((bits = pmap_ridmap[pmap_rididx]) == ~0UL) {
 		pmap_rididx++;
 		if (pmap_rididx == pmap_ridmapsz)
 			pmap_rididx = 0;
 	}
 	rid = pmap_rididx * 64;
 
 	/* Find a free bit. */
 	bit = 1UL;
 	while (bits & bit) {
 		rid++;
 		bit <<= 1;
 	}
 
 	pmap_ridmap[pmap_rididx] |= bit;
 	pmap_ridcount++;
 	mtx_unlock(&pmap_ridmutex);
 
 	return rid;
 }
 
 static void
 pmap_free_rid(uint32_t rid)
 {
 	uint64_t bit;
 	int idx;
 
 	idx = rid / 64;
 	bit = ~(1UL << (rid & 63));
 
 	mtx_lock(&pmap_ridmutex);
 	pmap_ridmap[idx] &= bit;
 	pmap_ridcount--;
 	mtx_unlock(&pmap_ridmutex);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	/* kernel_pmap is the same as any other pmap. */
 	pmap_pinit(pmap);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
-void
+int
 pmap_pinit(struct pmap *pmap)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	for (i = 0; i < 5; i++)
 		pmap->pm_rid[i] = pmap_allocate_rid();
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	return (1);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (pmap->pm_rid[i])
 			pmap_free_rid(pmap->pm_rid[i]);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct ia64_lpte **dir1;
 	struct ia64_lpte *leaf;
 	vm_page_t nkpg;
 
 	while (kernel_vm_end <= addr) {
 		if (nkpt == PAGE_SIZE/8 + PAGE_SIZE*PAGE_SIZE/64)
 			panic("%s: out of kernel address space", __func__);
 
 		dir1 = ia64_kptdir[KPTE_DIR0_INDEX(kernel_vm_end)];
 		if (dir1 == NULL) {
 			nkpg = vm_page_alloc(NULL, nkpt++,
 			    VM_ALLOC_NOOBJ|VM_ALLOC_INTERRUPT|VM_ALLOC_WIRED);
 			if (!nkpg)
 				panic("%s: cannot add dir. page", __func__);
 
 			dir1 = (struct ia64_lpte **) 
 			    IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg));
 			bzero(dir1, PAGE_SIZE);
 			ia64_kptdir[KPTE_DIR0_INDEX(kernel_vm_end)] = dir1;
 		}
 
 		nkpg = vm_page_alloc(NULL, nkpt++,
 		    VM_ALLOC_NOOBJ|VM_ALLOC_INTERRUPT|VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("%s: cannot add PTE page", __func__);
 
 		leaf = (struct ia64_lpte *)
 		    IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg));
 		bzero(leaf, PAGE_SIZE);
 		dir1[KPTE_DIR1_INDEX(kernel_vm_end)] = leaf;
 
 		kernel_vm_end += PAGE_SIZE * NKPTEPG;
 	}
 }
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	struct vpgqueues *vpq;
 	struct ia64_lpte *pte;
 	pmap_t oldpmap, pmap;
 	pv_entry_t allocated_pv, next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 	if (allocated_pv != NULL) {
 		pv_entry_count++;
 		if (pv_entry_count > pv_entry_high_water)
 			pagedaemon_wakeup();
 		else
 			return (allocated_pv);
 	}
 
 	/*
 	 * Reclaim pv entries: At first, destroy mappings to inactive
 	 * pages.  After that, if a pv entry is still needed, destroy
 	 * mappings to active pages.
 	 */
 	if (ratecheck(&lastprint, &printinterval))
 		printf("Approaching the limit on PV entries, "
 		    "increase the vm.pmap.shpgperproc tunable.\n");
 	vpq = &vm_page_queues[PQ_INACTIVE];
 retry:
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = pv->pv_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			oldpmap = pmap_switch(pmap);
 			pte = pmap_find_vhpt(va);
 			KASSERT(pte != NULL, ("pte"));
 			pmap_remove_pte(pmap, pte, va, pv, 1);
 			pmap_switch(oldpmap);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			if (allocated_pv == NULL)
 				allocated_pv = pv;
 			else
 				free_pv_entry(pv);
 		}
 	}
 	if (allocated_pv == NULL) {
 		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 			vpq = &vm_page_queues[PQ_ACTIVE];
 			goto retry;
 		}
 		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 	}
 	return (allocated_pv);
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) {
 		pv_entry_count++;
 		pv->pv_va = va;
 		pv->pv_pmap = pmap;
 		TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Add an ia64_lpte to the VHPT.
  */
 static void
 pmap_enter_vhpt(struct ia64_lpte *pte, vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *vhpte;
 	uint64_t pte_pa;
 
 	/* Can fault, so get it out of the way. */
 	pte_pa = ia64_tpa((vm_offset_t)pte);
 
 	vhpte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)vhpte->chain;
 
 	mtx_lock_spin(&bckt->mutex);
 	pte->chain = bckt->chain;
 	ia64_mf();
 	bckt->chain = pte_pa;
 
 	pmap_vhpt_inserts++;
 	bckt->length++;
 	mtx_unlock_spin(&bckt->mutex);
 }
 
 /*
  * Remove the ia64_lpte matching va from the VHPT. Return zero if it
  * worked or an appropriate error code otherwise.
  */
 static int
 pmap_remove_vhpt(vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *pte;
 	struct ia64_lpte *lpte;
 	struct ia64_lpte *vhpte;
 	uint64_t chain, tag;
 
 	tag = ia64_ttag(va);
 	vhpte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)vhpte->chain;
 
 	lpte = NULL;
 	mtx_lock_spin(&bckt->mutex);
 	chain = bckt->chain;
 	pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	while (chain != 0 && pte->tag != tag) {
 		lpte = pte;
 		chain = pte->chain;
 		pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	}
 	if (chain == 0) {
 		mtx_unlock_spin(&bckt->mutex);
 		return (ENOENT);
 	}
 
 	/* Snip this pv_entry out of the collision chain. */
 	if (lpte == NULL)
 		bckt->chain = pte->chain;
 	else
 		lpte->chain = pte->chain;
 	ia64_mf();
 
 	bckt->length--;
 	mtx_unlock_spin(&bckt->mutex);
 	return (0);
 }
 
 /*
  * Find the ia64_lpte for the given va, if any.
  */
 static struct ia64_lpte *
 pmap_find_vhpt(vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *pte;
 	uint64_t chain, tag;
 
 	tag = ia64_ttag(va);
 	pte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)pte->chain;
 
 	mtx_lock_spin(&bckt->mutex);
 	chain = bckt->chain;
 	pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	while (chain != 0 && pte->tag != tag) {
 		chain = pte->chain;
 		pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	}
 	mtx_unlock_spin(&bckt->mutex);
 	return ((chain != 0) ? pte : NULL);
 }
 
 /*
  * Remove an entry from the list of managed mappings.
  */
 static int
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va, pv_entry_t pv)
 {
 	if (!pv) {
 		if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 			TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 				if (pmap == pv->pv_pmap && va == pv->pv_va) 
 					break;
 			}
 		} else {
 			TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 				if (va == pv->pv_va) 
 					break;
 			}
 		}
 	}
 
 	if (pv) {
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 		return 0;
 	} else {
 		return ENOENT;
 	}
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	pv = get_pv_entry(pmap);
 	pv->pv_pmap = pmap;
 	pv->pv_va = va;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL && pmap_present(pte))
 		pa = pmap_ppn(pte);
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL && pmap_present(pte) &&
 	    (pmap_prot(pte) & prot) == prot) {
 		m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Find the kernel lpte for mapping the given virtual address, which
  * must be in the part of region 5 which we can cover with our kernel
  * 'page tables'.
  */
 static struct ia64_lpte *
 pmap_find_kpte(vm_offset_t va)
 {
 	struct ia64_lpte **dir1;
 	struct ia64_lpte *leaf;
 
 	KASSERT((va >> 61) == 5,
 		("kernel mapping 0x%lx not in region 5", va));
 	KASSERT(va < kernel_vm_end,
 		("kernel mapping 0x%lx out of range", va));
 
 	dir1 = ia64_kptdir[KPTE_DIR0_INDEX(va)];
 	leaf = dir1[KPTE_DIR1_INDEX(va)];
 	return (&leaf[KPTE_PTE_INDEX(va)]);
 }
 
 /*
  * Find a pte suitable for mapping a user-space address. If one exists 
  * in the VHPT, that one will be returned, otherwise a new pte is
  * allocated.
  */
 static struct ia64_lpte *
 pmap_find_pte(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return pmap_find_kpte(va);
 
 	pte = pmap_find_vhpt(va);
 	if (pte == NULL) {
 		pte = uma_zalloc(ptezone, M_NOWAIT | M_ZERO);
 		pte->tag = 1UL << 63;
 	}
 	return (pte);
 }
 
 /*
  * Free a pte which is now unused. This simply returns it to the zone
  * allocator if it is a user mapping. For kernel mappings, clear the
  * valid bit to make it clear that the mapping is not currently used.
  */
 static void
 pmap_free_pte(struct ia64_lpte *pte, vm_offset_t va)
 {
 	if (va < VM_MAXUSER_ADDRESS)
 		uma_zfree(ptezone, pte);
 	else
 		pmap_clear_present(pte);
 }
 
 static PMAP_INLINE void
 pmap_pte_prot(pmap_t pm, struct ia64_lpte *pte, vm_prot_t prot)
 {
 	static long prot2ar[4] = {
 		PTE_AR_R,		/* VM_PROT_NONE */
 		PTE_AR_RW,		/* VM_PROT_WRITE */
 		PTE_AR_RX|PTE_ED,	/* VM_PROT_EXECUTE */
 		PTE_AR_RWX|PTE_ED	/* VM_PROT_WRITE|VM_PROT_EXECUTE */
 	};
 
 	pte->pte &= ~(PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK | PTE_ED);
 	pte->pte |= (uint64_t)(prot & VM_PROT_ALL) << 56;
 	pte->pte |= (prot == VM_PROT_NONE || pm == kernel_pmap)
 	    ? PTE_PL_KERN : PTE_PL_USER;
 	pte->pte |= prot2ar[(prot & VM_PROT_ALL) >> 1];
 }
 
 /*
  * Set a pte to contain a valid mapping and enter it in the VHPT. If
  * the pte was orginally valid, then its assumed to already be in the
  * VHPT.
  * This functions does not set the protection bits.  It's expected
  * that those have been set correctly prior to calling this function.
  */
 static void
 pmap_set_pte(struct ia64_lpte *pte, vm_offset_t va, vm_offset_t pa,
     boolean_t wired, boolean_t managed)
 {
 
 	pte->pte &= PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK | PTE_ED;
 	pte->pte |= PTE_PRESENT | PTE_MA_WB;
 	pte->pte |= (managed) ? PTE_MANAGED : (PTE_DIRTY | PTE_ACCESSED);
 	pte->pte |= (wired) ? PTE_WIRED : 0;
 	pte->pte |= pa & PTE_PPN_MASK;
 
 	pte->itir = PAGE_SHIFT << 2;
 
 	pte->tag = ia64_ttag(va);
 }
 
 /*
  * Remove the (possibly managed) mapping represented by pte from the
  * given pmap.
  */
 static int
 pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte, vm_offset_t va,
 		pv_entry_t pv, int freepte)
 {
 	int error;
 	vm_page_t m;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("removing pte for non-current pmap"));
 
 	/*
 	 * First remove from the VHPT.
 	 */
 	error = pmap_remove_vhpt(va);
 	if (error)
 		return (error);
 
 	pmap_invalidate_page(pmap, va);
 
 	if (pmap_wired(pte))
 		pmap->pm_stats.wired_count -= 1;
 
 	pmap->pm_stats.resident_count -= 1;
 	if (pmap_managed(pte)) {
 		m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
 		if (pmap_dirty(pte))
 			vm_page_dirty(m);
 		if (pmap_accessed(pte))
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		error = pmap_remove_entry(pmap, m, va, pv);
 	}
 	if (freepte)
 		pmap_free_pte(pte, va);
 
 	return (error);
 }
 
 /*
  * Extract the physical page address associated with a kernel
  * virtual address.
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	vm_offset_t gwpage;
 
 	KASSERT(va >= IA64_RR_BASE(5), ("Must be kernel VA"));
 
 	/* Regions 6 and 7 are direct mapped. */
 	if (va >= IA64_RR_BASE(6))
 		return (IA64_RR_MASK(va));
 
 	/* EPC gateway page? */
 	gwpage = (vm_offset_t)ia64_get_k5();
 	if (va >= gwpage && va < gwpage + VM_GATEWAY_SIZE)
 		return (IA64_RR_MASK((vm_offset_t)ia64_gateway_page));
 
 	/* Bail out if the virtual address is beyond our limits. */
 	if (va >= kernel_vm_end)
 		return (0);
 
 	pte = pmap_find_kpte(va);
 	if (!pmap_present(pte))
 		return (0);
 	return (pmap_ppn(pte) | (va & PAGE_MASK));
 }
 
 /*
  * Add a list of wired pages to the kva this routine is only used for
  * temporary kernel mappings that do not need to have page modification
  * or references recorded.  Note that old mappings are simply written
  * over.  The page is effectively wired, but it's customary to not have
  * the PTE reflect that, nor update statistics.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	struct ia64_lpte *pte;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pte = pmap_find_kpte(va);
 		if (pmap_present(pte))
 			pmap_invalidate_page(kernel_pmap, va);
 		else
 			pmap_enter_vhpt(pte, va);
 		pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL);
 		pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m[i]), FALSE, FALSE);
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	struct ia64_lpte *pte;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pte = pmap_find_kpte(va);
 		if (pmap_present(pte)) {
 			pmap_remove_vhpt(va);
 			pmap_invalidate_page(kernel_pmap, va);
 			pmap_clear_present(pte);
 		}
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * Add a wired page to the kva.  As for pmap_qenter(), it's customary
  * to not have the PTE reflect that, nor update statistics.
  */
 void 
 pmap_kenter(vm_offset_t va, vm_offset_t pa)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_kpte(va);
 	if (pmap_present(pte))
 		pmap_invalidate_page(kernel_pmap, va);
 	else
 		pmap_enter_vhpt(pte, va);
 	pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL);
 	pmap_set_pte(pte, va, pa, FALSE, FALSE);
 }
 
 /*
  * Remove a page from the kva
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_kpte(va);
 	if (pmap_present(pte)) {
 		pmap_remove_vhpt(va);
 		pmap_invalidate_page(kernel_pmap, va);
 		pmap_clear_present(pte);
 	}
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 	return IA64_PHYS_TO_RR7(start);
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("removing page for non-current pmap"));
 
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL)
 		pmap_remove_pte(pmap, pte, va, 0, 1);
 	return;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	pmap_t oldpmap;
 	vm_offset_t va;
 	pv_entry_t npv, pv;
 	struct ia64_lpte *pte;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pmap_remove_page(pmap, sva);
 		goto out;
 	}
 
 	if (pmap->pm_stats.resident_count < ((eva - sva) >> PAGE_SHIFT)) {
 		TAILQ_FOREACH_SAFE(pv, &pmap->pm_pvlist, pv_plist, npv) {
 			va = pv->pv_va;
 			if (va >= sva && va < eva) {
 				pte = pmap_find_vhpt(va);
 				KASSERT(pte != NULL, ("pte"));
 				pmap_remove_pte(pmap, pte, va, pv, 1);
 			}
 		}
 	} else {
 		for (va = sva; va < eva; va += PAGE_SIZE) {
 			pte = pmap_find_vhpt(va);
 			if (pte != NULL)
 				pmap_remove_pte(pmap, pte, va, 0, 1);
 		}
 	}
 
 out:
 	vm_page_unlock_queues();
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 #if defined(DIAGNOSTIC)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		struct ia64_lpte *pte;
 		pmap_t pmap = pv->pv_pmap;
 		vm_offset_t va = pv->pv_va;
 
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_ppn(pte) != VM_PAGE_TO_PHYS(m))
 			panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m));
 		pmap_remove_pte(pmap, pte, va, pv, 1);
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	if ((sva & PAGE_MASK) || (eva & PAGE_MASK))
 		panic("pmap_protect: unaligned addresses");
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	while (sva < eva) {
 		/* 
 		 * If page is invalid, skip this page
 		 */
 		pte = pmap_find_vhpt(sva);
 		if (pte == NULL) {
 			sva += PAGE_SIZE;
 			continue;
 		}
 
 		if (pmap_prot(pte) != prot) {
 			if (pmap_managed(pte)) {
 				vm_offset_t pa = pmap_ppn(pte);
 				vm_page_t m = PHYS_TO_VM_PAGE(pa);
 				if (pmap_dirty(pte)) {
 					vm_page_dirty(m);
 					pmap_clear_dirty(pte);
 				}
 				if (pmap_accessed(pte)) {
 					vm_page_flag_set(m, PG_REFERENCED);
 					pmap_clear_accessed(pte);
 				}
 			}
 			pmap_pte_prot(pmap, pte, prot);
 			pmap_invalidate_page(pmap, sva);
 		}
 
 		sva += PAGE_SIZE;
 	}
 	vm_page_unlock_queues();
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 	pmap_t oldpmap;
 	vm_offset_t pa;
 	vm_offset_t opa;
 	struct ia64_lpte origpte;
 	struct ia64_lpte *pte;
 	boolean_t managed;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 
 	va &= ~PAGE_MASK;
 #ifdef DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 #endif
 
 	/*
 	 * Find (or create) a pte for the given mapping.
 	 */
 	while ((pte = pmap_find_pte(va)) == NULL) {
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pmap);
 		vm_page_unlock_queues();
 		VM_WAIT;
 		vm_page_lock_queues();
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_switch(pmap);
 	}
 	origpte = *pte;
 	if (!pmap_present(pte)) {
 		opa = ~0UL;
 		pmap_enter_vhpt(pte, va);
 	} else
 		opa = pmap_ppn(pte);
 	managed = FALSE;
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (opa == pa) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && !pmap_wired(&origpte))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && pmap_wired(&origpte))
 			pmap->pm_stats.wired_count--;
 
 		managed = (pmap_managed(&origpte)) ? TRUE : FALSE;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (managed && pmap_dirty(&origpte))
 			vm_page_dirty(m);
 
 		pmap_invalidate_page(pmap, va);
 		goto validate;
 	}
 
 	/*
 	 * Mapping has changed, invalidate old range and fall
 	 * through to handle validating new mapping.
 	 */
 	if (opa != ~0UL) {
 		pmap_remove_pte(pmap, pte, va, 0, 0);
 		pmap_enter_vhpt(pte, va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		managed = TRUE;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 
 	/*
 	 * Now validate mapping with desired protection/wiring. This
 	 * adds the pte to the VHPT if necessary.
 	 */
 	pmap_pte_prot(pmap, pte, prot);
 	pmap_set_pte(pte, va, pa, wired, managed);
 
 	if ((prot & VM_PROT_WRITE) != 0)
 		vm_page_flag_set(m, PG_WRITEABLE);
 	vm_page_unlock_queues();
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
 		m = TAILQ_NEXT(m, listq);
 	}
 	pmap_switch(oldpmap);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	pmap_enter_quick_locked(pmap, va, m, prot);
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot)
 {
 	struct ia64_lpte *pte;
 	boolean_t managed;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pte = pmap_find_pte(va)) == NULL)
 		return;
 
 	if (!pmap_present(pte)) {
 		/* Enter on the PV list if the page is managed. */
 		if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 			if (!pmap_try_insert_pv_entry(pmap, va, m)) {
 				pmap_free_pte(pte, va);
 				return;
 			}
 			managed = TRUE;
 		} else
 			managed = FALSE;
 
 		/* Increment counters. */
 		pmap->pm_stats.resident_count++;
 
 		/* Initialise with R/O protection and enter into VHPT. */
 		pmap_enter_vhpt(pte, va);
 		pmap_pte_prot(pmap, pte,
 		    prot & (VM_PROT_READ | VM_PROT_EXECUTE));
 		pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m), FALSE, managed);
 	}
 }
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte;
 
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 
 	pte = pmap_find_vhpt(va);
 	KASSERT(pte != NULL, ("pte"));
 	if (wired && !pmap_wired(pte)) {
 		pmap->pm_stats.wired_count++;
 		pmap_set_wired(pte);
 	} else if (!wired && pmap_wired(pte)) {
 		pmap->pm_stats.wired_count--;
 		pmap_clear_wired(pte);
 	}
 
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 }	
 
 
 /*
  *	pmap_zero_page zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.
  */
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t) va, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.
  *
  *	off and size must reside within a single page.
  */
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((char *)(caddr_t)va + off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.  This is for the vm_idlezero process.
  */
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t) va, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(mdst));
 	bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pmap_t oldpmap;
 	pv_entry_t pv, npv;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		struct ia64_lpte *pte;
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (!pmap_wired(pte))
 			pmap_remove_pte(pmap, pte, pv->pv_va, pv, 1);
 	}
 
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  * 
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 	int count = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return 0;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_switch(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_accessed(pte)) {
 			count++;
 			pmap_clear_accessed(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 
 	return count;
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_switch(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		pmap_switch(oldpmap);
 		KASSERT(pte != NULL, ("pte"));
 		rv = pmap_dirty(pte) ? TRUE : FALSE;
 		PMAP_UNLOCK(pv->pv_pmap);
 		if (rv)
 			break;
 	}
 
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_vhpt(addr);
 	if (pte != NULL && pmap_present(pte))
 		return (FALSE);
 	return (TRUE);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 	if (m->flags & PG_FICTITIOUS)
 		return;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_switch(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_dirty(pte)) {
 			pmap_clear_dirty(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 	if (m->flags & PG_FICTITIOUS)
 		return;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_switch(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_accessed(pte)) {
 			pmap_clear_accessed(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 	vm_prot_t prot;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = pv->pv_pmap;
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_switch(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		prot = pmap_prot(pte);
 		if ((prot & VM_PROT_WRITE) != 0) {
 			if (pmap_dirty(pte)) {
 				vm_page_dirty(m);
 				pmap_clear_dirty(pte);
 			}
 			prot &= ~VM_PROT_WRITE;
 			pmap_pte_prot(pmap, pte, prot);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		pmap_switch(oldpmap);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	return (void*) IA64_PHYS_TO_RR6(pa);
 }
 
 /*
  * 'Unmap' a range mapped by pmap_mapdev().
  */
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	return;
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte, tpte;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_switch(pmap);
 	pte = pmap_find_vhpt(addr);
 	if (pte != NULL) {
 		tpte = *pte;
 		pte = &tpte;
 	}
 	pmap_switch(oldpmap);
 	PMAP_UNLOCK(pmap);
 
 	if (pte == NULL)
 		return 0;
 
 	if (pmap_present(pte)) {
 		vm_page_t m;
 		vm_offset_t pa;
 
 		val = MINCORE_INCORE;
 		if (!pmap_managed(pte))
 			return val;
 
 		pa = pmap_ppn(pte);
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pmap_dirty(pte))
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone
 			 */
 			vm_page_lock_queues();
 			if (pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pmap_accessed(pte))
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone
 			 */
 			vm_page_lock_queues();
 			if (pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_switch(vmspace_pmap(td->td_proc->p_vmspace));
 }
 
 pmap_t
 pmap_switch(pmap_t pm)
 {
 	pmap_t prevpm;
 	int i;
 
 	critical_enter();
 	prevpm = PCPU_GET(current_pmap);
 	if (prevpm == pm)
 		goto out;
 	if (prevpm != NULL)
 		atomic_clear_32(&prevpm->pm_active, PCPU_GET(cpumask));
 	if (pm == NULL) {
 		for (i = 0; i < 5; i++) {
 			ia64_set_rr(IA64_RR_BASE(i),
 			    (i << 8)|(PAGE_SHIFT << 2)|1);
 		}
 	} else {
 		for (i = 0; i < 5; i++) {
 			ia64_set_rr(IA64_RR_BASE(i),
 			    (pm->pm_rid[i] << 8)|(PAGE_SHIFT << 2)|1);
 		}
 		atomic_set_32(&pm->pm_active, PCPU_GET(cpumask));
 	}
 	PCPU_SET(current_pmap, pm);
 	ia64_srlz_d();
 
 out:
 	critical_exit();
 	return (prevpm);
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	return addr;
 }
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 
 #include <ddb/ddb.h>
 
 static const char*	psnames[] = {
 	"1B",	"2B",	"4B",	"8B",
 	"16B",	"32B",	"64B",	"128B",
 	"256B",	"512B",	"1K",	"2K",
 	"4K",	"8K",	"16K",	"32K",
 	"64K",	"128K",	"256K",	"512K",
 	"1M",	"2M",	"4M",	"8M",
 	"16M",	"32M",	"64M",	"128M",
 	"256M",	"512M",	"1G",	"2G"
 };
 
 static void
 print_trs(int type)
 {
 	struct ia64_pal_result res;
 	int i, maxtr;
 	struct {
 		pt_entry_t	pte;
 		uint64_t	itir;
 		uint64_t	ifa;
 		struct ia64_rr	rr;
 	} buf;
 	static const char *manames[] = {
 		"WB",	"bad",	"bad",	"bad",
 		"UC",	"UCE",	"WC",	"NaT",
 	};
 
 	res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0);
 	if (res.pal_status != 0) {
 		db_printf("Can't get VM summary\n");
 		return;
 	}
 
 	if (type == 0)
 		maxtr = (res.pal_result[0] >> 40) & 0xff;
 	else
 		maxtr = (res.pal_result[0] >> 32) & 0xff;
 
 	db_printf("V RID    Virtual Page  Physical Page PgSz ED AR PL D A MA  P KEY\n");
 	for (i = 0; i <= maxtr; i++) {
 		bzero(&buf, sizeof(buf));
 		res = ia64_call_pal_stacked_physical
 			(PAL_VM_TR_READ, i, type, ia64_tpa((uint64_t) &buf));
 		if (!(res.pal_result[0] & 1))
 			buf.pte &= ~PTE_AR_MASK;
 		if (!(res.pal_result[0] & 2))
 			buf.pte &= ~PTE_PL_MASK;
 		if (!(res.pal_result[0] & 4))
 			pmap_clear_dirty(&buf);
 		if (!(res.pal_result[0] & 8))
 			buf.pte &= ~PTE_MA_MASK;
 		db_printf("%d %06x %013lx %013lx %4s %d  %d  %d  %d %d %-3s "
 		    "%d %06x\n", (int)buf.ifa & 1, buf.rr.rr_rid,
 		    buf.ifa >> 12, (buf.pte & PTE_PPN_MASK) >> 12,
 		    psnames[(buf.itir & ITIR_PS_MASK) >> 2],
 		    (buf.pte & PTE_ED) ? 1 : 0,
 		    (int)(buf.pte & PTE_AR_MASK) >> 9,
 		    (int)(buf.pte & PTE_PL_MASK) >> 7,
 		    (pmap_dirty(&buf)) ? 1 : 0,
 		    (pmap_accessed(&buf)) ? 1 : 0,
 		    manames[(buf.pte & PTE_MA_MASK) >> 2],
 		    (pmap_present(&buf)) ? 1 : 0,
 		    (int)((buf.itir & ITIR_KEY_MASK) >> 8));
 	}
 }
 
 DB_COMMAND(itr, db_itr)
 {
 	print_trs(0);
 }
 
 DB_COMMAND(dtr, db_dtr)
 {
 	print_trs(1);
 }
 
 DB_COMMAND(rr, db_rr)
 {
 	int i;
 	uint64_t t;
 	struct ia64_rr rr;
 
 	printf("RR RID    PgSz VE\n");
 	for (i = 0; i < 8; i++) {
 		__asm __volatile ("mov %0=rr[%1]"
 				  : "=r"(t)
 				  : "r"(IA64_RR_BASE(i)));
 		*(uint64_t *) &rr = t;
 		printf("%d  %06x %4s %d\n",
 		       i, rr.rr_rid, psnames[rr.rr_ps], rr.rr_ve);
 	}
 }
 
 DB_COMMAND(thash, db_thash)
 {
 	if (!have_addr)
 		return;
 
 	db_printf("%p\n", (void *) ia64_thash(addr));
 }
 
 DB_COMMAND(ttag, db_ttag)
 {
 	if (!have_addr)
 		return;
 
 	db_printf("0x%lx\n", ia64_ttag(addr));
 }
 
 DB_COMMAND(kpte, db_kpte)
 {
 	struct ia64_lpte *pte;
 
 	if (!have_addr) {
 		db_printf("usage: kpte <kva>\n");
 		return;
 	}
 	if (addr < VM_MIN_KERNEL_ADDRESS) {
 		db_printf("kpte: error: invalid <kva>\n");
 		return;
 	}
 	pte = pmap_find_kpte(addr);
 	db_printf("kpte at %p:\n", pte);
 	db_printf("  pte  =%016lx\n", pte->pte);
 	db_printf("  itir =%016lx\n", pte->itir);
 	db_printf("  tag  =%016lx\n", pte->tag);
 	db_printf("  chain=%016lx\n", pte->chain);
 }
 
 #endif
Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c	(revision 173360)
+++ head/sys/kern/imgact_aout.c	(revision 173361)
@@ -1,272 +1,274 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/frame.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_param.h>
 
 static int	exec_aout_imgact(struct image_params *imgp);
 static int	aout_fixup(register_t **stack_base, struct image_params *imgp);
 
 struct sysentvec aout_sysvec = {
 	SYS_MAXSYSCALL,
 	sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	aout_fixup,
 	sendsig,
 	sigcode,
 	&szsigcode,
 	NULL,
 	"FreeBSD a.out",
 	NULL,
 	NULL,
 	MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	exec_copyout_strings,
 	exec_setregs,
 	NULL
 };
 
 static int
 aout_fixup(stack_base, imgp)
 	register_t **stack_base;
 	struct image_params *imgp;
 {
 
 	return (suword(--(*stack_base), imgp->args->argc));
 }
 
 static int
 exec_aout_imgact(imgp)
 	struct image_params *imgp;
 {
 	const struct exec *a_out = (const struct exec *) imgp->image_header;
 	struct thread *td = curthread;
 	struct vmspace *vmspace;
 	vm_map_t map;
 	vm_object_t object;
 	vm_offset_t text_end, data_end;
 	unsigned long virtual_offset;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	int error;
 
 	/*
 	 * Linux and *BSD binaries look very much alike,
 	 * only the machine id is different:
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 * NetBSD is in network byte order.. ugh.
 	 */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
 	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
 	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
                 return -1;
 
 	/*
 	 * Set file/virtual offset based on a.out variant.
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
 			file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		virtual_offset = PAGE_SIZE;
 		file_offset = 0;
 		/* Pass PS_STRINGS for BSD/OS binaries only. */
 		if (N_GETMID(*a_out) == MID_ZERO)
 			imgp->ps_strings = aout_sysvec.sv_psstrings;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = PAGE_SIZE;
 			file_offset = 0;
 			break;
 		default:
 			return (-1);
 		}
 	}
 
 	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (/* entry point must lay with text region */
 	    a_out->a_entry < virtual_offset ||
 	    a_out->a_entry >= virtual_offset + a_out->a_text ||
 
 	    /* text and data size must each be page rounded */
 	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
 		return (-1);
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 		return (EFAULT);
 
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(imgp->proc);
 	if (/* text can't exceed maximum text size */
 	    a_out->a_text > maxtsiz ||
 
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size > lim_cur(imgp->proc, RLIMIT_DATA)) {
 			PROC_UNLOCK(imgp->proc);
 			return (ENOMEM);
 	}
 	PROC_UNLOCK(imgp->proc);
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
-	exec_new_vmspace(imgp, &aout_sysvec);
+	error = exec_new_vmspace(imgp, &aout_sysvec);
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error)
+		return (error);
 
 	/*
 	 * The vm space can be changed by exec_new_vmspace
 	 */
 	vmspace = imgp->proc->p_vmspace;
 
 	object = imgp->object;
 	map = &vmspace->vm_map;
 	vm_map_lock(map);
 	vm_object_reference(object);
 
 	text_end = virtual_offset + a_out->a_text;
 	error = vm_map_insert(map, object,
 		file_offset,
 		virtual_offset, text_end,
 		VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
 		MAP_COPY_ON_WRITE | MAP_PREFAULT);
 	if (error) {
 		vm_map_unlock(map);
 		vm_object_deallocate(object);
 		return (error);
 	}
 	data_end = text_end + a_out->a_data;
 	if (a_out->a_data) {
 		vm_object_reference(object);
 		error = vm_map_insert(map, object,
 			file_offset + a_out->a_text,
 			text_end, data_end,
 			VM_PROT_ALL, VM_PROT_ALL,
 			MAP_COPY_ON_WRITE | MAP_PREFAULT);
 		if (error) {
 			vm_map_unlock(map);
 			vm_object_deallocate(object);
 			return (error);
 		}
 	}
 
 	if (bss_size) {
 		error = vm_map_insert(map, NULL, 0,
 			data_end, data_end + bss_size,
 			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			vm_map_unlock(map);
 			return (error);
 		}
 	}
 	vm_map_unlock(map);
 
 	/* Fill in process VM information */
 	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (virtual_offset + a_out->a_text);
 
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
 
 	imgp->proc->p_sysent = &aout_sysvec;
 
 	return (0);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
 EXEC_SET(aout, aout_execsw);
Index: head/sys/kern/imgact_elf.c
===================================================================
--- head/sys/kern/imgact_elf.c	(revision 173360)
+++ head/sys/kern/imgact_elf.c	(revision 173361)
@@ -1,1301 +1,1303 @@
 /*-
  * Copyright (c) 2000 David O'Brien
  * Copyright (c) 1995-1996 S�ren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 #include <machine/fpu.h>
 #include <compat/ia32/ia32_reg.h>
 #endif
 
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr,
     const char *interp);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object,
     vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
     vm_prot_t prot, size_t pagesize);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
     &__elfN(fallback_brand));
 
 static int elf_trace = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, "");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0, "");
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	/*
 	 * We support three types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, and (3) path of the `interp_path'
 	 * field.  We should also look for an ".note.ABI-tag" ELF section now
 	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
 	 */
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
 			return (bi);
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi != NULL && hdr->e_machine == bi->machine &&
 			    strcmp(interp, bi->interp_path) == 0)
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand)
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_lock(map);
 	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (KERN_FAILURE);
 		}
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (end > start) {
 		if (offset & PAGE_MASK) {
 			/*
 			 * The mapping is not page aligned. This means we have
 			 * to copy the data. Sigh.
 			 */
 			rv = vm_map_find(map, NULL, 0, &start, end - start,
 			    FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
 			if (rv)
 				return (rv);
 			if (object == NULL)
 				return (KERN_SUCCESS);
 			for (; start < end; start += sz) {
 				sf = vm_imgact_map_page(object, offset);
 				if (sf == NULL)
 					return (KERN_FAILURE);
 				off = offset - trunc_page(offset);
 				sz = end - start;
 				if (sz > PAGE_SIZE - off)
 					sz = PAGE_SIZE - off;
 				error = copyout((caddr_t)sf_buf_kva(sf) + off,
 				    (caddr_t)start, sz);
 				vm_imgact_unmap_page(sf);
 				if (error) {
 					return (KERN_FAILURE);
 				}
 				offset += sz;
 			}
 			rv = KERN_SUCCESS;
 		} else {
 			vm_object_reference(object);
 			vm_map_lock(map);
 			rv = vm_map_insert(map, object, offset, start, end,
 			    prot, VM_PROT_ALL, cow);
 			vm_map_unlock(map);
 			if (rv != KERN_SUCCESS)
 				vm_object_deallocate(object);
 		}
 		return (rv);
 	} else {
 		return (KERN_SUCCESS);
 	}
 }
 
 static int
 __elfN(load_section)(struct vmspace *vmspace,
 	vm_object_t object, vm_offset_t offset,
 	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
 	size_t pagesize)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_offset_t map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_offset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 #define trunc_page_ps(va, ps)	((va) & ~(ps - 1))
 #define round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
 
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second..
 	 */
 	if (memsz > filsz)
 		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
 	else
 		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(&vmspace->vm_map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz) {
 			return (0);
 		}
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
 	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
 	    map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
 		    map_addr + map_len, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (EINVAL);
 		}
 	}
 
 	if (copy_len != 0) {
 		vm_offset_t off;
 
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page_ps(offset + filsz, pagesize) -
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (error);
 		}
 	}
 
 	/*
 	 * set it to the specified protection.
 	 * XXX had better undo the damage from pasting over the cracks here!
 	 */
 	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
 	    round_page(map_addr + map_len),  prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry, size_t pagesize)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vmspace *vmspace = p->p_vmspace;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int vfslocked, error, i, numsegs;
 
 	if (curthread->td_proc != p)
 		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	/* XXXKSE */
 	NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file,
 	    curthread);
 	vfslocked = 0;
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	vfslocked = NDHASGIANT(nd);
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	nd->ni_vp->v_vflag |= VV_TEXT;
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	/*    (multiplication of two Elf_Half fields will not overflow) */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    pagesize)) != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	VFS_UNLOCK_GIANT(vfslocked);
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_prot_t prot;
 	u_long text_size = 0, data_size = 0, total_size = 0;
 	u_long text_addr = 0, data_addr = 0;
 	u_long seg_size, seg_addr;
 	u_long addr, entry = 0, proghdr = 0;
 	int error = 0, i;
 	const char *interp = NULL;
 	Elf_Brandinfo *brand_info;
 	char *path;
 	struct thread *td = curthread;
 	struct sysentvec *sv;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
 		/* Only support headers in first page for now */
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_INTERP) {
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
 			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(hdr, interp);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		return (ENOEXEC);
 	}
 	if (hdr->e_type == ET_DYN &&
 	    (brand_info->flags & BI_CAN_EXEC_DYN) == 0)
 		return (ENOEXEC);
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		interp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0, td);
 
-	exec_new_vmspace(imgp, sv);
+	error = exec_new_vmspace(imgp, sv);
 	imgp->proc->p_sysent = sv;
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error)
+		return (error);
 
 	vmspace = imgp->proc->p_vmspace;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
 			/*
 			 * Some x86 binaries assume read == executable,
 			 * notably the M3 runtime and therefore cvsup
 			 */
 			if (prot & VM_PROT_READ)
 				prot |= VM_PROT_EXECUTE;
 #endif
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    sv->sv_pagesize)) != 0)
 				return (error);
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff;
 
 			seg_addr = trunc_page(phdr[i].p_vaddr);
 			seg_size = round_page(phdr[i].p_memsz +
 			    phdr[i].p_vaddr - seg_addr);
 
 			/*
 			 * Is this .text or .data?  We can't use
 			 * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
 			 * alpha terribly and possibly does other bad
 			 * things so we stick to the old way of figuring
 			 * it out:  If the segment contains the program
 			 * entry point, it's a text segment, otherwise it
 			 * is a data segment.
 			 *
 			 * Note that obreak() assumes that data_addr + 
 			 * data_size == end of data load area, and the ELF
 			 * file format expects segments to be sorted by
 			 * address.  If multiple data segments exist, the
 			 * last one will be used.
 			 */
 			if (hdr->e_entry >= phdr[i].p_vaddr &&
 			    hdr->e_entry < (phdr[i].p_vaddr +
 			    phdr[i].p_memsz)) {
 				text_size = seg_size;
 				text_addr = seg_addr;
 				entry = (u_long)hdr->e_entry;
 			} else {
 				data_size = seg_size;
 				data_addr = seg_addr;
 			}
 			total_size += seg_size;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr;
 			break;
 		default:
 			break;
 		}
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
 	    text_size > maxtsiz ||
 	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) {
 		PROC_UNLOCK(imgp->proc);
 		return (ENOMEM);
 	}
 
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr +
 	    lim_max(imgp->proc, RLIMIT_DATA));
 	PROC_UNLOCK(imgp->proc);
 
 	imgp->entry_addr = entry;
 
 	if (interp != NULL) {
 		VOP_UNLOCK(imgp->vp, 0, td);
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			free(path, M_TEMP);
 			if (error == 0)
 				interp = NULL;
 		}
 		if (interp != NULL) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found\n", interp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->trace = elf_trace;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Addr *base;
 	Elf_Addr *pos;
 
 	base = (Elf_Addr *)*stack_base;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->trace) {
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	}
 	if (args->execfd != -1) {
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	}
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword(base, (long)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_writable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
     int, void *, size_t);
 static void __elfN(puthdr)(struct thread *, void *, size_t *, int);
 static void __elfN(putnote)(void *, size_t *, const char *, int,
     const void *, size_t);
 
 extern int osreldate;
 
 int
 __elfN(coredump)(td, vp, limit)
 	struct thread *td;
 	struct vnode *vp;
 	off_t limit;
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	void *hdr;
 	size_t hdrsize;
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_writable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Calculate the size of the core file header area by making
 	 * a dry run of generating it.  Nothing is written, but the
 	 * size is calculated.
 	 */
 	hdrsize = 0;
 	__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
 
 	if (hdrsize + seginfo.size >= limit)
 		return (EFAULT);
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	if (hdr == NULL) {
 		return (EINVAL);
 	}
 	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = hdrsize;
 		for (i = 0; i < seginfo.count; i++) {
 			error = vn_rdwr_inchunks(UIO_WRITE, vp,
 			    (caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, UIO_USERSPACE,
 			    IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 			    curthread); /* XXXKSE */
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 	}
 	free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_writable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = 0;
 	if (entry->protection & VM_PROT_READ)
 		phdr->p_flags |= PF_R;
 	if (entry->protection & VM_PROT_WRITE)
 		phdr->p_flags |= PF_W;
 	if (entry->protection & VM_PROT_EXECUTE)
 		phdr->p_flags |= PF_X;
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_writable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_writable_segment(td, func, closure)
 	struct thread *td;
 	segment_callback func;
 	void *closure;
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, object;
 	boolean_t ignore_entry;
 
 	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Ignore memory-mapped devices and such things. */
 		VM_OBJECT_LOCK(object);
 		while ((backing_object = object->backing_object) != NULL) {
 			VM_OBJECT_LOCK(backing_object);
 			VM_OBJECT_UNLOCK(object);
 			object = backing_object;
 		}
 		ignore_entry = object->type != OBJT_DEFAULT &&
 		    object->type != OBJT_SWAP && object->type != OBJT_VNODE;
 		VM_OBJECT_UNLOCK(object);
 		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
 	vm_map_unlock_read(map);
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
 	struct thread *td;
 	struct vnode *vp;
 	struct ucred *cred;
 	int numsegs;
 	size_t hdrsize;
 	void *hdr;
 {
 	size_t off;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	off = 0;
 	__elfN(puthdr)(td, hdr, &off, numsegs);
 
 	/* Write it to the core file. */
 	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
 	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 	    td)); /* XXXKSE */
 }
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 #endif
 
 static void
 __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs)
 {
 	struct {
 		elf_prstatus_t status;
 		elf_prfpregset_t fpregset;
 		elf_prpsinfo_t psinfo;
 	} *tempdata;
 	elf_prstatus_t *status;
 	elf_prfpregset_t *fpregset;
 	elf_prpsinfo_t *psinfo;
 	struct proc *p;
 	struct thread *thr;
 	size_t ehoff, noteoff, notesz, phoff;
 
 	p = td->td_proc;
 
 	ehoff = *off;
 	*off += sizeof(Elf_Ehdr);
 
 	phoff = *off;
 	*off += (numsegs + 1) * sizeof(Elf_Phdr);
 
 	noteoff = *off;
 	/*
 	 * Don't allocate space for the notes if we're just calculating
 	 * the size of the header. We also don't collect the data.
 	 */
 	if (dst != NULL) {
 		tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK);
 		status = &tempdata->status;
 		fpregset = &tempdata->fpregset;
 		psinfo = &tempdata->psinfo;
 	} else {
 		tempdata = NULL;
 		status = NULL;
 		fpregset = NULL;
 		psinfo = NULL;
 	}
 
 	if (dst != NULL) {
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		/*
 		 * XXX - We don't fill in the command line arguments properly
 		 * yet.
 		 */
 		strlcpy(psinfo->pr_psargs, p->p_comm,
 		    sizeof(psinfo->pr_psargs));
 	}
 	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
 	    sizeof *psinfo);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		if (dst != NULL) {
 			status->pr_version = PRSTATUS_VERSION;
 			status->pr_statussz = sizeof(elf_prstatus_t);
 			status->pr_gregsetsz = sizeof(elf_gregset_t);
 			status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 			status->pr_osreldate = osreldate;
 			status->pr_cursig = p->p_sig;
 			status->pr_pid = thr->td_tid;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 			fill_regs32(thr, &status->pr_reg);
 			fill_fpregs32(thr, fpregset);
 #else
 			fill_regs(thr, &status->pr_reg);
 			fill_fpregs(thr, fpregset);
 #endif
 		}
 		__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
 		    sizeof *status);
 		__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
 		    sizeof *fpregset);
 		/*
 		 * Allow for MD specific notes, as well as any MD
 		 * specific preparations for writing MI notes.
 		 */
 		__elfN(dump_thread)(thr, dst, off);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	notesz = *off - noteoff;
 
 	if (dst != NULL)
 		free(tempdata, M_TEMP);
 
 	/* Align up to a page boundary for the program segments. */
 	*off = round_page(*off);
 
 	if (dst != NULL) {
 		Elf_Ehdr *ehdr;
 		Elf_Phdr *phdr;
 		struct phdr_closure phc;
 
 		/*
 		 * Fill in the ELF header.
 		 */
 		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
 		ehdr->e_ident[EI_MAG0] = ELFMAG0;
 		ehdr->e_ident[EI_MAG1] = ELFMAG1;
 		ehdr->e_ident[EI_MAG2] = ELFMAG2;
 		ehdr->e_ident[EI_MAG3] = ELFMAG3;
 		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 		ehdr->e_ident[EI_DATA] = ELF_DATA;
 		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 		ehdr->e_ident[EI_ABIVERSION] = 0;
 		ehdr->e_ident[EI_PAD] = 0;
 		ehdr->e_type = ET_CORE;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 		ehdr->e_machine = EM_386;
 #else
 		ehdr->e_machine = ELF_ARCH;
 #endif
 		ehdr->e_version = EV_CURRENT;
 		ehdr->e_entry = 0;
 		ehdr->e_phoff = phoff;
 		ehdr->e_flags = 0;
 		ehdr->e_ehsize = sizeof(Elf_Ehdr);
 		ehdr->e_phentsize = sizeof(Elf_Phdr);
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shentsize = sizeof(Elf_Shdr);
 		ehdr->e_shnum = 0;
 		ehdr->e_shstrndx = SHN_UNDEF;
 
 		/*
 		 * Fill in the program header entries.
 		 */
 		phdr = (Elf_Phdr *)((char *)dst + phoff);
 
 		/* The note segement. */
 		phdr->p_type = PT_NOTE;
 		phdr->p_offset = noteoff;
 		phdr->p_vaddr = 0;
 		phdr->p_paddr = 0;
 		phdr->p_filesz = notesz;
 		phdr->p_memsz = 0;
 		phdr->p_flags = 0;
 		phdr->p_align = 0;
 		phdr++;
 
 		/* All the writable segments from the program. */
 		phc.phdr = phdr;
 		phc.offset = *off;
 		each_writable_segment(td, cb_put_phdr, &phc);
 	}
 }
 
 static void
 __elfN(putnote)(void *dst, size_t *off, const char *name, int type,
     const void *desc, size_t descsz)
 {
 	Elf_Note note;
 
 	note.n_namesz = strlen(name) + 1;
 	note.n_descsz = descsz;
 	note.n_type = type;
 	if (dst != NULL)
 		bcopy(&note, (char *)dst + *off, sizeof note);
 	*off += sizeof note;
 	if (dst != NULL)
 		bcopy(name, (char *)dst + *off, note.n_namesz);
 	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
 	if (dst != NULL)
 		bcopy(desc, (char *)dst + *off, note.n_descsz);
 	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	__CONCAT(exec_, __elfN(imgact)),
 	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
Index: head/sys/kern/imgact_gzip.c
===================================================================
--- head/sys/kern/imgact_gzip.c	(revision 173360)
+++ head/sys/kern/imgact_gzip.c	(revision 173361)
@@ -1,399 +1,403 @@
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@FreeBSD.org> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  */
 
 /*
  * This module handles execution of a.out files which have been run through
  * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
  *
  * TODO:
  *	text-segments should be made R/O after being filled
  *	is the vm-stuff safe ?
  * 	should handle the entire header of gzip'ed stuff.
  *	inflate isn't quite reentrant yet...
  *	error-handling is a mess...
  *	so is the rest...
  *	tidy up unnecesary includes
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/inflate.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 struct imgact_gzip {
 	struct image_params *ip;
 	struct exec     a_out;
 	int             error;
 	int		gotheader;
 	int             where;
 	u_char         *inbuf;
 	u_long          offset;
 	u_long          output;
 	u_long          len;
 	int             idx;
 	u_long          virtual_offset, file_offset, file_end, bss_size;
 };
 
 static int exec_gzip_imgact(struct image_params *imgp);
 static int NextByte(void *vp);
 static int do_aout_hdr(struct imgact_gzip *);
 static int Flush(void *vp, u_char *, u_long siz);
 
 static int
 exec_gzip_imgact(imgp)
 	struct image_params *imgp;
 {
 	int             error, error2 = 0;
 	const u_char   *p = (const u_char *) imgp->image_header;
 	struct imgact_gzip igz;
 	struct inflate  infl;
 	struct vmspace *vmspace;
 
 	/* If these four are not OK, it isn't a gzip file */
 	if (p[0] != 0x1f)
 		return -1;	/* 0    Simply magic	 */
 	if (p[1] != 0x8b)
 		return -1;	/* 1    Simply magic	 */
 	if (p[2] != 0x08)
 		return -1;	/* 2    Compression method	 */
 	if (p[9] != 0x03)
 		return -1;	/* 9    OS compressed on	 */
 
 	/*
 	 * If this one contains anything but a comment or a filename marker,
 	 * we don't want to chew on it
 	 */
 	if (p[3] & ~(0x18))
 		return ENOEXEC;	/* 3    Flags		 */
 
 	/* These are of no use to us */
 	/* 4-7  Timestamp		 */
 	/* 8    Extra flags		 */
 
 	bzero(&igz, sizeof igz);
 	bzero(&infl, sizeof infl);
 	infl.gz_private = (void *) &igz;
 	infl.gz_input = NextByte;
 	infl.gz_output = Flush;
 
 	igz.ip = imgp;
 	igz.idx = 10;
 
 	if (p[3] & 0x08) {	/* skip a filename */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	if (p[3] & 0x10) {	/* skip a comment */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	igz.len = imgp->attr->va_size;
 
 	error = inflate(&infl);
 
 	/*
 	 * The unzipped file may not even have been long enough to contain
 	 * a header giving Flush() a chance to return error.  Check for this.
 	 */
 	if ( !igz.gotheader )
 		return ENOEXEC;
 
 	if ( !error ) {
 		vmspace = imgp->proc->p_vmspace;
 		error = vm_map_protect(&vmspace->vm_map,
 			(vm_offset_t) vmspace->vm_taddr,
 			(vm_offset_t) (vmspace->vm_taddr + 
 				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
 			VM_PROT_READ|VM_PROT_EXECUTE,0);
 	}
 
 	if (igz.inbuf) {
 		error2 =
 			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
 			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
 	}
 	if (igz.error || error || error2) {
 		printf("Output=%lu ", igz.output);
 		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
 		       error, igz.error, error2, igz.where);
 	}
 	if (igz.error)
 		return igz.error;
 	if (error)
 		return ENOEXEC;
 	if (error2)
 		return error2;
 	return 0;
 }
 
 static int
 do_aout_hdr(struct imgact_gzip * gz)
 {
 	int             error;
 	struct thread  *td = curthread;
 	struct vmspace *vmspace;
 	vm_offset_t     vmaddr;
 
 	/*
 	 * Set file/virtual offset based on a.out variant. We do two cases:
 	 * host byte order and network byte order (for NetBSD compatibility)
 	 */
 	switch ((int) (gz->a_out.a_magic & 0xffff)) {
 	case ZMAGIC:
 		gz->virtual_offset = 0;
 		if (gz->a_out.a_text) {
 			gz->file_offset = PAGE_SIZE;
 		} else {
 			/* Bill's "screwball mode" */
 			gz->file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		gz->virtual_offset = PAGE_SIZE;
 		gz->file_offset = 0;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			gz->virtual_offset = PAGE_SIZE;
 			gz->file_offset = 0;
 			break;
 		default:
 			gz->where = __LINE__;
 			return (-1);
 		}
 	}
 
 	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (			/* entry point must lay with text region */
 	    gz->a_out.a_entry < gz->virtual_offset ||
 	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
 
 	/* text and data size must each be page rounded */
 	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
 		gz->where = __LINE__;
 		return (-1);
 	}
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	PROC_LOCK(gz->ip->proc);
 	if (			/* text can't exceed maximum text size */
 	    gz->a_out.a_text > maxtsiz ||
 
 	/* data + bss can't exceed rlimit */
 	    gz->a_out.a_data + gz->bss_size >
 	    lim_cur(gz->ip->proc, RLIMIT_DATA)) {
 		PROC_UNLOCK(gz->ip->proc);
 		gz->where = __LINE__;
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(gz->ip->proc);
 	/* Find out how far we should go */
 	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(gz->ip->vp, 0, td);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
-	exec_new_vmspace(gz->ip, &aout_sysvec);
+	error = exec_new_vmspace(gz->ip, &aout_sysvec);
 
 	vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
 
 	vmspace = gz->ip->proc->p_vmspace;
 
 	vmaddr = gz->virtual_offset;
 
 	error = vm_mmap(&vmspace->vm_map,
 			&vmaddr,
 			gz->a_out.a_text + gz->a_out.a_data,
 			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
 			OBJT_DEFAULT,
 			NULL,
 			0);
 
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 
 	if (gz->bss_size != 0) {
 		/*
 		 * Allocate demand-zeroed area for uninitialized data.
 		 * "bss" = 'block started by symbol' - named after the 
 		 * IBM 7090 instruction of the same name.
 		 */
 		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
 			gz->a_out.a_data;
 		error = vm_map_find(&vmspace->vm_map,
 				NULL,
 				0,
 				&vmaddr, 
 				gz->bss_size,
 				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			gz->where = __LINE__;
 			return (error);
 		}
 	}
 	/* Fill in process VM information */
 	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset;
 	vmspace->vm_daddr = (caddr_t) (uintptr_t)
 			    (gz->virtual_offset + gz->a_out.a_text);
 
 	/* Fill in image_params */
 	gz->ip->interpreted = 0;
 	gz->ip->entry_addr = gz->a_out.a_entry;
 
 	gz->ip->proc->p_sysent = &aout_sysvec;
 
 	return 0;
 }
 
 static int
 NextByte(void *vp)
 {
 	int             error;
 	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
 
 	if (igz->idx >= igz->len) {
 		igz->where = __LINE__;
 		return GZ_EOF;
 	}
 	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
 		return igz->inbuf[(igz->idx++) - igz->offset];
 	}
 	if (igz->inbuf) {
 		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
 			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
 		if (error) {
 			igz->where = __LINE__;
 			igz->error = error;
 			return GZ_EOF;
 		}
 	}
 	igz->offset = igz->idx & ~PAGE_MASK;
 
 	error = vm_mmap(kernel_map,	/* map */
 			(vm_offset_t *) & igz->inbuf,	/* address */
 			PAGE_SIZE,	/* size */
 			VM_PROT_READ,	/* protection */
 			VM_PROT_READ,	/* max protection */
 			0,	/* flags */
 			OBJT_VNODE,	/* handle type */
 			igz->ip->vp,	/* vnode */
 			igz->offset);	/* offset */
 	if (error) {
 		igz->where = __LINE__;
 		igz->error = error;
 		return GZ_EOF;
 	}
 	return igz->inbuf[(igz->idx++) - igz->offset];
 }
 
 static int
 Flush(void *vp, u_char * ptr, u_long siz)
 {
 	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
 	u_char         *p = ptr, *q;
 	int             i;
 
 	/* First, find an a.out-header. */
 	if (gz->output < sizeof gz->a_out) {
 		q = (u_char *) & gz->a_out;
 		i = min(siz, sizeof gz->a_out - gz->output);
 		bcopy(p, q + gz->output, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 		if (gz->output == sizeof gz->a_out) {
 			gz->gotheader = 1;
 			i = do_aout_hdr(gz);
 			if (i == -1) {
 				if (!gz->where)
 					gz->where = __LINE__;
 				gz->error = ENOEXEC;
 				return ENOEXEC;
 			} else if (i) {
 				gz->where = __LINE__;
 				gz->error = i;
 				return ENOEXEC;
 			}
 			if (gz->file_offset == 0) {
 				q = (u_char *) (uintptr_t) gz->virtual_offset;
 				copyout(&gz->a_out, q, sizeof gz->a_out);
 			}
 		}
 	}
 	/* Skip over zero-padded first PAGE if needed */
 	if (gz->output < gz->file_offset &&
 	    gz->output + siz > gz->file_offset) {
 		i = min(siz, gz->file_offset - gz->output);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
 		i = min(siz, gz->file_end - gz->output);
 		q = (u_char *) (uintptr_t)
 		    (gz->virtual_offset + gz->output - gz->file_offset);
 		copyout(p, q, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	gz->output += siz;
 	return 0;
 }
 
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
 EXEC_SET(execgzip, gzip_execsw);
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 173360)
+++ head/sys/kern/kern_exec.c	(revision 173361)
@@ -1,1299 +1,1301 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 static void exec_free_args(struct image_args *);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always returning to
  * the caller.  If sufficiently bad things happen during the call to
  * do_execve(), it can end up calling exit1(); as a result, callers must
  * avoid doing anything which they might need to undo (e.g., allocating
  * memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG(argv, args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG(envv, args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 
 	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
 	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
 	if (error)
 		goto exec_fail;
 
 	vfslocked = NDHASGIANT(ndp);
 	imgp->vp = ndp->ni_vp;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		interplabel = mac_vnode_label_alloc();
 		mac_vnode_copy_label(ndp->ni_vp->v_label, interplabel);
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
 	    interplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 
 #ifdef KTRACE
 		if (p->p_tracevp != NULL &&
 		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 * Use tdsignal to deliver signal to current thread, use
 	 * psignal may cause the signal to be delivered to wrong thread
 	 * because that thread will exit, remember we are going to enter
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
 		tdsignal(p, td, SIGTRAP, NULL);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 	vfs_mark_atime(imgp->vp, td);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
 		vrele(textvp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabel != NULL)
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
 	exec_free_args(args);
 
 	if (error && imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 	}
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_LOCK(object);
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
 				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
 					break;
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
 	vm_page_unlock_queues();
 	vm_page_wakeup(ma[0]);
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
 	imgp->sysent = sv;
 
 	/* May be called with Giant held */
 	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
-		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+		error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+		if (error)
+			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	if (sv->sv_maxssiz != NULL)
 		ssiz = *sv->sv_maxssiz;
 	else
 		ssiz = maxssiz;
 	stack_addr = sv->sv_usrstack - ssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process address
  * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	error = 0;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings:
 	 *
 	 * o ARG_MAX for argument and environment;
 	 * o MAXSHELLCMDLEN for the name of interpreters.
 	 */
 	args->buf = (char *) kmem_alloc_wait(exec_map,
 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 	if (args->buf == NULL)
 		return (ENOMEM);
 	args->begin_argv = args->buf;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	args->fname = args->buf + ARG_MAX;
 
 	/*
 	 * Copy the file name.
 	 */
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
 		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1) {
 			error = EFAULT;
 			goto err_exit;
 		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG) 
 				error = E2BIG;
 			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1) {
 				error = EFAULT;
 				goto err_exit;
 			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					error = E2BIG;
 				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 
 err_exit:
 	exec_free_args(args);
 	return (error);
 }
 
 static void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 		args->buf = NULL;
 	}
 }
 
 /*
  * Copy strings out to the new process address space, constructing new arg
  * and env vector tables. Return a pointer to the base so that it can be used
  * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(char *));
 
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 173360)
+++ head/sys/kern/kern_fork.c	(revision 173361)
@@ -1,794 +1,821 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sx.h>
 #include <sys/signalvar.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	struct proc *p2;
 	int error;
 
 	/* Don't allow kernel-only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
 	AUDIT_ARG(fflags, uap->flags);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error != 0)
 		return(error);
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;
 	int flags;
 	int pages;
 	struct proc **procp;
 {
 	struct proc *p1, *p2, *pptr;
 	struct proc *newproc;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
 	struct sigacts *newsigacts;
+	struct vmspace *vm2;
 	int error;
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	p1 = td->td_proc;
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			if (thread_single(SINGLE_BOUNDARY)) {
 				PROC_UNLOCK(p1);
 				return (ERESTART);
 			}
 			PROC_UNLOCK(p1);
 		}
 
-		vm_forkproc(td, NULL, NULL, flags);
+		error = vm_forkproc(td, NULL, NULL, NULL, flags);
+		if (error)
+			goto norfproc_fail;
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent).
 		 */
 		if (flags & RFFDG) 
 			fdunshare(p1, td);
 
+norfproc_fail:
 		if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			thread_single_end();
 			PROC_UNLOCK(p1);
 		}
 		*procp = NULL;
-		return (0);
+		return (error);
 	}
 
 	/*
 	 * XXX
 	 * We did have single-threading code here
 	 * however it proved un-needed and caused problems
 	 */
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
+	if (TAILQ_EMPTY(&newproc->p_threads)) {
+		td2 = thread_alloc();
+		if (td2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+		proc_linkup(newproc, td2);
+		sched_newproc(newproc, td2);
+	} else
+		td2 = FIRST_THREAD_IN_PROC(newproc);
+
+	/* Allocate and switch to an alternate kstack if specified. */
+	if (pages != 0) {
+		if (!vm_thread_new_altkstack(td2, pages)) {
+			error = ENOMEM;
+			goto fail1;
+		}
+	}
+	if ((flags & RFMEM) == 0) {
+		vm2 = vmspace_fork(p1->p_vmspace);
+		if (vm2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+	} else
+		vm2 = NULL;
 #ifdef MAC
 	mac_proc_init(newproc);
 #endif
 	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
 	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
 	    PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 *
 	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
 	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			while (p2->p_pid == trypid ||
 			    (p2->p_pgrp != NULL &&
 			    (p2->p_pgrp->pg_id == trypid ||
 			    (p2->p_session != NULL &&
 			    p2->p_session->s_sid == trypid)))) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp != NULL) {
 				if (p2->p_pgrp->pg_id > trypid &&
 				    pidchecked > p2->p_pgrp->pg_id)
 					pidchecked = p2->p_pgrp->pg_id;
 				if (p2->p_session != NULL &&
 				    p2->p_session->s_sid > trypid &&
 				    pidchecked > p2->p_session->s_sid)
 					pidchecked = p2->p_session->s_sid;
 			}
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 	sx_sunlock(&proctree_lock);
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
-	td2 = FIRST_THREAD_IN_PROC(newproc);
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	/*
 	 * Allow the scheduler to initialize the child.
 	 */
 	thread_lock(td);
 	sched_fork(td, td2);
 	thread_unlock(td);
 	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	sx_xunlock(&allproc_lock);
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
 
 	p2->p_ucred = crhold(td->td_ucred);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(p1->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		fd = fdcopy(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_XLOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_XUNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
-	/* Allocate and switch to an alternate kstack if specified. */
-	if (pages != 0)
-		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_sigmask = td->td_sigmask;
 	td2->td_flags = TDF_INMEM;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = P_INMEM;
 	p2->p_swtick = ticks;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	td2->td_ucred = crhold(p2->p_ucred);
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	p2->p_textvp = p1->p_textvp;
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
 	lim_fork(p1, p2);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/* Bump references to the text vnode (for procfs) */
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
 	/*
 	 * Set up linkage for kernel based threading.
 	 */
 	if ((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	p2->p_pgrp = p1->p_pgrp;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
-	vm_forkproc(td, p2, td2, flags);
+	vm_forkproc(td, p2, td2, vm2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		PCPU_INC(cnt.v_forks);
 		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		PCPU_INC(cnt.v_vforks);
 		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
 		PCPU_INC(cnt.v_kthreads);
 		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
 		PCPU_INC(cnt.v_rforks);
 		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	PROC_SLOCK(p2);
 	p2->p_state = PRS_NORMAL;
 	PROC_SUNLOCK(p2);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
 		thread_lock(td2);
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
 		thread_unlock(td2);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
 	return (0);
 fail:
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_proc_destroy(newproc);
 #endif
+fail1:
 	uma_zfree(proc_zone, newproc);
 	pause("fork", hz / 2);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct proc *p;
 	struct thread *td;
 	struct thread *dtd;
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
 	sched_fork_exit(td);
 	/*
 	* Processes normally resume in mi_switch() after being
 	* cpu_switch()'ed to, but when children start up they arrive here
 	* instead, so we must do much the same things as mi_switch() would.
 	*/
 	if ((dtd = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(dtd);
 	}
 	thread_unlock(td);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kproc_exit(0);
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	EVENTHANDLER_INVOKE(schedtail, p);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/kern_kse.c
===================================================================
--- head/sys/kern/kern_kse.c	(revision 173360)
+++ head/sys/kern/kern_kse.c	(revision 173361)
@@ -1,1427 +1,1444 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/imgact.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
+#include <sys/syslog.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <vm/uma.h>
 
 #ifdef KSE
 static uma_zone_t upcall_zone;
 
 /* DEBUG ONLY */
 extern int virtual_cpu;
 extern int thread_debug;
 
 extern int max_threads_per_proc;
 extern int max_groups_per_proc;
 extern int max_threads_hits;
 extern struct mtx kse_lock;
 
 
 TAILQ_HEAD(, kse_upcall) zombie_upcalls =
 	TAILQ_HEAD_INITIALIZER(zombie_upcalls);
 
 static int thread_update_usr_ticks(struct thread *td);
-static void thread_alloc_spare(struct thread *td);
+static int thread_alloc_spare(struct thread *td);
 static struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku);
 static struct kse_upcall *upcall_alloc(void);
 
 
 struct mtx kse_lock;
 MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN);
 
 struct kse_upcall *
 upcall_alloc(void)
 {
 	struct kse_upcall *ku;
 
 	ku = uma_zalloc(upcall_zone, M_WAITOK | M_ZERO);
 	return (ku);
 }
 
 void
 upcall_reap(void)
 {
 	TAILQ_HEAD(, kse_upcall) zupcalls;
 	struct kse_upcall *ku_item, *ku_tmp;
 
 	TAILQ_INIT(&zupcalls);
 	mtx_lock_spin(&kse_lock);
 	if (!TAILQ_EMPTY(&zombie_upcalls)) {
 		TAILQ_CONCAT(&zupcalls, &zombie_upcalls, ku_link);
 		TAILQ_INIT(&zombie_upcalls);
 	}
 	mtx_unlock_spin(&kse_lock);
 	TAILQ_FOREACH_SAFE(ku_item, &zupcalls, ku_link, ku_tmp)
 		uma_zfree(upcall_zone, ku_item);
 }
 
 void
 upcall_remove(struct thread *td)
 {
 
 	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_upcall != NULL) {
 		/*
 	 	* If we are not a bound thread then decrement the count of
 	 	* possible upcall sources
 	 	*/
 		if (td->td_pflags & TDP_SA) 
 			td->td_proc->p_numupcalls--;
 		mtx_lock_spin(&kse_lock);
 		td->td_upcall->ku_owner = NULL;
 		TAILQ_REMOVE(&td->td_upcall->ku_proc->p_upcalls, td->td_upcall,
 		    ku_link);
 		TAILQ_INSERT_HEAD(&zombie_upcalls, td->td_upcall, ku_link);
 		mtx_unlock_spin(&kse_lock);
 		td->td_upcall = NULL;
 	}
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct kse_switchin_args {
 	struct kse_thr_mailbox *tmbx;
 	int flags;
 };
 #endif
 
 #ifdef KSE
 void
 kse_unlink(struct thread *td)
 {
 	mtx_lock_spin(&kse_lock);
 	thread_unlink(td);
 	mtx_unlock_spin(&kse_lock);
 	upcall_remove(td);
 }
 #endif
 
 int
 kse_switchin(struct thread *td, struct kse_switchin_args *uap)
 {
 #ifdef KSE
 	struct kse_thr_mailbox tmbx;
 	struct kse_upcall *ku;
 	int error;
 
 	thread_lock(td);
 	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
 		thread_unlock(td);
 		return (EINVAL);
 	}
 	thread_unlock(td);
 	error = (uap->tmbx == NULL) ? EINVAL : 0;
 	if (!error)
 		error = copyin(uap->tmbx, &tmbx, sizeof(tmbx));
 	if (!error && (uap->flags & KSE_SWITCHIN_SETTMBX))
 		error = (suword(&ku->ku_mailbox->km_curthread,
 			 (long)uap->tmbx) != 0 ? EINVAL : 0);
 	if (!error)
 		error = set_mcontext(td, &tmbx.tm_context.uc_mcontext);
 	if (!error) {
 		suword32(&uap->tmbx->tm_lwp, td->td_tid);
 		if (uap->flags & KSE_SWITCHIN_SETTMBX) {
 			td->td_mailbox = uap->tmbx;
 			td->td_pflags |= TDP_CAN_UNBIND;
 		}
 		PROC_LOCK(td->td_proc);
 		if (td->td_proc->p_flag & P_TRACED) {
 			_PHOLD(td->td_proc);
 			if (tmbx.tm_dflags & TMDF_SSTEP)
 				ptrace_single_step(td);
 			else
 				ptrace_clear_single_step(td);
 			if (tmbx.tm_dflags & TMDF_SUSPEND) {
 				thread_lock(td);
 				/* fuword can block, check again */
 				if (td->td_upcall)
 					ku->ku_flags |= KUF_DOUPCALL;
 				thread_unlock(td);
 			}
 			_PRELE(td->td_proc);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 	return ((error == 0) ? EJUSTRETURN : error);
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
 struct kse_thr_interrupt_args {
 	struct kse_thr_mailbox * tmbx;
 	int cmd;
 	long data;
 };
 */
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
 #ifdef KSE
 	struct kse_execve_args args;
 	struct image_args iargs;
 	struct proc *p;
 	struct thread *td2;
 	struct kse_upcall *ku;
 	struct kse_thr_mailbox *tmbx;
 	uint32_t flags;
 	int error;
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_SA)) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 
 	switch (uap->cmd) {
 	case KSE_INTR_SENDSIG:
 		if (uap->data < 0 || uap->data > _SIG_MAXSIG)
 			return (EINVAL);
 	case KSE_INTR_INTERRUPT:
 	case KSE_INTR_RESTART:
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2->td_mailbox == uap->tmbx)
 				break;
 		}
 		if (td2 == NULL) {
 			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			return (ESRCH);
 		}
 		thread_lock(td2);
 		PROC_SUNLOCK(p);
 		if (uap->cmd == KSE_INTR_SENDSIG) {
 			if (uap->data > 0) {
 				td2->td_flags &= ~TDF_INTERRUPT;
 				thread_unlock(td2);
 				tdsignal(p, td2, (int)uap->data, NULL);
 			} else {
 				thread_unlock(td2);
 			}
 		} else {
 			td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING;
 			if (TD_CAN_UNBIND(td2))
 				td2->td_upcall->ku_flags |= KUF_DOUPCALL;
 			if (uap->cmd == KSE_INTR_INTERRUPT)
 				td2->td_intrval = EINTR;
 			else
 				td2->td_intrval = ERESTART;
 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR))
 				sleepq_abort(td2, td2->td_intrval);
 			thread_unlock(td2);
 		}
 		PROC_UNLOCK(p);
 		break;
 	case KSE_INTR_SIGEXIT:
 		if (uap->data < 1 || uap->data > _SIG_MAXSIG)
 			return (EINVAL);
 		PROC_LOCK(p);
 		sigexit(td, (int)uap->data);
 		break;
 
 	case KSE_INTR_DBSUSPEND:
 		/* this sub-function is only for bound thread */
 		if (td->td_pflags & TDP_SA)
 			return (EINVAL);
 		thread_lock(td);
 		ku = td->td_upcall;
 		thread_unlock(td);
 		tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 		if (tmbx == NULL || tmbx == (void *)-1)
 			return (EINVAL);
 		flags = 0;
 		PROC_LOCK(p);
 		while ((p->p_flag & P_TRACED) && !(p->p_flag & P_SINGLE_EXIT)) {
 			flags = fuword32(&tmbx->tm_dflags);
 			if (!(flags & TMDF_SUSPEND))
 				break;
 			PROC_SLOCK(p);
 			thread_stopped(p);
 			PROC_UNLOCK(p);
 			thread_lock(td);
 			thread_suspend_one(td);
 			PROC_SUNLOCK(p);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(td);
 			PROC_LOCK(p);
 		}
 		PROC_UNLOCK(p);
 		return (0);
 
 	case KSE_INTR_EXECVE:
 		error = copyin((void *)uap->data, &args, sizeof(args));
 		if (error)
 			return (error);
 		error = exec_copyin_args(&iargs, args.path, UIO_USERSPACE,
 		    args.argv, args.envp);
 		if (error == 0)
 			error = kern_execve(td, &iargs, NULL);
 		if (error == 0) {
 			PROC_LOCK(p);
 			SIGSETOR(td->td_siglist, args.sigpend);
 			PROC_UNLOCK(p);
 			kern_sigprocmask(td, SIG_SETMASK, &args.sigmask, NULL,
 			    0);
 		}
 		return (error);
 
 	default:
 		return (EINVAL);
 	}
 	return (0);
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
 struct kse_exit_args {
 	register_t dummy;
 };
 */
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
 #ifdef KSE
 	struct proc *p;
 	struct kse_upcall *ku, *ku2;
 	int    error, count;
 
 	p = td->td_proc;
 	/* 
 	 * Ensure that this is only called from the UTS
 	 */
 	thread_lock(td);
 	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
 		thread_unlock(td);
 		return (EINVAL);
 	}
 	thread_unlock(td);
 
 	/*
 	 * Calculate the existing non-exiting upcalls in this process.
 	 * If we are the last upcall but there are still other threads,
 	 * then do not exit. We need the other threads to be able to 
 	 * complete whatever they are doing.
 	 * XXX This relies on the userland knowing what to do if we return.
 	 * It may be a better choice to convert ourselves into a kse_release
 	 * ( or similar) and wait in the kernel to be needed.
 	 * XXX Where are those other threads? I suppose they are waiting in
 	 * the kernel. We should wait for them all at the user boundary after
 	 * turning into an exit.
 	 */
 	count = 0;
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 	FOREACH_UPCALL_IN_PROC(p, ku2) {
 		if ((ku2->ku_flags & KUF_EXITING) == 0)
 			count++;
 	}
 	if (count == 1 && (p->p_numthreads > 1)) {
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	ku->ku_flags |= KUF_EXITING;
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 
 	/* 
 	 * Mark the UTS mailbox as having been finished with.
 	 * If that fails then just go for a segfault.
 	 * XXX need to check it that can be deliverred without a mailbox.
 	 */
 	error = suword32(&ku->ku_mailbox->km_flags, ku->ku_mflags|KMF_DONE);
 	if (!(td->td_pflags & TDP_SA))
 		if (suword32(&td->td_mailbox->tm_lwp, 0))
 			error = EFAULT;
 	PROC_LOCK(p);
 	if (error)
 		psignal(p, SIGSEGV);
 	sigqueue_flush(&td->td_sigqueue);
 	PROC_SLOCK(p);
 	thread_lock(td);
 	upcall_remove(td);
 	thread_unlock(td);
 	if (p->p_numthreads != 1) {
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	/*
 	 * This is the last thread. Just return to the user.
 	 * Effectively we have left threading mode..
 	 * The only real thing left to do is ensure that the
 	 * scheduler sets out concurrency back to 1 as that may be a
 	 * resource leak otherwise.
 	 * This is an A[PB]I issue.. what SHOULD we do?
 	 * One possibility is to return to the user. It may not cope well.
 	 * The other possibility would be to let the process exit.
 	 */
 	thread_unthread(td);
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 #if 0
 	return (0);
 #else
 	printf("kse_exit: called on last thread. Calling exit1()");
 	exit1(td, 0);
 #endif
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
  * Either becomes an upcall or waits for an awakening event and
  * then becomes an upcall. Only error cases return.
  */
 /*
 struct kse_release_args {
 	struct timespec *timeout;
 };
 */
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
 #ifdef KSE
 	struct proc *p;
 	struct kse_upcall *ku;
 	struct timespec timeout;
 	struct timeval tv;
 	sigset_t sigset;
 	int error;
 
 	p = td->td_proc;
 	thread_lock(td);
 	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
 		thread_unlock(td);
 		printf("kse_release: called outside of threading. exiting");
 		exit1(td, 0);
 	}
 	thread_unlock(td);
 	if (uap->timeout != NULL) {
 		if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
 			return (error);
 		TIMESPEC_TO_TIMEVAL(&tv, &timeout);
 	}
 	if (td->td_pflags & TDP_SA)
 		td->td_pflags |= TDP_UPCALLING;
 	else {
 		ku->ku_mflags = fuword32(&ku->ku_mailbox->km_flags);
 		if (ku->ku_mflags == -1) {
 			PROC_LOCK(p);
 			sigexit(td, SIGSEGV);
 		}
 	}
 	PROC_LOCK(p);
 	if (ku->ku_mflags & KMF_WAITSIGEVENT) {
 		/* UTS wants to wait for signal event */
 		if (!(p->p_flag & P_SIGEVENT) &&
 		    !(ku->ku_flags & KUF_DOUPCALL)) {
 			td->td_kflags |= TDK_KSERELSIG;
 			error = msleep(&p->p_siglist, &p->p_mtx, PPAUSE|PCATCH,
 			    "ksesigwait", (uap->timeout ? tvtohz(&tv) : 0));
 			td->td_kflags &= ~(TDK_KSERELSIG | TDK_WAKEUP);
 		}
 		p->p_flag &= ~P_SIGEVENT;
 		sigset = p->p_siglist;
 		PROC_UNLOCK(p);
 		error = copyout(&sigset, &ku->ku_mailbox->km_sigscaught,
 		    sizeof(sigset));
 	} else {
 		if ((ku->ku_flags & KUF_DOUPCALL) == 0 &&
 		    ((ku->ku_mflags & KMF_NOCOMPLETED) ||
 		     (p->p_completed == NULL))) {
 			p->p_upsleeps++;
 			td->td_kflags |= TDK_KSEREL;
 			error = msleep(&p->p_completed, &p->p_mtx,
 				PPAUSE|PCATCH, "kserel",
 				(uap->timeout ? tvtohz(&tv) : 0));
 			td->td_kflags &= ~(TDK_KSEREL | TDK_WAKEUP);
 			p->p_upsleeps--;
 		}
 		PROC_UNLOCK(p);
 	}
 	if (ku->ku_flags & KUF_DOUPCALL) {
 		PROC_SLOCK(p);
 		ku->ku_flags &= ~KUF_DOUPCALL;
 		PROC_SUNLOCK(p);
 	}
 	return (0);
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 /* struct kse_wakeup_args {
 	struct kse_mailbox *mbx;
 }; */
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
 #ifdef KSE
 	struct proc *p;
 	struct kse_upcall *ku;
 	struct thread *td2;
 
 	p = td->td_proc;
 	td2 = NULL;
 	ku = NULL;
 	/* KSE-enabled processes only, please. */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_SA)) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_SLOCK(p);
 	if (uap->mbx) {
 		FOREACH_UPCALL_IN_PROC(p, ku) {
 			if (ku->ku_mailbox == uap->mbx)
 				break;
 		}
 	} else {
 		if (p->p_upsleeps) {
 			PROC_SUNLOCK(p);
 			wakeup(&p->p_completed);
 			PROC_UNLOCK(p);
 			return (0);
 		}
 		ku = TAILQ_FIRST(&p->p_upcalls);
 	}
 	if (ku == NULL) {
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
 	mtx_lock_spin(&kse_lock);
 	if ((td2 = ku->ku_owner) == NULL) {
 		mtx_unlock_spin(&kse_lock);
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		panic("%s: no owner", __func__);
 	} else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) {
 		mtx_unlock_spin(&kse_lock);
 		if (!(td2->td_kflags & TDK_WAKEUP)) {
 			td2->td_kflags |= TDK_WAKEUP;
 			if (td2->td_kflags & TDK_KSEREL)
 				sleepq_remove(td2, &p->p_completed);
 			else
 				sleepq_remove(td2, &p->p_siglist);
 		}
 	} else {
 		ku->ku_flags |= KUF_DOUPCALL;
 		mtx_unlock_spin(&kse_lock);
 	}
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
  * newgroup == 0: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate max new KSEs and schedule an upcall.
  *
  * XXX should be changed so that 'first' behaviour lasts for as long
  * as you have not made a thread in this proc. i.e. as long as we do not have
  * a mailbox..
  */
 /* struct kse_create_args {
 	struct kse_mailbox *mbx;
 	int newgroup;
 }; */
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
 #ifdef KSE
 	struct proc *p;
 	struct kse_mailbox mbx;
 	struct kse_upcall *newku;
 	int err, ncpus, sa = 0, first = 0;
 	struct thread *newtd;
 
 	p = td->td_proc;
 
 	/*
 	 * Processes using the other threading model can't
 	 * suddenly start calling this one
 	 * XXX  maybe...
 	 */
 	PROC_LOCK(p);
 	if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	if (!(p->p_flag & P_SA)) {
 		first = 1;
 		p->p_flag |= P_SA|P_HADTHREADS;
 	}
 	PROC_UNLOCK(p);
 
 	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
 		return (err);
 
 	ncpus = mp_ncpus;
 	if (virtual_cpu != 0)
 		ncpus = virtual_cpu;
 	/*
 	 * If the new UTS mailbox says that this
 	 * will be a BOUND lwp, then it had better
 	 * have its thread mailbox already there.
 	 */
 	if ((mbx.km_flags & KMF_BOUND) || uap->newgroup) {
 		/* It's a bound thread (1:1) */
 		if (mbx.km_curthread == NULL) 
 			return (EINVAL);
 		ncpus = 1;
 		if (!(uap->newgroup || first))
 			return (EINVAL);
 	} else {
 		/* It's an upcall capable thread */
 		sa = TDP_SA;
 		PROC_LOCK(p);
 		/*
 		 * Limit it to NCPU upcall contexts per proc in any case.
 		 * numupcalls will soon be numkse or something
 		 * as it will represent the number of 
 		 * non-bound upcalls available.  (i.e. ones that can 
 		 * actually call up).
 		 */
 		if (p->p_numupcalls >= ncpus) {
 			PROC_UNLOCK(p);
 			return (EPROCLIM);
 		}
 		p->p_numupcalls++;
 		PROC_UNLOCK(p);
 	}
 
+	/*
+	 * For the first call this may not have been set.
+	 * Of course nor may it actually be needed.
+	 * thread_schedule_upcall() will look for it.
+	 */
+	if (td->td_standin == NULL) {
+		if (!thread_alloc_spare(td))
+			return (ENOMEM);
+	}
+
 	/* 
 	 * Even bound LWPs get a mailbox and an upcall to hold it.
 	 * XXX This should change.
 	 */
 	newku = upcall_alloc();
 	newku->ku_mailbox = uap->mbx;
 	newku->ku_func = mbx.km_func;
 	bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
 
-	/*
-	 * For the first call this may not have been set.
-	 * Of course nor may it actually be needed.
-	 * thread_schedule_upcall() will look for it.
-	 */
-	if (td->td_standin == NULL)
-		thread_alloc_spare(td);
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 	/*
 	 * If we are the first time, and a normal thread,
 	 * then transfer all the signals back to the 'process'.
 	 * SA threading will make a special thread to handle them.
 	 */
 	if (first) {
 		sigqueue_move_set(&td->td_sigqueue, &p->p_sigqueue, 
 			&td->td_sigqueue.sq_signals);
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 	}
 
 	/*
 	 * Make the new upcall available to the process.
 	 * It may or may not use it, but it's available.
 	 */
 	TAILQ_INSERT_TAIL(&p->p_upcalls, newku, ku_link);
 	newku->ku_proc = p;
 	PROC_UNLOCK(p);
 	if (mbx.km_quantum)
 /* XXX should this be in the thread? */
 		p->p_upquantum = max(1, mbx.km_quantum / tick);
 
 	/*
 	 * Each upcall structure has an owner thread, find which
 	 * one owns it.
 	 */
 	thread_lock(td);
 	mtx_lock_spin(&kse_lock);
 	if (uap->newgroup) {
 		/*
 		 * The newgroup parameter now means
 		 * "bound, non SA, system scope"
 		 * It is only used for the interrupt thread at the
 		 * moment I think.. (or system scope threads dopey).
 		 * We'll rename it later.
 		 */
 		newtd = thread_schedule_upcall(td, newku);
 	} else {
 		/*
 		 * If the current thread hasn't an upcall structure,
 		 * just assign the upcall to it.
 		 * It'll just return.
 		 */
 		if (td->td_upcall == NULL) {
 			newku->ku_owner = td;
 			td->td_upcall = newku;
 			newtd = td;
 		} else {
 			/*
 			 * Create a new upcall thread to own it.
 			 */
 			newtd = thread_schedule_upcall(td, newku);
 		}
 	}
 	mtx_unlock_spin(&kse_lock);
 	thread_unlock(td);
 	PROC_SUNLOCK(p);
 
 	/*
 	 * Let the UTS instance know its LWPID.
 	 * It doesn't really care. But the debugger will.
 	 * XXX warning.. remember that this moves.
 	 */
 	suword32(&newku->ku_mailbox->km_lwp, newtd->td_tid);
 
 	/*
 	 * In the same manner, if the UTS has a current user thread, 
 	 * then it is also running on this LWP so set it as well.
 	 * The library could do that of course.. but why not..
 	 * XXX I'm not sure this can ever happen but ...
 	 * XXX does the UTS ever set this in the mailbox before calling this?
 	 */
 	if (mbx.km_curthread)
 		suword32(&mbx.km_curthread->tm_lwp, newtd->td_tid);
 	
 	if (sa) {
 		newtd->td_pflags |= TDP_SA;
 		/* 
 		 * If we are starting a new thread, kick it off.
 		 */
 		if (newtd != td) {
 			thread_lock(newtd);
 			sched_add(newtd, SRQ_BORING);
 			thread_unlock(newtd);
 		}
 	} else {
 		newtd->td_pflags &= ~TDP_SA;
 
 		/*
 		 * Since a library will use the mailbox pointer to 
 		 * identify even a bound thread, and the mailbox pointer
 		 * will never be allowed to change after this syscall
 		 * for a bound thread, set it here so the library can
 		 * find the thread after the syscall returns.
 		 */
 		newtd->td_mailbox = mbx.km_curthread;
 
 		if (newtd != td) {
 			/*
 			 * If we did create a new thread then
 			 * make sure it goes to the right place
 			 * when it starts up, and make sure that it runs 
 			 * at full speed when it gets there. 
 			 * thread_schedule_upcall() copies all cpu state
 			 * to the new thread, so we should clear single step
 			 * flag here.
 			 */
 			cpu_set_upcall_kse(newtd, newku->ku_func,
 				newku->ku_mailbox, &newku->ku_stack);
 			PROC_LOCK(p);
 			if (p->p_flag & P_TRACED) {
 				_PHOLD(p);
 				ptrace_clear_single_step(newtd);
 				_PRELE(p);
 			}
 			PROC_UNLOCK(p);
 			thread_lock(newtd);
 			sched_add(newtd, SRQ_BORING);
 			thread_unlock(newtd);
 		}
 	}
 	return (0);
 #else /* !KSE */
 	return (EOPNOTSUPP);
 #endif
 }
 
 #ifdef KSE
 /*
  * Initialize global thread allocation resources.
  */
 void
 kseinit(void)
 {
 
 	upcall_zone = uma_zcreate("UPCALL", sizeof(struct kse_upcall),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 }
 
 /*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
  * The list is anchored in the proc structure.
  */
 int
 thread_export_context(struct thread *td, int willexit)
 {
 	struct proc *p;
 	uintptr_t mbx;
 	void *addr;
 	int error = 0, sig;
 	mcontext_t mc;
 
 	p = td->td_proc;
 
 	/*
 	 * Post sync signal, or process SIGKILL and SIGSTOP.
 	 * For sync signal, it is only possible when the signal is not
 	 * caught by userland or process is being debugged.
 	 */
 	PROC_LOCK(p);
 	if (td->td_flags & TDF_NEEDSIGCHK) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_NEEDSIGCHK;
 		thread_unlock(td);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 	}
 	if (willexit)
 		SIGFILLSET(td->td_sigmask);
 	PROC_UNLOCK(p);
 
 	/* Export the user/machine context. */
 	get_mcontext(td, &mc, 0);
 	addr = (void *)(&td->td_mailbox->tm_context.uc_mcontext);
 	error = copyout(&mc, addr, sizeof(mcontext_t));
 	if (error)
 		goto bad;
 
 	addr = (caddr_t)(&td->td_mailbox->tm_lwp);
 	if (suword32(addr, 0)) {
 		error = EFAULT;
 		goto bad;
 	}
 
 	/* Get address in latest mbox of list pointer */
 	addr = (void *)(&td->td_mailbox->tm_next);
 	/*
 	 * Put the saved address of the previous first
 	 * entry into this one
 	 */
 	for (;;) {
 		mbx = (uintptr_t)p->p_completed;
 		if (suword(addr, mbx)) {
 			error = EFAULT;
 			goto bad;
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)p->p_completed) {
 			thread_lock(td);
 			p->p_completed = td->td_mailbox;
 			/*
 			 * The thread context may be taken away by
 			 * other upcall threads when we unlock
 			 * process lock. it's no longer valid to
 			 * use it again in any other places.
 			 */
 			td->td_mailbox = NULL;
 			thread_unlock(td);
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	td->td_usticks = 0;
 	return (0);
 
 bad:
 	PROC_LOCK(p);
 	sigexit(td, SIGILL);
 	return (error);
 }
 
 /*
  * Take the list of completed mailboxes for this Process and put them on this
  * upcall's mailbox as it's the next one going up.
  */
 static int
 thread_link_mboxes(struct proc *p, struct kse_upcall *ku)
 {
 	void *addr;
 	uintptr_t mbx;
 
 	addr = (void *)(&ku->ku_mailbox->km_completed);
 	for (;;) {
 		mbx = (uintptr_t)p->p_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
 			PROC_UNLOCK(p);
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
 		if (mbx == (uintptr_t)p->p_completed) {
 			p->p_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
 		PROC_UNLOCK(p);
 	}
 	return (0);
 }
 
 /*
  * This function should be called at statclock interrupt time
  */
 int
 thread_statclock(int user)
 {
 	struct thread *td = curthread;
 
 	if (!(td->td_pflags & TDP_SA))
 		return (0);
 	if (user) {
 		/* Current always do via ast() */
 		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
 		thread_unlock(td);
 		td->td_uuticks++;
 	} else if (td->td_mailbox != NULL)
 		td->td_usticks++;
 	return (0);
 }
 
 /*
  * Export state clock ticks for userland
  */
 static int
 thread_update_usr_ticks(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	caddr_t addr;
 	u_int uticks;
 
 	thread_lock(td);
 	if (td->td_mailbox == NULL) {
 		thread_unlock(td);
 		return (-1);
 	}
 	thread_unlock(td);
 
 	if ((uticks = td->td_uuticks) != 0) {
 		td->td_uuticks = 0;
 		addr = (caddr_t)&td->td_mailbox->tm_uticks;
 		if (suword32(addr, uticks+fuword32(addr)))
 			goto error;
 	}
 	if ((uticks = td->td_usticks) != 0) {
 		td->td_usticks = 0;
 		addr = (caddr_t)&td->td_mailbox->tm_sticks;
 		if (suword32(addr, uticks+fuword32(addr)))
 			goto error;
 	}
 	return (0);
 
 error:
 	PROC_LOCK(p);
 	psignal(p, SIGSEGV);
 	PROC_UNLOCK(p);
 	return (-2);
 }
 
 /*
  * This function is intended to be used to initialize a spare thread
  * for upcall. Initialize thread's large data area outside the thread lock
  * for thread_schedule_upcall(). The crhold is also here to get it out
  * from the schedlock as it has a mutex op itself.
  * XXX BUG.. we need to get the cr ref after the thread has 
  * checked and chenged its own, not 6 months before...  
  */
-void
+int
 thread_alloc_spare(struct thread *td)
 {
 	struct thread *spare;
 
 	if (td->td_standin)
-		return;
+		return (1);
 	spare = thread_alloc();
+	if (spare == NULL)
+		return (0);
 	td->td_standin = spare;
 	bzero(&spare->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	spare->td_proc = td->td_proc;
 	spare->td_ucred = crhold(td->td_ucred);
 	spare->td_flags = TDF_INMEM;
+	return (1);
 }
 
 /*
  * Create a thread and schedule it for upcall on the KSE given.
  * Use our thread's standin so that we don't have to allocate one.
  */
 struct thread *
 thread_schedule_upcall(struct thread *td, struct kse_upcall *ku)
 {
 	struct thread *td2;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	mtx_assert(&kse_lock, MA_OWNED);
 	/*
 	 * Schedule an upcall thread on specified kse_upcall,
 	 * the kse_upcall must be free.
 	 * td must have a spare thread.
 	 */
 	KASSERT(ku->ku_owner == NULL, ("%s: upcall has owner", __func__));
 	if ((td2 = td->td_standin) != NULL) {
 		td->td_standin = NULL;
 	} else {
 		panic("no reserve thread when scheduling an upcall");
 		return (NULL);
 	}
 	CTR3(KTR_PROC, "thread_schedule_upcall: thread %p (pid %d, %s)",
 	     td2, td->td_proc->p_pid, td->td_proc->p_comm);
 	/*
 	 * Bzero already done in thread_alloc_spare() because we can't
 	 * do the crhold here because we are in schedlock already.
 	 */
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	sched_fork_thread(td, td2);
 	thread_link(td2, ku->ku_proc);
 	/* inherit parts of blocked thread's context as a good template */
 	cpu_set_upcall(td2, td);
 	/* Let the new thread become owner of the upcall */
 	ku->ku_owner   = td2;
 	td2->td_upcall = ku;
 	td2->td_pflags = TDP_SA|TDP_UPCALLING;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	SIGFILLSET(td2->td_sigmask);
 	SIG_CANTMASK(td2->td_sigmask);
 	return (td2);	/* bogus.. should be a void function */
 }
 
 /*
  * It is only used when thread generated a trap and process is being
  * debugged.
  */
 void
 thread_signal_add(struct thread *td, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 
 	mtx_unlock(&ps->ps_mtx);
 	SIGADDSET(td->td_sigmask, ksi->ksi_signo);
 	PROC_UNLOCK(p);
 	error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
 			sizeof(siginfo_t));
 	if (error) {
 		PROC_LOCK(p);
 		sigexit(td, SIGSEGV);
 	}
 	PROC_LOCK(p);
 	mtx_lock(&ps->ps_mtx);
 }
 #include "opt_sched.h"
 struct thread *
 thread_switchout(struct thread *td, int flags, struct thread *nextthread)
 {
 	struct kse_upcall *ku;
 	struct thread *td2;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If the outgoing thread is in threaded group and has never
 	 * scheduled an upcall, decide whether this is a short
 	 * or long term event and thus whether or not to schedule
 	 * an upcall.
 	 * If it is a short term event, just suspend it in
 	 * a way that takes its KSE with it.
 	 * Select the events for which we want to schedule upcalls.
 	 * For now it's just sleep or if thread is suspended but
 	 * process wide suspending flag is not set (debugger
 	 * suspends thread).
 	 * XXXKSE eventually almost any inhibition could do.
 	 */
 	if (TD_CAN_UNBIND(td) && (td->td_standin) &&
 	    (TD_ON_SLEEPQ(td) || (TD_IS_SUSPENDED(td) &&
 	     !P_SHOULDSTOP(td->td_proc)))) {
 		/*
 		 * Release ownership of upcall, and schedule an upcall
 		 * thread, this new upcall thread becomes the owner of
 		 * the upcall structure. It will be ahead of us in the
 		 * run queue, so as we are stopping, it should either
 		 * start up immediatly, or at least before us if
 		 * we release our slot.
 		 */
 		mtx_lock_spin(&kse_lock);
 		ku = td->td_upcall;
 		ku->ku_owner = NULL;
 		td->td_upcall = NULL;
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		td2 = thread_schedule_upcall(td, ku);
 		mtx_unlock_spin(&kse_lock);
 		if (flags & SW_INVOL || nextthread) {
 			thread_lock(td2);
 			sched_add(td2, SRQ_YIELDING);
 			thread_unlock(td2);
 		} else {
 			/* Keep up with reality.. we have one extra thread 
 			 * in the picture.. and it's 'running'.
 			 */
 			return td2;
 		}
 	}
 	return (nextthread);
 }
 
 /*
  * Setup done on the thread when it enters the kernel.
  */
 void
 thread_user_enter(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct kse_upcall *ku;
 	struct kse_thr_mailbox *tmbx;
 	uint32_t flags;
 
 	/*
 	 * First check that we shouldn't just abort. we
 	 * can suspend it here or just exit.
 	 */
 	if (__predict_false(P_SHOULDSTOP(p))) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);
 		PROC_UNLOCK(p);
 	}
 
 	if (!(td->td_pflags & TDP_SA))
 		return;
 
 	/*
 	 * If we are doing a syscall in a KSE environment,
 	 * note where our mailbox is.
 	 */
 
 	thread_lock(td);
 	ku = td->td_upcall;
 	thread_unlock(td);
 
 	KASSERT(ku != NULL, ("no upcall owned"));
 	KASSERT(ku->ku_owner == td, ("wrong owner"));
 	KASSERT(!TD_CAN_UNBIND(td), ("can unbind"));
 
-	if (td->td_standin == NULL)
-		thread_alloc_spare(td);
+	if (td->td_standin == NULL) {
+		if (!thread_alloc_spare(td)) {
+			PROC_LOCK(p);
+			if (kern_logsigexit)
+				log(LOG_INFO,
+				    "pid %d (%s), uid %d: thread_alloc_spare failed\n",
+				    p->p_pid, p->p_comm,
+				    td->td_ucred ? td->td_ucred->cr_uid : -1);
+			sigexit(td, SIGSEGV);	/* XXX ? */
+			/* panic("thread_user_enter: thread_alloc_spare failed"); */
+		}
+	}
 	ku->ku_mflags = fuword32((void *)&ku->ku_mailbox->km_flags);
 	tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 	if ((tmbx == NULL) || (tmbx == (void *)-1L) ||
 	    (ku->ku_mflags & KMF_NOUPCALL)) {
 		td->td_mailbox = NULL;
 	} else {
 		flags = fuword32(&tmbx->tm_flags);
 		/*
 		 * On some architectures, TP register points to thread
 		 * mailbox but not points to kse mailbox, and userland
 		 * can not atomically clear km_curthread, but can
 		 * use TP register, and set TMF_NOUPCALL in thread
 		 * flag	to indicate a critical region.
 		 */
 		if (flags & TMF_NOUPCALL) {
 			td->td_mailbox = NULL;
 		} else {
 			td->td_mailbox = tmbx;
 			td->td_pflags |= TDP_CAN_UNBIND;
 			PROC_LOCK(p);
 			if (__predict_false(p->p_flag & P_TRACED)) {
 				flags = fuword32(&tmbx->tm_dflags);
 				if (flags & TMDF_SUSPEND) {
 					thread_lock(td);
 					/* fuword can block, check again */
 					if (td->td_upcall)
 						ku->ku_flags |= KUF_DOUPCALL;
 					thread_unlock(td);
 				}
 			}
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * The extra work we go through if we are a threaded process when we
  * return to userland.
  *
  * If we are a KSE process and returning to user mode, check for
  * extra work to do before we return (e.g. for more syscalls
  * to complete first).  If we were in a critical section, we should
  * just return to let it finish. Same if we were in the UTS (in
  * which case the mailbox's context's busy indicator will be set).
  * The only traps we suport will have set the mailbox.
  * We will clear it here.
  */
 int
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	struct kse_upcall *ku;
 	struct proc *p;
 	struct timespec ts;
 	int error = 0, uts_crit;
 
 	/* Nothing to do with bound thread */
 	if (!(td->td_pflags & TDP_SA))
 		return (0);
 
 	/*
 	 * Update stat clock count for userland
 	 */
 	if (td->td_mailbox != NULL) {
 		thread_update_usr_ticks(td);
 		uts_crit = 0;
 	} else {
 		uts_crit = 1;
 	}
 
 	p = td->td_proc;
 	thread_lock(td);
 	ku = td->td_upcall;
 
 	/*
 	 * Optimisation:
 	 * This thread has not started any upcall.
 	 * If there is no work to report other than ourself,
 	 * then it can return direct to userland.
 	 */
 	if (TD_CAN_UNBIND(td)) {
 		thread_unlock(td);
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		if ((td->td_flags & TDF_NEEDSIGCHK) == 0 &&
 		    (p->p_completed == NULL) &&
 		    (ku->ku_flags & KUF_DOUPCALL) == 0 &&
 		    (p->p_upquantum && ticks < p->p_nextupcall)) {
 			nanotime(&ts);
 			error = copyout(&ts,
 				(caddr_t)&ku->ku_mailbox->km_timeofday,
 				sizeof(ts));
 			td->td_mailbox = 0;
 			ku->ku_mflags = 0;
 			if (error)
 				goto out;
 			return (0);
 		}
 		thread_export_context(td, 0);
 		/*
 		 * There is something to report, and we own an upcall
 		 * structure, we can go to userland.
 		 * Turn ourself into an upcall thread.
 		 */
 		td->td_pflags |= TDP_UPCALLING;
 	} else if (td->td_mailbox && (ku == NULL)) {
 		thread_unlock(td);
 		thread_export_context(td, 1);
 		PROC_LOCK(p);
 		if (p->p_upsleeps)
 			wakeup(&p->p_completed);
 		WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object,
 		    "thread exiting in userret");
 		sigqueue_flush(&td->td_sigqueue);
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	} else
 		thread_unlock(td);
 
 	KASSERT(ku != NULL, ("upcall is NULL"));
 	KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
 
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 	if (p->p_numthreads > max_threads_per_proc) {
 		max_threads_hits++;
 		while (p->p_numthreads > max_threads_per_proc) {
 			if (p->p_numupcalls >= max_threads_per_proc)
 				break;
 			PROC_SUNLOCK(p);
 			if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
 			    "maxthreads", hz/10) != EWOULDBLOCK) {
 				PROC_SLOCK(p);
 				break;
 			} else
 				PROC_SLOCK(p);
 		}
 	}
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 
 	if (td->td_pflags & TDP_UPCALLING) {
 		uts_crit = 0;
 		p->p_nextupcall = ticks + p->p_upquantum;
 		/*
 		 * There is no more work to do and we are going to ride
 		 * this thread up to userland as an upcall.
 		 * Do the last parts of the setup needed for the upcall.
 		 */
 		CTR3(KTR_PROC, "userret: upcall thread %p (pid %d, %s)",
 		    td, td->td_proc->p_pid, td->td_proc->p_comm);
 
 		td->td_pflags &= ~TDP_UPCALLING;
 		if (ku->ku_flags & KUF_DOUPCALL) {
 			PROC_SLOCK(p);
 			ku->ku_flags &= ~KUF_DOUPCALL;
 			PROC_SUNLOCK(p);
 		}
 		/*
 		 * Set user context to the UTS
 		 */
 		if (!(ku->ku_mflags & KMF_NOUPCALL)) {
 			cpu_set_upcall_kse(td, ku->ku_func, ku->ku_mailbox,
 				&ku->ku_stack);
 			PROC_LOCK(p);
 			if (p->p_flag & P_TRACED) {
 				_PHOLD(p);
 				ptrace_clear_single_step(td);
 				_PRELE(p);
 			}
 			PROC_UNLOCK(p);
 			error = suword32(&ku->ku_mailbox->km_lwp,
 					td->td_tid);
 			if (error)
 				goto out;
 			error = suword(&ku->ku_mailbox->km_curthread, 0);
 			if (error)
 				goto out;
 		}
 
 		/*
 		 * Unhook the list of completed threads.
 		 * anything that completes after this gets to
 		 * come in next time.
 		 * Put the list of completed thread mailboxes on
 		 * this KSE's mailbox.
 		 */
 		if (!(ku->ku_mflags & KMF_NOCOMPLETED) &&
 		    (error = thread_link_mboxes(p, ku)) != 0)
 			goto out;
 	}
 	if (!uts_crit) {
 		nanotime(&ts);
 		error = copyout(&ts, &ku->ku_mailbox->km_timeofday, sizeof(ts));
 	}
 
 out:
 	if (error) {
 		/*
 		 * Things are going to be so screwed we should just kill
 		 * the process.
 		 * how do we do that?
 		 */
 		PROC_LOCK(p);
 		psignal(p, SIGSEGV);
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Optimisation:
 		 * Ensure that we have a spare thread available,
 		 * for when we re-enter the kernel.
 		 */
 		if (td->td_standin == NULL)
-			thread_alloc_spare(td);
+			thread_alloc_spare(td); /* XXX care of failure ? */
 	}
 
 	ku->ku_mflags = 0;
 	td->td_mailbox = NULL;
 	td->td_usticks = 0;
 	return (error);	/* go sync */
 }
 
 /*
  * called after ptrace resumed a process, force all
  * virtual CPUs to schedule upcall for SA process,
  * because debugger may have changed something in userland,
  * we should notice UTS as soon as possible.
  */
 void
 thread_continued(struct proc *p)
 {
 	struct kse_upcall *ku;
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(P_SHOULDSTOP(p), ("process not stopped"));
 
 	if (!(p->p_flag & P_SA))
 		return;
 
 	if (p->p_flag & P_TRACED) {
 		td = TAILQ_FIRST(&p->p_threads);
 		if (td && (td->td_pflags & TDP_SA)) {
 			FOREACH_UPCALL_IN_PROC(p, ku) {
 				PROC_SLOCK(p);
 				ku->ku_flags |= KUF_DOUPCALL;
 				PROC_SUNLOCK(p);
 				wakeup(&p->p_completed);
 			}
 		}
 	}
 }
 #endif
Index: head/sys/kern/kern_proc.c
===================================================================
--- head/sys/kern/kern_proc.c	(revision 173360)
+++ head/sys/kern/kern_proc.c	(revision 173361)
@@ -1,1348 +1,1346 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/filedesc.h>
 #include <sys/tty.h>
 #include <sys/signalvar.h>
 #include <sys/sx.h>
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/uma.h>
 
 MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
 MALLOC_DEFINE(M_SESSION, "session", "session header");
 static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
 MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");
 
 static void doenterpgrp(struct proc *, struct pgrp *);
 static void orphanpg(struct pgrp *pg);
 static void fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp);
 static void fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp);
 static void pgadjustjobc(struct pgrp *pgrp, int entering);
 static void pgdelete(struct pgrp *);
 static int proc_ctor(void *mem, int size, void *arg, int flags);
 static void proc_dtor(void *mem, int size, void *arg);
 static int proc_init(void *mem, int size, int flags);
 static void proc_fini(void *mem, int size);
 
 /*
  * Other process lists
  */
 struct pidhashhead *pidhashtbl;
 u_long pidhash;
 struct pgrphashhead *pgrphashtbl;
 u_long pgrphash;
 struct proclist allproc;
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 uma_zone_t ithread_zone;
 
 int kstack_pages = KSTACK_PAGES;
 SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0, "");
 
 CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
 
 /*
  * Initialize global process hashing structures.
  */
 void
 procinit()
 {
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
 	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
 	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
 	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
 	    proc_ctor, proc_dtor, proc_init, proc_fini,
 	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uihashinit();
 }
 
 /*
  * Prepare a proc for use.
  */
 static int
 proc_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	return (0);
 }
 
 /*
  * Reclaim a proc after use.
  */
 static void
 proc_dtor(void *mem, int size, void *arg)
 {
 	struct proc *p;
 	struct thread *td;
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
         td = FIRST_THREAD_IN_PROC(p);
+	if (td != NULL) {
 #ifdef INVARIANTS
-	KASSERT((p->p_numthreads == 1),
-	    ("bad number of threads in exiting process"));
-	KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
-	KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
+		KASSERT((p->p_numthreads == 1),
+		    ("bad number of threads in exiting process"));
+		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
 #endif
 
-	/* Dispose of an alternate kstack, if it exists.
-	 * XXX What if there are more than one thread in the proc?
-	 *     The first thread in the proc is special and not
-	 *     freed, so you gotta do this here.
-	 */
-	if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
-		vm_thread_dispose_altkstack(td);
+		/* Dispose of an alternate kstack, if it exists.
+		 * XXX What if there are more than one thread in the proc?
+		 *     The first thread in the proc is special and not
+		 *     freed, so you gotta do this here.
+		 */
+		if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
+			vm_thread_dispose_altkstack(td);
+	}
 	if (p->p_ksi != NULL)
 		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
 }
 
 /*
  * Initialize type-stable parts of a proc (when newly created).
  */
 static int
 proc_init(void *mem, int size, int flags)
 {
 	struct proc *p;
-	struct thread *td;
 
 	p = (struct proc *)mem;
 	p->p_sched = (struct p_sched *)&p[1];
-	td = thread_alloc();
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	p->p_stats = pstats_alloc();
-	proc_linkup(p, td);
-	sched_newproc(p, td);
 	return (0);
 }
 
 /*
  * UMA should ensure that this function is never called.
  * Freeing a proc structure would violate type stability.
  */
 static void
 proc_fini(void *mem, int size)
 {
 #ifdef notnow
 	struct proc *p;
 
 	p = (struct proc *)mem;
 	pstats_free(p->p_stats);
 	thread_free(FIRST_THREAD_IN_PROC(p));
 	mtx_destroy(&p->p_mtx);
 	if (p->p_ksi != NULL)
 		ksiginfo_free(p->p_ksi);
 #else
 	panic("proc reclaimed");
 #endif
 }
 
 /*
  * Is p an inferior of the current process?
  */
 int
 inferior(p)
 	register struct proc *p;
 {
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (; p != curproc; p = p->p_pptr)
 		if (p->p_pid == 0)
 			return (0);
 	return (1);
 }
 
 /*
  * Locate a process by number; return only "live" processes -- i.e., neither
  * zombies nor newly born but incompletely initialized processes.  By not
  * returning processes in the PRS_NEW state, we allow callers to avoid
  * testing for that condition to avoid dereferencing p_ucred, et al.
  */
 struct proc *
 pfind(pid)
 	register pid_t pid;
 {
 	register struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, PIDHASH(pid), p_hash)
 		if (p->p_pid == pid) {
 			if (p->p_state == PRS_NEW) {
 				p = NULL;
 				break;
 			}
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 /*
  * Locate a process group by number.
  * The caller must hold proctree_lock.
  */
 struct pgrp *
 pgfind(pgid)
 	register pid_t pgid;
 {
 	register struct pgrp *pgrp;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 
 	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
 		if (pgrp->pg_id == pgid) {
 			PGRP_LOCK(pgrp);
 			return (pgrp);
 		}
 	}
 	return (NULL);
 }
 
 /*
  * Create a new process group.
  * pgid must be equal to the pid of p.
  * Begin a new session if required.
  */
 int
 enterpgrp(p, pgid, pgrp, sess)
 	register struct proc *p;
 	pid_t pgid;
 	struct pgrp *pgrp;
 	struct session *sess;
 {
 	struct pgrp *pgrp2;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 
 	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
 	KASSERT(p->p_pid == pgid,
 	    ("enterpgrp: new pgrp and pid != pgid"));
 
 	pgrp2 = pgfind(pgid);
 
 	KASSERT(pgrp2 == NULL,
 	    ("enterpgrp: pgrp with pgid exists"));
 	KASSERT(!SESS_LEADER(p),
 	    ("enterpgrp: session leader attempted setpgrp"));
 
 	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 
 	if (sess != NULL) {
 		/*
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
 		mtx_lock(&Giant);       /* XXX TTY */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
 		PGRP_LOCK(pgrp);
 		sess->s_leader = p;
 		sess->s_sid = p->p_pid;
 		sess->s_count = 1;
 		sess->s_ttyvp = NULL;
 		sess->s_ttyp = NULL;
 		bcopy(p->p_session->s_login, sess->s_login,
 			    sizeof(sess->s_login));
 		pgrp->pg_session = sess;
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
 		mtx_lock(&Giant);       /* XXX TTY */
 		pgrp->pg_session = p->p_session;
 		SESS_LOCK(pgrp->pg_session);
 		pgrp->pg_session->s_count++;
 		SESS_UNLOCK(pgrp->pg_session);
 		PGRP_LOCK(pgrp);
 	}
 	pgrp->pg_id = pgid;
 	LIST_INIT(&pgrp->pg_members);
 
 	/*
 	 * As we have an exclusive lock of proctree_lock,
 	 * this should not deadlock.
 	 */
 	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
 	mtx_unlock(&Giant);       /* XXX TTY */
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to an existing process group
  */
 int
 enterthispgrp(p, pgrp)
 	register struct proc *p;
 	struct pgrp *pgrp;
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 	KASSERT(pgrp->pg_session == p->p_session,
 		("%s: pgrp's session %p, p->p_session %p.\n",
 		__func__,
 		pgrp->pg_session,
 		p->p_session));
 	KASSERT(pgrp != p->p_pgrp,
 		("%s: p belongs to pgrp.", __func__));
 
 	doenterpgrp(p, pgrp);
 
 	return (0);
 }
 
 /*
  * Move p to a process group
  */
 static void
 doenterpgrp(p, pgrp)
 	struct proc *p;
 	struct pgrp *pgrp;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
 
 	savepgrp = p->p_pgrp;
 
 	/*
 	 * Adjust eligibility of affected pgrps to participate in job control.
 	 * Increment eligibility counts before decrementing, otherwise we
 	 * could reach 0 spuriously during the first call.
 	 */
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
 	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = pgrp;
 	PROC_UNLOCK(p);
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
 	mtx_unlock(&Giant);     /* XXX TTY */
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
 
 /*
  * remove process from process group
  */
 int
 leavepgrp(p)
 	register struct proc *p;
 {
 	struct pgrp *savepgrp;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	savepgrp = p->p_pgrp;
 	mtx_lock(&Giant);	/* XXX TTY */
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
 	LIST_REMOVE(p, p_pglist);
 	p->p_pgrp = NULL;
 	PROC_UNLOCK(p);
 	PGRP_UNLOCK(savepgrp);
 	mtx_unlock(&Giant);	/* XXX TTY */
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 	return (0);
 }
 
 /*
  * delete a process group
  */
 static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 	struct session *savesess;
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pgid.
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
 	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
 		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
 	LIST_REMOVE(pgrp, pg_hash);
 	savesess = pgrp->pg_session;
 	SESSRELE(savesess);
 	PGRP_UNLOCK(pgrp);
 	mtx_destroy(&pgrp->pg_mtx);
 	FREE(pgrp, M_PGRP);
 	mtx_unlock(&Giant);     /* XXX TTY */
 }
 
 static void
 pgadjustjobc(pgrp, entering)
 	struct pgrp *pgrp;
 	int entering;
 {
 
 	PGRP_LOCK(pgrp);
 	if (entering)
 		pgrp->pg_jobc++;
 	else {
 		--pgrp->pg_jobc;
 		if (pgrp->pg_jobc == 0)
 			orphanpg(pgrp);
 	}
 	PGRP_UNLOCK(pgrp);
 }
 
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
  * the group for terminal job control (those with a parent in a different
  * process group of the same session).  If that count reaches zero, the
  * process group becomes orphaned.  Check both the specified process'
  * process group and that of its children.
  * entering == 0 => p is leaving specified group.
  * entering == 1 => p is entering specified group.
  */
 void
 fixjobc(p, pgrp, entering)
 	register struct proc *p;
 	register struct pgrp *pgrp;
 	int entering;
 {
 	register struct pgrp *hispgrp;
 	register struct session *mysession;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
 	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);
 
 	/*
 	 * Check p's parent to see whether p qualifies its own process
 	 * group; if so, adjust count for p's process group.
 	 */
 	mysession = pgrp->pg_session;
 	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
 	    hispgrp->pg_session == mysession)
 		pgadjustjobc(pgrp, entering);
 
 	/*
 	 * Check this process' children to see whether they qualify
 	 * their process groups; if so, adjust counts for children's
 	 * process groups.
 	 */
 	LIST_FOREACH(p, &p->p_children, p_sibling) {
 		hispgrp = p->p_pgrp;
 		if (hispgrp == pgrp ||
 		    hispgrp->pg_session != mysession)
 			continue;
 		PROC_LOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		PROC_UNLOCK(p);
 		pgadjustjobc(hispgrp, entering);
 	}
 }
 
 /*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
  */
 static void
 orphanpg(pg)
 	struct pgrp *pg;
 {
 	register struct proc *p;
 
 	PGRP_LOCK_ASSERT(pg, MA_OWNED);
 
 	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 		PROC_LOCK(p);
 		if (P_SHOULDSTOP(p)) {
 			PROC_UNLOCK(p);
 			LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 				PROC_LOCK(p);
 				psignal(p, SIGHUP);
 				psignal(p, SIGCONT);
 				PROC_UNLOCK(p);
 			}
 			return;
 		}
 		PROC_UNLOCK(p);
 	}
 }
 
 void
 sessrele(struct session *s)
 {
 	int i;
 
 	SESS_LOCK(s);
 	i = --s->s_count;
 	SESS_UNLOCK(s);
 	if (i == 0) {
 		if (s->s_ttyp != NULL)
 			ttyrel(s->s_ttyp);
 		mtx_destroy(&s->s_mtx);
 		FREE(s, M_SESSION);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
 	register int i;
 
 	for (i = 0; i <= pgrphash; i++) {
 		if (!LIST_EMPTY(&pgrphashtbl[i])) {
 			printf("\tindx %d\n", i);
 			LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
 				printf(
 			"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
 				    (void *)pgrp, (long)pgrp->pg_id,
 				    (void *)pgrp->pg_session,
 				    pgrp->pg_session->s_count,
 				    (void *)LIST_FIRST(&pgrp->pg_members));
 				LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 					printf("\t\tpid %ld addr %p pgrp %p\n", 
 					    (long)p->p_pid, (void *)p,
 					    (void *)p->p_pgrp);
 				}
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  * Clear kinfo_proc and fill in any information that is common
  * to all threads in the process.
  * Must be called with the target process locked.
  */
 static void
 fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp)
 {
 	struct thread *td0;
 	struct tty *tp;
 	struct session *sp;
 	struct ucred *cred;
 	struct sigacts *ps;
 
 	bzero(kp, sizeof(*kp));
 
 	kp->ki_structsize = sizeof(*kp);
 	kp->ki_paddr = p;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	kp->ki_addr =/* p->p_addr; */0; /* XXXKSE */
 	kp->ki_args = p->p_args;
 	kp->ki_textvp = p->p_textvp;
 #ifdef KTRACE
 	kp->ki_tracep = p->p_tracevp;
 	mtx_lock(&ktrace_mtx);
 	kp->ki_traceflag = p->p_traceflag;
 	mtx_unlock(&ktrace_mtx);
 #endif
 	kp->ki_fd = p->p_fd;
 	kp->ki_vmspace = p->p_vmspace;
 	kp->ki_flag = p->p_flag;
 	cred = p->p_ucred;
 	if (cred) {
 		kp->ki_uid = cred->cr_uid;
 		kp->ki_ruid = cred->cr_ruid;
 		kp->ki_svuid = cred->cr_svuid;
 		/* XXX bde doesn't like KI_NGROUPS */
 		kp->ki_ngroups = min(cred->cr_ngroups, KI_NGROUPS);
 		bcopy(cred->cr_groups, kp->ki_groups,
 		    kp->ki_ngroups * sizeof(gid_t));
 		kp->ki_rgid = cred->cr_rgid;
 		kp->ki_svgid = cred->cr_svgid;
 		/* If jailed(cred), emulate the old P_JAILED flag. */
 		if (jailed(cred)) {
 			kp->ki_flag |= P_JAILED;
 			/* If inside a jail, use 0 as a jail ID. */
 			if (!jailed(curthread->td_ucred))
 				kp->ki_jid = cred->cr_prison->pr_id;
 		}
 	}
 	ps = p->p_sigacts;
 	if (ps) {
 		mtx_lock(&ps->ps_mtx);
 		kp->ki_sigignore = ps->ps_sigignore;
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
 	PROC_SLOCK(p);
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
 		struct vmspace *vm = p->p_vmspace;
 
 		kp->ki_size = vm->vm_map.size;
 		kp->ki_rssize = vmspace_resident_count(vm); /*XXX*/
 		FOREACH_THREAD_IN_PROC(p, td0) {
 			if (!TD_IS_SWAPPED(td0))
 				kp->ki_rssize += td0->td_kstack_pages;
 			if (td0->td_altkstack_obj != NULL)
 				kp->ki_rssize += td0->td_altkstack_pages;
 		}
 		kp->ki_swrss = vm->vm_swrss;
 		kp->ki_tsize = vm->vm_tsize;
 		kp->ki_dsize = vm->vm_dsize;
 		kp->ki_ssize = vm->vm_ssize;
 	} else if (p->p_state == PRS_ZOMBIE)
 		kp->ki_stat = SZOMB;
 	if (kp->ki_flag & P_INMEM)
 		kp->ki_sflag = PS_INMEM;
 	else
 		kp->ki_sflag = 0;
 	/* Calculate legacy swtime as seconds since 'swtick'. */
 	kp->ki_swtime = (ticks - p->p_swtick) / hz;
 	kp->ki_pid = p->p_pid;
 	kp->ki_nice = p->p_nice;
 	rufetch(p, &kp->ki_rusage);
 	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
 	PROC_SUNLOCK(p);
 	if ((p->p_flag & P_INMEM) && p->p_stats != NULL) {
 		kp->ki_start = p->p_stats->p_start;
 		timevaladd(&kp->ki_start, &boottime);
 		PROC_SLOCK(p);
 		calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
 		PROC_SUNLOCK(p);
 		calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 
 		/* Some callers want child-times in a single value */
 		kp->ki_childtime = kp->ki_childstime;
 		timevaladd(&kp->ki_childtime, &kp->ki_childutime);
 	}
 	tp = NULL;
 	if (p->p_pgrp) {
 		kp->ki_pgid = p->p_pgrp->pg_id;
 		kp->ki_jobc = p->p_pgrp->pg_jobc;
 		sp = p->p_pgrp->pg_session;
 
 		if (sp != NULL) {
 			kp->ki_sid = sp->s_sid;
 			SESS_LOCK(sp);
 			strlcpy(kp->ki_login, sp->s_login,
 			    sizeof(kp->ki_login));
 			if (sp->s_ttyvp)
 				kp->ki_kiflag |= KI_CTTY;
 			if (SESS_LEADER(p))
 				kp->ki_kiflag |= KI_SLEADER;
 			tp = sp->s_ttyp;
 			SESS_UNLOCK(sp);
 		}
 	}
 	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
 		kp->ki_tdev = dev2udev(tp->t_dev);
 		kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
 		if (tp->t_session)
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NODEV;
 	if (p->p_comm[0] != '\0')
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
 	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
 	    p->p_sysent->sv_name[0] != '\0')
 		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
 	kp->ki_siglist = p->p_siglist;
 	kp->ki_xstat = p->p_xstat;
 	kp->ki_acflag = p->p_acflag;
 	kp->ki_lock = p->p_lock;
 	if (p->p_pptr)
 		kp->ki_ppid = p->p_pptr->p_pid;
 }
 
 /*
  * Fill in information that is thread specific.
  * Must be called with p_slock locked.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
 		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
 	if (td->td_name[0] != '\0')
 		strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm));
 	if (TD_ON_LOCK(td)) {
 		kp->ki_kiflag |= KI_LOCKBLOCK;
 		strlcpy(kp->ki_lockname, td->td_lockname,
 		    sizeof(kp->ki_lockname));
 	} else {
 		kp->ki_kiflag &= ~KI_LOCKBLOCK;
 		bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
 	}
 
 	if (p->p_state == PRS_NORMAL) { /*  XXXKSE very approximate */
 		if (TD_ON_RUNQ(td) ||
 		    TD_CAN_RUN(td) ||
 		    TD_IS_RUNNING(td)) {
 			kp->ki_stat = SRUN;
 		} else if (P_SHOULDSTOP(p)) {
 			kp->ki_stat = SSTOP;
 		} else if (TD_IS_SLEEPING(td)) {
 			kp->ki_stat = SSLEEP;
 		} else if (TD_ON_LOCK(td)) {
 			kp->ki_stat = SLOCK;
 		} else {
 			kp->ki_stat = SWAIT;
 		}
 	} else if (p->p_state == PRS_ZOMBIE) {
 		kp->ki_stat = SZOMB;
 	} else {
 		kp->ki_stat = SIDL;
 	}
 
 	/* Things in the thread */
 	kp->ki_wchan = td->td_wchan;
 	kp->ki_pri.pri_level = td->td_priority;
 	kp->ki_pri.pri_native = td->td_base_pri;
 	kp->ki_lastcpu = td->td_lastcpu;
 	kp->ki_oncpu = td->td_oncpu;
 	kp->ki_tdflags = td->td_flags;
 	kp->ki_tid = td->td_tid;
 	kp->ki_numthreads = p->p_numthreads;
 	kp->ki_pcb = td->td_pcb;
 	kp->ki_kstack = (void *)td->td_kstack;
 	kp->ki_pctcpu = sched_pctcpu(td);
 	kp->ki_estcpu = td->td_estcpu;
 	kp->ki_slptime = (ticks - td->td_slptick) / hz;
 	kp->ki_pri.pri_class = td->td_pri_class;
 	kp->ki_pri.pri_user = td->td_user_pri;
 
 	/* We can't get this anymore but ps etc never used it anyway. */
 	kp->ki_rqindex = 0;
 
 	SIGSETOR(kp->ki_siglist, td->td_siglist);
 	kp->ki_sigmask = td->td_sigmask;
 	thread_unlock(td);
 }
 
 /*
  * Fill in a kinfo_proc structure for the specified process.
  * Must be called with the target process locked.
  */
 void
 fill_kinfo_proc(struct proc *p, struct kinfo_proc *kp)
 {
 
 	fill_kinfo_proc_only(p, kp);
 	PROC_SLOCK(p);
 	if (FIRST_THREAD_IN_PROC(p) != NULL)
 		fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
 	PROC_SUNLOCK(p);
 }
 
 struct pstats *
 pstats_alloc(void)
 {
 
 	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO|M_WAITOK));
 }
 
 /*
  * Copy parts of p_stats; zero the rest of p_stats (statistics).
  */
 void
 pstats_fork(struct pstats *src, struct pstats *dst)
 {
 
 	bzero(&dst->pstat_startzero,
 	    __rangeof(struct pstats, pstat_startzero, pstat_endzero));
 	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
 	    __rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
 }
 
 void
 pstats_free(struct pstats *ps)
 {
 
 	free(ps, M_SUBPROC);
 }
 
 /*
  * Locate a zombie process by number
  */
 struct proc *
 zpfind(pid_t pid)
 {
 	struct proc *p;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &zombproc, p_list)
 		if (p->p_pid == pid) {
 			PROC_LOCK(p);
 			break;
 		}
 	sx_sunlock(&allproc_lock);
 	return (p);
 }
 
 #define KERN_PROC_ZOMBMASK	0x3
 #define KERN_PROC_NOTHREADS	0x4
 
 /*
  * Must be called with the process locked and will return with it unlocked.
  */
 static int
 sysctl_out_proc(struct proc *p, struct sysctl_req *req, int flags)
 {
 	struct thread *td;
 	struct kinfo_proc kinfo_proc;
 	int error = 0;
 	struct proc *np;
 	pid_t pid = p->p_pid;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fill_kinfo_proc_only(p, &kinfo_proc);
 	if (flags & KERN_PROC_NOTHREADS) {
 		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc);
 		PROC_SUNLOCK(p);
 		error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 				   sizeof(kinfo_proc));
 	} else {
 		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			FOREACH_THREAD_IN_PROC(p, td) {
 				fill_kinfo_thread(td, &kinfo_proc);
 				error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 						   sizeof(kinfo_proc));
 				if (error)
 					break;
 			}
 		else
 			error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 					   sizeof(kinfo_proc));
 		PROC_SUNLOCK(p);
 	}
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 	if (flags & KERN_PROC_ZOMBMASK)
 		np = zpfind(pid);
 	else {
 		if (pid == 0)
 			return (0);
 		np = pfind(pid);
 	}
 	if (np == NULL)
 		return EAGAIN;
 	if (np != p) {
 		PROC_UNLOCK(np);
 		return EAGAIN;
 	}
 	PROC_UNLOCK(np);
 	return (0);
 }
 
 static int
 sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct proc *p;
 	int flags, doingzomb, oid_number;
 	int error = 0;
 
 	oid_number = oidp->oid_number;
 	if (oid_number != KERN_PROC_ALL &&
 	    (oid_number & KERN_PROC_INC_THREAD) == 0)
 		flags = KERN_PROC_NOTHREADS;
 	else {
 		flags = 0;
 		oid_number &= ~KERN_PROC_INC_THREAD;
 	}
 	if (oid_number == KERN_PROC_PID) {
 		if (namelen != 1) 
 			return (EINVAL);
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);		
 		p = pfind((pid_t)name[0]);
 		if (!p)
 			return (ESRCH);
 		if ((error = p_cansee(curthread, p))) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 		error = sysctl_out_proc(p, req, flags);
 		return (error);
 	}
 
 	switch (oid_number) {
 	case KERN_PROC_ALL:
 		if (namelen != 0)
 			return (EINVAL);
 		break;
 	case KERN_PROC_PROC:
 		if (namelen != 0 && namelen != 1)
 			return (EINVAL);
 		break;
 	default:
 		if (namelen != 1)
 			return (EINVAL);
 		break;
 	}
 	
 	if (!req->oldptr) {
 		/* overestimate by 5 procs */
 		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
 		if (error)
 			return (error);
 	}
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sx_slock(&allproc_lock);
 	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
 		if (!doingzomb)
 			p = LIST_FIRST(&allproc);
 		else
 			p = LIST_FIRST(&zombproc);
 		for (; p != 0; p = LIST_NEXT(p, p_list)) {
 			/*
 			 * Skip embryonic processes.
 			 */
 			PROC_SLOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_SUNLOCK(p);
 				continue;
 			}
 			PROC_SUNLOCK(p);
 			PROC_LOCK(p);
 			KASSERT(p->p_ucred != NULL,
 			    ("process credential is NULL for non-NEW proc"));
 			/*
 			 * Show a user only appropriate processes.
 			 */
 			if (p_cansee(curthread, p)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * TODO - make more efficient (see notes below).
 			 * do by session.
 			 */
 			switch (oid_number) {
 
 			case KERN_PROC_GID:
 				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
 				if (p->p_pgrp == NULL ||
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RGID:
 				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_SESSION:
 				if (p->p_session == NULL ||
 				    p->p_session->s_sid != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_TTY:
 				if ((p->p_flag & P_CONTROLT) == 0 ||
 				    p->p_session == NULL) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_LOCK(p->p_session);
 				if (p->p_session->s_ttyp == NULL ||
 				    dev2udev(p->p_session->s_ttyp->t_dev) != 
 				    (dev_t)name[0]) {
 					SESS_UNLOCK(p->p_session);
 					PROC_UNLOCK(p);
 					continue;
 				}
 				SESS_UNLOCK(p->p_session);
 				break;
 
 			case KERN_PROC_UID:
 				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
 				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_PROC:
 				break;
 
 			default:
 				break;
 
 			}
 
 			error = sysctl_out_proc(p, req, flags | doingzomb);
 			if (error) {
 				sx_sunlock(&allproc_lock);
 				return (error);
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return (0);
 }
 
 struct pargs *
 pargs_alloc(int len)
 {
 	struct pargs *pa;
 
 	MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
 	refcount_init(&pa->ar_ref, 1);
 	pa->ar_length = len;
 	return (pa);
 }
 
 void
 pargs_free(struct pargs *pa)
 {
 
 	FREE(pa, M_PARGS);
 }
 
 void
 pargs_hold(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	refcount_acquire(&pa->ar_ref);
 }
 
 void
 pargs_drop(struct pargs *pa)
 {
 
 	if (pa == NULL)
 		return;
 	if (refcount_release(&pa->ar_ref))
 		pargs_free(pa);
 }
 
 /*
  * This sysctl allows a process to retrieve the argument list or process
  * title for another process without groping around in the address space
  * of the other process.  It also allow a process to set its own "process 
  * title to a string of its own choice.
  */
 static int
 sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int*) arg1;
 	u_int namelen = arg2;
 	struct pargs *newpa, *pa;
 	struct proc *p;
 	int error = 0;
 
 	if (namelen != 1) 
 		return (EINVAL);
 
 	p = pfind((pid_t)name[0]);
 	if (!p)
 		return (ESRCH);
 
 	if ((error = p_cansee(curthread, p)) != 0) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	if (req->newptr && curproc != p) {
 		PROC_UNLOCK(p);
 		return (EPERM);
 	}
 
 	pa = p->p_args;
 	pargs_hold(pa);
 	PROC_UNLOCK(p);
 	if (req->oldptr != NULL && pa != NULL)
 		error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
 	pargs_drop(pa);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
 		return (ENOMEM);
 	newpa = pargs_alloc(req->newlen);
 	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
 	if (error != 0) {
 		pargs_free(newpa);
 		return (error);
 	}
 	PROC_LOCK(p);
 	pa = p->p_args;
 	p->p_args = newpa;
 	PROC_UNLOCK(p);
 	pargs_drop(pa);
 	return (0);
 }
 
 /*
  * This sysctl allows a process to retrieve the path of the executable for
  * itself or another process.
  */
 static int
 sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
 {
 	pid_t *pidp = (pid_t *)arg1;
 	unsigned int arglen = arg2;
 	struct proc *p;
 	struct vnode *vp;
 	char *retbuf, *freebuf;
 	int error;
 
 	if (arglen != 1)
 		return (EINVAL);
 	if (*pidp == -1) {	/* -1 means this process */
 		p = req->td->td_proc;
 	} else {
 		p = pfind(*pidp);
 		if (p == NULL)
 			return (ESRCH);
 		if ((error = p_cansee(curthread, p)) != 0) {
 			PROC_UNLOCK(p);
 			return (error);
 		}
 	}
 
 	vp = p->p_textvp;
 	vref(vp);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
 	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
 	vrele(vp);
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
 	free(freebuf, M_TEMP);
 	return (error);
 }
 
 static int
 sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	char *sv_name;
 	int *name;
 	int namelen;
 	int error;
 
 	namelen = arg2;
 	if (namelen != 1) 
 		return (EINVAL);
 
 	name = (int *)arg1;
 	if ((p = pfind((pid_t)name[0])) == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	sv_name = p->p_sysent->sv_name;
 	PROC_UNLOCK(p);
 	return (sysctl_handle_string(oidp, sv_name, 0, req));
 }
 
 
 static SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
 
 SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
 	0, 0, sysctl_kern_proc, "S,proc", "Return entire process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD,
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
 	sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD,
 	sysctl_kern_proc, "Return process table, no threads");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
 	CTLFLAG_RW | CTLFLAG_ANYBODY,
 	sysctl_kern_proc_args, "Process argument list");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD,
 	sysctl_kern_proc_pathname, "Process executable path");
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD,
 	sysctl_kern_proc_sv_name, "Process syscall vector name (ABI type)");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID | KERN_PROC_INC_THREAD), gid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP | KERN_PROC_INC_THREAD), pgrp_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID | KERN_PROC_INC_THREAD), rgid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION | KERN_PROC_INC_THREAD),
 	sid_td, CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY | KERN_PROC_INC_THREAD), tty_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID | KERN_PROC_INC_THREAD), uid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID | KERN_PROC_INC_THREAD), ruid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID | KERN_PROC_INC_THREAD), pid_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Process table");
 
 static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC | KERN_PROC_INC_THREAD), proc_td,
 	CTLFLAG_RD, sysctl_kern_proc, "Return process table, no threads");
Index: head/sys/kern/kern_sig.c
===================================================================
--- head/sys/kern/kern_sig.c	(revision 173360)
+++ head/sys/kern/kern_sig.c	(revision 173361)
@@ -1,3324 +1,3324 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 static int	coredump(struct thread *);
 static char	*expand_name(const char *, uid_t, pid_t);
 static int	killpg1(struct thread *td, int sig, int pgid, int all);
 static int	issignal(struct thread *p);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static void	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 #ifdef KSE
 static int	do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
 #endif
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
-static int	kern_logsigexit = 1;
+int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 int sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, 
     &sugid_coredump, 0, "Enable coredumping set user/group ID processes");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SA_KILL		0x01		/* terminates process by default */
 #define	SA_CORE		0x02		/* ditto and coredumps */
 #define	SA_STOP		0x04		/* suspend process */
 #define	SA_TTYSTOP	0x08		/* ditto, from tty */
 #define	SA_IGNORE	0x10		/* ignore by default */
 #define	SA_CONT		0x20		/* continue if suspended */
 #define	SA_CANTMASK	0x40		/* non-maskable, catchable */
 #define	SA_PROC		0x80		/* deliverable to any thread */
 
 static int sigproptbl[NSIG] = {
         SA_KILL|SA_PROC,		/* SIGHUP */
         SA_KILL|SA_PROC,		/* SIGINT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGQUIT */
         SA_KILL|SA_CORE,		/* SIGILL */
         SA_KILL|SA_CORE,		/* SIGTRAP */
         SA_KILL|SA_CORE,		/* SIGABRT */
         SA_KILL|SA_CORE|SA_PROC,	/* SIGEMT */
         SA_KILL|SA_CORE,		/* SIGFPE */
         SA_KILL|SA_PROC,		/* SIGKILL */
         SA_KILL|SA_CORE,		/* SIGBUS */
         SA_KILL|SA_CORE,		/* SIGSEGV */
         SA_KILL|SA_CORE,		/* SIGSYS */
         SA_KILL|SA_PROC,		/* SIGPIPE */
         SA_KILL|SA_PROC,		/* SIGALRM */
         SA_KILL|SA_PROC,		/* SIGTERM */
         SA_IGNORE|SA_PROC,		/* SIGURG */
         SA_STOP|SA_PROC,		/* SIGSTOP */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTSTP */
         SA_IGNORE|SA_CONT|SA_PROC,	/* SIGCONT */
         SA_IGNORE|SA_PROC,		/* SIGCHLD */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTIN */
         SA_STOP|SA_TTYSTOP|SA_PROC,	/* SIGTTOU */
         SA_IGNORE|SA_PROC,		/* SIGIO */
         SA_KILL,			/* SIGXCPU */
         SA_KILL,			/* SIGXFSZ */
         SA_KILL|SA_PROC,		/* SIGVTALRM */
         SA_KILL|SA_PROC,		/* SIGPROF */
         SA_IGNORE|SA_PROC,		/* SIGWINCH  */
         SA_IGNORE|SA_PROC,		/* SIGINFO */
         SA_KILL|SA_PROC,		/* SIGUSR1 */
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  * 	0	-	signal not found
  *	others	-	signal number
  */ 
 int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 	
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 	
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if ((si->ksi_flags & KSI_TRAP) != 0) {
 		if (ret != 0)
 			SIGADDSET(sq->sq_kill, signo);
 		ret = 0;
 		goto out_set_bit;
 	}
 
 	if (ret != 0)
 		return (ret);
 	
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 }
 
 void
 sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
 {
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
 		SIGADDSET(*set, ksi->ksi_signo);
 	SIGSETOR(*set, sq->sq_kill);
 }
 
 void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
 {
 	sigset_t tmp, set;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	/*
 	 * make a copy, this allows setp to point to src or dst
 	 * sq_signals without trouble.
 	 */
 	set = *setp;
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 
 	/* Finally, rescan src queue and set pending bits for it */
 	sigqueue_collect_set(src, &src->sq_signals);
 }
 
 void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 
 void
 sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 	/* Finally, rescan queue and set pending bits for it */
 	sigqueue_collect_set(sq, &sq->sq_signals);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 void
 sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 	PROC_SUNLOCK(p);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
 #ifdef KSE
 	sigset_t set, saved;
 #else
 	sigset_t set;
 #endif
 
 	p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * If our mask changed we may have to move signal that were
 	 * previously masked by all threads to our sigqueue.
 	 */
 	set = p->p_sigqueue.sq_signals;
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 #endif
 	SIGSETNAND(set, td->td_sigmask);
 	if (! SIGISEMPTY(set))
 		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 #ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 #endif
 }
 
 int
 sigonstack(size_t sp)
 {
 	struct thread *td = curthread;
 
 	return ((td->td_pflags & TDP_ALTSTACK) ?
 #if defined(COMPAT_43)
 	    ((td->td_sigstk.ss_size == 0) ?
 		(td->td_sigstk.ss_flags & SS_ONSTACK) :
 		((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
 #else
 	    ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
 #endif
 	    : 0);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < NSIG)
 		return (sigproptbl[_SIG_IDX(sig)]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
 	struct thread *td;
 	register int sig;
 	struct sigaction *act, *oact;
 	int flags;
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		oact->sa_flags = 0;
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig))
 			oact->sa_flags |= SA_SIGINFO;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (act->sa_flags & SA_SIGINFO) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!(act->sa_flags & SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (act->sa_flags & SA_ONSTACK)
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (act->sa_flags & SA_RESETHAND)
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (act->sa_flags & SA_NODEFER)
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 #ifdef KSE
 			if ((p->p_flag & P_SA) &&
 			     SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
 				p->p_flag |= P_SIGEVENT;
 				wakeup(&p->p_siglist);
 			}
 #endif
 			/* never to be seen again */
 			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
 			PROC_SUNLOCK(p);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sigaction(td, uap)
 	struct thread *td;
 	register struct sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
 	register struct freebsd4_sigaction_args *uap;
 {
 	struct sigaction act, oact;
 	register struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(td, uap)
 	struct thread *td;
 	register struct osigaction_args *uap;
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args *uap;
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(p)
 	struct proc *p;
 {
 	register int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++)
 		if (sigprop(i) & SA_IGNORE && i != SIGCONT)
 			SIGADDSET(ps->ps_sigignore, i);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	td = FIRST_THREAD_IN_PROC(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		SIGDELSET(ps->ps_sigcatch, sig);
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
 			PROC_SLOCK(p);
 			sigqueue_delete_proc(p, sig);
 			PROC_SUNLOCK(p);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(td, how, set, oset, old)
 	struct thread *td;
 	int how;
 	sigset_t *set, *oset;
 	int old;
 {
 	int error;
 
 	PROC_LOCK(td->td_proc);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			SIGSETOR(td->td_sigmask, *set);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			break;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			if (old)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	PROC_UNLOCK(td->td_proc);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sigprocmask(td, uap)
 	register struct thread *td;
 	struct sigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(td, uap)
 	register struct thread *td;
 	struct osigprocmask_args *uap;
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 	
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t savedmask;
 	struct proc *p;
 	int error, sig, hz, i, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
 	ets.tv_sec = 0;
 	ets.tv_nsec = 0;
 	SIG_CANTMASK(waitset);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	savedmask = td->td_sigmask;
 	if (timeout) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 		 	ets = rts;
 			timespecadd(&ets, timeout);
 		}
 	}
 
 restart:
 	for (i = 1; i <= _SIG_MAXSIG; ++i) {
 		if (!SIGISMEMBER(waitset, i))
 			continue;
 		if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
 			if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
 #ifdef KSE
 				if (p->p_flag & P_SA) {
 					p->p_flag |= P_SIGEVENT;
 					wakeup(&p->p_siglist);
 				}
 #endif
 				sigqueue_move(&p->p_sigqueue,
 					&td->td_sigqueue, i);
 			} else
 				continue;
 		}
 
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 		SIGDELSET(td->td_sigmask, i);
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		if (sig)
 			goto out;
 		else {
 			/*
 			 * Because cursig() may have stopped current thread,
 			 * after it is resumed, things may have already been 
 			 * changed, it should rescan any pending signals.
 			 */
 			goto restart;
 		}
 	}
 
 	if (error)
 		goto out;
 
 	/*
 	 * POSIX says this must be checked after looking for pending
 	 * signals.
 	 */
 	if (timeout) {
 		if (!timevalid) {
 			error = EINVAL;
 			goto out;
 		}
 		getnanouptime(&rts);
 		if (timespeccmp(&rts, &ets, >=)) {
 			error = EAGAIN;
 			goto out;
 		}
 		ts = ets;
 		timespecsub(&ts, &rts);
 		TIMESPEC_TO_TIMEVAL(&tv, &ts);
 		hz = tvtohz(&tv);
 	} else
 		hz = 0;
 
 	td->td_sigmask = savedmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	signotify(td);
 	error = msleep(&ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", hz);
 	if (timeout) {
 		if (error == ERESTART) {
 			/* timeout can not be restarted. */
 			error = EINTR;
 		} else if (error == EAGAIN) {
 			/* will calculate timeout by ourself. */
 			error = 0;
 		}
 	}
 	goto restart;
 
 out:
 	td->td_sigmask = savedmask;
 	signotify(td);
 	if (sig) {
 		ksiginfo_init(ksi);
 		sigqueue_get(&td->td_sigqueue, sig, ksi);
 		ksi->ksi_signo = sig;
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 		error = 0;
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, 0);
 		}
 #endif
 		if (sig == SIGKILL)
 			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(td, uap)
 	struct thread *td;
 	register struct osigvec_args *uap;
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	register struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(td, uap)
 	register struct thread *td;
 	struct osigblock_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETOR(td->td_sigmask, set);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(td, uap)
 	struct thread *td;
 	struct osigsetmask_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t set;
 
 	OSIG2SIG(uap->mask, set);
 	SIG_CANTMASK(set);
 	PROC_LOCK(p);
 	SIG2OSIG(td->td_sigmask, td->td_retval[0]);
 	SIGSETLO(td->td_sigmask, set);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime. 
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
 	struct thread *td;
 	struct sigsuspend_args *uap;
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	SIG_CANTMASK(mask);
 	td->td_sigmask = mask;
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
 	struct thread *td;
 	struct osigsuspend_args *uap;
 {
 	struct proc *p = td->td_proc;
 	sigset_t mask;
 
 	PROC_LOCK(p);
 	td->td_oldsigmask = td->td_sigmask;
 	td->td_pflags |= TDP_OLDMASK;
 	OSIG2SIG(uap->mask, mask);
 	SIG_CANTMASK(mask);
 	SIGSETLO(td->td_sigmask, mask);
 	signotify(td);
 	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "opause", 0) == 0)
 		/* void */;
 	PROC_UNLOCK(p);
 	/* always return EINTR rather than ERESTART... */
 	return (EINTR);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(td, uap)
 	struct thread *td;
 	register struct osigstack_args *uap;
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
 	struct thread *td;
 	register struct sigaltstack_args *uap;
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(td, sig, pgid, all)
 	register struct thread *td;
 	int sig, pgid, all;
 {
 	register struct proc *p;
 	struct pgrp *pgrp;
 	int nfound = 0;
 
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 				p->p_state == PRS_NEW ) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			if (p_cansignal(td, p, sig) == 0) {
 				nfound++;
 				if (sig)
 					psignal(p, sig);
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (nfound ? 0 : ESRCH);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 kill(td, uap)
 	register struct thread *td;
 	register struct kill_args *uap;
 {
 	register struct proc *p;
 	int error;
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind(uap->pid)) == NULL) {
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
 		AUDIT_ARG(process, p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0));
 	}
 	/* NOTREACHED */
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(td, uap)
 	struct thread *td;
 	register struct okillpg_args *uap;
 {
 
 	AUDIT_ARG(signum, uap->signum);
 	AUDIT_ARG(pid, uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (uap->pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind(uap->pid)) == NULL) {
 		if ((p = zpfind(uap->pid)) == NULL)
 			return (ESRCH);
 	}
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = uap->signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value.sival_ptr = uap->value;
 		error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(pgid, sig)
 	int pgid, sig;
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(pgrp, sig, checkctty)
 	struct pgrp *pgrp;
 	int sig, checkctty;
 {
 	register struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (checkctty == 0 || p->p_flag & P_CONTROLT)
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 #ifdef KSE
 	int error;
 #endif
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 #ifdef KSE
 	if (td->td_pflags & TDP_SA) {
 		if (td->td_mailbox == NULL)
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
 		thread_lock(td);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
 		 * userland.
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
 		thread_unlock(td);
 	} else {
 		PROC_LOCK(p);
 	}
 #else
 	PROC_LOCK(p);
 #endif
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 		td->td_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 #ifdef KSE
 		if (!(td->td_pflags & TDP_SA))
 			(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #else
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
 				ksi, &td->td_sigmask);
 #endif
 #ifdef KSE
 		else if (td->td_mailbox == NULL) {
 			mtx_unlock(&ps->ps_mtx);
 			/* UTS caused a sync signal */
 			p->p_code = code;	/* XXX for core dump/debugger */
 			p->p_sig = sig;		/* XXX to verify code */
 			sigexit(td, sig);
 		} else {
 			mtx_unlock(&ps->ps_mtx);
 			SIGADDSET(td->td_sigmask, sig);
 			PROC_UNLOCK(p);
 			error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
 			    sizeof(siginfo_t));
 			PROC_LOCK(p);
 			/* UTS memory corrupted */
 			if (error)
 				sigexit(td, SIGSEGV);
 			mtx_lock(&ps->ps_mtx);
 		}
 #endif
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching conetxt to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	PROC_SUNLOCK(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  * 
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 psignal(struct proc *p, int sig)
 {
 	(void) tdsignal(p, NULL, sig, NULL);
 }
 
 int
 psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
 {
 	struct thread *td = NULL;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
 
 	/*
 	 * ksi_code and other fields should be set before
 	 * calling this function.
 	 */
 	ksi->ksi_signo = sigev->sigev_signo;
 	ksi->ksi_value = sigev->sigev_value;
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = thread_find(p, sigev->sigev_notify_thread_id);
 		if (td == NULL)
 			return (ESRCH);
 	}
 	return (tdsignal(p, td, ksi->ksi_signo, ksi));
 }
 
 int
 tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #ifdef KSE
 	sigset_t saved;
 	int ret;
 
 	if (p->p_flag & P_SA)
 		saved = p->p_sigqueue.sq_signals;
 	ret = do_tdsignal(p, td, sig, ksi);
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
 		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
 	return (ret);
 }
 
 static int
 do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 #endif
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 #ifdef KSE
 		panic("do_tdsignal(): invalid signal %d", sig);
 #else
 		panic("tdsignal(): invalid signal %d", sig);
 #endif
 
 #ifdef KSE
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
 #else
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
 #endif
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	/*
 	 * If the signal is blocked and not destined for this thread, then
 	 * assign it to the process so that we can find it later in the first
 	 * thread that unblocks it.  Otherwise, assign it to this thread now.
 	 */
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		if (SIGISMEMBER(td->td_sigmask, sig))
 			sigqueue = &p->p_sigqueue;
 		else
 			sigqueue = &td->td_sigqueue;
 	} else {
 		KASSERT(td->td_proc == p, ("invalid thread"));
 		sigqueue = &td->td_sigqueue;
 	}
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SA_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		PROC_SLOCK(p);
 		sigqueue_delete_proc(p, SIGCONT);
 		PROC_SUNLOCK(p);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediatly, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
 	PROC_SLOCK(p);
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
 		 * either winding down or already on the suspended queue.
 		 */
 		if (p->p_flag & P_TRACED) {
 			/*
 			 * The traced process is already stopped,
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (sig == SIGKILL) {
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xstat = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out;
 			}
 			if (action == SIG_CATCH) {
 #ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 * It would seem that the answer would be to
 				 * run an upcall in the next KSE to run, and
 				 * deliver the signal that way. In a NON KSE
 				 * process, we need to make sure that the
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
 #endif
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (prop & SA_STOP) {
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			PROC_SUNLOCK(p);
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
 		thread_unlock(td);
 		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			thread_lock(td);
 			tdsigwakeup(td, sig, action, intrval);
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
 			if (p->p_flag & P_PPWAIT) {
 				PROC_SUNLOCK(p);
 				goto out;
 			}
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
 			sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xstat);
 			} else
 				PROC_SUNLOCK(p);
 			goto out;
 		} 
 		else
 			goto runfast;
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		PROC_SUNLOCK(p);
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 
 runfast:
 	thread_lock(td);
 	tdsigwakeup(td, sig, action, intrval);
 	thread_unlock(td);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
 	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
 		sched_prio(td, PUSER);
 
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			return;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			PROC_SLOCK(p);
 			thread_lock(td);
 			return;
 		}
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER)
 			sched_prio(td, PUSER);
 
 		sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 }
 
 static void
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
 			thread_suspend_one(td2);
 		} else {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 }
 
 int
 ptracestop(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	thread_lock(td);
 	td->td_flags |= TDF_XSIG;
 	thread_unlock(td);
 	td->td_xsig = sig;
 	PROC_SLOCK(p);
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
 			thread_lock(td);
 			td->td_flags &= ~TDF_XSIG;
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			return (sig);
 		}
 		/*
 		 * Just make wait() to work, the last stopped thread
 		 * will win.
 		 */
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
 		sig_suspend_threads(td, p, 0);
 stopme:
 		thread_suspend_switch(td);
 		if (!(p->p_flag & P_TRACED)) {
 			break;
 		}
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
 			goto stopme;
 		}
 	}
 	PROC_SUNLOCK(p);
 	return (td->td_xsig);
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(td)
 	struct thread *td;
 {
 	struct proc *p;
 	struct sigacts *ps;
 	sigset_t sigpending;
 	int sig, prop, newsig;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		sig = sig_ffs(&sigpending);
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 #endif
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
 			/*
 			 * If traced, always stop.
 			 */
 			mtx_unlock(&ps->ps_mtx);
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
 
 #endif
 			if (sig != newsig) {
 				ksiginfo_t ksi;
 				/*
 				 * clear old signal.
 				 * XXX shrug off debugger, it causes siginfo to
 				 * be thrown away.
 				 */
 				sigqueue_get(&td->td_sigqueue, sig, &ksi);
 
 				/*
 				 * If parent wants us to take the signal,
 				 * then it will leave it in p->p_xstat;
 				 * otherwise we just look for signals again.
 			 	*/
 				if (newsig == 0)
 					continue;
 				sig = newsig;
 
 				/*
 				 * Put the new signal into td_sigqueue. If the
 				 * signal is being masked, look for other signals.
 				 */
 				SIGADDSET(td->td_sigqueue.sq_signals, sig);
 #ifdef KSE
 				if (td->td_pflags & TDP_SA)
 					SIGDELSET(td->td_sigmask, sig);
 #endif
 				if (SIGISMEMBER(td->td_sigmask, sig))
 					continue;
 				signotify(td);
 			}
 
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
 			 * that p_sig* and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process
 			 * with default action, stop here,
 			 * then clear the signal.  However,
 			 * if process is member of an orphaned
 			 * process group, ignore tty stop signals.
 			 */
 			if (prop & SA_STOP) {
 				if (p->p_flag & P_TRACED ||
 		    		    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SA_TTYSTOP))
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
 				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
 				thread_suspend_switch(td);
 				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SA_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
  
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 void
 postsig(sig)
 	register int sig;
 {
 	struct thread *td = curthread;
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 	int code;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	sigqueue_get(&td->td_sigqueue, sig, &ksi);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, 0);
 #endif
 	if (p->p_stops & S_SIG) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 #ifdef KSE
 	if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
 #else
 	if (action == SIG_DFL) {
 #endif
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 #ifdef KSE
 		if (td->td_pflags & TDP_SA) {
 			if (sig == SIGKILL) {
 				mtx_unlock(&ps->ps_mtx);
 				sigexit(td, sig);
 			}
 		}
 
 #endif
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action"));
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
 
 		if (SIGISMEMBER(ps->ps_sigreset, sig)) {
 			/*
 			 * See kern_sigaction() for origin of this code.
 			 */
 			SIGDELSET(ps->ps_sigcatch, sig);
 			if (sig != SIGCONT &&
 			    sigprop(sig) & SA_IGNORE)
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		td->td_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
 			code = p->p_code;
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
 #ifdef KSE
 		if (td->td_pflags & TDP_SA)
 			thread_signal_add(curthread, &ksi);
 		else
 			(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #else
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 #endif
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(p, why)
 	struct proc *p;
 	char *why;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
 		p, p->p_pid, p->p_comm);
 	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
 		p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(td, sig)
 	struct thread *td;
 	int sig;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), uid %d: exited on signal %d%s\n",
 			    p->p_pid, p->p_comm,
 			    td->td_ucred ? td->td_ucred->cr_uid : -1,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, W_EXITCODE(0, sig));
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int status)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, status);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 	childproc_jobstate(p, reason, p->p_xstat);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason;
 	int status = p->p_xstat; /* convert to int */
 
 	reason = CLD_EXITED;
 	if (WCOREDUMP(status))
 		reason = CLD_DUMPED;
 	else if (WIFSIGNALED(status))
 		reason = CLD_KILLED;
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
 
 /*
  * expand_name(name, uid, pid)
  * Expand the name described in corefilename, using name, uid, and pid.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 
 static char *
 expand_name(name, uid, pid)
 	const char *name;
 	uid_t uid;
 	pid_t pid;
 {
 	const char *format, *appendstr;
 	char *temp;
 	char buf[11];		/* Buffer for pid/uid -- max 4B */
 	size_t i, l, n;
 
 	format = corefilename;
 	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT | M_ZERO);
 	if (temp == NULL)
 		return (NULL);
 	for (i = 0, n = 0; n < MAXPATHLEN && format[i]; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				appendstr = "%";
 				break;
 			case 'N':	/* process name */
 				appendstr = name;
 				break;
 			case 'P':	/* process id */
 				sprintf(buf, "%u", pid);
 				appendstr = buf;
 				break;
 			case 'U':	/* user id */
 				sprintf(buf, "%u", uid);
 				appendstr = buf;
 				break;
 			default:
 				appendstr = "";
 			  	log(LOG_ERR,
 				    "Unknown format character %c in `%s'\n",
 				    format[i], format);
 			}
 			l = strlen(appendstr);
 			if ((n + l) >= MAXPATHLEN)
 				goto toolong;
 			memcpy(temp + n, appendstr, l);
 			n += l;
 			break;
 		default:
 			temp[n++] = format[i];
 		}
 	}
 	if (format[i] != '\0')
 		goto toolong;
 	return (temp);
 toolong:
 	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too long\n",
 	    (long)pid, name, (u_long)uid);
 	free(temp, M_TEMP);
 	return (NULL);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	register struct vnode *vp;
 	register struct ucred *cred = td->td_ucred;
 	struct flock lf;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1, flags, locked;
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
 	int vfslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
 	if (name == NULL) {
 #ifdef AUDIT
 		audit_proc_coredump(td, NULL, EINVAL);
 #endif
 		return (EINVAL);
 	}
 	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) || do_coredump == 0) {
 		PROC_UNLOCK(p);
 #ifdef AUDIT
 		audit_proc_coredump(td, name, EFAULT);
 #endif
 		free(name, M_TEMP);
 		return (EFAULT);
 	}
 	
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(p, RLIMIT_CORE);
 	PROC_UNLOCK(p);
 	if (limit == 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, EFBIG);
 #endif
 		free(name, M_TEMP);
 		return (EFBIG);
 	}
 
 restart:
 	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
 	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL);
 	if (error) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/* Don't dump to non-regular files or files with links. */
 	if (vp->v_type != VREG ||
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
 		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		lf.l_type = F_UNLCK;
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
 			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			goto out;
 		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	error = p->p_sysent->sv_coredump ?
 	  p->p_sysent->sv_coredump(td, vp, limit) :
 	  ENOSYS;
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 close:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 out:
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(td, args)
 	struct thread *td;
 	struct nosys_args *args;
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	psignal(p, SIGSYS);
 	PROC_UNLOCK(p);
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
 	struct sigio **sigiop;
 	int sig, checkctty;
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(&p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(&p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to 
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	ps->ps_refcnt = 1;
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt--;
 	if (ps->ps_refcnt == 0) {
 		mtx_destroy(&ps->ps_mtx);
 		free(ps, M_SUBPROC);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 	mtx_lock(&ps->ps_mtx);
 	ps->ps_refcnt++;
 	mtx_unlock(&ps->ps_mtx);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 	int shared;
 
 	mtx_lock(&ps->ps_mtx);
 	shared = ps->ps_refcnt > 1;
 	mtx_unlock(&ps->ps_mtx);
 	return (shared);
 }
Index: head/sys/kern/kern_thr.c
===================================================================
--- head/sys/kern/kern_thr.c	(revision 173360)
+++ head/sys/kern/kern_thr.c	(revision 173361)
@@ -1,509 +1,511 @@
 /*-
  * Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_posix.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/posix4.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
 #include <sys/rtprio.h>
 #include <sys/umtx.h>
 #include <sys/limits.h>
 
 #include <machine/frame.h>
 
 #include <security/audit/audit.h>
 
 #ifdef COMPAT_IA32
 
 extern struct sysentvec ia32_freebsd_sysvec;
 
 static inline int
 suword_lwpid(void *addr, lwpid_t lwpid)
 {
 	int error;
 
 	if (curproc->p_sysent != &ia32_freebsd_sysvec)
 		error = suword(addr, lwpid);
 	else
 		error = suword32(addr, lwpid);
 	return (error);
 }
 
 #else
 #define suword_lwpid	suword
 #endif
 
 extern int max_threads_per_proc;
 
 static int create_thread(struct thread *td, mcontext_t *ctx,
 			 void (*start_func)(void *), void *arg,
 			 char *stack_base, size_t stack_size,
 			 char *tls_base,
 			 long *child_tid, long *parent_tid,
 			 int flags, struct rtprio *rtp);
 
 /*
  * System call interface.
  */
 int
 thr_create(struct thread *td, struct thr_create_args *uap)
     /* ucontext_t *ctx, long *id, int flags */
 {
 	ucontext_t ctx;
 	int error;
 
 	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
 		return (error);
 
 	error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
 		NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
 	return (error);
 }
 
 int
 thr_new(struct thread *td, struct thr_new_args *uap)
     /* struct thr_param * */
 {
 	struct thr_param param;
 	int error;
 
 	if (uap->param_size < 0 || uap->param_size > sizeof(param))
 		return (EINVAL);
 	bzero(&param, sizeof(param));
 	if ((error = copyin(uap->param, &param, uap->param_size)))
 		return (error);
 	return (kern_thr_new(td, &param));
 }
 
 int
 kern_thr_new(struct thread *td, struct thr_param *param)
 {
 	struct rtprio rtp, *rtpp;
 	int error;
 
 	rtpp = NULL;
 	if (param->rtp != 0) {
 		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
 		rtpp = &rtp;
 	}
 	error = create_thread(td, NULL, param->start_func, param->arg,
 		param->stack_base, param->stack_size, param->tls_base,
 		param->child_tid, param->parent_tid, param->flags,
 		rtpp);
 	return (error);
 }
 
 static int
 create_thread(struct thread *td, mcontext_t *ctx,
 	    void (*start_func)(void *), void *arg,
 	    char *stack_base, size_t stack_size,
 	    char *tls_base,
 	    long *child_tid, long *parent_tid,
 	    int flags, struct rtprio *rtp)
 {
 	stack_t stack;
 	struct thread *newtd;
 	struct proc *p;
 	int error;
 
 	error = 0;
 	p = td->td_proc;
 
 	/* Have race condition but it is cheap. */
 	if (p->p_numthreads >= max_threads_per_proc)
 		return (EPROCLIM);
 
 	if (rtp != NULL) {
 		switch(rtp->type) {
 		case RTP_PRIO_REALTIME:
 		case RTP_PRIO_FIFO:
 			/* Only root can set scheduler policy */
 			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
 				return (EPERM);
 			if (rtp->prio > RTP_PRIO_MAX)
 				return (EINVAL);
 			break;
 		case RTP_PRIO_NORMAL:
 			rtp->prio = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 	}
 
 	/* Initialize our td */
 	newtd = thread_alloc();
+	if (newtd == NULL)
+		return (ENOMEM);
 
 	/*
 	 * Try the copyout as soon as we allocate the td so we don't
 	 * have to tear things down in a failure case below.
 	 * Here we copy out tid to two places, one for child and one
 	 * for parent, because pthread can create a detached thread,
 	 * if parent wants to safely access child tid, it has to provide 
 	 * its storage, because child thread may exit quickly and
 	 * memory is freed before parent thread can access it.
 	 */
 	if ((child_tid != NULL &&
 	    suword_lwpid(child_tid, newtd->td_tid)) ||
 	    (parent_tid != NULL &&
 	    suword_lwpid(parent_tid, newtd->td_tid))) {
 		thread_free(newtd);
 		return (EFAULT);
 	}
 
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 	newtd->td_proc = td->td_proc;
 	newtd->td_ucred = crhold(td->td_ucred);
 
 	cpu_set_upcall(newtd, td);
 
 	if (ctx != NULL) { /* old way to set user context */
 		error = set_mcontext(newtd, ctx);
 		if (error != 0) {
 			thread_free(newtd);
 			crfree(td->td_ucred);
 			return (error);
 		}
 	} else {
 		/* Set up our machine context. */
 		stack.ss_sp = stack_base;
 		stack.ss_size = stack_size;
 		/* Set upcall address to user thread entry function. */
 		cpu_set_upcall_kse(newtd, start_func, arg, &stack);
 		/* Setup user TLS address and TLS pointer register. */
 		error = cpu_set_user_tls(newtd, tls_base);
 		if (error != 0) {
 			thread_free(newtd);
 			crfree(td->td_ucred);
 			return (error);
 		}
 	}
 
 	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
 	newtd->td_sigmask = td->td_sigmask;
 	PROC_SLOCK(p);
 	thread_link(newtd, p); 
 	thread_lock(td);
 	/* let the scheduler know about these things. */
 	sched_fork_thread(td, newtd);
 	thread_unlock(td);
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	thread_lock(newtd);
 	if (rtp != NULL) {
 		if (!(td->td_pri_class == PRI_TIMESHARE &&
 		      rtp->type == RTP_PRIO_NORMAL)) {
 			rtp_to_pri(rtp, newtd);
 			sched_prio(newtd, newtd->td_user_pri);
 		} /* ignore timesharing class */
 	}
 	TD_SET_CAN_RUN(newtd);
 	/* if ((flags & THR_SUSPENDED) == 0) */
 		sched_add(newtd, SRQ_BORING);
 	thread_unlock(newtd);
 
 	return (error);
 }
 
 int
 thr_self(struct thread *td, struct thr_self_args *uap)
     /* long *id */
 {
 	int error;
 
 	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
 	if (error == -1)
 		return (EFAULT);
 	return (0);
 }
 
 int
 thr_exit(struct thread *td, struct thr_exit_args *uap)
     /* long *state */
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	/* Signal userland that it can free the stack. */
 	if ((void *)uap->state != NULL) {
 		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX);
 	}
 
 	PROC_LOCK(p);
 	sigqueue_flush(&td->td_sigqueue);
 	PROC_SLOCK(p);
 
 	/*
 	 * Shutting down last thread in the proc.  This will actually
 	 * call exit() in the trampoline when it returns.
 	 */
 	if (p->p_numthreads != 1) {
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 thr_kill(struct thread *td, struct thr_kill_args *uap)
     /* long id, int sig */
 {
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	error = 0;
 	PROC_LOCK(p);
 	if (uap->id == -1) {
 		if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 			error = EINVAL;
 		} else {
 			error = ESRCH;
 			FOREACH_THREAD_IN_PROC(p, ttd) {
 				if (ttd != td) {
 					error = 0;
 					if (uap->sig == 0)
 						break;
 					tdsignal(p, ttd, uap->sig, NULL);
 				}
 			}
 		}
 	} else {
 		if (uap->id != td->td_tid)
 			ttd = thread_find(p, uap->id);
 		else
 			ttd = td;
 		if (ttd == NULL)
 			error = ESRCH;
 		else if (uap->sig == 0)
 			;
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else
 			tdsignal(p, ttd, uap->sig, NULL);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 thr_kill2(struct thread *td, struct thr_kill2_args *uap)
     /* pid_t pid, long id, int sig */
 {
 	struct thread *ttd;
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG(signum, uap->sig);
 
 	if (uap->pid == td->td_proc->p_pid) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else if ((p = pfind(uap->pid)) == NULL) {
 		return (ESRCH);
 	}
 	AUDIT_ARG(process, p);
 
 	error = p_cansignal(td, p, uap->sig);
 	if (error == 0) {
 		if (uap->id == -1) {
 			if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
 				error = EINVAL;
 			} else {
 				error = ESRCH;
 				FOREACH_THREAD_IN_PROC(p, ttd) {
 					if (ttd != td) {
 						error = 0;
 						if (uap->sig == 0)
 							break;
 						tdsignal(p, ttd, uap->sig, NULL);
 					}
 				}
 			}
 		} else {
 			if (uap->id != td->td_tid)
 				ttd = thread_find(p, uap->id);
 			else
 				ttd = td;
 			if (ttd == NULL)
 				error = ESRCH;
 			else if (uap->sig == 0)
 				;
 			else if (!_SIG_VALID(uap->sig))
 				error = EINVAL;
 			else
 				tdsignal(p, ttd, uap->sig, NULL);
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 	/* const struct timespec *timeout */
 {
 	struct timespec ts, *tsp;
 	int error;
 
 	error = 0;
 	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = copyin((const void *)uap->timeout, (void *)&ts,
 		    sizeof(struct timespec));
 		if (error != 0)
 			return (error);
 		tsp = &ts;
 	}
 
 	return (kern_thr_suspend(td, tsp));
 }
 
 int
 kern_thr_suspend(struct thread *td, struct timespec *tsp)
 {
 	struct timeval tv;
 	int error = 0, hz = 0;
 
 	if (tsp != NULL) {
 		if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
 			return (EINVAL);
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			return (ETIMEDOUT);
 		TIMESPEC_TO_TIMEVAL(&tv, tsp);
 		hz = tvtohz(&tv);
 	}
 
 	if (td->td_pflags & TDP_WAKEUP) {
 		td->td_pflags &= ~TDP_WAKEUP;
 		return (0);
 	}
 
 	PROC_LOCK(td->td_proc);
 	if ((td->td_flags & TDF_THRWAKEUP) == 0)
 		error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr",
 		    hz);
 	if (td->td_flags & TDF_THRWAKEUP) {
 		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
 		thread_unlock(td);
 		PROC_UNLOCK(td->td_proc);
 		return (0);
 	}
 	PROC_UNLOCK(td->td_proc);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	else if (error == ERESTART) {
 		if (hz != 0)
 			error = EINTR;
 	}
 	return (error);
 }
 
 int
 thr_wake(struct thread *td, struct thr_wake_args *uap)
 	/* long id */
 {
 	struct proc *p;
 	struct thread *ttd;
 
 	if (uap->id == td->td_tid) {
 		td->td_pflags |= TDP_WAKEUP;
 		return (0);
 	} 
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	ttd = thread_find(p, uap->id);
 	if (ttd == NULL) {
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
 	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
 	thread_unlock(ttd);
 	wakeup((void *)ttd);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 int
 thr_set_name(struct thread *td, struct thr_set_name_args *uap)
 {
 	struct proc *p = td->td_proc;
 	char name[MAXCOMLEN + 1];
 	struct thread *ttd;
 	int error;
 
 	error = 0;
 	name[0] = '\0';
 	if (uap->name != NULL) {
 		error = copyinstr(uap->name, name, sizeof(name),
 			NULL);
 		if (error)
 			return (error);
 	}
 	PROC_LOCK(p);
 	if (uap->id == td->td_tid)
 		ttd = td;
 	else
 		ttd = thread_find(p, uap->id);
 	if (ttd != NULL)
 		strcpy(ttd->td_name, name);
 	else 
 		error = ESRCH;
 	PROC_UNLOCK(p);
 	return (error);
 }
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 173360)
+++ head/sys/kern/kern_thread.c	(revision 173361)
@@ -1,963 +1,980 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/umtx.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 
 int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
 	&max_threads_hits, 0, "");
 
 #ifdef KSE
 int virtual_cpu;
 
 #endif
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 static struct mtx zombie_lock;
 MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
 
 static void thread_zombie(struct thread *);
 
 #ifdef KSE
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 	int def_val;
 
 	def_val = mp_ncpus;
 	if (virtual_cpu == 0)
 		new_val = def_val;
 	else
 		new_val = virtual_cpu;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < 0)
 		return (EINVAL);
 	virtual_cpu = new_val;
 	return (0);
 }
 
 /* DEBUG ONLY */
 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
 	"debug virtual cpus");
 #endif
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = alloc_unr(tid_unrhdr);
 	td->td_syscalls = 0;
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
 
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	free_unr(tid_unrhdr, td->td_tid);
 	sched_newthread(td);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
-	vm_thread_new(td, 0);
-	cpu_thread_setup(td);
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_sched = (struct td_sched *)&td[1];
 	sched_newthread(td);
 	umtx_thread_init(td);
+	td->td_kstack = 0;
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
-	vm_thread_dispose(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   ia64_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
+proc_linkup0(struct proc *p, struct thread *td)
+{
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	proc_linkup(p, td);
+}
+
+void
 proc_linkup(struct proc *p, struct thread *td)
 {
 
-	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 #ifdef KSE
 	TAILQ_INIT(&p->p_upcalls);	     /* upcall list */
 #endif
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 	tid_unrhdr = new_unrhdr(PID_MAX + 1, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    16 - 1, 0);
 #ifdef KSE
 	kseinit();	/* set up kse specific stuff  e.g. upcall zone*/
 #endif
 }
 
 /*
  * Place an unused thread on the zombie list.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_zombie(struct thread *td)
 {
 	mtx_lock_spin(&zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
  * Release a thread that has exited after cpu_throw().
  */
 void
 thread_stash(struct thread *td)
 {
 	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
 	thread_zombie(td);
 }
 
 /*
  * Reap zombie kse resource.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 #ifdef KSE
 	upcall_reap();
 #endif
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
+	struct thread *td;
 
 	thread_reap(); /* check if any zombies to get */
-	return (uma_zalloc(thread_zone, M_WAITOK));
+
+	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
+	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
+	if (!vm_thread_new(td, 0)) {
+		uma_zfree(thread_zone, td);
+		return (NULL);
+	}
+	cpu_thread_setup(td);
+	return (td);
 }
 
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
+	if (td->td_altkstack != 0)
+		vm_thread_dispose_altkstack(td);
+	if (td->td_kstack != 0)
+		vm_thread_dispose(td);
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().  This may not be needed now as we are under schedlock.
  * Maybe we can just do a thread_stash() as thr_exit1 does.
  */
 /*  XXX
  * libthr expects its thread exit to return for the last
  * thread, meaning that the program is back to non-threaded
  * mode I guess. Because we do this (cpu_throw) unconditionally
  * here, they have their own version of it. (thr_exit1()) 
  * that doesn't do it all if this was the last thread.
  * It is also called from thread_suspend_check().
  * Of course in the end, they end up coming here through exit1
  * anyhow..  After fixing 'thr' to play by the rules we should be able 
  * to merge these two functions together.
  *
  * called from:
  * exit1()
  * kse_exit()
  * thr_exit()
  * ifdef KSE
  * thread_user_enter()
  * thread_userret()
  * endif
  * thread_suspend_check()
  */
 void
 thread_exit(void)
 {
 	uint64_t new_switchtime;
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, p->p_comm);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 #ifdef KSE
 	if (td->td_standin != NULL) {
 		/*
 		 * Note that we don't need to free the cred here as it
 		 * is done in thread_reap().
 		 */
 		thread_zombie(td->td_standin);
 		td->td_standin = NULL;
 	}
 #endif
 
 	umtx_thread_exit(td);
 
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	PCPU_INC(cnt.v_swtch);
 	/* Save our resource usage in our process. */
 	td->td_ru.ru_nvcsw++;
 	rucollect(&p->p_ru, &td->td_ru);
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			thread_lock(td);
 #ifdef KSE
 			kse_unlink(td);
 #else
 			thread_unlink(td);
 #endif
 			thread_unlock(td);
 			td2 = FIRST_THREAD_IN_PROC(p);
 			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SNGL is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_lock(p->p_singlethread);
 					thread_unsuspend_one(p->p_singlethread);
 					thread_unlock(p->p_singlethread);
 				}
 			}
 
 			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 * what should we do?
 			 * Theoretically this can't happen
  			 * exit1() - clears threading flags before coming here
  			 * kse_exit() - treats last thread specially
  			 * thr_exit() - treats last thread specially
 			 * ifdef KSE
  			 * thread_user_enter() - only if more exist
  			 * thread_userret() - only if more exist
 			 * endif
  			 * thread_suspend_check() - only if more exist
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} 
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	/* Save our tick information with both the thread and proc locked */
 	ruxagg(&p->p_rux, td);
 	PROC_SUNLOCK(p);
 	td->td_state = TDS_INACTIVE;
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
 	td = FIRST_THREAD_IN_PROC(p);
 #ifdef KSE
 	if (td->td_standin != NULL) {
 		if (td->td_standin->td_ucred != NULL) {
 			crfree(td->td_standin->td_ucred);
 			td->td_standin->td_ucred = NULL;
 		}
 		thread_free(td->td_standin);
 		td->td_standin = NULL;
 	}
 #endif
 	/* Lock the last thread so we spin until it exits cpu_throw(). */
 	thread_lock(td);
 	thread_unlock(td);
 	/* Wait for any remaining threads to exit cpu_throw(). */
 	while (p->p_exitthreads)
 		sched_relinquish(curthread);
 	cpu_thread_clean(td);
 	crfree(td->td_ucred);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  * Called from:
  *  proc_linkup()
  *  thread_schedule_upcall()
  *  thr_create()
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	/*
 	 * XXX This can't be enabled because it's called for proc0 before
 	 * it's spinlock has been created.
 	 * PROC_SLOCK_ASSERT(p, MA_OWNED);
 	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Convert a process with one thread to an unthreaded process.
  * Called from:
  *  thread_single(exit)  (called from execve and exit)
  *  kse_exit()		XXX may need cleaning up wrt KSE stuff
  */
 void
 thread_unthread(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
 #ifdef KSE
 	thread_lock(td);
 	upcall_remove(td);
 	thread_unlock(td);
 	p->p_flag &= ~(P_SA|P_HADTHREADS);
 	td->td_mailbox = NULL;
 	td->td_pflags &= ~(TDP_SA | TDP_CAN_UNBIND);
 	if (td->td_standin != NULL) {
 		thread_zombie(td->td_standin);
 		td->td_standin = NULL;
 	}
 #else
 	p->p_flag &= ~P_HADTHREADS;
 #endif
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int remaining;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
 	if ((p->p_flag & P_HADTHREADS) == 0)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	p->p_flag |= P_STOPPED_SINGLE;
 	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else
 		remaining = p->p_numthreads - p->p_suspcount;
 	while (remaining != 1) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				switch (mode) {
 				case SINGLE_EXIT:
 					if (td->td_flags & TDF_DBSUSPEND)
 						td->td_flags &= ~TDF_DBSUSPEND;
 					if (TD_IS_SUSPENDED(td2))
 						thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						sleepq_abort(td2, EINTR);
 					break;
 				case SINGLE_BOUNDARY:
 					break;
 				default:	
 					if (TD_IS_SUSPENDED(td2)) {
 						thread_unlock(td2);
 						continue;
 					}
 					/*
 					 * maybe other inhibited states too?
 					 */
 					if ((td2->td_flags & TDF_SINTR) &&
 					    (td2->td_inhibitors &
 					    (TDI_SLEEPING | TDI_SWAPPED)))
 						thread_suspend_one(td2);
 					break;
 				}
 			}
 #ifdef SMP
 			else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 			}
 #endif
 			thread_unlock(td2);
 		}
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
 			remaining = p->p_numthreads - p->p_boundary_count;
 		else
 			remaining = p->p_numthreads - p->p_suspcount;
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == 1)
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_suspend_switch(td);
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
 			remaining = p->p_numthreads - p->p_boundary_count;
 		else
 			remaining = p->p_numthreads - p->p_suspcount;
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * We have gotten rid of all the other threads and we
 		 * are about to either exit or exec. In either case,
 		 * we try our utmost  to revert to being a non-threaded
 		 * process.
 		 */
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
 		thread_unthread(td);
 	}
 	PROC_SUNLOCK(p);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p) ||
 	      ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/* If thread will exit, flush its pending signals */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			sigqueue_flush(&td->td_sigqueue);
 
 		PROC_SLOCK(p);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			thread_exit();
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount + 1) {
 				thread_lock(p->p_singlethread);
 				thread_unsuspend_one(p->p_singlethread);
 				thread_unlock(p->p_singlethread);
 			}
 		}
 		PROC_UNLOCK(p);
 		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL, NULL);
 		if (return_instead == 0)
 			td->td_flags &= ~TDF_BOUNDARY;
 		thread_unlock(td);
 		PROC_LOCK(p);
 		if (return_instead == 0)
 			p->p_boundary_count--;
 	}
 	return (0);
 }
 
 void
 thread_suspend_switch(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * We implement thread_suspend_one in stages here to avoid
 	 * dropping the proc lock while the thread lock is owned.
 	 */
 	thread_stopped(p);
 	p->p_suspcount++;
 	PROC_UNLOCK(p);
 	thread_lock(td);
 	sched_sleep(td);
 	TD_SET_SUSPENDED(td);
 	PROC_SUNLOCK(p);
 	DROP_GIANT();
 	mi_switch(SW_VOL, NULL);
 	thread_unlock(td);
 	PICKUP_GIANT();
 	PROC_LOCK(p);
 	PROC_SLOCK(p);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	sched_sleep(td);
 	TD_SET_SUSPENDED(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
 			thread_unlock(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_lock(p->p_singlethread);
 		thread_unsuspend_one(p->p_singlethread);
 		thread_unlock(p->p_singlethread);
 	}
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
 	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
 			thread_unlock(td);
 		}
 	}
 	PROC_SUNLOCK(p);
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	PROC_SUNLOCK(p);
 	return (td);
 }
Index: head/sys/pc98/pc98/machdep.c
===================================================================
--- head/sys/pc98/pc98/machdep.c	(revision 173360)
+++ head/sys/pc98/pc98/machdep.c	(revision 173361)
@@ -1,2800 +1,2800 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <pc98/pc98/pc98_machdep.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/privatespace.h>
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ISA
 #include <i386/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 int	need_pre_dma_flush;	/* If 1, use wbinvd befor DMA transfer. */
 int	need_post_dma_flush;	/* If 1, use invd after DMA transfer. */
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 static int	ispc98 = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 /*
  * The number of PHYSMAP entries must be one less than the number of
  * PHYSSEG entries because the PHYSMAP entry that spans the largest
  * physical address that is accessible by ISA DMA is split into two
  * PHYSSEG entries.
  */
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct pcpu __pcpu;
 #endif
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
 	    ptoa((uintmax_t)Maxmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
 	    ptoa((uintmax_t)cnt.v_free_count),
 	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	struct proc *p = td->td_proc;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	SIGSETOLD(td->td_sigmask, scp->sc_mask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if (!tsc_present)
 		return (EOPNOTSUPP);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	thread_lock(curthread);
 	sched_bind(curthread, cpu_id);
 	thread_unlock(curthread);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	thread_lock(curthread);
 	sched_unbind(curthread);
 	thread_unlock(curthread);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	mtx_lock_spin(&dt_lock);
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
 	else
 		mtx_unlock_spin(&dt_lock);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 struct mtx dt_lock;			/* lock for GDT and LDT */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length  */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	10 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	int pg_n;
 	u_long physmem_tunable;
 	u_int extmem, under16;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 
 	bzero(physmap, sizeof(physmap));
 
 	/* XXX - some of EPSON machines can't use PG_N */
 	pg_n = PG_N;
 	if (pc98_machine_type & M_EPSON_PC98) {
 		switch (epson_machine_id) {
 #ifdef WB_CACHE
 		default:
 #endif
 		case EPSON_PC486_HX:
 		case EPSON_PC486_HG:
 		case EPSON_PC486_HA:
 			pg_n = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
         under16 = pc98_getmemsize(&basemem, &extmem);
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/*
 	 * We need to divide chunk if Maxmem is larger than 16MB and
 	 * under 16MB area is not full of memory.
 	 * (1) system area (15-16MB region) is cut off
 	 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
 	 */
 	if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
 		/* 15M - 16M region is cut off, so need to divide chunk */
 		physmap[physmap_idx + 1] = under16 * 1024;
 		physmap_idx += 2;
 		physmap[physmap_idx] = 0x1000000;
 		physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
 	}
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | pg_n;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 
 	/*
 	 * Initialize DMAC
 	 */
 	pc98_init_dmac();
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 #ifdef SMP
 	pc = &SMP_prvspace[0].pcpu;
 #else
 	pc = &__pcpu;
 #endif
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 	atpic_startup();
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL)
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
 						(struct save87 *)fpregs);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		set_fpregs_xmm((struct save87 *)fpregs,
 					   &td->td_pcb->pcb_save.sv_xmm);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 #else
 	union savefpu *addr;
 
 	/*
 	 * XXX mc_fpstate might be misaligned, since its declaration is not
 	 * unportabilized using __attribute__((aligned(16))) like the
 	 * declaration of struct savemm, and anyway, alignment doesn't work
 	 * for auto variables since we don't use gcc's pessimal stack
 	 * alignment.  Work around this by abusing the spare fields after
 	 * mcp->mc_fpstate.
 	 *
 	 * XXX unpessimize most cases by only aligning when fxsave might be
 	 * called, although this requires knowing too much about
 	 * npxgetregs()'s internals.
 	 */
 	addr = (union savefpu *)&mcp->mc_fpstate;
 	if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 	    cpu_fxsr &&
 #endif
 	    ((uintptr_t)(void *)addr & 0xF)) {
 		do
 			addr = (void *)((char *)addr + 4);
 		while ((uintptr_t)(void *)addr & 0xF);
 	}
 	mcp->mc_ownedfp = npxgetregs(td, addr);
 	if (addr != (union savefpu *)&mcp->mc_fpstate) {
 		bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 		bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	}
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	union savefpu *addr;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/* XXX align as above. */
 		addr = (union savefpu *)&mcp->mc_fpstate;
 		if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 		    cpu_fxsr &&
 #endif
 		    ((uintptr_t)(void *)addr & 0xF)) {
 			do
 				addr = (void *)((char *)addr + 4);
 			while ((uintptr_t)(void *)addr & 0xF);
 			bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
 		}
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		/*
 		 * XXX we violate the dubious requirement that npxsetregs()
 		 * be called with interrupts disabled.
 		 */
 		npxsetregs(td, addr);
 #endif
 		/*
 		 * Don't bother putting things back where they were in the
 		 * misaligned case, since we know that the caller won't use
 		 * them again.
 		 */
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/powerpc/aim/machdep.c
===================================================================
--- head/sys/powerpc/aim/machdep.c	(revision 173360)
+++ head/sys/powerpc/aim/machdep.c	(revision 173361)
@@ -1,990 +1,990 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *	$NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bat.h>
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mmuvar.h>
 #include <machine/pcb.h>
 #include <machine/powerpc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 #include <dev/ofw/openfirm.h>
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int cold = 1;
 
 struct		pcpu __pcpu[MAXCPU];
 struct		trapframe frame0;
 
 vm_offset_t	kstack0;
 vm_offset_t	kstack0_phys;
 
 char		machine[] = "powerpc";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static int cacheline_size = CACHELINESIZE;
 SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size,
 	   CTLFLAG_RD, &cacheline_size, 0, "");
 
 static void	cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 void		powerpc_init(u_int, u_int, u_int, void *);
 
 int		save_ofw_mapping(void);
 int		restore_ofw_mapping(void);
 
 void		install_extint(void (*)(void));
 
 int             setfault(faultbuf);             /* defined in locore.S */
 
 static int	grab_mcontext(struct thread *, mcontext_t *, int);
 
 void		asm_panic(char *);
 
 long		Maxmem = 0;
 long		realmem = 0;
 
 struct pmap	ofw_pmap;
 extern int	ofmsr;
 
 struct bat	battable[16];
 
 struct kva_md_info kmi;
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 powerpc_ofw_shutdown(void *junk, int howto)
 {
 	if (howto & RB_HALT) {
 		OF_halt();
 	}
 	OF_reboot();
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	cpu_setup(PCPU_GET(cpuid));
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ptoa(physmem),
 	    ptoa(physmem) / 1048576);
 	realmem = physmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %d bytes (%d pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0,
 	    SHUTDOWN_PRI_LAST);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the secondaries */
 	mp_announce();
 #endif  /* SMP */
 }
 
 extern char	kernel_text[], _end[];
 
 extern void	*trapcode, *trapsize;
 extern void	*alitrap, *alisize;
 extern void	*dsitrap, *dsisize;
 extern void	*decrint, *decrsize;
 extern void     *extint, *extsize;
 extern void	*dblow, *dbsize;
 extern void	*vectrap, *vectrapsize;
 
 void
 powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 {
 	struct		pcpu *pc;
 	vm_offset_t	end, off;
 	void		*kmdp;
         char		*env;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be done
 	 * before console is inited so cninit gets the right value of
 	 * boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 #ifdef DDB
 			ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 			ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 		}
 	}
 
 	/*
 	 * Init params/tunables that can be overridden by the loader
 	 */
 	init_param1();
 
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Set up per-cpu data.
 	 */
 	pc = &__pcpu[0];
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_cpuid = 0;
 
 	__asm __volatile("mtsprg 0, %0" :: "r"(pc));
 
 	mutex_init();
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Complain if there is no metadata.
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("powerpc_init: no loader metadata.\n");
 	}
 
 	kdb_init();
 
 	kobj_machdep_init();
 
 	/*
 	 * XXX: Initialize the interrupt tables.
 	 *      Disable translation in case the vector area
 	 *      hasn't been mapped (G5)
 	 */
 	mtmsr(mfmsr() & ~(PSL_IR | PSL_DR));
 	isync();
 	bcopy(&trapcode, (void *)EXC_RST,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize);
 	bcopy(&dsitrap,  (void *)EXC_DSI,  (size_t)&dsisize);
 	bcopy(&trapcode, (void *)EXC_ISI,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_EXI,  (size_t)&trapsize);
 	bcopy(&alitrap,  (void *)EXC_ALI,  (size_t)&alisize);
 	bcopy(&trapcode, (void *)EXC_PGM,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPU,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_SC,   (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_TRC,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPA,  (size_t)&trapsize);
 	bcopy(&vectrap,  (void *)EXC_VEC,  (size_t)&vectrapsize);
 	bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_BPT,  (size_t)&trapsize);
 #ifdef KDB
 	bcopy(&dblow,	 (void *)EXC_RST,  (size_t)&dbsize);
 	bcopy(&dblow,	 (void *)EXC_MCHK, (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_PGM,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_TRC,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_BPT,  (size_t)&dbsize);
 #endif
 	__syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD);
 
 	/*
 	 * Make sure translation has been enabled
 	 */
 	mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI);
 	isync();
 
 	/*
 	 * Initialise virtual memory.
 	 */
 	pmap_mmu_install(MMU_TYPE_OEA, 0);		/* XXX temporary */
 	pmap_bootstrap(startkernel, endkernel);
 
 	/*
 	 * Initialize params/tunables that are derived from memsize
 	 */
 	init_param2(physmem);
 
 	/*
 	 * Grab booted kernel's name
 	 */
         env = getenv("kernelname");
         if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Finish setting up thread0.
 	 */
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
 	 * Map and initialise the message buffer.
 	 */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t	p;
 
 	p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct proc *p;
 	int oonstack, rndfsize;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	oonstack = sigonstack(tf->fixreg[1]);
 
 	rndfsize = ((sizeof(sf) + 15) / 16) * 16;
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	     catcher, sig);
 
 	/*
 	 * Save user context
 	 */
 	memset(&sf, 0, sizeof(sf));
 	grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		   td->td_sigstk.ss_size - rndfsize);
 	} else {
 		sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize);
 	}
 
 	/*
 	 * Translate the signal if appropriate (Linux emu ?)
 	 */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/*
 	 * Save the floating-point state, if necessary, then copy it.
 	 */
 	/* XXX */
 
 	/*
 	 * Set up the registers to return to sigcode.
 	 *
 	 *   r1/sp - sigframe ptr
 	 *   lr    - sig function, dispatched to by blrl in trampoline
 	 *   r3    - sig number
 	 *   r4    - SIGINFO ? &siginfo : exception code
 	 *   r5    - user context
 	 *   srr0  - trampoline function addr
 	 */
 	tf->lr = (register_t)catcher;
 	tf->fixreg[1] = (register_t)sfp;
 	tf->fixreg[FIRSTARG] = sig;
 	tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/*
 		 * Signal handler installed with SA_SIGINFO.
 		 */
 		tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si;
 
 		/*
 		 * Fill siginfo structure.
 		 */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0);
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->fixreg[FIRSTARG+1] = code;
 		tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
 
 	/*
 	 * copy the frame out to userland.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Process has trashed its stack. Kill it.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
 	     tf->srr0, tf->fixreg[1]);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	ucontext_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_lr = tf->srr0;
 	pcb->pcb_sp = tf->fixreg[1];
 }
 
 /*
  * get_mcontext/sendsig helper routine that doesn't touch the
  * proc lock
  */
 static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	memset(mcp, 0, sizeof(mcontext_t));
 
 	mcp->mc_vers = _MC_VERSION;
 	mcp->mc_flags = 0;
 	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_gpr[3] = 0;
 		mcp->mc_gpr[4] = 0;
 	}
 
 	/*
 	 * This assumes that floating-point context is *not* lazy,
 	 * so if the thread has used FP there would have been a
 	 * FP-unavailable exception that would have set things up
 	 * correctly.
 	 */
 	if (pcb->pcb_flags & PCB_FPU) {
 		KASSERT(td == curthread,
 			("get_mcontext: fp save not curthread"));
 		critical_enter();
 		save_fpu(td);
 		critical_exit();
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context ? */
 
 	mcp->mc_len = sizeof(*mcp);
 
 	return (0);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tf;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 
 	if (mcp->mc_vers != _MC_VERSION ||
 	    mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 
 	/*
 	 * Don't let the user set privileged MSR bits
 	 */
 	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
 		return (EINVAL);
 	}
 
 	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
 
 	if (mcp->mc_flags & _MC_FP_VALID) {
 		if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
 			critical_enter();
 			enable_fpu(td);
 			critical_exit();
 		}
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
 		memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context? */
 
 	return (0);
 }
 
 void
 cpu_boot(int howto)
 {
 }
 
 void
 cpu_initclocks(void)
 {
 
 	decr_init();
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Shutdown the CPU as much as possible.
  */
 void
 cpu_halt(void)
 {
 
 	OF_exit();
 }
 
 void
 cpu_idle(void)
 {
 	/* TODO: Insert code to halt (until next interrupt) */
 
 #ifdef INVARIANTS
 	if ((mfmsr() & PSL_EE) != PSL_EE) {
 		struct thread *td = curthread;
 		printf("td msr %x\n", td->td_md.md_saved_msr);
 		panic("ints disabled in idleproc!");
 	}
 #endif
 }
 
 /*
  * Set set up registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe	*tf;
 	struct ps_strings	arginfo;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 
 	/*
 	 * XXX Machine-independent code has already copied arguments and
 	 * XXX environment to userland.  Get them back here.
 	 */
 	(void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo));
 
 	/*
 	 * Set up arguments for _start():
 	 *	_start(argc, argv, envp, obj, cleanup, ps_strings);
 	 *
 	 * Notes:
 	 *	- obj and cleanup are the auxilliary and termination
 	 *	  vectors.  They are fixed up by ld.elf_so.
 	 *	- ps_strings is a NetBSD extention, and will be
 	 * 	  ignored by executables which are strictly
 	 *	  compliant with the SVR4 ABI.
 	 *
 	 * XXX We have to set both regs and retval here due to different
 	 * XXX calling convention in trap.c and init_main.c.
 	 */
         /*
          * XXX PG: these get overwritten in the syscall return code.
          * execve() should return EJUSTRETURN, like it does on NetBSD.
          * Emulate by setting the syscall return value cells. The
          * registers still have to be set for init's fork trampoline.
          */
         td->td_retval[0] = arginfo.ps_nargvstr;
         td->td_retval[1] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[3] = arginfo.ps_nargvstr;
 	tf->fixreg[4] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[5] = (register_t)arginfo.ps_envstr;
 	tf->fixreg[6] = 0;			/* auxillary vector */
 	tf->fixreg[7] = 0;			/* termination vector */
 	tf->fixreg[8] = (register_t)PS_STRINGS;	/* NetBSD extension */
 
 	tf->srr0 = entry;
 	tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
 	td->td_pcb->pcb_flags = 0;
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(regs, tf, sizeof(struct reg));
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		memset(fpregs, 0, sizeof(struct fpreg));
 	else
 		memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(tf, regs, sizeof(struct reg));
 	
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		enable_fpu(td);
 	memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr0 = (register_t)addr;
 
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 	
 	tf = td->td_frame;
 	tf->srr1 |= PSL_SE;
 
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr1 &= ~PSL_SE;
 
 	return (0);
 }
 
 /*
  * Initialise a struct pcpu.
  */
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_msr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_msr);
 }
 
 /*
  * kcopy(const void *src, void *dst, size_t len);
  *
  * Copy len bytes from src to dst, aborting if we encounter a fatal
  * page fault.
  *
  * kcopy() _must_ save and restore the old fault handler since it is
  * called by uiomove(), which may be in the path of servicing a non-fatal
  * page fault.
  */
 int
 kcopy(const void *src, void *dst, size_t len)
 {
 	struct thread	*td;
 	faultbuf	env, *oldfault;
 	int		rv;
 
 	td = PCPU_GET(curthread);
 	oldfault = td->td_pcb->pcb_onfault;
 	if ((rv = setfault(env)) != 0) {
 		td->td_pcb->pcb_onfault = oldfault;
 		return rv;
 	}
 
 	memcpy(dst, src, len);
 
 	td->td_pcb->pcb_onfault = oldfault;
 	return (0);
 }
 
 void
 asm_panic(char *pstr)
 {
 	panic(pstr);
 }
 
 int db_trap_glue(struct trapframe *);		/* Called from trap_subr.S */
 
 int
 db_trap_glue(struct trapframe *frame)
 {
 	if (!(frame->srr1 & PSL_PR)
 	    && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC
 		|| (frame->exc == EXC_PGM
 		    && (frame->srr1 & 0x20000))
 		|| frame->exc == EXC_BPT
 		|| frame->exc == EXC_DSI)) {
 		int type = frame->exc;
 		if (type == EXC_PGM && (frame->srr1 & 0x20000)) {
 			type = T_BREAKPOINT;
 		}
 		return (kdb_trap(type, 0, frame));
 	}
 
 	return (0);
 }
Index: head/sys/powerpc/powerpc/machdep.c
===================================================================
--- head/sys/powerpc/powerpc/machdep.c	(revision 173360)
+++ head/sys/powerpc/powerpc/machdep.c	(revision 173361)
@@ -1,990 +1,990 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *	$NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bat.h>
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mmuvar.h>
 #include <machine/pcb.h>
 #include <machine/powerpc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 #include <dev/ofw/openfirm.h>
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int cold = 1;
 
 struct		pcpu __pcpu[MAXCPU];
 struct		trapframe frame0;
 
 vm_offset_t	kstack0;
 vm_offset_t	kstack0_phys;
 
 char		machine[] = "powerpc";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static int cacheline_size = CACHELINESIZE;
 SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size,
 	   CTLFLAG_RD, &cacheline_size, 0, "");
 
 static void	cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 void		powerpc_init(u_int, u_int, u_int, void *);
 
 int		save_ofw_mapping(void);
 int		restore_ofw_mapping(void);
 
 void		install_extint(void (*)(void));
 
 int             setfault(faultbuf);             /* defined in locore.S */
 
 static int	grab_mcontext(struct thread *, mcontext_t *, int);
 
 void		asm_panic(char *);
 
 long		Maxmem = 0;
 long		realmem = 0;
 
 struct pmap	ofw_pmap;
 extern int	ofmsr;
 
 struct bat	battable[16];
 
 struct kva_md_info kmi;
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 powerpc_ofw_shutdown(void *junk, int howto)
 {
 	if (howto & RB_HALT) {
 		OF_halt();
 	}
 	OF_reboot();
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	cpu_setup(PCPU_GET(cpuid));
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ptoa(physmem),
 	    ptoa(physmem) / 1048576);
 	realmem = physmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %d bytes (%d pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0,
 	    SHUTDOWN_PRI_LAST);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the secondaries */
 	mp_announce();
 #endif  /* SMP */
 }
 
 extern char	kernel_text[], _end[];
 
 extern void	*trapcode, *trapsize;
 extern void	*alitrap, *alisize;
 extern void	*dsitrap, *dsisize;
 extern void	*decrint, *decrsize;
 extern void     *extint, *extsize;
 extern void	*dblow, *dbsize;
 extern void	*vectrap, *vectrapsize;
 
 void
 powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 {
 	struct		pcpu *pc;
 	vm_offset_t	end, off;
 	void		*kmdp;
         char		*env;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be done
 	 * before console is inited so cninit gets the right value of
 	 * boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 #ifdef DDB
 			ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 			ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 		}
 	}
 
 	/*
 	 * Init params/tunables that can be overridden by the loader
 	 */
 	init_param1();
 
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Set up per-cpu data.
 	 */
 	pc = &__pcpu[0];
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_cpuid = 0;
 
 	__asm __volatile("mtsprg 0, %0" :: "r"(pc));
 
 	mutex_init();
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Complain if there is no metadata.
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("powerpc_init: no loader metadata.\n");
 	}
 
 	kdb_init();
 
 	kobj_machdep_init();
 
 	/*
 	 * XXX: Initialize the interrupt tables.
 	 *      Disable translation in case the vector area
 	 *      hasn't been mapped (G5)
 	 */
 	mtmsr(mfmsr() & ~(PSL_IR | PSL_DR));
 	isync();
 	bcopy(&trapcode, (void *)EXC_RST,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize);
 	bcopy(&dsitrap,  (void *)EXC_DSI,  (size_t)&dsisize);
 	bcopy(&trapcode, (void *)EXC_ISI,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_EXI,  (size_t)&trapsize);
 	bcopy(&alitrap,  (void *)EXC_ALI,  (size_t)&alisize);
 	bcopy(&trapcode, (void *)EXC_PGM,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPU,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_SC,   (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_TRC,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPA,  (size_t)&trapsize);
 	bcopy(&vectrap,  (void *)EXC_VEC,  (size_t)&vectrapsize);
 	bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_BPT,  (size_t)&trapsize);
 #ifdef KDB
 	bcopy(&dblow,	 (void *)EXC_RST,  (size_t)&dbsize);
 	bcopy(&dblow,	 (void *)EXC_MCHK, (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_PGM,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_TRC,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_BPT,  (size_t)&dbsize);
 #endif
 	__syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD);
 
 	/*
 	 * Make sure translation has been enabled
 	 */
 	mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI);
 	isync();
 
 	/*
 	 * Initialise virtual memory.
 	 */
 	pmap_mmu_install(MMU_TYPE_OEA, 0);		/* XXX temporary */
 	pmap_bootstrap(startkernel, endkernel);
 
 	/*
 	 * Initialize params/tunables that are derived from memsize
 	 */
 	init_param2(physmem);
 
 	/*
 	 * Grab booted kernel's name
 	 */
         env = getenv("kernelname");
         if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Finish setting up thread0.
 	 */
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
 	 * Map and initialise the message buffer.
 	 */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t	p;
 
 	p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct proc *p;
 	int oonstack, rndfsize;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	oonstack = sigonstack(tf->fixreg[1]);
 
 	rndfsize = ((sizeof(sf) + 15) / 16) * 16;
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	     catcher, sig);
 
 	/*
 	 * Save user context
 	 */
 	memset(&sf, 0, sizeof(sf));
 	grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		   td->td_sigstk.ss_size - rndfsize);
 	} else {
 		sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize);
 	}
 
 	/*
 	 * Translate the signal if appropriate (Linux emu ?)
 	 */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/*
 	 * Save the floating-point state, if necessary, then copy it.
 	 */
 	/* XXX */
 
 	/*
 	 * Set up the registers to return to sigcode.
 	 *
 	 *   r1/sp - sigframe ptr
 	 *   lr    - sig function, dispatched to by blrl in trampoline
 	 *   r3    - sig number
 	 *   r4    - SIGINFO ? &siginfo : exception code
 	 *   r5    - user context
 	 *   srr0  - trampoline function addr
 	 */
 	tf->lr = (register_t)catcher;
 	tf->fixreg[1] = (register_t)sfp;
 	tf->fixreg[FIRSTARG] = sig;
 	tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/*
 		 * Signal handler installed with SA_SIGINFO.
 		 */
 		tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si;
 
 		/*
 		 * Fill siginfo structure.
 		 */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0);
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->fixreg[FIRSTARG+1] = code;
 		tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
 
 	/*
 	 * copy the frame out to userland.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Process has trashed its stack. Kill it.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
 	     tf->srr0, tf->fixreg[1]);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	ucontext_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_lr = tf->srr0;
 	pcb->pcb_sp = tf->fixreg[1];
 }
 
 /*
  * get_mcontext/sendsig helper routine that doesn't touch the
  * proc lock
  */
 static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	memset(mcp, 0, sizeof(mcontext_t));
 
 	mcp->mc_vers = _MC_VERSION;
 	mcp->mc_flags = 0;
 	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_gpr[3] = 0;
 		mcp->mc_gpr[4] = 0;
 	}
 
 	/*
 	 * This assumes that floating-point context is *not* lazy,
 	 * so if the thread has used FP there would have been a
 	 * FP-unavailable exception that would have set things up
 	 * correctly.
 	 */
 	if (pcb->pcb_flags & PCB_FPU) {
 		KASSERT(td == curthread,
 			("get_mcontext: fp save not curthread"));
 		critical_enter();
 		save_fpu(td);
 		critical_exit();
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context ? */
 
 	mcp->mc_len = sizeof(*mcp);
 
 	return (0);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tf;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 
 	if (mcp->mc_vers != _MC_VERSION ||
 	    mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 
 	/*
 	 * Don't let the user set privileged MSR bits
 	 */
 	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
 		return (EINVAL);
 	}
 
 	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
 
 	if (mcp->mc_flags & _MC_FP_VALID) {
 		if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
 			critical_enter();
 			enable_fpu(td);
 			critical_exit();
 		}
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
 		memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context? */
 
 	return (0);
 }
 
 void
 cpu_boot(int howto)
 {
 }
 
 void
 cpu_initclocks(void)
 {
 
 	decr_init();
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Shutdown the CPU as much as possible.
  */
 void
 cpu_halt(void)
 {
 
 	OF_exit();
 }
 
 void
 cpu_idle(void)
 {
 	/* TODO: Insert code to halt (until next interrupt) */
 
 #ifdef INVARIANTS
 	if ((mfmsr() & PSL_EE) != PSL_EE) {
 		struct thread *td = curthread;
 		printf("td msr %x\n", td->td_md.md_saved_msr);
 		panic("ints disabled in idleproc!");
 	}
 #endif
 }
 
 /*
  * Set set up registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe	*tf;
 	struct ps_strings	arginfo;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 
 	/*
 	 * XXX Machine-independent code has already copied arguments and
 	 * XXX environment to userland.  Get them back here.
 	 */
 	(void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo));
 
 	/*
 	 * Set up arguments for _start():
 	 *	_start(argc, argv, envp, obj, cleanup, ps_strings);
 	 *
 	 * Notes:
 	 *	- obj and cleanup are the auxilliary and termination
 	 *	  vectors.  They are fixed up by ld.elf_so.
 	 *	- ps_strings is a NetBSD extention, and will be
 	 * 	  ignored by executables which are strictly
 	 *	  compliant with the SVR4 ABI.
 	 *
 	 * XXX We have to set both regs and retval here due to different
 	 * XXX calling convention in trap.c and init_main.c.
 	 */
         /*
          * XXX PG: these get overwritten in the syscall return code.
          * execve() should return EJUSTRETURN, like it does on NetBSD.
          * Emulate by setting the syscall return value cells. The
          * registers still have to be set for init's fork trampoline.
          */
         td->td_retval[0] = arginfo.ps_nargvstr;
         td->td_retval[1] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[3] = arginfo.ps_nargvstr;
 	tf->fixreg[4] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[5] = (register_t)arginfo.ps_envstr;
 	tf->fixreg[6] = 0;			/* auxillary vector */
 	tf->fixreg[7] = 0;			/* termination vector */
 	tf->fixreg[8] = (register_t)PS_STRINGS;	/* NetBSD extension */
 
 	tf->srr0 = entry;
 	tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
 	td->td_pcb->pcb_flags = 0;
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(regs, tf, sizeof(struct reg));
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		memset(fpregs, 0, sizeof(struct fpreg));
 	else
 		memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(tf, regs, sizeof(struct reg));
 	
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		enable_fpu(td);
 	memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr0 = (register_t)addr;
 
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 	
 	tf = td->td_frame;
 	tf->srr1 |= PSL_SE;
 
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr1 &= ~PSL_SE;
 
 	return (0);
 }
 
 /*
  * Initialise a struct pcpu.
  */
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_msr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_msr);
 }
 
 /*
  * kcopy(const void *src, void *dst, size_t len);
  *
  * Copy len bytes from src to dst, aborting if we encounter a fatal
  * page fault.
  *
  * kcopy() _must_ save and restore the old fault handler since it is
  * called by uiomove(), which may be in the path of servicing a non-fatal
  * page fault.
  */
 int
 kcopy(const void *src, void *dst, size_t len)
 {
 	struct thread	*td;
 	faultbuf	env, *oldfault;
 	int		rv;
 
 	td = PCPU_GET(curthread);
 	oldfault = td->td_pcb->pcb_onfault;
 	if ((rv = setfault(env)) != 0) {
 		td->td_pcb->pcb_onfault = oldfault;
 		return rv;
 	}
 
 	memcpy(dst, src, len);
 
 	td->td_pcb->pcb_onfault = oldfault;
 	return (0);
 }
 
 void
 asm_panic(char *pstr)
 {
 	panic(pstr);
 }
 
 int db_trap_glue(struct trapframe *);		/* Called from trap_subr.S */
 
 int
 db_trap_glue(struct trapframe *frame)
 {
 	if (!(frame->srr1 & PSL_PR)
 	    && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC
 		|| (frame->exc == EXC_PGM
 		    && (frame->srr1 & 0x20000))
 		|| frame->exc == EXC_BPT
 		|| frame->exc == EXC_DSI)) {
 		int type = frame->exc;
 		if (type == EXC_PGM && (frame->srr1 & 0x20000)) {
 			type = T_BREAKPOINT;
 		}
 		return (kdb_trap(type, 0, frame));
 	}
 
 	return (0);
 }
Index: head/sys/powerpc/powerpc/pmap_dispatch.c
===================================================================
--- head/sys/powerpc/powerpc/pmap_dispatch.c	(revision 173360)
+++ head/sys/powerpc/powerpc/pmap_dispatch.c	(revision 173361)
@@ -1,382 +1,383 @@
 /*-
  * Copyright (c) 2005 Peter Grehan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Dispatch MI pmap calls to the appropriate MMU implementation
  * through a previously registered kernel object.
  *
  * Before pmap_bootstrap() can be called, a CPU module must have
  * called pmap_mmu_install(). This may be called multiple times:
  * the highest priority call will be installed as the default
  * MMU handler when pmap_bootstrap() is called.
  *
  * It is required that kobj_machdep_init() be called before
  * pmap_bootstrap() to allow the kobj subsystem to initialise. This
  * in turn requires that mutex_init() has been called.
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 #include <machine/mmuvar.h>
 
 #include "mmu_if.h"
 
 static mmu_def_t	*mmu_def_impl;
 static mmu_t		mmu_obj;
 static struct mmu_kobj	mmu_kernel_obj;
 static struct kobj_ops	mmu_kernel_kops;
 
 /*
  * pmap globals
  */
 struct pmap kernel_pmap_store;
 
 struct msgbuf *msgbufp;
 vm_offset_t    msgbuf_phys;
 
 vm_offset_t kernel_vm_end;
 vm_offset_t phys_avail[PHYS_AVAIL_SZ];
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 
 int pmap_bootstrapped;
 
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	MMU_CHANGE_WIRING(mmu_obj, pmap, va, wired);
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	MMU_CLEAR_MODIFY(mmu_obj, m);
 }
 
 void
 pmap_clear_reference(vm_page_t m)
 {
 	MMU_CLEAR_REFERENCE(mmu_obj, m);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 	MMU_COPY(mmu_obj, dst_pmap, src_pmap, dst_addr, len, src_addr);
 }
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	MMU_COPY_PAGE(mmu_obj, src, dst);
 }
 
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t p, vm_prot_t prot,
     boolean_t wired)
 {
 	MMU_ENTER(mmu_obj, pmap, va, p, prot, wired);
 }
 
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	MMU_ENTER_OBJECT(mmu_obj, pmap, start, end, m_start, prot);
 }
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	MMU_ENTER_QUICK(mmu_obj, pmap, va, m, prot);
 }
 
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	return (MMU_EXTRACT(mmu_obj, pmap, va));
 }
 
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	return (MMU_EXTRACT_AND_HOLD(mmu_obj, pmap, va, prot));
 }
 
 void
 pmap_growkernel(vm_offset_t va)
 {
 	MMU_GROWKERNEL(mmu_obj, va);
 }
 
 void
 pmap_init(void)
 {
 	MMU_INIT(mmu_obj);
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	return (MMU_IS_MODIFIED(mmu_obj, m));
 }
 
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
 {
 	return (MMU_IS_PREFAULTABLE(mmu_obj, pmap, va));
 }
 
 boolean_t
 pmap_ts_referenced(vm_page_t m)
 {
 	return (MMU_TS_REFERENCED(mmu_obj, m));
 }
 
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return (MMU_MAP(mmu_obj, virt, start, end, prot));
 }
 
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	MMU_OBJECT_INIT_PT(mmu_obj, pmap, addr, object, pindex, size);
 }
 
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	return (MMU_PAGE_EXISTS_QUICK(mmu_obj, pmap, m));
 }
 
 void
 pmap_page_init(vm_page_t m)
 {
 	MMU_PAGE_INIT(mmu_obj, m);
 }
 
-void
+int
 pmap_pinit(pmap_t pmap)
 {
 	MMU_PINIT(mmu_obj, pmap);
+	return (1);
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 	MMU_PINIT0(mmu_obj, pmap);
 }
 
 void
 pmap_protect(pmap_t pmap, vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	MMU_PROTECT(mmu_obj, pmap, start, end, prot);
 }
 
 void
 pmap_qenter(vm_offset_t start, vm_page_t *m, int count)
 {
 	MMU_QENTER(mmu_obj, start, m, count);
 }
 
 void
 pmap_qremove(vm_offset_t start, int count)
 {
 	MMU_QREMOVE(mmu_obj, start, count);
 }
 
 void
 pmap_release(pmap_t pmap)
 {
 	MMU_RELEASE(mmu_obj, pmap);
 }
 
 void
 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
 {
 	MMU_REMOVE(mmu_obj, pmap, start, end);
 }
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	MMU_REMOVE_ALL(mmu_obj, m);
 }
 
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	MMU_REMOVE_PAGES(mmu_obj, pmap);
 }
 
 void
 pmap_remove_write(vm_page_t m)
 {
 	MMU_REMOVE_WRITE(mmu_obj, m);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	MMU_ZERO_PAGE(mmu_obj, m);
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	MMU_ZERO_PAGE_AREA(mmu_obj, m, off, size);
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	MMU_ZERO_PAGE_IDLE(mmu_obj, m);
 }
 
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	return (MMU_MINCORE(mmu_obj, pmap, addr));
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	MMU_ACTIVATE(mmu_obj, td);
 }
 
 void
 pmap_deactivate(struct thread *td)
 {
 	MMU_DEACTIVATE(mmu_obj, td);
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 	return (MMU_ADDR_HINT(mmu_obj, obj, addr, size));
 }
 
 
 
 /*
  * Routines used in machine-dependent code
  */
 void
 pmap_bootstrap(vm_offset_t start, vm_offset_t end)
 {
 	mmu_obj = &mmu_kernel_obj;
 
 	/*
 	 * Take care of compiling the selected class, and
 	 * then statically initialise the MMU object
 	 */
 	kobj_class_compile_static(mmu_def_impl, &mmu_kernel_kops);
 	kobj_init((kobj_t)mmu_obj, mmu_def_impl);
 
 	MMU_BOOTSTRAP(mmu_obj, start, end);
 }
 
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	return (MMU_MAPDEV(mmu_obj, pa, size));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	MMU_UNMAPDEV(mmu_obj, va, size);
 }
 
 vm_offset_t
 pmap_kextract(vm_offset_t va)
 {
 	return (MMU_KEXTRACT(mmu_obj, va));
 }
 
 void
 pmap_kenter(vm_offset_t va, vm_offset_t pa)
 {
 	MMU_KENTER(mmu_obj, va, pa);
 }
 
 boolean_t
 pmap_dev_direct_mapped(vm_offset_t pa, vm_size_t size)
 {
 	return (MMU_DEV_DIRECT_MAPPED(mmu_obj, pa, size));
 }
 
 boolean_t
 pmap_page_executable(vm_page_t pg)
 {
 	return (MMU_PAGE_EXECUTABLE(mmu_obj, pg));
 }
 
 /*
  * MMU install routines. Highest priority wins, equal priority also
  * overrides allowing last-set to win.
  */
 SET_DECLARE(mmu_set, mmu_def_t);
 
 boolean_t
 pmap_mmu_install(char *name, int prio)
 {
 	mmu_def_t	**mmupp, *mmup;
 	static int	curr_prio = 0;
 
 	/*
 	 * Try and locate the MMU kobj corresponding to the name
 	 */
 	SET_FOREACH(mmupp, mmu_set) {
 		mmup = *mmupp;
 
 		if (mmup->name &&
 		    !strcmp(mmup->name, name) &&
 		    prio >= curr_prio) {
 			curr_prio = prio;
 			mmu_def_impl = mmup;
 			return (TRUE);
 		}
 	}
 
 	return (FALSE);
 }
Index: head/sys/sparc64/sparc64/machdep.c
===================================================================
--- head/sys/sparc64/sparc64/machdep.c	(revision 173360)
+++ head/sys/sparc64/sparc64/machdep.c	(revision 173361)
@@ -1,912 +1,912 @@
 /*-
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * 	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/cache.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/upa.h>
 #include <machine/ver.h>
 
 typedef int ofw_vec_t(void *);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct trapframe frame0;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 
 /*
  * Note: timer quality for CPU's is set low to try and prevent them from
  * being chosen as the primary timecounter.  The CPU counters are not
  * synchronized among the CPU's so in MP machines this causes problems
  * when calculating the time.  With this value the CPU's should only be
  * chosen as the primary timecounter as a last resort.
  */
 
 #define	UP_TICK_QUALITY	1000
 #define	MP_TICK_QUALITY	-100
 static struct timecounter tick_tc;
 
 char sparc64_model[32];
 
 static int cpu_use_vis = 1;
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static timecounter_get_t tick_get_timecount;
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
 		  ofw_vec_t *vec);
 void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	tick_tc.tc_get_timecount = tick_get_timecount;
 	tick_tc.tc_poll_pps = NULL;
 	tick_tc.tc_counter_mask = ~0u;
 	tick_tc.tc_frequency = tick_freq;
 	tick_tc.tc_name = "tick";
 	tick_tc.tc_quality = UP_TICK_QUALITY;
 #ifdef SMP
 	/*
 	 * We do not know if each CPU's tick counter is synchronized.
 	 */
 	if (cpu_mp_probe())
 		tick_tc.tc_quality = MP_TICK_QUALITY;
 #endif
 
 	tc_init(&tick_tc);
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz / PAGE_SIZE;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
 	printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
 	    cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 	cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid));
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = rdpr(pil);
 		wrpr(pil, 0, PIL_TICK);
 		td->td_md.md_saved_pil = pil;
 	}
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		wrpr(pil, td->td_md.md_saved_pil, 0);
 }
 
 unsigned
 tick_get_timecount(struct timecounter *tc)
 {
 	return ((unsigned)rd(tick));
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	phandle_t child;
 	phandle_t root;
 	struct pcpu *pc;
 	vm_offset_t end;
 	caddr_t kmdp;
 	u_int clock;
 	char *env;
 	char type[8];
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Find out what kind of cpu we have first, for anything that changes
 	 * behaviour.
 	 */
 	cpu_impl = VER_IMPL(rdpr(ver));
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_init(vec);
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
 	init_param1();
 
 	root = OF_peer(0);
 	for (child = OF_child(root); child != 0; child = OF_peer(child)) {
 		OF_getprop(child, "device_type", type, sizeof(type));
 		if (strcmp(type, "cpu") == 0)
 			break;
 	}
 
 	/*
 	 * Initialize the tick counter.  Must be before the console is inited
 	 * in order to provide the low-level console drivers with a working
 	 * DELAY().
 	 */
 	OF_getprop(child, "clock-frequency", &clock, sizeof(clock));
 	tick_init(clock);
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Panic if there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("sparc64_init: no loader metadata.\n"
 		       "This probably means you are not using loader(8).\n");
 		panic("sparc64_init");
 	}
 
 	/*
 	 * Sanity check the kernel end, which is important.
 	 */
 	if (end == 0) {
 		printf("sparc64_init: warning, kernel end not specified.\n"
 		       "Attempting to continue anyway.\n");
 		end = (vm_offset_t)_end;
 	}
 
 	cache_init(child);
 	uma_set_align(cache.dc_linesize - 1);
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 	getenv_int("machdep.use_vis", &cpu_use_vis);
 	if (cpu_use_vis) {
 		switch (cpu_impl) {
 		case CPU_IMPL_SPARC64:
 		case CPU_IMPL_ULTRASPARCI:
 		case CPU_IMPL_ULTRASPARCII:
 		case CPU_IMPL_ULTRASPARCIIi:
 		case CPU_IMPL_ULTRASPARCIIe:
 			cpu_block_copy = spitfire_block_copy;
 			cpu_block_zero = spitfire_block_zero;
 			break;
 		}
 	}
 
 #ifdef SMP
 	mp_init();
 #endif
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(end);
 
 	/*
 	 * Initialize tunables.
 	 */
 	init_param2(physmem);
 	env = getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Initialize the interrupt tables.
 	 */
 	intr_init1();
 
 	/*
 	 * Initialize proc0 stuff (p_contested needs to be done early).
 	 */
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Prime our per-cpu data page for use.  Note, we are using it for our
 	 * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or
 	 * it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_mid = UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG));
 	pc->pc_addr = (vm_offset_t)pcpu0;
 	pc->pc_node = child;
 	pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;
 
 	/*
 	 * Initialize global registers.
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize the message buffer (after setting trap table).
 	 */
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	mutex_init();
 	intr_init2();
 
 	/*
 	 * Finish pmap initialization now that we're ready for mutexes.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 set_openfirm_callback(ofw_vec_t *vec)
 {
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	int oonstack;
 	u_long sp;
 	int sig;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal trampoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	bcopy(tf, mc, sizeof(*tf));
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	}
 	mc->mc_flags = _MC_VERSION;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		tf->tf_fprs &= ~FPRS_FEF;
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	uint64_t wstate;
 
 	if (!TSTATE_SECURE(mc->mc_tstate) ||
 	    (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	wstate = tf->tf_wstate;
 	bcopy(mc, tf, sizeof(*tf));
 	tf->tf_wstate = wstate;
 	if ((mc->mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	openfirmware_exit(args);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(void)
 {
 	/* Insert code to halt (until next interrupt) for the idle loop */
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 	tf->tf_tnpc = entry + 4;
 	tf->tf_tpc = entry;
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
 
 	td->td_retval[0] = tf->tf_out[0];
 	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
Index: head/sys/sparc64/sparc64/pmap.c
===================================================================
--- head/sys/sparc64/sparc64/pmap.c	(revision 173360)
+++ head/sys/sparc64/sparc64/pmap.c	(revision 173361)
@@ -1,1955 +1,1960 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by the University of
  *      California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  * $FreeBSD$
  */
 
 /*
  * Manages physical address maps.
  *
  * In addition to hardware address maps, this module is called upon to
  * provide software-use-only maps which may or may not be stored in the
  * same form as hardware maps.  These pseudo-maps are used to store
  * intermediate results from copy operations to and from address spaces.
  *
  * Since the information managed by this module is also stored by the
  * logical address mapping module, this module may throw away valid virtual
  * to physical mappings at almost any time.  However, invalidations of
  * mappings must be done as requested.
  *
  * In order to cope with hardware architectures which make virtual to
  * physical map invalidates expensive, this module may delay invalidate
  * reduced protection operations until such time as they are actually
  * necessary.  This module is given full information as to which processors
  * are currently using which maps, and to when physical maps must be made
  * correct.
  */
 
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h> 
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 
 #include <machine/cache.h>
 #include <machine/frame.h>
 #include <machine/instr.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_mem.h>
 #include <machine/smp.h>
 #include <machine/tlb.h>
 #include <machine/tte.h>
 #include <machine/tsb.h>
 
 #define	PMAP_DEBUG
 
 #ifndef	PMAP_SHPGPERPROC
 #define	PMAP_SHPGPERPROC	200
 #endif
 
 /* XXX */
 #include "opt_sched.h"
 #ifndef SCHED_4BSD
 #error "sparc64 only works with SCHED_4BSD which uses a global scheduler lock."
 #endif
 extern struct mtx sched_lock;
 
 /*
  * Virtual and physical address of message buffer.
  */
 struct msgbuf *msgbufp;
 vm_paddr_t msgbuf_phys;
 
 /*
  * Map of physical memory reagions.
  */
 vm_paddr_t phys_avail[128];
 static struct ofw_mem_region mra[128];
 struct ofw_mem_region sparc64_memreg[128];
 int sparc64_nmemreg;
 static struct ofw_map translations[128];
 static int translations_size;
 
 static vm_offset_t pmap_idle_map;
 static vm_offset_t pmap_temp_map_1;
 static vm_offset_t pmap_temp_map_2;
 
 /*
  * First and last available kernel virtual addresses.
  */
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 vm_offset_t kernel_vm_end;
 
 vm_offset_t vm_max_kernel_address;
 
 /*
  * Kernel pmap.
  */
 struct pmap kernel_pmap_store;
 
 /*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static void pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, boolean_t wired);
 
 extern int tl1_immu_miss_patch_1[];
 extern int tl1_immu_miss_patch_2[];
 extern int tl1_dmmu_miss_patch_1[];
 extern int tl1_dmmu_miss_patch_2[];
 extern int tl1_dmmu_prot_patch_1[];
 extern int tl1_dmmu_prot_patch_2[];
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
  * resident count drops to 0, there are no more pages to remove, so we
  * need not continue.
  */
 #define	PMAP_REMOVE_DONE(pm) \
 	((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 
 /*
  * The threshold (in bytes) above which tsb_foreach() is used in pmap_remove()
  * and pmap_protect() instead of trying each virtual address.
  */
 #define	PMAP_TSB_THRESH	((TSB_SIZE / 2) * PAGE_SIZE)
 
 SYSCTL_NODE(_debug, OID_AUTO, pmap_stats, CTLFLAG_RD, 0, "");
 
 PMAP_STATS_VAR(pmap_nenter);
 PMAP_STATS_VAR(pmap_nenter_update);
 PMAP_STATS_VAR(pmap_nenter_replace);
 PMAP_STATS_VAR(pmap_nenter_new);
 PMAP_STATS_VAR(pmap_nkenter);
 PMAP_STATS_VAR(pmap_nkenter_oc);
 PMAP_STATS_VAR(pmap_nkenter_stupid);
 PMAP_STATS_VAR(pmap_nkremove);
 PMAP_STATS_VAR(pmap_nqenter);
 PMAP_STATS_VAR(pmap_nqremove);
 PMAP_STATS_VAR(pmap_ncache_enter);
 PMAP_STATS_VAR(pmap_ncache_enter_c);
 PMAP_STATS_VAR(pmap_ncache_enter_oc);
 PMAP_STATS_VAR(pmap_ncache_enter_cc);
 PMAP_STATS_VAR(pmap_ncache_enter_coc);
 PMAP_STATS_VAR(pmap_ncache_enter_nc);
 PMAP_STATS_VAR(pmap_ncache_enter_cnc);
 PMAP_STATS_VAR(pmap_ncache_remove);
 PMAP_STATS_VAR(pmap_ncache_remove_c);
 PMAP_STATS_VAR(pmap_ncache_remove_oc);
 PMAP_STATS_VAR(pmap_ncache_remove_cc);
 PMAP_STATS_VAR(pmap_ncache_remove_coc);
 PMAP_STATS_VAR(pmap_ncache_remove_nc);
 PMAP_STATS_VAR(pmap_nzero_page);
 PMAP_STATS_VAR(pmap_nzero_page_c);
 PMAP_STATS_VAR(pmap_nzero_page_oc);
 PMAP_STATS_VAR(pmap_nzero_page_nc);
 PMAP_STATS_VAR(pmap_nzero_page_area);
 PMAP_STATS_VAR(pmap_nzero_page_area_c);
 PMAP_STATS_VAR(pmap_nzero_page_area_oc);
 PMAP_STATS_VAR(pmap_nzero_page_area_nc);
 PMAP_STATS_VAR(pmap_nzero_page_idle);
 PMAP_STATS_VAR(pmap_nzero_page_idle_c);
 PMAP_STATS_VAR(pmap_nzero_page_idle_oc);
 PMAP_STATS_VAR(pmap_nzero_page_idle_nc);
 PMAP_STATS_VAR(pmap_ncopy_page);
 PMAP_STATS_VAR(pmap_ncopy_page_c);
 PMAP_STATS_VAR(pmap_ncopy_page_oc);
 PMAP_STATS_VAR(pmap_ncopy_page_nc);
 PMAP_STATS_VAR(pmap_ncopy_page_dc);
 PMAP_STATS_VAR(pmap_ncopy_page_doc);
 PMAP_STATS_VAR(pmap_ncopy_page_sc);
 PMAP_STATS_VAR(pmap_ncopy_page_soc);
 
 PMAP_STATS_VAR(pmap_nnew_thread);
 PMAP_STATS_VAR(pmap_nnew_thread_oc);
 
 /*
  * Quick sort callout for comparing memory regions.
  */
 static int mr_cmp(const void *a, const void *b);
 static int om_cmp(const void *a, const void *b);
 static int
 mr_cmp(const void *a, const void *b)
 {
 	const struct ofw_mem_region *mra;
 	const struct ofw_mem_region *mrb;
 
 	mra = a;
 	mrb = b;
 	if (mra->mr_start < mrb->mr_start)
 		return (-1);
 	else if (mra->mr_start > mrb->mr_start)
 		return (1);
 	else
 		return (0);
 }
 static int
 om_cmp(const void *a, const void *b)
 {
 	const struct ofw_map *oma;
 	const struct ofw_map *omb;
 
 	oma = a;
 	omb = b;
 	if (oma->om_start < omb->om_start)
 		return (-1);
 	else if (oma->om_start > omb->om_start)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t ekva)
 {
 	struct pmap *pm;
 	struct tte *tp;
 	vm_offset_t off;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	vm_size_t physsz;
 	vm_size_t virtsz;
 	ihandle_t pmem;
 	ihandle_t vmem;
 	int sz;
 	int i;
 	int j;
 
 	/*
 	 * Find out what physical memory is available from the prom and
 	 * initialize the phys_avail array.  This must be done before
 	 * pmap_bootstrap_alloc is called.
 	 */
 	if ((pmem = OF_finddevice("/memory")) == -1)
 		panic("pmap_bootstrap: finddevice /memory");
 	if ((sz = OF_getproplen(pmem, "available")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/available");
 	if (sizeof(phys_avail) < sz)
 		panic("pmap_bootstrap: phys_avail too small");
 	if (sizeof(mra) < sz)
 		panic("pmap_bootstrap: mra too small");
 	bzero(mra, sz);
 	if (OF_getprop(pmem, "available", mra, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/available");
 	sz /= sizeof(*mra);
 	CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
 	qsort(mra, sz, sizeof (*mra), mr_cmp);
 	physsz = 0;
 	getenv_quad("hw.physmem", &physmem);
 	physmem = btoc(physmem);
 	for (i = 0, j = 0; i < sz; i++, j += 2) {
 		CTR2(KTR_PMAP, "start=%#lx size=%#lx", mra[i].mr_start,
 		    mra[i].mr_size);
 		if (physmem != 0 && btoc(physsz + mra[i].mr_size) >= physmem) {
 			if (btoc(physsz) < physmem) {
 				phys_avail[j] = mra[i].mr_start;
 				phys_avail[j + 1] = mra[i].mr_start +
 				    (ctob(physmem) - physsz);
 				physsz = ctob(physmem);
 			}
 			break;
 		}
 		phys_avail[j] = mra[i].mr_start;
 		phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 		physsz += mra[i].mr_size;
 	}
 	physmem = btoc(physsz);
 
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel tsb.
 	 */
 	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 	tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 	tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1;
 
 	/*
 	 * Allocate the kernel tsb and lock it in the tlb.
 	 */
 	pa = pmap_bootstrap_alloc(tsb_kernel_size);
 	if (pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: tsb unaligned\n");
 	tsb_kernel_phys = pa;
 	tsb_kernel = (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size);
 	pmap_map_tsb();
 	bzero(tsb_kernel, tsb_kernel_size);
 
 	/*
 	 * Allocate and map the message buffer.
 	 */
 	msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
 
 	/*
 	 * Patch the virtual address and the tsb mask into the trap table.
 	 */
 
 #define	SETHI(rd, imm22) \
 	(EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) | \
 	    EIF_IMM((imm22) >> 10, 22))
 #define	OR_R_I_R(rd, imm13, rs1) \
 	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) | \
 	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
 
 #define	PATCH(addr) do { \
 	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \
 	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, IF_F3_RS1(addr[1])) || \
 	    addr[2] != SETHI(IF_F2_RD(addr[2]), 0x0)) \
 		panic("pmap_boostrap: patched instructions have changed"); \
 	addr[0] |= EIF_IMM((tsb_kernel_mask) >> 10, 22); \
 	addr[1] |= EIF_IMM(tsb_kernel_mask, 10); \
 	addr[2] |= EIF_IMM(((vm_offset_t)tsb_kernel) >> 10, 22); \
 	flush(addr); \
 	flush(addr + 1); \
 	flush(addr + 2); \
 } while (0)
 
 	PATCH(tl1_immu_miss_patch_1);
 	PATCH(tl1_immu_miss_patch_2);
 	PATCH(tl1_dmmu_miss_patch_1);
 	PATCH(tl1_dmmu_miss_patch_2);
 	PATCH(tl1_dmmu_prot_patch_1);
 	PATCH(tl1_dmmu_prot_patch_2);
 	
 	/*
 	 * Enter fake 8k pages for the 4MB kernel pages, so that
 	 * pmap_kextract() will work for them.
 	 */
 	for (i = 0; i < kernel_tlb_slots; i++) {
 		pa = kernel_tlbs[i].te_pa;
 		va = kernel_tlbs[i].te_va;
 		for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) {
 			tp = tsb_kvtotte(va + off);
 			tp->tte_vpn = TV_VPN(va + off, TS_8K);
 			tp->tte_data = TD_V | TD_8K | TD_PA(pa + off) |
 			    TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W;
 		}
 	}
 
 	/*
 	 * Set the start and end of kva.  The kernel is loaded at the first
 	 * available 4 meg super page, so round up to the end of the page.
 	 */
 	virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
 	virtual_end = vm_max_kernel_address;
 	kernel_vm_end = vm_max_kernel_address;
 
 	/*
 	 * Allocate kva space for temporary mappings.
 	 */
 	pmap_idle_map = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 	pmap_temp_map_1 = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 	pmap_temp_map_2 = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 
 	/*
 	 * Allocate a kernel stack with guard page for thread0 and map it into
 	 * the kernel tsb.  We must ensure that the virtual address is coloured
 	 * properly, since we're allocating from phys_avail so the memory won't
 	 * have an associated vm_page_t.
 	 */
 	pa = pmap_bootstrap_alloc(roundup(KSTACK_PAGES, DCACHE_COLORS) *
 	    PAGE_SIZE);
 	kstack0_phys = pa;
 	virtual_avail += roundup(KSTACK_GUARD_PAGES, DCACHE_COLORS) *
 	    PAGE_SIZE;
 	kstack0 = virtual_avail;
 	virtual_avail += roundup(KSTACK_PAGES, DCACHE_COLORS) * PAGE_SIZE;
 	KASSERT(DCACHE_COLOR(kstack0) == DCACHE_COLOR(kstack0_phys),
 	    ("pmap_bootstrap: kstack0 miscoloured"));
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tp = tsb_kvtotte(va);
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW |
 		    TD_CP | TD_CV | TD_P | TD_W;
 	}
 
 	/*
 	 * Calculate the last available physical address.
 	 */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		;
 	Maxmem = sparc64_btop(phys_avail[i + 1]);
 
 	/*
 	 * Add the prom mappings to the kernel tsb.
 	 */
 	if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 		panic("pmap_bootstrap: finddevice /virtual-memory");
 	if ((sz = OF_getproplen(vmem, "translations")) == -1)
 		panic("pmap_bootstrap: getproplen translations");
 	if (sizeof(translations) < sz)
 		panic("pmap_bootstrap: translations too small");
 	bzero(translations, sz);
 	if (OF_getprop(vmem, "translations", translations, sz) == -1)
 		panic("pmap_bootstrap: getprop /virtual-memory/translations");
 	sz /= sizeof(*translations);
 	translations_size = sz;
 	CTR0(KTR_PMAP, "pmap_bootstrap: translations");
 	qsort(translations, sz, sizeof (*translations), om_cmp);
 	for (i = 0; i < sz; i++) {
 		CTR3(KTR_PMAP,
 		    "translation: start=%#lx size=%#lx tte=%#lx",
 		    translations[i].om_start, translations[i].om_size,
 		    translations[i].om_tte);
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS)
 			continue;
 		for (off = 0; off < translations[i].om_size;
 		    off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			tp = tsb_kvtotte(va);
 			tp->tte_vpn = TV_VPN(va, TS_8K);
 			tp->tte_data =
 			    ((translations[i].om_tte &
 			      ~(TD_SOFT_MASK << TD_SOFT_SHIFT)) | TD_EXEC) +
 			    off;
 		}
 	}
 
 	/*
 	 * Get the available physical memory ranges from /memory/reg. These
 	 * are only used for kernel dumps, but it may not be wise to do prom
 	 * calls in that situation.
 	 */
 	if ((sz = OF_getproplen(pmem, "reg")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/reg");
 	if (sizeof(sparc64_memreg) < sz)
 		panic("pmap_bootstrap: sparc64_memreg too small");
 	if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/reg");
 	sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * NOTE: PMAP_LOCK_INIT() is needed as part of the initialization
 	 * but sparc64 start up is not ready to initialize mutexes yet.
 	 * It is called in machdep.c.
 	 */
 	pm = kernel_pmap;
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = TLB_CTX_KERNEL;
 	pm->pm_active = ~0;
 
 	/* XXX flush all non-locked tlb entries */
 }
 
 void
 pmap_map_tsb(void)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	u_long data;
 	u_long s;
 	int i;
 
 	s = intr_disable();
 
 	/*
 	 * Map the 4mb tsb pages.
 	 */
 	for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) {
 		va = (vm_offset_t)tsb_kernel + i;
 		pa = tsb_kernel_phys + i;
 		data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP | TD_CV |
 		    TD_P | TD_W;
 		/* XXX - cheetah */
 		stxa(AA_DMMU_TAR, ASI_DMMU, TLB_TAR_VA(va) |
 		    TLB_TAR_CTX(TLB_CTX_KERNEL));
 		stxa_sync(0, ASI_DTLB_DATA_IN_REG, data);
 	}
 
 	/*
 	 * Set the secondary context to be the kernel context (needed for
 	 * fp block operations in the kernel and the cache code).
 	 */
 	stxa(AA_DMMU_SCXR, ASI_DMMU, TLB_CTX_KERNEL);
 	membar(Sync);
 
 	intr_restore(s);
 }
 
 /*
  * Allocate a physical page of memory directly from the phys_avail map.
  * Can only be called from pmap_bootstrap before avail start and end are
  * calculated.
  */
 static vm_paddr_t
 pmap_bootstrap_alloc(vm_size_t size)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = round_page(size);
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		return (pa);
 	}
 	panic("pmap_bootstrap_alloc");
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.tte_list);
 	m->md.color = DCACHE_COLOR(VM_PAGE_TO_PHYS(m));
 	m->md.flags = 0;
 	m->md.pmap = NULL;
 }
 
 /*
  * Initialize the pmap module.
  */
 void
 pmap_init(void)
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int result;
 	int i;
 
 	for (i = 0; i < translations_size; i++) {
 		addr = translations[i].om_start;
 		size = translations[i].om_size;
 		if (addr < VM_MIN_PROM_ADDRESS || addr > VM_MAX_PROM_ADDRESS)
 			continue;
 		result = vm_map_find(kernel_map, NULL, 0, &addr, size, FALSE,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (result != KERN_SUCCESS || addr != translations[i].om_start)
 			panic("pmap_init: vm_map_find");
 	}
 }
 
 /*
  * Extract the physical page address associated with the given
  * map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pm, vm_offset_t va)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 
 	if (pm == kernel_pmap)
 		return (pmap_kextract(va));
 	PMAP_LOCK(pm);
 	tp = tsb_tte_lookup(pm, va);
 	if (tp == NULL)
 		pa = 0;
 	else
 		pa = TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp));
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pm, vm_offset_t va, vm_prot_t prot)
 {
 	struct tte *tp;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	if (pm == kernel_pmap) {
 		if (va >= VM_MIN_DIRECT_ADDRESS) {
 			tp = NULL;
 			m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS(va));
 			vm_page_hold(m);
 		} else {
 			tp = tsb_kvtotte(va);
 			if ((tp->tte_data & TD_V) == 0)
 				tp = NULL;
 		}
 	} else {
 		PMAP_LOCK(pm);
 		tp = tsb_tte_lookup(pm, va);
 	}
 	if (tp != NULL && ((tp->tte_data & TD_SW) ||
 	    (prot & VM_PROT_WRITE) == 0)) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	if (pm != kernel_pmap)
 		PMAP_UNLOCK(pm);
 	return (m);
 }
 
 /*
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	struct tte *tp;
 
 	if (va >= VM_MIN_DIRECT_ADDRESS)
 		return (TLB_DIRECT_TO_PHYS(va));
 	tp = tsb_kvtotte(va);
 	if ((tp->tte_data & TD_V) == 0)
 		return (0);
 	return (TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp)));
 }
 
 int
 pmap_cache_enter(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_enter: fake page"));
 	PMAP_STATS_INC(pmap_ncache_enter);
 
 	/*
 	 * Find the color for this virtual address and note the added mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]++;
 
 	/*
 	 * If all existing mappings have the same color, the mapping is
 	 * cacheable.
 	 */
 	if (m->md.color == color) {
 		KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] == 0,
 		    ("pmap_cache_enter: cacheable, mappings of other color"));
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_oc);
 		return (1);
 	}
 
 	/*
 	 * If there are no mappings of the other color, and the page still has
 	 * the wrong color, this must be a new mapping.  Change the color to
 	 * match the new mapping, which is cacheable.  We must flush the page
 	 * from the cache now.
 	 */
 	if (m->md.colors[DCACHE_OTHER_COLOR(color)] == 0) {
 		KASSERT(m->md.colors[color] == 1,
 		    ("pmap_cache_enter: changing color, not new mapping"));
 		dcache_page_inval(VM_PAGE_TO_PHYS(m));
 		m->md.color = color;
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_cc);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_coc);
 		return (1);
 	}
 
 	/*
 	 * If the mapping is already non-cacheable, just return.
 	 */	
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncache_enter_nc);
 		return (0);
 	}
 
 	PMAP_STATS_INC(pmap_ncache_enter_cnc);
 
 	/*
 	 * Mark all mappings as uncacheable, flush any lines with the other
 	 * color out of the dcache, and set the color to none (-1).
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_clear_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	dcache_page_inval(VM_PAGE_TO_PHYS(m));
 	m->md.color = -1;
 	return (0);
 }
 
 void
 pmap_cache_remove(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va,
 	    m->md.colors[DCACHE_COLOR(va)]);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_remove: fake page"));
 	KASSERT(m->md.colors[DCACHE_COLOR(va)] > 0,
 	    ("pmap_cache_remove: no mappings %d <= 0",
 	    m->md.colors[DCACHE_COLOR(va)]));
 	PMAP_STATS_INC(pmap_ncache_remove);
 
 	/*
 	 * Find the color for this virtual address and note the removal of
 	 * the mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]--;
 
 	/*
 	 * If the page is cacheable, just return and keep the same color, even
 	 * if there are no longer any mappings.
 	 */
 	if (m->md.color != -1) {
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_remove_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_remove_oc);
 		return;
 	}
 
 	KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] != 0,
 	    ("pmap_cache_remove: uncacheable, no mappings of other color"));
 
 	/*
 	 * If the page is not cacheable (color is -1), and the number of
 	 * mappings for this color is not zero, just return.  There are
 	 * mappings of the other color still, so remain non-cacheable.
 	 */
 	if (m->md.colors[color] != 0) {
 		PMAP_STATS_INC(pmap_ncache_remove_nc);
 		return;
 	}
 
 	/*
 	 * The number of mappings for this color is now zero.  Recache the
 	 * other colored mappings, and change the page color to the other
 	 * color.  There should be no lines in the data cache for this page,
 	 * so flushing should not be needed.
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_set_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	m->md.color = DCACHE_OTHER_COLOR(color);
 
 	if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 		PMAP_STATS_INC(pmap_ncache_remove_cc);
 	else
 		PMAP_STATS_INC(pmap_ncache_remove_coc);
 }
 
 /*
  * Map a wired page into kernel virtual address space.
  */
 void
 pmap_kenter(vm_offset_t va, vm_page_t m)
 {
 	vm_offset_t ova;
 	struct tte *tp;
 	vm_page_t om;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_STATS_INC(pmap_nkenter);
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, VM_PAGE_TO_PHYS(m), tp, tp->tte_data);
 	if (DCACHE_COLOR(VM_PAGE_TO_PHYS(m)) != DCACHE_COLOR(va)) {
 		CTR6(KTR_CT2,
 	"pmap_kenter: off colour va=%#lx pa=%#lx o=%p oc=%#lx ot=%d pi=%#lx",
 		    va, VM_PAGE_TO_PHYS(m), m->object,
 		    m->object ? m->object->pg_color : -1,
 		    m->object ? m->object->type : -1,
 		    m->pindex);
 		PMAP_STATS_INC(pmap_nkenter_oc);
 	}
 	if ((tp->tte_data & TD_V) != 0) {
 		om = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		ova = TTE_GET_VA(tp);
 		if (m == om && va == ova) {
 			PMAP_STATS_INC(pmap_nkenter_stupid);
 			return;
 		}
 		TAILQ_REMOVE(&om->md.tte_list, tp, tte_link);
 		pmap_cache_remove(om, ova);
 		if (va != ova)
 			tlb_page_demap(kernel_pmap, ova);
 	}
 	data = TD_V | TD_8K | VM_PAGE_TO_PHYS(m) | TD_REF | TD_SW | TD_CP |
 	    TD_P | TD_W;
 	if (pmap_cache_enter(m, va) != 0)
 		data |= TD_CV;
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = data;
 	TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 }
 
 /*
  * Map a wired page into kernel virtual address space. This additionally
  * takes a flag argument wich is or'ed to the TTE data. This is used by
  * bus_space_map().
  * NOTE: if the mapping is non-cacheable, it's the caller's responsibility
  * to flush entries that might still be in the cache, if applicable.
  */
 void
 pmap_kenter_flags(vm_offset_t va, vm_paddr_t pa, u_long flags)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter_flags: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, pa, tp, tp->tte_data);
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_P | flags;
 }
 
 /*
  * Remove a wired page from kernel virtual address space.
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct tte *tp;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_STATS_INC(pmap_nkremove);
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	if ((tp->tte_data & TD_V) == 0)
 		return;
 	m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 	TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 	pmap_cache_remove(m, va);
 	TTE_ZERO(tp);
 }
 
 /*
  * Inverse of pmap_kenter_flags, used by bus_space_unmap().
  */
 void
 pmap_kremove_flags(vm_offset_t va)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	TTE_ZERO(tp);
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  *
  * The value passed in *virt is a suggested virtual address for the mapping.
  * Architectures which can support a direct-mapped physical to virtual region
  * can return the appropriate address within that region, leaving '*virt'
  * unchanged.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 
 	return (TLB_PHYS_TO_DIRECT(start));
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 	int locked;
 
 	PMAP_STATS_INC(pmap_nqenter);
 	va = sva;
 	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
 		vm_page_lock_queues();
 	while (count-- > 0) {
 		pmap_kenter(va, *m);
 		va += PAGE_SIZE;
 		m++;
 	}
 	if (!locked)
 		vm_page_unlock_queues();
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by pmap_qenter.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 	int locked;
 
 	PMAP_STATS_INC(pmap_nqremove);
 	va = sva;
 	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
 		vm_page_lock_queues();
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	if (!locked)
 		vm_page_unlock_queues();
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Initialize the pmap associated with process 0.
  */
 void
 pmap_pinit0(pmap_t pm)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pm);
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = 0;
 	pm->pm_active = 0;
 	pm->pm_tsb = NULL;
 	pm->pm_tsb_obj = NULL;
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure, such as one in a
  * vmspace structure.
  */
-void
+int
 pmap_pinit(pmap_t pm)
 {
 	vm_page_t ma[TSB_PAGES];
 	vm_page_t m;
 	int i;
 
 	PMAP_LOCK_INIT(pm);
 
 	/*
 	 * Allocate kva space for the tsb.
 	 */
 	if (pm->pm_tsb == NULL) {
 		pm->pm_tsb = (struct tte *)kmem_alloc_nofault(kernel_map,
 		    TSB_BSIZE);
+		if (pm->pm_tsb == NULL) {
+			PMAP_LOCK_DESTROY(pm);
+			return (0);
+		}
 	}
 
 	/*
 	 * Allocate an object for it.
 	 */
 	if (pm->pm_tsb_obj == NULL)
 		pm->pm_tsb_obj = vm_object_allocate(OBJT_DEFAULT, TSB_PAGES);
 
 	VM_OBJECT_LOCK(pm->pm_tsb_obj);
 	for (i = 0; i < TSB_PAGES; i++) {
 		m = vm_page_grab(pm->pm_tsb_obj, i, VM_ALLOC_NOBUSY |
 		    VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 		m->valid = VM_PAGE_BITS_ALL;
 		m->md.pmap = pm;
 		ma[i] = m;
 	}
 	VM_OBJECT_UNLOCK(pm->pm_tsb_obj);
 	pmap_qenter((vm_offset_t)pm->pm_tsb, ma, TSB_PAGES);
 
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = -1;
 	pm->pm_active = 0;
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
+	return (1);
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pm)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	struct pcpu *pc;
 
 	CTR2(KTR_PMAP, "pmap_release: ctx=%#x tsb=%p",
 	    pm->pm_context[PCPU_GET(cpuid)], pm->pm_tsb);
 	KASSERT(pmap_resident_count(pm) == 0,
 	    ("pmap_release: resident pages %ld != 0",
 	    pmap_resident_count(pm)));
 
 	/*
 	 * After the pmap was freed, it might be reallocated to a new process.
 	 * When switching, this might lead us to wrongly assume that we need
 	 * not switch contexts because old and new pmap pointer are equal.
 	 * Therefore, make sure that this pmap is not referenced by any PCPU
 	 * pointer any more. This could happen in two cases:
 	 * - A process that referenced the pmap is currently exiting on a CPU.
 	 *   However, it is guaranteed to not switch in any more after setting
 	 *   its state to PRS_ZOMBIE.
 	 * - A process that referenced this pmap ran on a CPU, but we switched
 	 *   to a kernel thread, leaving the pmap pointer unchanged.
 	 */
 	mtx_lock_spin(&sched_lock);
 	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (pc->pc_pmap == pm)
 			pc->pc_pmap = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	obj = pm->pm_tsb_obj;
 	VM_OBJECT_LOCK(obj);
 	KASSERT(obj->ref_count == 1, ("pmap_release: tsbobj ref count != 1"));
 	while (!TAILQ_EMPTY(&obj->memq)) {
 		m = TAILQ_FIRST(&obj->memq);
 		vm_page_lock_queues();
 		if (vm_page_sleep_if_busy(m, FALSE, "pmaprl"))
 			continue;
 		KASSERT(m->hold_count == 0,
 		    ("pmap_release: freeing held tsb page"));
 		m->md.pmap = NULL;
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(obj);
 	pmap_qremove((vm_offset_t)pm->pm_tsb, TSB_PAGES);
 	PMAP_LOCK_DESTROY(pm);
 }
 
 /*
  * Grow the number of kernel page table entries.  Unneeded.
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 
 	panic("pmap_growkernel: can't grow kernel");
 }
 
 int
 pmap_remove_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
 		vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	data = atomic_readandclear_long(&tp->tte_data);
 	if ((data & TD_FAKE) == 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		if ((data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((data & TD_PV) != 0) {
 			if ((data & TD_W) != 0)
 				vm_page_dirty(m);
 			if ((data & TD_REF) != 0)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.tte_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			pm->pm_stats.resident_count--;
 		}
 		pmap_cache_remove(m, va);
 	}
 	TTE_ZERO(tp);
 	if (PMAP_REMOVE_DONE(pm))
 		return (0);
 	return (1);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  */
 void
 pmap_remove(pmap_t pm, vm_offset_t start, vm_offset_t end)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	CTR3(KTR_PMAP, "pmap_remove: ctx=%#lx start=%#lx end=%#lx",
 	    pm->pm_context[PCPU_GET(cpuid)], start, end);
 	if (PMAP_REMOVE_DONE(pm))
 		return;
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (end - start > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, start, end, pmap_remove_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = start; va < end; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL) {
 				if (!pmap_remove_tte(pm, NULL, tp, va))
 					break;
 			}
 		}
 		tlb_range_demap(pm, start, end - 1);
 	}
 	PMAP_UNLOCK(pm);
 	vm_page_unlock_queues();
 }
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct pmap *pm;
 	struct tte *tpn;
 	struct tte *tp;
 	vm_offset_t va;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) {
 		tpn = TAILQ_NEXT(tp, tte_link);
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		pm = TTE_GET_PMAP(tp);
 		va = TTE_GET_VA(tp);
 		PMAP_LOCK(pm);
 		if ((tp->tte_data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((tp->tte_data & TD_REF) != 0)
 			vm_page_flag_set(m, PG_REFERENCED);
 		if ((tp->tte_data & TD_W) != 0)
 			vm_page_dirty(m);
 		tp->tte_data &= ~TD_V;
 		tlb_page_demap(pm, va);
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		pm->pm_stats.resident_count--;
 		pmap_cache_remove(m, va);
 		TTE_ZERO(tp);
 		PMAP_UNLOCK(pm);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 int
 pmap_protect_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
 		 vm_offset_t va)
 {
 	u_long data;
 	vm_page_t m;
 
 	data = atomic_clear_long(&tp->tte_data, TD_REF | TD_SW | TD_W);
 	if ((data & TD_PV) != 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		if ((data & TD_REF) != 0)
 			vm_page_flag_set(m, PG_REFERENCED);
 		if ((data & TD_W) != 0)
 			vm_page_dirty(m);
 	}
 	return (1);
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va;
 	struct tte *tp;
 
 	CTR4(KTR_PMAP, "pmap_protect: ctx=%#lx sva=%#lx eva=%#lx prot=%#lx",
 	    pm->pm_context[PCPU_GET(cpuid)], sva, eva, prot);
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (eva - sva > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, sva, eva, pmap_protect_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = sva; va < eva; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL)
 				pmap_protect_tte(pm, NULL, tp, va);
 		}
 		tlb_range_demap(pm, sva, eva - 1);
 	}
 	PMAP_UNLOCK(pm);
 	vm_page_unlock_queues();
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  */
 void
 pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot, wired);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pm);
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static void
 pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 	u_long data;
 	int i;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	PMAP_STATS_INC(pmap_nenter);
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * If this is a fake page from the device_pager, but it covers actual
 	 * physical memory, convert to the real backing page.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 			if (pa >= phys_avail[i] && pa <= phys_avail[i + 1]) {
 				m = PHYS_TO_VM_PAGE(pa);
 				break;
 			}
 		}
 	}
 
 	CTR6(KTR_PMAP,
 	    "pmap_enter: ctx=%p m=%p va=%#lx pa=%#lx prot=%#x wired=%d",
 	    pm->pm_context[PCPU_GET(cpuid)], m, va, pa, prot, wired);
 
 	/*
 	 * If there is an existing mapping, and the physical address has not
 	 * changed, must be protection or wiring change.
 	 */
 	if ((tp = tsb_tte_lookup(pm, va)) != NULL && TTE_GET_PA(tp) == pa) {
 		CTR0(KTR_PMAP, "pmap_enter: update");
 		PMAP_STATS_INC(pmap_nenter_update);
 
 		/*
 		 * Wiring change, just update stats.
 		 */
 		if (wired) {
 			if ((tp->tte_data & TD_WIRED) == 0) {
 				tp->tte_data |= TD_WIRED;
 				pm->pm_stats.wired_count++;
 			}
 		} else {
 			if ((tp->tte_data & TD_WIRED) != 0) {
 				tp->tte_data &= ~TD_WIRED;
 				pm->pm_stats.wired_count--;
 			}
 		}
 
 		/*
 		 * Save the old bits and clear the ones we're interested in.
 		 */
 		data = tp->tte_data;
 		tp->tte_data &= ~(TD_EXEC | TD_SW | TD_W);
 
 		/*
 		 * If we're turning off write permissions, sense modify status.
 		 */
 		if ((prot & VM_PROT_WRITE) != 0) {
 			tp->tte_data |= TD_SW;
 			if (wired) {
 				tp->tte_data |= TD_W;
 			}
 			vm_page_flag_set(m, PG_WRITEABLE);
 		} else if ((data & TD_W) != 0) {
 			vm_page_dirty(m);
 		}
 
 		/*
 		 * If we're turning on execute permissions, flush the icache.
 		 */
 		if ((prot & VM_PROT_EXECUTE) != 0) {
 			if ((data & TD_EXEC) == 0) {
 				icache_page_inval(pa);
 			}
 			tp->tte_data |= TD_EXEC;
 		}
 
 		/*
 		 * Delete the old mapping.
 		 */
 		tlb_page_demap(pm, TTE_GET_VA(tp));
 	} else {
 		/*
 		 * If there is an existing mapping, but its for a different
 		 * phsyical address, delete the old mapping.
 		 */
 		if (tp != NULL) {
 			CTR0(KTR_PMAP, "pmap_enter: replace");
 			PMAP_STATS_INC(pmap_nenter_replace);
 			pmap_remove_tte(pm, NULL, tp, va);
 			tlb_page_demap(pm, va);
 		} else {
 			CTR0(KTR_PMAP, "pmap_enter: new");
 			PMAP_STATS_INC(pmap_nenter_new);
 		}
 
 		/*
 		 * Now set up the data and install the new mapping.
 		 */
 		data = TD_V | TD_8K | TD_PA(pa);
 		if (pm == kernel_pmap)
 			data |= TD_P;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			data |= TD_SW;
 			vm_page_flag_set(m, PG_WRITEABLE);
 		}
 		if (prot & VM_PROT_EXECUTE) {
 			data |= TD_EXEC;
 			icache_page_inval(pa);
 		}
 
 		/*
 		 * If its wired update stats.  We also don't need reference or
 		 * modify tracking for wired mappings, so set the bits now.
 		 */
 		if (wired) {
 			pm->pm_stats.wired_count++;
 			data |= TD_REF | TD_WIRED;
 			if ((prot & VM_PROT_WRITE) != 0)
 				data |= TD_W;
 		}
 
 		tsb_tte_enter(pm, m, va, TS_8K, data);
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pm);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pm, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 		m = TAILQ_NEXT(m, listq);
 	}
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE);
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_object_init_pt(pmap_t pm, vm_offset_t addr, vm_object_t object,
 		    vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  * Change the wiring attribute for a map/virtual-address pair.
  * The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pm, vm_offset_t va, boolean_t wired)
 {
 	struct tte *tp;
 	u_long data;
 
 	PMAP_LOCK(pm);
 	if ((tp = tsb_tte_lookup(pm, va)) != NULL) {
 		if (wired) {
 			data = atomic_set_long(&tp->tte_data, TD_WIRED);
 			if ((data & TD_WIRED) == 0)
 				pm->pm_stats.wired_count++;
 		} else {
 			data = atomic_clear_long(&tp->tte_data, TD_WIRED);
 			if ((data & TD_WIRED) != 0)
 				pm->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pm);
 }
 
 static int
 pmap_copy_tte(pmap_t src_pmap, pmap_t dst_pmap, struct tte *tp, vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	if ((tp->tte_data & TD_FAKE) != 0)
 		return (1);
 	if (tsb_tte_lookup(dst_pmap, va) == NULL) {
 		data = tp->tte_data &
 		    ~(TD_PV | TD_REF | TD_SW | TD_CV | TD_W);
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		tsb_tte_enter(dst_pmap, m, va, TS_8K, data);
 	}
 	return (1);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 	  vm_size_t len, vm_offset_t src_addr)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	if (dst_addr != src_addr)
 		return;
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	if (len > PMAP_TSB_THRESH) {
 		tsb_foreach(src_pmap, dst_pmap, src_addr, src_addr + len,
 		    pmap_copy_tte);
 		tlb_context_demap(dst_pmap);
 	} else {
 		for (va = src_addr; va < src_addr + len; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(src_pmap, va)) != NULL)
 				pmap_copy_tte(src_pmap, dst_pmap, tp, va);
 		}
 		tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_area: fake page"));
 	KASSERT(off + size <= PAGE_SIZE, ("pmap_zero_page_area: bad off/size"));
 	PMAP_STATS_INC(pmap_nzero_page_area);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_area_nc);
 		aszero(ASI_PHYS_USE_EC, pa + off, size);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_area_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		bzero((void *)(va + off), size);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_area_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		bzero((void *)(va + off), size);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_idle: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page_idle);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_idle_oc);
 		va = pmap_idle_map + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 	}
 }
 
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t vdst;
 	vm_offset_t vsrc;
 	vm_paddr_t pdst;
 	vm_paddr_t psrc;
 	struct tte *tp;
 
 	KASSERT((mdst->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake dst page"));
 	KASSERT((msrc->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake src page"));
 	PMAP_STATS_INC(pmap_ncopy_page);
 	pdst = VM_PAGE_TO_PHYS(mdst);
 	psrc = VM_PAGE_TO_PHYS(msrc);
 	if (msrc->md.color == -1 && mdst->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncopy_page_nc);
 		ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE);
 	} else if (msrc->md.color == DCACHE_COLOR(psrc) &&
 	    mdst->md.color == DCACHE_COLOR(pdst)) {
 		PMAP_STATS_INC(pmap_ncopy_page_c);
 		vdst = TLB_PHYS_TO_DIRECT(pdst);
 		vsrc = TLB_PHYS_TO_DIRECT(psrc);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 	} else if (msrc->md.color == -1) {
 		if (mdst->md.color == DCACHE_COLOR(pdst)) {
 			PMAP_STATS_INC(pmap_ncopy_page_dc);
 			vdst = TLB_PHYS_TO_DIRECT(pdst);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_doc);
 			PMAP_LOCK(kernel_pmap);
 			vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vdst);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vdst, TS_8K);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vdst);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else if (mdst->md.color == -1) {
 		if (msrc->md.color == DCACHE_COLOR(psrc)) {
 			PMAP_STATS_INC(pmap_ncopy_page_sc);
 			vsrc = TLB_PHYS_TO_DIRECT(psrc);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_soc);
 			PMAP_LOCK(kernel_pmap);
 			vsrc = pmap_temp_map_1 + (msrc->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vsrc);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vsrc);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else {
 		PMAP_STATS_INC(pmap_ncopy_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vdst);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vdst, TS_8K);
 		vsrc = pmap_temp_map_2 + (msrc->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vsrc);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, vdst);
 		tlb_page_demap(kernel_pmap, vsrc);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pm, vm_page_t m)
 {
 	struct tte *tp;
 	int loops;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	loops = 0;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if (TTE_GET_PMAP(tp) == pm)
 			return (TRUE);
 		if (++loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space, this aids process exit
  * speeds.  This is much faster than pmap_remove n the case of running down
  * an entire address space.  Only works for the current pmap.
  */
 void
 pmap_remove_pages(pmap_t pm)
 {
 }
 
 /*
  * Returns TRUE if the given page has a managed mapping.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct tte *tp;
 
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) != 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct tte *tpf;
 	struct tte *tpn;
 	struct tte *tp;
 	u_long data;
 	int count;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (0);
 	count = 0;
 	if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) {
 		tpf = tp;
 		do {
 			tpn = TAILQ_NEXT(tp, tte_link);
 			TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 			TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 			if ((tp->tte_data & TD_PV) == 0)
 				continue;
 			data = atomic_clear_long(&tp->tte_data, TD_REF);
 			if ((data & TD_REF) != 0 && ++count > 4)
 				break;
 		} while ((tp = tpn) != NULL && tp != tpf);
 	}
 	return (count);
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct tte *tp;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if ((tp->tte_data & TD_W) != 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 
 	return (FALSE);
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_W);
 		if ((data & TD_W) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 }
 
 void
 pmap_clear_reference(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_REF);
 		if ((data & TD_REF) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 }
 
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W);
 		if ((data & TD_W) != 0) {
 			vm_page_dirty(m);
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 		}
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 int
 pmap_mincore(pmap_t pm, vm_offset_t addr)
 {
 	/* TODO; */
 	return (0);
 }
 
 /*
  * Activate a user pmap.  The pmap must be activated before its address space
  * can be accessed in any way.
  */
 void
 pmap_activate(struct thread *td)
 {
 	struct vmspace *vm;
 	struct pmap *pm;
 	int context;
 
 	vm = td->td_proc->p_vmspace;
 	pm = vmspace_pmap(vm);
 
 	mtx_lock_spin(&sched_lock);
 
 	context = PCPU_GET(tlb_ctx);
 	if (context == PCPU_GET(tlb_ctx_max)) {
 		tlb_flush_user();
 		context = PCPU_GET(tlb_ctx_min);
 	}
 	PCPU_SET(tlb_ctx, context + 1);
 
 	pm->pm_context[PCPU_GET(cpuid)] = context;
 	pm->pm_active |= PCPU_GET(cpumask);
 	PCPU_SET(pmap, pm);
 
 	stxa(AA_DMMU_TSB, ASI_DMMU, pm->pm_tsb);
 	stxa(AA_IMMU_TSB, ASI_IMMU, pm->pm_tsb);
 	stxa(AA_DMMU_PCXR, ASI_DMMU, context);
 	membar(Sync);
 
 	mtx_unlock_spin(&sched_lock);
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
 {
 
 	return (va);
 }
Index: head/sys/sun4v/sun4v/machdep.c
===================================================================
--- head/sys/sun4v/sun4v/machdep.c	(revision 173360)
+++ head/sys/sun4v/sun4v/machdep.c	(revision 173361)
@@ -1,1001 +1,1001 @@
 /*-
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * 	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/asm.h>
 #include <machine/hv_api.h>
 #include <machine/wstate.h>
 
 #include <machine/md_var.h>
 #include <machine/hypervisorvar.h>
 #include <dev/ofw/openfirm.h>
 
 /* XXX move this to a header */
 extern void mdesc_init(void);
 
 typedef int ofw_vec_t(void *);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct trapframe frame0;
 int trap_conversion[256];
 vm_paddr_t mmu_fault_status_area;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 
 /*
  * Note: timer quality for CPU's is set low to try and prevent them from
  * being chosen as the primary timecounter.  The CPU counters are not
  * synchronized among the CPU's so in MP machines this causes problems
  * when calculating the time.  With this value the CPU's should only be
  * chosen as the primary timecounter as a last resort.
  */
 
 #define	UP_TICK_QUALITY	1000
 #ifdef SUN4V
 #define	MP_TICK_QUALITY	1000
 #else
 #define	MP_TICK_QUALITY	-100
 #endif
 
 
 
 
 
 static struct timecounter tick_tc;
 
 char sparc64_model[32];
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static timecounter_get_t tick_get_timecount;
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
 		  ofw_vec_t *vec);
 void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 CTASSERT((sizeof(struct pcpu) & ((1<<6)-1)) == 0);
 
 
 #define BVPRINTF(x) \
 	if (bootverbose) \
 		printf(x);
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	tick_tc.tc_get_timecount = tick_get_timecount;
 	tick_tc.tc_poll_pps = NULL;
 	tick_tc.tc_counter_mask = ~0u;
 	tick_tc.tc_frequency = tick_freq;
 	tick_tc.tc_name = "tick";
 	tick_tc.tc_quality = UP_TICK_QUALITY;
 #ifdef SMP
 	/*
 	 * We do not know if each CPU's tick counter is synchronized.
 	 */
 	if (cpu_mp_probe())
 		tick_tc.tc_quality = MP_TICK_QUALITY;
 #endif
 
 	tc_init(&tick_tc);
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
 	printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
 	    cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 #ifdef notyet
 	cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid));
 #endif 
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = intr_disable();
 		td->td_md.md_saved_pil = pil;
 	}
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		intr_restore(td->td_md.md_saved_pil);
 	}
 
 }
 
 unsigned
 tick_get_timecount(struct timecounter *tc)
 {
 	return ((unsigned)rd(tick));
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	phandle_t child;
 	phandle_t root;
 	struct pcpu *pc;
 	vm_offset_t end;
 	caddr_t kmdp;
 	u_int clock;
 	char *env;
 	char type[8];
 	vm_paddr_t mmfsa;
 	int i;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_init(vec);
 
 
         /*
 	 * XXX
 	 */
 	bootverbose = 1;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
         if (boothowto & RB_VERBOSE)
                 bootverbose = 1;
 
 	init_param1();
 
 	root = OF_peer(0);
 	for (child = OF_child(root); child != 0; child = OF_peer(child)) {
 		OF_getprop(child, "device_type", type, sizeof(type));
 		if (strcmp(type, "cpu") == 0)
 			break;
 	}
 
 	OF_getprop(child, "clock-frequency", &clock, sizeof(clock));
 
 	/*
 	 * Initialize the console before printing anything.
 	 * console uses the pcpu area for serialization 
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize proc0 stuff (p_contested needs to be done early).
 	 */
 
-	proc_linkup(&proc0, &thread0);
+	proc_linkup0(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 	if ((u_long)thread0.td_frame & 0x3f) {
 		panic("unaligned frame0");
 	}
 
 	/*
 	 * Prime our per-cpu data page for use.  Note, we are using it for our
 	 * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or
 	 * it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_addr = (vm_offset_t)pcpu0;
 
 	cninit();
 	tick_init(clock);
 
 	printf("cpu0: UltraSparc T1 Processor (%d.%02d MHz CPU)\n",
 	    (clock + 4999) / 1000000, ((clock + 4999) / 10000) % 100);
 
 	/*
 	 * Panic is there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("sparc64_init: no loader metadata.\n"
 		       "This probably means you are not using loader(8).\n");
 		panic("sparc64_init");
 	}
 
 	/*
 	 * Sanity check the kernel end, which is important.
 	 */
 	if (end == 0) {
 		printf("sparc64_init: warning, kernel end not specified.\n"
 		       "Attempting to continue anyway.\n");
 		end = (vm_offset_t)_end;
 	}
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 
 #ifdef SMP
 	mp_tramp = mp_tramp_alloc();
 #endif
 
 	env = getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Initialize global registers.
 	 * needed for curthread to work
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(end);
 
 	thread0.td_kstack = kstack0;
 	thread0.td_md.md_saved_pil = 0;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_kstack = (uint64_t)(((char *)thread0.td_pcb) - (CCFSZ + SPOFF));
 	thread0.td_pcb = (struct pcb *)TLB_PHYS_TO_DIRECT(vtophys((vm_offset_t)thread0.td_pcb));
 	pc->pc_curpcb = thread0.td_pcb;
 
 	if (((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f) != 0) {
 		printf("unaligned stack pcb_kstack & 0x3f == 0x%lx\n", 
 		       ((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f));
 	}
 
 	/*
 	 * Update PCPU_REG to point to direct address
 	 * to support easy phys <-> virt translation in trap handler
 	 */
 	pc = (struct pcpu *)TLB_PHYS_TO_DIRECT(vtophys(pc));
 
 	BVPRINTF("initializing cpu regs\n");
 	cpu_setregs(pc);
 	
 	/*
 	 * Initialize tunables.
 	 */
 	BVPRINTF("initialize tunables\n");
 	init_param2(physmem);
 
 	/*
 	 * setup trap table and fault status area
 	 */
 	BVPRINTF("initialize trap tables\n");
 
 	mmfsa = mmu_fault_status_area + MMFSA_SIZE;
 	BVPRINTF("setwstate\n");
 	set_wstate(WSTATE_KERN);
 	BVPRINTF("set_mmfsa_scratchpad\n");
 	set_mmfsa_scratchpad(mmfsa);
 
 	BVPRINTF("init_mondo_queue\n");
 	init_mondo_queue();
 	BVPRINTF("set_mmfsa_traptable\n");
 	set_mmfsa_traptable(&tl0_base, mmfsa);
 	BVPRINTF("trap conversion\n");
 	for (i = 0; i < 256; i++)
 		trap_conversion[i] = 0;
 	trap_conversion[TT_INSTRUCTION_EXCEPTION] = T_INSTRUCTION_EXCEPTION;
 	trap_conversion[TT_INSTRUCTION_MISS]      = T_INSTRUCTION_MISS;
 	trap_conversion[TT_ILLEGAL_INSTRUCTION]   = T_ILLEGAL_INSTRUCTION;
 	trap_conversion[TT_PRIVILEGED_OPCODE]     = T_PRIVILEGED_OPCODE;
 	trap_conversion[TT_FP_EXCEPTION_IEEE_754] = T_FP_EXCEPTION_IEEE_754; 
 	trap_conversion[TT_TAG_OVERFLOW]          = T_TAG_OVERFLOW;
 	trap_conversion[TT_DIVISION_BY_ZERO]      = T_DIVISION_BY_ZERO;
 	trap_conversion[TT_DATA_EXCEPTION]        = T_DATA_EXCEPTION;
 	trap_conversion[TT_DATA_MISS]             = T_DATA_MISS;
 	trap_conversion[TT_ALIGNMENT]             = T_ALIGNMENT;
 	trap_conversion[TT_DATA_PROTECTION]       = T_DATA_PROTECTION;
 	
 	/*
 	 * Initialize the message buffer (after setting trap table).
 	 */
 	BVPRINTF("initialize msgbuf\n");
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	BVPRINTF("initialize mutexes\n");
 	mutex_init();
 	
 	BVPRINTF("initialize machine descriptor table\n");
 	mdesc_init();
 
 	BVPRINTF("initialize get model name\n");
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	BVPRINTF("initialize kdb\n");
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 	BVPRINTF("sparc64_init done\n");
 }
 
 void
 set_openfirm_callback(ofw_vec_t *vec)
 {
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	int oonstack;
 	u_long sp;
 	int sig;
 	int code;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal tramoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 
 	/* Fill siginfo structure. */
 	sf.sf_si = ksi->ksi_info;
 	sf.sf_si.si_addr = (void *)tf->tf_tpc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	bcopy(tf, mc, sizeof(*tf));
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	}
 	mc->mc_flags = _MC_VERSION;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		pcb->pcb_flags |= PCB_FEF;
 		tf->tf_fprs &= ~FPRS_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	uint64_t wstate;
 
 	if (!TSTATE_SECURE(mc->mc_tstate) ||
 	    (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	wstate = tf->tf_wstate;
 	bcopy(mc, tf, sizeof(*tf));
 	tf->tf_wstate = wstate;
 	if ((mc->mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	hv_mach_exit(0);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(void)
 {
 
 	if (rdpr(pil) != 0) 
 		panic("pil in cpu_idle not 0 - %ld", rdpr(pil));
 	if (rdpr(pstate) != 0x16)
 		panic("interrupts disabled in cpu_idle 0x%lx", rdpr(pstate));
 		/* XXX heinous hack begin*/
 	
 	cpu_yield();
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	uint64_t kstack;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 	pcb = td->td_pcb;
 	kstack = pcb->pcb_kstack;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	pcb->pcb_kstack = kstack;
 
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 
 	tf->tf_tnpc = entry + 4;
 	tf->tf_tpc = entry;
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
 
 	td->td_retval[0] = tf->tf_out[0];
 	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs = ~FPRS_FEF;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
 
 void
 cpu_yield(void)
 {
 	if (rdpr(pil) < PIL_TICK)
 		hv_cpu_yield();
 }
Index: head/sys/sun4v/sun4v/pmap.c
===================================================================
--- head/sys/sun4v/sun4v/pmap.c	(revision 173360)
+++ head/sys/sun4v/sun4v/pmap.c	(revision 173361)
@@ -1,2233 +1,2234 @@
 /*-
  * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 #include "opt_trap_trace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h> 
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/frame.h>
 #include <machine/instr.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_mem.h>
 #include <machine/mmu.h>
 #include <machine/smp.h>
 #include <machine/tlb.h>
 #include <machine/tte.h>
 #include <machine/tte_hash.h>
 #include <machine/pcb.h>
 #include <machine/pstate.h>
 #include <machine/tsb.h>
 
 #include <machine/hypervisorvar.h>
 #include <machine/hv_api.h>
 
 #ifdef TRAP_TRACING
 void trap_trace_report(int);
 #endif
 
 #if 1
 #define	PMAP_DEBUG
 #endif
 #ifndef	PMAP_SHPGPERPROC
 #define	PMAP_SHPGPERPROC	200
 #endif
 
 /*
  * Virtual and physical address of message buffer.
  */
 struct msgbuf *msgbufp;
 vm_paddr_t msgbuf_phys;
 
 /*
  * Map of physical memory reagions.
  */
 vm_paddr_t phys_avail[128];
 vm_paddr_t phys_avail_tmp[128];
 static struct ofw_mem_region mra[128];
 static struct ofw_map translations[128];
 static int translations_size;
 
 
 struct ofw_mem_region sparc64_memreg[128];
 int sparc64_nmemreg;
 
 extern vm_paddr_t mmu_fault_status_area;
 
 /*
  * First and last available kernel virtual addresses.
  */
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 vm_offset_t kernel_vm_end;
 vm_offset_t vm_max_kernel_address;
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 int pmap_debug = 0;
 static int pmap_debug_range = 1;
 static int use_256M_pages = 1;
 
 static struct mtx pmap_ctx_lock;
 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
 static int ctx_stack_top; 
 
 static int permanent_mappings = 0;
 static uint64_t nucleus_memory;
 static uint64_t nucleus_mappings[4];
 /*
  * Kernel pmap.
  */
 struct pmap kernel_pmap_store;
 
 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
 
 /*
  * This should be determined at boot time
  * with tiny TLBS it doesn't make sense to try and selectively
  * invalidate more than this 
  */
 #define MAX_INVALIDATES   32
 #define MAX_TSB_CLEARS   128
 
 /*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
  * resident count drops to 0, there are no more pages to remove, so we
  * need not continue.
  */
 #define	PMAP_REMOVE_DONE(pm) \
 	((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 
 /*
  * Kernel MMU interface
  */
 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace) 
 
 #ifdef PMAP_DEBUG
 #define KDPRINTF if (pmap_debug) printf
 #define DPRINTF \
 	if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
    	panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n",  \
 	      PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
 if (pmap_debug) printf
 
 
 #else
 #define DPRINTF(...)
 #define KDPRINTF(...)
 #endif
 
 
 static void free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
 static void pmap_tsb_reset(pmap_t pmap);
 static void pmap_tsb_resize(pmap_t pmap);
 static void pmap_tte_hash_resize(pmap_t pmap);
 
 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
 
 struct tsb_resize_info {
 	uint64_t tri_tsbscratch;
 	uint64_t tri_tsb_ra;
 };
 
 /*
  * Quick sort callout for comparing memory regions.
  */
 static int mr_cmp(const void *a, const void *b);
 static int om_cmp(const void *a, const void *b);
 static int
 mr_cmp(const void *a, const void *b)
 {
 	const struct ofw_mem_region *mra;
 	const struct ofw_mem_region *mrb;
 
 	mra = a;
 	mrb = b;
 	if (mra->mr_start < mrb->mr_start)
 		return (-1);
 	else if (mra->mr_start > mrb->mr_start)
 		return (1);
 	else
 		return (0);
 }
 static int
 om_cmp(const void *a, const void *b)
 {
 	const struct ofw_map *oma;
 	const struct ofw_map *omb;
 
 	oma = a;
 	omb = b;
 	if (oma->om_start < omb->om_start)
 		return (-1);
 	else if (oma->om_start > omb->om_start)
 		return (1);
 	else
 		return (0);
 }
 
 static __inline void
 free_context(uint16_t ctx)
 {
 	mtx_lock_spin(&pmap_ctx_lock);
 	ctx_stack[ctx_stack_top++] = ctx;
 	mtx_unlock_spin(&pmap_ctx_lock);
 
 	KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX, 
 		("context stack overrun - system error"));
 }
 
 static __inline uint16_t
 get_context(void)
 {
 	uint16_t ctx;
 
 	mtx_lock_spin(&pmap_ctx_lock);
 	ctx = ctx_stack[--ctx_stack_top];
 	mtx_unlock_spin(&pmap_ctx_lock);
 
 	KASSERT(ctx_stack_top > 0,
 		("context stack underrun - need to implement context stealing"));
 
 	return ctx;
 }
 
 static __inline void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	struct vpgqueues *vpq;
 	uint64_t tte_data;
 	pmap_t pmap;
 	pv_entry_t allocated_pv, next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 	if (allocated_pv != NULL) {
 		pv_entry_count++;
 		if (pv_entry_count > pv_entry_high_water)
 			pagedaemon_wakeup();
 		else
 			return (allocated_pv);
 	}
 
 	/*
 	 * Reclaim pv entries: At first, destroy mappings to inactive
 	 * pages.  After that, if a pv entry is still needed, destroy
 	 * mappings to active pages.
 	 */
 	if (ratecheck(&lastprint, &printinterval))
 		printf("Approaching the limit on PV entries, "
 		    "increase the vm.pmap.shpgperproc tunable.\n");
 
 	vpq = &vm_page_queues[PQ_INACTIVE];
 retry:
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = pv->pv_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 
 			tte_data = tte_hash_delete(pmap->pm_hash, va);
 
 			KASSERT((tte_data & VTD_WIRED) == 0,
 			    ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
 			if (tte_data & VTD_REF)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tte_data & VTD_W) {
 				KASSERT((tte_data & VTD_SW_W),
 				("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
 				    va, tte_data));
 				vm_page_dirty(m);
 			}
 
 			pmap_invalidate_page(pmap, va, TRUE);
 			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			if (allocated_pv == NULL)
 				allocated_pv = pv;
 			else
 				free_pv_entry(pv);
 		}
 	}
 	if (allocated_pv == NULL) {
 		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 			vpq = &vm_page_queues[PQ_ACTIVE];
 			goto retry;
 		}
 		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 	}
 	return (allocated_pv);
 }
 
 /*
  * Allocate a physical page of memory directly from the phys_avail map.
  * Can only be called from pmap_bootstrap before avail start and end are
  * calculated.
  */
 static vm_paddr_t
 pmap_bootstrap_alloc(vm_size_t size)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = round_page(size);
 
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		pmap_scrub_pages(pa, size);
 		return (pa);
 	}
 	panic("pmap_bootstrap_alloc");
 }
 
 /*
  * Activate a user pmap.  The pmap must be activated before its address space
  * can be accessed in any way.
  */
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pmap, oldpmap;
 	int err;
 	
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #if defined(SMP)
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	oldpmap->pm_active &= ~1;
 	pmap->pm_active |= 1;
 	pmap->pm_tlbactive |= 1;
 #endif
 
 	pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
 	pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 	pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
 
 	PCPU_SET(curpmap, pmap);
 	if (pmap->pm_context != 0)
 		if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 	stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
 	membar(Sync);
 	critical_exit();
 }
 
 vm_offset_t 
 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
 {
 	return (va);
 }
 
 /*
  * Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t ekva)
 {
 	struct pmap *pm;
 	vm_offset_t off, va;
 	vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
 	vm_size_t physsz, virtsz, kernel_hash_shift;
 	ihandle_t pmem, vmem;
 	int i, j, k, sz;
 	uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
 	vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
 	
 
 	if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 		panic("pmap_bootstrap: finddevice /virtual-memory");
 	if ((sz = OF_getproplen(vmem, "translations")) == -1)
 		panic("pmap_bootstrap: getproplen translations");
 	if (sizeof(translations) < sz)
 		panic("pmap_bootstrap: translations too small");
 	bzero(translations, sz);
 	if (OF_getprop(vmem, "translations", translations, sz) == -1)
 		panic("pmap_bootstrap: getprop /virtual-memory/translations");
 	sz /= sizeof(*translations);
 	translations_size = sz;
 	nucleus_memory_start = 0;
 	CTR0(KTR_PMAP, "pmap_bootstrap: translations");
 	qsort(translations, sz, sizeof (*translations), om_cmp);
 
 	for (i = 0; i < sz; i++) {
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 			translations[i].om_size, translations[i].om_start, 
 			translations[i].om_tte);
 		if ((translations[i].om_start >= KERNBASE) && 
 		    (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
 			for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
 				KDPRINTF("mapping permanent translation\n");
 				pa = TTE_GET_PA(translations[i].om_tte) + j;
 				va = translations[i].om_start + j;
 				error = hv_mmu_map_perm_addr(va, KCONTEXT, 
 							     pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
 				if (error != H_EOK)
 					panic("map_perm_addr returned error=%ld", error);
 				
 				if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
 					nucleus_memory_start = pa;
 				printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
 				nucleus_mappings[permanent_mappings++] = pa;
 				nucleus_memory += PAGE_SIZE_4M;
 #ifdef SMP
 				mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
 #endif
 			}
 		}  
 	}
 
 	/*
 	 * Find out what physical memory is available from the prom and
 	 * initialize the phys_avail array.  This must be done before
 	 * pmap_bootstrap_alloc is called.
 	 */
 	if ((pmem = OF_finddevice("/memory")) == -1)
 		panic("pmap_bootstrap: finddevice /memory");
 	if ((sz = OF_getproplen(pmem, "available")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/available");
 	if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
 		panic("pmap_bootstrap: phys_avail too small");
 	if (sizeof(mra) < sz)
 		panic("pmap_bootstrap: mra too small");
 	bzero(mra, sz);
 	if (OF_getprop(pmem, "available", mra, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/available");
 
 	sz /= sizeof(*mra);
 	CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
 
 	qsort(mra, sz, sizeof (*mra), mr_cmp);
 	physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
 	
         if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
 		KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
 	}
         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
                 physmem = atop(physmem_tunable);
 		KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
 	}
 	if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
 		physmem_tunable += physmemstart_tunable;
 	
 	bzero(real_phys_avail, sizeof(real_phys_avail));
 	bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
 
 	for (i = 0, j = 0; i < sz; i++) {
 		uint64_t size;
 		KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
 		if (mra[i].mr_size < PAGE_SIZE_4M)
 			continue;
 
 		if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
 			uint64_t newstart, roundup;
 			newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
 			roundup = newstart - mra[i].mr_start;
 			size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
 			mra[i].mr_start = newstart;
 			if (size < PAGE_SIZE_4M)
 				continue;
 			mra[i].mr_size = size;
 		}
 		real_phys_avail[j] = mra[i].mr_start;
 		if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
 			mra[i].mr_size = physmem_tunable - physsz;
 			physsz = physmem_tunable;
 			real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 			break;
 		}
 		physsz += mra[i].mr_size;
 		real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 		j += 2;
 	}
 	physmem = btoc(physsz - physmemstart_tunable);
 
 	/*
 	 * This is needed for versions of OFW that would allocate us memory
 	 * and then forget to remove it from the available ranges ...
 	 * as well as for compensating for the above move of nucleus pages
 	 */
 	for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
 		vm_paddr_t start = real_phys_avail[i];
 		uint64_t end = real_phys_avail[i + 1];
 		CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
 		KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
 		/* 
 		 * Is kernel memory at the beginning of range?
 		 */
 		if (nucleus_memory_start == start) {
 			start += nucleus_memory;
 		}
 		/* 
 		 * Is kernel memory at the end of range?
 		 */
 		if (nucleus_memory_start == (end - nucleus_memory)) 
 			end -= nucleus_memory;
 
 		if (physmemstart_tunable != 0 && 
 		    (end < physmemstart_tunable))
 			continue;
 
 		if (physmemstart_tunable != 0 && 
 		    ((start < physmemstart_tunable))) {
 			start = physmemstart_tunable;
 		}
 
 		/* 
 		 * Is kernel memory in the middle somewhere?		 
 		 */
 		if ((nucleus_memory_start > start) && 
 		    (nucleus_memory_start < end)) {
 			phys_avail[j] = start;
 			phys_avail[j+1] = nucleus_memory_start;
 			start =  nucleus_memory_start + nucleus_memory;
 			j += 2;
 		}
 		/*
 		 * Break phys_avail up on 4GB boundaries to try
 		 * to work around PCI-e allocation bug
 		 * we rely on the fact that kernel memory is allocated 
 		 * from the first 4GB of physical memory
 		 */ 
 		while (bounds < start)
 			bounds += (1UL<<32);
 
 		while (bounds < end) {
 			phys_avail[j] = start;
 			phys_avail[j + 1] = bounds;
 			start = bounds;
 			bounds += (1UL<<32);
 			j += 2;
 		}
 		phys_avail[j] = start; 
 		phys_avail[j + 1] = end;
 		j += 2;
 	}
 
 	/*
 	 * Merge nucleus memory in to real_phys_avail
 	 *
 	 */
 	for (i = 0; real_phys_avail[i] != 0; i += 2) {
 		if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
 			real_phys_avail[i] -= nucleus_memory;
 		
 		if (real_phys_avail[i + 1] == nucleus_memory_start)
 			real_phys_avail[i + 1] += nucleus_memory;
 		
 		if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
 			real_phys_avail[i + 1] = real_phys_avail[i + 3];
 			for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
 				real_phys_avail[k] = real_phys_avail[k + 2];
 				real_phys_avail[k + 1] = real_phys_avail[k + 3];
 			}
 		}
 	}
 	for (i = 0; phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 
 	/*
 	 * Shuffle the memory range containing the 256MB page with 
 	 * nucleus_memory to the beginning of the phys_avail array
 	 * so that physical memory from that page is preferentially
 	 * allocated first
 	 */
 	for (j = 0; phys_avail[j] != 0; j += 2) 
 		if (nucleus_memory_start < phys_avail[j])
 			break;
 	/*
 	 * Don't shuffle unless we have a full 256M page in the range
 	 * our kernel malloc appears to be horribly brittle
 	 */
 	if ((phys_avail[j + 1] - phys_avail[j]) < 
 	    (PAGE_SIZE_256M - nucleus_memory))
 		goto skipshuffle;
 
 	for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
 		tmp_phys_avail[k] = phys_avail[i];
 	for (i = 0; i < j; i++)
 		tmp_phys_avail[k + i] = phys_avail[i];
 	for (i = 0; i < 128; i++)
 		phys_avail[i] = tmp_phys_avail[i];
 
 skipshuffle:
 	for (i = 0; real_phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
 			i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
 
 	for (i = 0; phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel tsb.
 	 */
 	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 
 	/*
 	 * Set the start and end of kva.  The kernel is loaded at the first
 	 * available 4 meg super page, so round up to the end of the page.
 	 */
 	virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
 	virtual_end = vm_max_kernel_address;
 	kernel_vm_end = vm_max_kernel_address;
 
 	/*
 	 * Allocate and map a 4MB page for the kernel hashtable 
 	 *
 	 */
 #ifndef SIMULATOR
 	kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
 #else
 	kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
 #endif
 
 	kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
 	if (kernel_hash_pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: hashtable pa unaligned\n");
 	/*
 	 * Set up TSB descriptors for the hypervisor
 	 *
 	 */
 #ifdef notyet
 	tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 #else
 	/* avoid alignment complaints from the hypervisor */
 	tsb_8k_size = PAGE_SIZE_4M;
 #endif
 
 	tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
 	if (tsb_8k_pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: tsb unaligned\n");
 	KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
 
 	tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
 	tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
 
 	kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
 	kernel_td[TSB8K_INDEX].hti_assoc = 1;
 	kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 	kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
 	kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
 	kernel_td[TSB8K_INDEX].hti_rsvd = 0;
 	kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
 
 	/*
 	 * Initialize kernel's private TSB from 8K page TSB
 	 *
 	 */
 	kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
 	kernel_pmap->pm_tsb.hti_assoc = 1;
 	kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 	kernel_pmap->pm_tsb.hti_ctx_index = 0;
 	kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
 	kernel_pmap->pm_tsb.hti_rsvd = 0;
 	kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
 	
 	kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
 	tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
 	
 	/*
 	 * Initialize kernel TSB for 4M pages
 	 * currently (not by design) used for permanent mappings
 	 */
 	
 
 	KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
 	kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
 	kernel_td[TSB4M_INDEX].hti_assoc = 1;
 	kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
 	kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
 	kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
 	kernel_td[TSB4M_INDEX].hti_rsvd = 0;
 	kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
 	/*
 	 * allocate MMU fault status areas for all CPUS
 	 */
 	mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
 
 	/*
 	 * Allocate and map the message buffer.
 	 */
 	msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
 
 	/*
 	 * Allocate a kernel stack with guard page for thread0 and map it into
 	 * the kernel tsb.  
 	 */
 	pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
 	kstack0_phys = pa;
 	virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	kstack0 = virtual_avail;
 	virtual_avail += KSTACK_PAGES * PAGE_SIZE;
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
 			    pa | TTE_KERNEL | VTD_8K, 0);
 	}
 	/*
 	 * Calculate the last available physical address.
 	 */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 	KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 
 	Maxmem = sparc64_btop(phys_avail[i + 1]);
 	
 	/*
 	 * Add the prom mappings to the kernel tsb.
 	 */
 	for (i = 0; i < sz; i++) {
 		CTR3(KTR_PMAP,
 		    "translation: start=%#lx size=%#lx tte=%#lx",
 		    translations[i].om_start, translations[i].om_size,
 		    translations[i].om_tte);
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS) 
 			continue;
 
 		for (off = 0; off < translations[i].om_size;
 		     off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			pa = TTE_GET_PA(translations[i].om_tte) + off;
 			tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
 			tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | 
 				    TTE_KERNEL | VTD_8K, 0);
 		}
 	}
 
 	if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO, 
 				     vtophys((vm_offset_t)kernel_td))) != H_EOK)
 		panic("failed to set ctx0 TSBs error: %ld", error);
 
 #ifdef SMP
 	mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
 #endif
 	/*
 	 * setup direct mappings
 	 * 
 	 */
 	for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
 		vm_paddr_t tag_pa = 0, next_pa = 0;
 		uint64_t size_bits = VTD_4M;
 		while (pa < real_phys_avail[i + 1]) {
 			if (use_256M_pages &&
 			    (pa & PAGE_MASK_256M) == 0 && 
 			    ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
 				tag_pa = pa;
 				size_bits = VTD_256M;
 				next_pa = pa + PAGE_SIZE_256M;
 			} else if (next_pa <= pa) {
 				tag_pa = pa;
 				size_bits = VTD_4M;
 			}
 			tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
 			tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa), 
 					 TLB_PHYS_TO_DIRECT(pa), 
 					 tag_pa | TTE_KERNEL | size_bits, 0);
 			pa += PAGE_SIZE_4M;
 		}
 	}
 
 	/*
 	 * Get the available physical memory ranges from /memory/reg. These
 	 * are only used for kernel dumps, but it may not be wise to do prom
 	 * calls in that situation.
 	 */
 	if ((sz = OF_getproplen(pmem, "reg")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/reg");
 	if (sizeof(sparc64_memreg) < sz)
 		panic("pmap_bootstrap: sparc64_memreg too small");
 	if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/reg");
 	sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 
 	pm = kernel_pmap;
 	pm->pm_active = ~0;
 	pm->pm_tlbactive = ~0;
 
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 
 	/* 
 	 * This could happen earlier - but I put it here to avoid 
 	 * attempts to do updates until they're legal
 	 */
 	pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift, 
 					     pmap_bootstrap_alloc(PAGE_SIZE));
 	pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
 
 	for (i = 0; i < translations_size; i++) {
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS) {
 			KDPRINTF("skipping\n");
 			continue;
 		}
 		for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			pa = TTE_GET_PA(translations[i].om_tte) + off;
 			tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 		}
 		KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 	}
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 	}
 	/*
 	 * Add direct mappings to hash
 	 *
 	 */
 #ifdef notyet
 	/* hash only supports 8k pages */
 	for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
 		tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa), 
 				pa | TTE_KERNEL | VTD_4M);
 #endif
 
 
 	if (bootverbose)
 		printf("pmap_bootstrap done\n");
 }
 
 
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	boolean_t iswired;
 	PMAP_LOCK(pmap);
 	iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
 
 	if (wired && !iswired) {
 		pmap->pm_stats.wired_count++;
 		tte_set_virt_bit(pmap, va, VTD_WIRED);
 	} else if (!wired && iswired) {
 		pmap->pm_stats.wired_count--;
 		tte_clear_virt_bit(pmap, va, VTD_WIRED);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 	tte_clear_phys_bit(m, VTD_W);
 }
 
 void
 pmap_clear_reference(vm_page_t m)
 {
 	KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 	tte_clear_phys_bit(m, VTD_REF);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 	  vm_size_t len, vm_offset_t src_addr)
 {
 	vm_offset_t addr, end_addr;
 
 	end_addr = src_addr + len;
 	/*
 	 * Don't let optional prefaulting of pages make us go
 	 * way below the low water mark of free pages or way
 	 * above high water mark of used pv entries.
 	 */
 	if (cnt.v_free_count < cnt.v_free_reserved ||
 	    pv_entry_count > pv_entry_high_water)
 		return;
 	
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
 		tte_t tte_data;
 		vm_page_t m;
 
 		tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
 
 		if ((tte_data & VTD_MANAGED) != 0) {
 			if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 
 				tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
 				dst_pmap->pm_stats.resident_count++;
 				pmap_insert_entry(dst_pmap, addr, m);
 			} 
 		}		
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	vm_paddr_t srcpa, dstpa;
 	srcpa = VM_PAGE_TO_PHYS(src);
 	dstpa = VM_PAGE_TO_PHYS(dst);
 
 	novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
 
 
 }
 
 static __inline void
 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
 {
 
 	if (wired)
 		pmap->pm_stats.wired_count++;
 	
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, m);
 		*tte_data |= VTD_MANAGED;
 	}
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa, opa;
 	uint64_t tte_data, otte_data;
 	vm_page_t om;
 	int invlva;
 
 	if (pmap->pm_context)
 		DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va, 
 			VM_PAGE_TO_PHYS(m), prot);
 
 	om = NULL;
 	
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	tte_data = pa = VM_PAGE_TO_PHYS(m);
 	otte_data = tte_hash_delete(pmap->pm_hash, va);
 	opa = TTE_GET_PA(otte_data);
 
 	if (opa == 0) {
 		/*
 		 * This is a new mapping
 		 */
 		pmap->pm_stats.resident_count++;
 		pmap_add_tte(pmap, va, m, &tte_data, wired);
 
 	} else if (pa != opa) {
 		/*
 		 * Mapping has changed, handle validating new mapping.
 		 * 
 		 */
 		if (otte_data & VTD_WIRED)
 			pmap->pm_stats.wired_count--;
 
 		if (otte_data & VTD_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 
 		pmap_add_tte(pmap, va, m, &tte_data, wired);
 
 	} else /* (pa == opa) */ {
 		/*
 		 * Mapping has not changed, must be protection or wiring change.
 		 */
 
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((otte_data & VTD_WIRED) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (otte_data & VTD_WIRED))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (otte_data & VTD_MANAGED) {
 			om = m;
 			tte_data |= VTD_MANAGED;
 		}
 	} 
 
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	if ((prot & VM_PROT_WRITE) != 0) {
 		tte_data |= VTD_SW_W; 
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		tte_data |= VTD_X;
 	if (wired)
 		tte_data |= VTD_WIRED;
 	if (pmap == kernel_pmap)
 		tte_data |= VTD_P;
 	
 	invlva = FALSE;
 	if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
 		if (otte_data & VTD_V) {
 			if (otte_data & VTD_REF) {
 				if (otte_data & VTD_MANAGED) 
 					vm_page_flag_set(om, PG_REFERENCED);
 				if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
 					invlva = TRUE;
 			}
 			if (otte_data & VTD_W) {
 				if (otte_data & VTD_MANAGED) 
 					vm_page_dirty(om);
 				if ((pa & VTD_SW_W) != 0) 
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va, TRUE);
 		}
 	} 
 
 
 	tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
 	/*
 	 * XXX this needs to be locked for the threaded / kernel case 
 	 */
 	tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF, 
 		    pmap->pm_context);
 
 	if (tte_hash_needs_resize(pmap->pm_hash))
 		pmap_tte_hash_resize(pmap);
 
 	/*
 	 * 512 is an arbitrary number of tsb misses
 	 */
 	if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
 		pmap_tsb_resize(pmap);
 
 	vm_page_unlock_queues();
 
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 
 		  vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
         vm_pindex_t diff, psize;
 
         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
         psize = atop(end - start);
         m = m_start;
         PMAP_LOCK(pmap);
         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
                 m = TAILQ_NEXT(m, listq);
         }
         PMAP_UNLOCK(pmap);
 }
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	PMAP_LOCK(pmap);
 	pmap_enter_quick_locked(pmap, va, m, prot);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	tte_t tte_data;
 
 	if (pmap->pm_context)
 		KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n", 
 			pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
 
         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (tte_hash_lookup(pmap->pm_hash, va))
 		return;
 		
 	tte_data = VM_PAGE_TO_PHYS(m);
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, m);
 		tte_data |= VTD_MANAGED;
 	}
 
 	pmap->pm_stats.resident_count++;
 
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		tte_data |= VTD_X;
 
 	tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
 }
 
 /*
  * Extract the physical page address associated with the given
  * map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 	tte_t tte_data;
 
 	tte_data = tte_hash_lookup(pmap->pm_hash, va);
 	pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	tte_t tte_data;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	tte_data = tte_hash_lookup(pmap->pm_hash, va);
 	if (tte_data != 0 && 
 	    ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 
 	return (m);
 }
 
 void *
 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
 {
 	vm_page_t m, tm;
 	int i;
 	void *ptr;
 	
 	m = NULL;
 	while (m == NULL) {	
 		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 			m = vm_phys_alloc_contig(npages, phys_avail[i], 
 						 phys_avail[i + 1], alignment, (1UL<<34));
 			if (m)
 				goto found;
 		}
 		if (m == NULL) {
 			printf("vm_phys_alloc_contig failed - waiting to retry\n");
 			VM_WAIT;
 		}
 	}
 found:
 	for (i = 0, tm = m; i < npages; i++, tm++) {
 		tm->wire_count++;
 		if ((tm->flags & PG_ZERO) == 0)
 			pmap_zero_page(tm);
 	}
 	ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
 	
 	return (ptr);
 }
 
 void
 pmap_free_contig_pages(void *ptr, int npages)
 {
 	int i;
 	vm_page_t m;
 
 	m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
 	for (i = 0; i < npages; i++, m++) {
 		m->wire_count--;
 		atomic_subtract_int(&cnt.v_wire_count, 1);
 		vm_page_free(m);
 	}
 }
 
 void 
 pmap_growkernel(vm_offset_t addr)
 {
 	return;
 }
 
 void 
 pmap_init(void)
 {
 
 	/* allocate pv_entry zones */
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++) 
 		ctx_stack[ctx_stack_top] = ctx_stack_top;
 
 	mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
 	tte_hash_init();
 
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
 	pv = get_pv_entry(pmap);
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 #ifdef TRAP_TRACING
 static int trap_trace_report_done;
 #endif
 
 #ifdef SMP
 static cpumask_t
 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
 {
 
 	int i, cpu_count, retried;
 	u_int cpus;
 	cpumask_t cpumask, active, curactive;
 	cpumask_t active_total, ackmask;
 	uint16_t *cpulist;
 
 	retried = 0;
 
 	if (!smp_started)
 		return (0);
 
 	cpumask = PCPU_GET(cpumask);
 	cpulist = PCPU_GET(cpulist);
 	curactive = 0;
 
 	if (rdpr(pil) != 14)
 		panic("pil %ld != 14", rdpr(pil));
 
 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
 	/* by definition cpumask should have curcpu's bit set */
 	if (cpumask != (1 << curcpu)) 
 		panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n", 
 		      cpumask, (1 << curcpu));
 
 #endif
 #ifdef notyet
 	if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
 		goto done;
 
 	if (pmap->pm_context != 0)
 		active_total = active = (pmap->pm_tlbactive & ~cpumask);
 	else 
 #endif
 		active_total = active = PCPU_GET(other_cpus);
 
 	if (active == 0)
 		goto done;
 	
  retry:
 	
 	for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
 		if ((cpus & 0x1) == 0)
 			continue;
 		
 		curactive |= (1 << i);
 		cpulist[cpu_count] = (uint16_t)i;
 		cpu_count++;
 	}
 
 	ackmask = 0;
 	cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1, 
 			 (uint64_t)arg2, (uint64_t *)&ackmask);
 
 	while (ackmask != curactive) {
 		membar(Sync);
 		i++;
 		if (i > 10000000) {
 #ifdef TRAP_TRACING
 			int j;
 #endif
 			uint64_t cpu_state;
 			printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
 			       curactive & ~ackmask);
 
 #ifdef TRAP_TRACING
 			if (!trap_trace_report_done) {
 				trap_trace_report_done = 1;
 				for (j = 0; j < MAXCPU; j++)
 					if (((1 << j) & curactive & ~ackmask) != 0) {
 						struct pcpu *pc = pcpu_find(j);
 						printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
 						    pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
 						    pc->pad[4], pc->pad[5], pc->pad[6]);
 						trap_trace_report(j);
 					}
 			}
 #endif
 
 			hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
 			printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
 			if (!retried) {
 				printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
 			       "and then I'm going to panic\n");
 
 				retried = 1;
 				goto retry;
 			}
 
 			panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
 		}
 	}
 
 	active_total |= curactive;
 	if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
 		printf("pmap_ipi: retrying");
 		goto retry;
 	}
  done:
 	return (active_total);
 }
 #endif
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
 {
 
 	if (cleartsb == TRUE)
 		tsb_clear_tte(&pmap->pm_tsb, va);
 
 	DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
 	spinlock_enter();
 	invlpg(va, pmap->pm_context);
 #ifdef SMP
 	pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
 #endif
 	spinlock_exit();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
 {
 	vm_offset_t tva, invlrngva;
 	char *func;
 #ifdef SMP
 	cpumask_t active;
 #endif
 	if ((eva - sva) == PAGE_SIZE) {
 		pmap_invalidate_page(pmap, sva, cleartsb);
 		return;
 	}
 	
 
 	KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
 
 	if (cleartsb == TRUE) 
 		tsb_clear_range(&pmap->pm_tsb, sva, eva);
 
 	spinlock_enter();
 	if ((sva - eva) < PAGE_SIZE*64) {
 		for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
 			invlpg(tva, pmap->pm_context);
 		func = tl_invlrng;
 	} else if (pmap->pm_context) {
 		func = tl_invlctx;
 		invlctx(pmap->pm_context);
 
 	} else {
 		func = tl_invltlb;
 		invltlb();
 	}
 #ifdef SMP
 	invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
 	active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
 	active &= ~pmap->pm_active;
 	atomic_clear_int(&pmap->pm_tlbactive, active);
 #endif
 	spinlock_exit();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
 
 	tsb_clear(&pmap->pm_tsb);
 
 	spinlock_enter();
 	invlctx(pmap->pm_context);
 #ifdef SMP
 	pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
 	pmap->pm_tlbactive = pmap->pm_active;
 #endif
 	spinlock_exit();
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	return (tte_get_phys_bit(m, VTD_W));
 }
 
 
 boolean_t 
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
 {
 	return (tte_hash_lookup(pmap->pm_hash, va) == 0);
 }
 
 /*
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	tte_t tte_data;
 	vm_paddr_t pa;
 
         pa = 0;
 	if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
 		uint64_t offset;
 		offset = va - KERNBASE; 
 		pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
 	}
 	if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
 		pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
 		pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	return pa;
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  *
  * The value passed in *virt is a suggested virtual address for the mapping.
  * Architectures which can support a direct-mapped physical to virtual region
  * can return the appropriate address within that region, leaving '*virt'
  * unchanged.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return TLB_PHYS_TO_DIRECT(start);
 }
 
 int 
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	return (0);
 }
 
 void 
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 
 		    vm_pindex_t index, vm_size_t size)
 {
 	printf("pmap_object_init_pt\n");
 	return;
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}	
 	return (FALSE);
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 /*
  * Lower the permission for all mappings to a given page.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	if ((m->flags & PG_WRITEABLE) == 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 /*
  * Initialize the pmap associated with process 0.
  */
 void
 pmap_pinit0(pmap_t pmap)
 {
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_active = pmap->pm_tlbactive = ~0;
 	pmap->pm_context = 0;
 	pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
 	pmap->pm_hash = kernel_pmap->pm_hash;
 	critical_enter();
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure, such as one in a
  * vmspace structure.
  */
-void
+int
 pmap_pinit(pmap_t pmap)
 {
 	int i;
 
 	pmap->pm_context = get_context();
 	pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
 
 	vm_page_lock_queues();
 	pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
 	tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
 	vm_page_unlock_queues();
 	pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
 	pmap->pm_active = pmap->pm_tlbactive = 0;
 	for (i = 0; i < TSB_MAX_RESIZE; i++)
 		pmap->pm_old_tsb_ra[i] = 0;
 
 	TAILQ_INIT(&pmap->pm_pvlist);
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+	return (1);
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 
 	int anychanged;
 	vm_offset_t tva;
 	uint64_t clearbits;
 
 	DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
 	
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 	
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	clearbits = anychanged = 0;
 	
 	if ((prot & VM_PROT_WRITE) == 0)
 		clearbits |= (VTD_W|VTD_SW_W);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		clearbits |= VTD_X;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (tva = sva; tva < eva; tva += PAGE_SIZE) {
 		uint64_t otte_data;
 		vm_page_t m;
 
 		if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva, 
 						     clearbits)) == 0)
 			continue;
 		/*
 		 * XXX technically we should do a shootdown if it 
 		 * was referenced and was executable - but is not now
 		 */
 		if (!anychanged && (otte_data & VTD_W))
 			anychanged = 1;
 		
 		if (otte_data & VTD_MANAGED) {
 			m = NULL;
 
 			if (otte_data & VTD_REF) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			if (otte_data & VTD_W) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
 				vm_page_dirty(m);
 			}
 		} 
 	}
 
 	vm_page_unlock_queues();
 	if (anychanged)
 		pmap_invalidate_range(pmap, sva, eva, TRUE);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 	tte_t otte;
 	
 	otte = 0;
 	va = sva;
 	while (count-- > 0) {
 		otte |= tte_hash_update(kernel_pmap->pm_hash, va,  
 					VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
 		va += PAGE_SIZE;
 		m++;
 	}
 	if ((otte & VTD_REF) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by pmap_qenter.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 	tte_t otte;
 
 	va = sva;
 
 	otte = 0;
 	while (count-- > 0) {
 		otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
 		va += PAGE_SIZE;
 	}
 	if ((otte & VTD_REF) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	tsb_deinit(&pmap->pm_tsb);
 	tte_hash_destroy(pmap->pm_hash);
 	free_context(pmap->pm_context);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
 {
 	int invlva;
 	vm_offset_t tva;
 	uint64_t tte_data;
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 	
 	DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n", 
 		start, end);
 	invlva = 0;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (tva = start; tva < end; tva += PAGE_SIZE) {
 		if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
 			continue;
 		pmap_remove_tte(pmap, tte_data, tva);
 		if (tte_data & (VTD_REF|VTD_W))
 			invlva = 1;
 	}
 	vm_page_unlock_queues();
 	if (invlva)
 		pmap_invalidate_range(pmap, start, end, TRUE);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	uint64_t tte_data;
 	DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		PMAP_LOCK(pv->pv_pmap);
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
 
 		if (tte_data & VTD_WIRED)
 			pv->pv_pmap->pm_stats.wired_count--;
 		if (tte_data & VTD_REF)
 			vm_page_flag_set(m, PG_REFERENCED);
 		
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tte_data & VTD_W) {
 			KASSERT((tte_data & VTD_SW_W),
 	("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
 			    pv->pv_va, tte_data));
 			vm_page_dirty(m);
 		}
 	
 		pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		PMAP_UNLOCK(pv->pv_pmap);
 		free_pv_entry(pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 	if (pmap != kernel_pmap)
 		DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 	free_pv_entry(pv);
 }
 
 
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	
 	vm_page_t m;
 	pv_entry_t pv, npv;
 	tte_t tte_data;
 	
 	DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
 
 		if (tte_data == 0) {
 			printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
 			panic("bad tte");
 		}
 		if (tte_data & VTD_WIRED) {
 			panic("wired page in process not handled correctly");
 			pmap->pm_stats.wired_count--;
 		}
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 
 		pmap->pm_stats.resident_count--;
 		
 		if (tte_data & VTD_W) {
 			vm_page_dirty(m);
 		}
 		
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		free_pv_entry(pv);
 	}
 	pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
 	if (0)
 		pmap_tsb_reset(pmap);
 
 	vm_page_unlock_queues();
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_tsb_reset(pmap_t pmap)
 {
 	int i;
 
 	for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
 		pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]), 
 				       (1 << (TSB_INIT_SHIFT + i)));
 		pmap->pm_old_tsb_ra[i] = 0;
 	}
 	if (pmap->pm_old_tsb_ra[0] != 0) {
 		vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
 		int size = tsb_size(&pmap->pm_tsb);
 		pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
 		pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
 		pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
 		pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
 		pmap->pm_old_tsb_ra[0] = 0;
 	}
 }
 
 void
 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
 {
 	uint64_t bytes_zeroed;
 	while (size > 0) {
 		hv_mem_scrub(pa, size, &bytes_zeroed);
 		pa += bytes_zeroed;
 		size -= bytes_zeroed;
 	}
 }
 
 static void
 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
 {
 	
 	vm_page_t m;
 
 	if (pmap != kernel_pmap)
 		DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (tte_data & VTD_WIRED)
 		pmap->pm_stats.wired_count--;
 
 	pmap->pm_stats.resident_count--;
 	
 	if (tte_data & VTD_MANAGED) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 		if (tte_data & VTD_W) {
 			vm_page_dirty(m);	
 		}
 		if (tte_data & VTD_REF) 
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 }
 
 /* resize the tsb if the number of capacity misses is greater than 1/4 of
  * the total 
  */  
 static void
 pmap_tsb_resize(pmap_t pmap)
 {
 	uint32_t miss_count;
 	uint32_t cap_miss_count;
 	struct tsb_resize_info info;
 	hv_tsb_info_t hvtsb;
 	uint64_t tsbscratch;
 
 	KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
 	miss_count = pmap->pm_tsb_miss_count;
 	cap_miss_count = pmap->pm_tsb_cap_miss_count;
 	int npages_shift = tsb_page_shift(pmap);
 
 	if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) && 
 	    cap_miss_count > (miss_count >> 1)) {
 		DPRINTF("resizing tsb for proc=%s pid=%d\n", 
 			curthread->td_proc->p_comm, curthread->td_proc->p_pid);
 		pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
 
 		/* double TSB size */
 		tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
 #ifdef SMP
 		spinlock_enter();
 		/* reset tsb */
 		bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
 		pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 
 		if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 		info.tri_tsbscratch = pmap->pm_tsbscratch;
 		info.tri_tsb_ra = pmap->pm_tsb_ra;
 		pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
 		pmap->pm_tlbactive = pmap->pm_active;
 		spinlock_exit();
 #else 
 		bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
 		if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 		pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 #endif		
 	}
 	pmap->pm_tsb_miss_count = 0;
 	pmap->pm_tsb_cap_miss_count = 0;
 }
 
 static void
 pmap_tte_hash_resize(pmap_t pmap)
 {
 	tte_hash_t old_th = pmap->pm_hash;
 	
 	pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
 	spinlock_enter();
 	if (curthread->td_proc->p_numthreads != 1) 
 		pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
 
 	pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);	
 	spinlock_exit();
 	tte_hash_destroy(old_th);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	
 	int rv;
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	tte_t otte_data;
 
 	rv = 0;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		
 		pvf = pv;
 
 		do {
                         pvn = TAILQ_NEXT(pv, pv_list);
 			
                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			
                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			
                         pmap = pv->pv_pmap;
                         PMAP_LOCK(pmap);
 			otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
 			if ((otte_data & VTD_REF) != 0) {
                                 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
 				
                                 rv++;
                                 if (rv > 4) {
                                         PMAP_UNLOCK(pmap);
                                         break;
                                 }
 			}
 		
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	return (rv);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_paddr_t pa;
 	vm_offset_t va;
 		
 	pa = VM_PAGE_TO_PHYS(m);
 	va = TLB_PHYS_TO_DIRECT(pa);
 	if (off == 0 && size == PAGE_SIZE)
 		hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 	else
 		bzero((char *)(va + off), size);
 
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 }
 
 void
 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
 {
 	panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
 	      pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);
 	
 }
Index: head/sys/sys/proc.h
===================================================================
--- head/sys/sys/proc.h	(revision 173360)
+++ head/sys/sys/proc.h	(revision 173361)
@@ -1,916 +1,917 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <sys/callout.h>		/* For struct callout. */
 #include <sys/event.h>			/* For struct klist. */
 #ifndef _KERNEL
 #include <sys/filedesc.h>
 #endif
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/priority.h>
 #include <sys/rtprio.h>			/* XXX. */
 #include <sys/runq.h>
 #include <sys/resource.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #ifndef _KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #else
 #include <sys/pcpu.h>
 #endif
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 
 /*
  * One structure allocated per session.
  *
  * List of locks
  * (m)		locked by s_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct session {
 	int		s_count;	/* (m) Ref cnt; pgrps in session. */
 	struct proc	*s_leader;	/* (m + e) Session leader. */
 	struct vnode	*s_ttyvp;	/* (m) Vnode of controlling tty. */
 	struct tty	*s_ttyp;	/* (m) Controlling tty. */
 	pid_t		s_sid;		/* (c) Session ID. */
 					/* (m) Setlogin() name: */
 	char		s_login[roundup(MAXLOGNAME, sizeof(long))];
 	struct mtx	s_mtx;		/* Mutex to protect members. */
 };
 
 /*
  * One structure allocated per process group.
  *
  * List of locks
  * (m)		locked by pg_mtx mtx
  * (e)		locked by proctree_lock sx
  * (c)		const until freeing
  */
 struct pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* (e) Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* (m + e) Pointer to pgrp members. */
 	struct session	*pg_session;	/* (c) Pointer to session. */
 	struct sigiolst	pg_sigiolst;	/* (m) List of sigio sources. */
 	pid_t		pg_id;		/* (c) Process group id. */
 	int		pg_jobc;	/* (m) Job control process count. */
 	struct mtx	pg_mtx;		/* Mutex to protect members */
 };
 
 /*
  * pargs, used to hold a copy of the command line, if it had a sane length.
  */
 struct pargs {
 	u_int	ar_ref;		/* Reference count. */
 	u_int	ar_length;	/* Length. */
 	u_char	ar_args[1];	/* Arguments. */
 };
 
 /*-
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(CPU)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  *
  * Below is a key of locks used to protect each member of struct proc.  The
  * lock is indicated by a reference to a specific character in parens in the
  * associated comment.
  *      * - not yet protected
  *      a - only touched by curproc or parent during fork/wait
  *      b - created at fork, never changes
  *		(exception aiods switch vmspaces, but they are also
  *		marked 'P_SYSTEM' so hopefully it will be left alone)
  *      c - locked by proc mtx
  *      d - locked by allproc_lock lock
  *      e - locked by proctree_lock lock
  *      f - session mtx
  *      g - process group mtx
  *      h - callout_lock mtx
  *      i - by curproc or the master session mtx
  *      j - locked by proc slock
  *      k - only accessed by curthread
  *	k*- only accessed by curthread and from an interrupt
  *      l - the attaching proc or attaching proc parent
  *      m - Giant
  *      n - not locked, lazy
  *      o - ktrace lock
  *      p - select lock (sellock)
  *      q - td_contested lock
  *      r - p_peers lock
  *      t - thread lock
  *      x - created at fork, only changes during single threading in exec
  *      z - zombie threads lock
  *
  * If the locking key specifies two identifiers (for example, p_pptr) then
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
 struct kaudit_record;
 struct td_sched;
 struct nlminfo;
 struct kaioinfo;
 struct p_sched;
 struct proc;
 struct sleepqueue;
 struct thread;
 struct trapframe;
 struct turnstile;
 struct mqueue_notifier;
 
 /*
  * Here we define the two structures used for process information.
  *
  * The first is the thread. It might be thought of as a "Kernel
  * Schedulable Entity Context".
  * This structure contains all the information as to where a thread of
  * execution is now, or was when it was suspended, why it was suspended,
  * and anything else that will be needed to restart it when it is
  * rescheduled. It includes a scheduler specific substructure that is different
  * for each scheduler.
  *
  * M:N notes.
  * It is important to remember that when using M:N threading, 
  * a particular thread structure may only exist as long as
  * the system call or kernel entrance (e.g. by pagefault)
  * which it is currently executing. It should therefore NEVER be referenced
  * by pointers in long lived structures that live longer than a single
  * request. If several threads complete their work at the same time,
  * they will all rewind their stacks to the user boundary, report their
  * completion state, and all but one will be freed. That last one will
  * be kept to provide a kernel stack and pcb for the NEXT syscall or kernel
  * entrance (basically to save freeing and then re-allocating it).  The existing
  * thread keeps a cached spare thread available to allow it to quickly
  * get one when it needs a new one. There is also a system
  * cache of free threads. Threads have priority and partake in priority
  * inheritance schemes.
  *
  * The second is the proc (process) which owns all the resources of a process
  * other than CPU cycles, which are parceled out to the threads.
  */
 
 /*
  * Kernel runnable context (thread).
  * This is what is put to sleep and reactivated.
  * Thread context.  Processes may have multiple threads.
  */
 struct thread {
 	struct mtx	*volatile td_lock; /* replaces sched lock */
 	struct proc	*td_proc;	/* (*) Associated process. */
 	TAILQ_ENTRY(thread) td_plist;	/* (*) All threads in this proc. */
 
 	/* The two queues below should someday be merged. */
 	TAILQ_ENTRY(thread) td_slpq;	/* (t) Sleep queue. */
 	TAILQ_ENTRY(thread) td_lockq;	/* (t) Lock queue. */
 
 	TAILQ_HEAD(, selinfo) td_selq;	/* (p) List of selinfos. */
 	struct sleepqueue *td_sleepqueue; /* (k) Associated sleep queue. */
 	struct turnstile *td_turnstile;	/* (k) Associated turnstile. */
 	struct umtx_q   *td_umtxq;	/* (c?) Link for when we're blocked. */
 	lwpid_t		td_tid;		/* (b) Thread ID. */
 	sigqueue_t	td_sigqueue;	/* (c) Sigs arrived, not delivered. */
 #define	td_siglist	td_sigqueue.sq_signals
 
 /* Cleared during fork1() or thread_schedule_upcall(). */
 #define	td_startzero td_flags
 	int		td_flags;	/* (t) TDF_* flags. */
 	int		td_inhibitors;	/* (t) Why can not run. */
 	int		td_pflags;	/* (k) Private thread (TDP_*) flags. */
 	int		td_dupfd;	/* (k) Ret value from fdopen. XXX */
 	int		td_sqqueue;	/* (t) Sleepqueue queue blocked on. */
 	void		*td_wchan;	/* (t) Sleep address. */
 	const char	*td_wmesg;	/* (t) Reason for sleep. */
 	u_char		td_lastcpu;	/* (t) Last cpu we were on. */
 	u_char		td_oncpu;	/* (t) Which cpu we are on. */
 	volatile u_char td_owepreempt;  /* (k*) Preempt on last critical_exit */
 	short		td_locks;	/* (k) Count of non-spin locks. */
 	u_char		td_tsqueue;	/* (t) Turnstile queue blocked on. */
 	struct turnstile *td_blocked;	/* (t) Lock thread is blocked on. */
 	const char	*td_lockname;	/* (t) Name of lock blocked on. */
 	LIST_HEAD(, turnstile) td_contested;	/* (q) Contested locks. */
 	struct lock_list_entry *td_sleeplocks; /* (k) Held sleep locks. */
 	int		td_intr_nesting_level; /* (k) Interrupt recursion. */
 	int		td_pinned;	/* (k) Temporary cpu pin count. */
 	struct kse_thr_mailbox *td_mailbox; /* (*) Userland mailbox address. */
 	struct ucred	*td_ucred;	/* (k) Reference to credentials. */
 	struct thread	*td_standin;	/* (k + a) Use this for an upcall. */
 	struct kse_upcall *td_upcall;	/* (k + t) Upcall structure. */
 	u_int		td_estcpu;	/* (t) estimated cpu utilization */
 	u_int		td_slptick;	/* (t) Time at sleep. */
 	struct rusage	td_ru;		/* (t) rusage information */
 	uint64_t	td_runtime;	/* (t) How many cpu ticks we've run. */
 	u_int 		td_pticks;	/* (t) Statclock hits for profiling */
 	u_int		td_sticks;	/* (t) Statclock hits in system mode. */
 	u_int		td_iticks;	/* (t) Statclock hits in intr mode. */
 	u_int		td_uticks;	/* (t) Statclock hits in user mode. */
 	u_int		td_uuticks;	/* (k) Statclock hits (usr), for UTS. */
 	u_int		td_usticks;	/* (k) Statclock hits (sys), for UTS. */
 	int		td_intrval;	/* (t) Return value of TDF_INTERRUPT. */
 	sigset_t	td_oldsigmask;	/* (k) Saved mask from pre sigpause. */
 	sigset_t	td_sigmask;	/* (c) Current signal mask. */
 	volatile u_int	td_generation;	/* (k) For detection of preemption */
 	stack_t		td_sigstk;	/* (k) Stack ptr and on-stack flag. */
 	int		td_kflags;	/* (c) Flags for KSE threading. */
 	int		td_xsig;	/* (c) Signal for ptrace */
 	u_long		td_profil_addr;	/* (k) Temporary addr until AST. */
 	u_int		td_profil_ticks; /* (k) Temporary ticks until AST. */
 	char		td_name[MAXCOMLEN + 1];	/* (*) Thread name. */
 #define	td_endzero td_base_pri
 
 /* Copied during fork1() or thread_sched_upcall(). */
 #define	td_startcopy td_endzero
 	u_char		td_base_pri;	/* (t) Thread base kernel priority. */
 	u_char		td_priority;	/* (t) Thread active priority. */
 	u_char		td_pri_class;	/* (t) Scheduling class. */
 	u_char		td_user_pri;	/* (t) User pri from estcpu and nice. */
 	u_char		td_base_user_pri; /* (t) Base user pri */
 #define	td_endcopy td_pcb
 
 /*
  * Fields that must be manually set in fork1() or thread_sched_upcall()
  * or already have been set in the allocator, constructor, etc.
  */
 	struct pcb	*td_pcb;	/* (k) Kernel VA of pcb and kstack. */
 	enum {
 		TDS_INACTIVE = 0x0,
 		TDS_INHIBITED,
 		TDS_CAN_RUN,
 		TDS_RUNQ,
 		TDS_RUNNING
 	} td_state;			/* (t) thread state */
 	register_t	td_retval[2];	/* (k) Syscall aux returns. */
 	struct callout	td_slpcallout;	/* (h) Callout for sleep. */
 	struct trapframe *td_frame;	/* (k) */
 	struct vm_object *td_kstack_obj;/* (a) Kstack object. */
 	vm_offset_t	td_kstack;	/* (a) Kernel VA of kstack. */
 	int		td_kstack_pages; /* (a) Size of the kstack. */
 	struct vm_object *td_altkstack_obj;/* (a) Alternate kstack object. */
 	vm_offset_t	td_altkstack;	/* (a) Kernel VA of alternate kstack. */
 	int		td_altkstack_pages; /* (a) Size of alternate kstack. */
 	volatile u_int	td_critnest;	/* (k*) Critical section nest level. */
 	struct mdthread td_md;		/* (k) Any machine-dependent fields. */
 	struct td_sched	*td_sched;	/* (*) Scheduler-specific data. */
 	struct kaudit_record	*td_ar;	/* (k) Active audit record, if any. */
 	int		td_syscalls;	/* per-thread syscall count (used by NFS :)) */
 };
 
 struct mtx *thread_lock_block(struct thread *);
 void thread_lock_unblock(struct thread *, struct mtx *);
 void thread_lock_set(struct thread *, struct mtx *);
 #define	THREAD_LOCK_ASSERT(td, type)					\
 do {									\
 	struct mtx *__m = (td)->td_lock;				\
 	if (__m != &blocked_lock)					\
 		mtx_assert(__m, (type));				\
 } while (0)
 
 /*
  * Flags kept in td_flags:
  * To change these you MUST have the scheduler lock.
  */
 #define	TDF_BORROWING	0x00000001 /* Thread is borrowing pri from another. */
 #define	TDF_INPANIC	0x00000002 /* Caused a panic, let it drive crashdump. */
 #define	TDF_INMEM	0x00000004 /* Thread's stack is in memory. */
 #define	TDF_SINTR	0x00000008 /* Sleep is interruptible. */
 #define	TDF_TIMEOUT	0x00000010 /* Timing out during sleep. */
 #define	TDF_IDLETD	0x00000020 /* This is a per-CPU idle thread. */
 #define	TDF_SELECT	0x00000040 /* Selecting; wakeup/waiting danger. */
 #define	TDF_SLEEPABORT	0x00000080 /* sleepq_abort was called. */
 #define	TDF_KTH_SUSP	0x00000100 /* kthread is suspended */
 #define	TDF_UBORROWING	0x00000200 /* Thread is borrowing user pri. */
 #define	TDF_BOUNDARY	0x00000400 /* Thread suspended at user boundary */
 #define	TDF_ASTPENDING	0x00000800 /* Thread has some asynchronous events. */
 #define	TDF_TIMOFAIL	0x00001000 /* Timeout from sleep after we were awake. */
 #define	TDF_INTERRUPT	0x00002000 /* Thread is marked as interrupted. */
 #define	TDF_UPIBLOCKED	0x00004000 /* Thread blocked on user PI mutex. */
 #define	TDF_UNUSED15	0x00008000 /* --available-- */
 #define	TDF_NEEDRESCHED	0x00010000 /* Thread needs to yield. */
 #define	TDF_NEEDSIGCHK	0x00020000 /* Thread may need signal delivery. */
 #define	TDF_XSIG	0x00040000 /* Thread is exchanging signal under trace */
 #define	TDF_UNUSED19	0x00080000 /* Thread is sleeping on a umtx. */
 #define	TDF_THRWAKEUP	0x00100000 /* Libthr thread must not suspend itself. */
 #define	TDF_DBSUSPEND	0x00200000 /* Thread is suspended by debugger */
 #define	TDF_SWAPINREQ	0x00400000 /* Swapin request due to wakeup. */
 #define	TDF_UNUSED23	0x00800000 /* --available-- */
 #define	TDF_SCHED0	0x01000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED1	0x02000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED2	0x04000000 /* Reserved for scheduler private use */
 #define	TDF_SCHED3	0x08000000 /* Reserved for scheduler private use */
 #define	TDF_ALRMPEND	0x10000000 /* Pending SIGVTALRM needs to be posted. */
 #define	TDF_PROFPEND	0x20000000 /* Pending SIGPROF needs to be posted. */
 #define	TDF_MACPEND	0x40000000 /* AST-based MAC event pending. */
 
 /*
  * "Private" flags kept in td_pflags:
  * These are only written by curthread and thus need no locking.
  */
 #define	TDP_OLDMASK	0x00000001 /* Need to restore mask after suspend. */
 #define	TDP_INKTR	0x00000002 /* Thread is currently in KTR code. */
 #define	TDP_INKTRACE	0x00000004 /* Thread is currently in KTRACE code. */
 #define	TDP_UPCALLING	0x00000008 /* This thread is doing an upcall. */
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock aquisition - deadlock treatment. */
 #define	TDP_SA		0x00000080 /* A scheduler activation based thread. */
 #define	TDP_NOSLEEPING	0x00000100 /* Thread is not allowed to sleep on a sq. */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */
 #define	TDP_CAN_UNBIND	0x00000800 /* Only temporarily bound. */
 #define	TDP_SCHED1	0x00001000 /* Reserved for scheduler private use */
 #define	TDP_SCHED2	0x00002000 /* Reserved for scheduler private use */
 #define	TDP_SCHED3	0x00004000 /* Reserved for scheduler private use */
 #define	TDP_SCHED4	0x00008000 /* Reserved for scheduler private use */
 #define	TDP_GEOM	0x00010000 /* Settle GEOM before finishing syscall */
 #define	TDP_SOFTDEP	0x00020000 /* Stuck processing softdep worklist */
 #define	TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
 #define	TDP_WAKEUP	0x00080000 /* Don't sleep in umtx cond_wait */
 #define	TDP_INBDFLUSH	0x00100000 /* Already in BO_BDFLUSH, do not recurse */
 #define	TDP_KTHREAD	0x00200000 /* This is an official kernel thread */
 
 /*
  * Reasons that the current thread can not be run yet.
  * More than one may apply.
  */
 #define	TDI_SUSPENDED	0x0001	/* On suspension queue. */
 #define	TDI_SLEEPING	0x0002	/* Actually asleep! (tricky). */
 #define	TDI_SWAPPED	0x0004	/* Stack not in mem.  Bad juju if run. */
 #define	TDI_LOCK	0x0008	/* Stopped on a lock. */
 #define	TDI_IWAIT	0x0010	/* Awaiting interrupt. */
 
 /*
  * flags (in kflags) related to M:N threading.
  */
 #define	TDK_KSEREL	0x0001	/* Blocked in msleep on p->p_completed. */
 #define	TDK_KSERELSIG	0x0002	/* Blocked in msleep on p->p_siglist. */
 #define	TDK_WAKEUP	0x0004	/* Thread has been woken by kse_wakeup. */
 
 #define	TD_CAN_UNBIND(td)			\
     (((td)->td_pflags & TDP_CAN_UNBIND) &&	\
      ((td)->td_upcall != NULL))
 
 #define	TD_IS_SLEEPING(td)	((td)->td_inhibitors & TDI_SLEEPING)
 #define	TD_ON_SLEEPQ(td)	((td)->td_wchan != NULL)
 #define	TD_IS_SUSPENDED(td)	((td)->td_inhibitors & TDI_SUSPENDED)
 #define	TD_IS_SWAPPED(td)	((td)->td_inhibitors & TDI_SWAPPED)
 #define	TD_ON_LOCK(td)		((td)->td_inhibitors & TDI_LOCK)
 #define	TD_AWAITING_INTR(td)	((td)->td_inhibitors & TDI_IWAIT)
 #define	TD_IS_RUNNING(td)	((td)->td_state == TDS_RUNNING)
 #define	TD_ON_RUNQ(td)		((td)->td_state == TDS_RUNQ)
 #define	TD_CAN_RUN(td)		((td)->td_state == TDS_CAN_RUN)
 #define	TD_IS_INHIBITED(td)	((td)->td_state == TDS_INHIBITED)
 #define	TD_ON_UPILOCK(td)	((td)->td_flags & TDF_UPIBLOCKED)
 #if 0
 #define TD_IS_IDLETHREAD(td)	((td) == pcpu(idlethread))
 #else
 #define TD_IS_IDLETHREAD(td)	((td)->td_flags & TDF_IDLETD)
 #endif
 
 
 #define	TD_SET_INHIB(td, inhib) do {			\
 	(td)->td_state = TDS_INHIBITED;			\
 	(td)->td_inhibitors |= (inhib);			\
 } while (0)
 
 #define	TD_CLR_INHIB(td, inhib) do {			\
 	if (((td)->td_inhibitors & (inhib)) &&		\
 	    (((td)->td_inhibitors &= ~(inhib)) == 0))	\
 		(td)->td_state = TDS_CAN_RUN;		\
 } while (0)
 
 #define	TD_SET_SLEEPING(td)	TD_SET_INHIB((td), TDI_SLEEPING)
 #define	TD_SET_SWAPPED(td)	TD_SET_INHIB((td), TDI_SWAPPED)
 #define	TD_SET_LOCK(td)		TD_SET_INHIB((td), TDI_LOCK)
 #define	TD_SET_SUSPENDED(td)	TD_SET_INHIB((td), TDI_SUSPENDED)
 #define	TD_SET_IWAIT(td)	TD_SET_INHIB((td), TDI_IWAIT)
 #define	TD_SET_EXITING(td)	TD_SET_INHIB((td), TDI_EXITING)
 
 #define	TD_CLR_SLEEPING(td)	TD_CLR_INHIB((td), TDI_SLEEPING)
 #define	TD_CLR_SWAPPED(td)	TD_CLR_INHIB((td), TDI_SWAPPED)
 #define	TD_CLR_LOCK(td)		TD_CLR_INHIB((td), TDI_LOCK)
 #define	TD_CLR_SUSPENDED(td)	TD_CLR_INHIB((td), TDI_SUSPENDED)
 #define	TD_CLR_IWAIT(td)	TD_CLR_INHIB((td), TDI_IWAIT)
 
 #define	TD_SET_RUNNING(td)	(td)->td_state = TDS_RUNNING
 #define	TD_SET_RUNQ(td)		(td)->td_state = TDS_RUNQ
 #define	TD_SET_CAN_RUN(td)	(td)->td_state = TDS_CAN_RUN
 
 /*
  * An upcall is used when returning to userland.  If a thread does not have
  * an upcall on return to userland the thread exports its context and exits.
  */
 struct kse_upcall {
 	TAILQ_ENTRY(kse_upcall) ku_link;	/* List of upcalls in proc. */
 	struct proc		*ku_proc;	/* Associated proc. */
 	struct thread		*ku_owner;	/* Owning thread. */
 	int			ku_flags;	/* KUF_* flags. */
 	struct kse_mailbox	*ku_mailbox;	/* Userland mailbox address. */
 	stack_t			ku_stack;	/* Userland upcall stack. */
 	void			*ku_func;	/* Userland upcall function. */
 	unsigned int		ku_mflags;	/* Cached upcall mbox flags. */
 };
 
 #define	KUF_DOUPCALL	0x00001		/* Do upcall now; don't wait. */
 #define	KUF_EXITING	0x00002		/* Upcall structure is exiting. */
 
 /*
  * XXX: Does this belong in resource.h or resourcevar.h instead?
  * Resource usage extension.  The times in rusage structs in the kernel are
  * never up to date.  The actual times are kept as runtimes and tick counts
  * (with control info in the "previous" times), and are converted when
  * userland asks for rusage info.  Backwards compatibility prevents putting
  * this directly in the user-visible rusage struct.
  *
  * Locking: (cj) means (j) for p_rux and (c) for p_crux.
  */
 struct rusage_ext {
 	u_int64_t	rux_runtime;    /* (cj) Real time. */
 	u_int64_t	rux_uticks;     /* (cj) Statclock hits in user mode. */
 	u_int64_t	rux_sticks;     /* (cj) Statclock hits in sys mode. */
 	u_int64_t	rux_iticks;     /* (cj) Statclock hits in intr mode. */
 	u_int64_t	rux_uu;         /* (c) Previous user time in usec. */
 	u_int64_t	rux_su;         /* (c) Previous sys time in usec. */
 	u_int64_t	rux_tu;         /* (c) Previous total time in usec. */
 };
 
 /*
  * The old fashionned process. May have multiple threads.
  *  Starts off with a single embedded THREAD.
  */
 struct proc {
 	LIST_ENTRY(proc) p_list;	/* (d) List of all processes. */
 	TAILQ_HEAD(, thread) p_threads;	/* (j) all threads. */
 	TAILQ_HEAD(, kse_upcall) p_upcalls; /* (j) All upcalls in the proc. */
 	struct mtx	p_slock;	/* process spin lock */
 	struct ucred	*p_ucred;	/* (c) Process owner's identity. */
 	struct filedesc	*p_fd;		/* (b) Open files. */
 	struct filedesc_to_leader *p_fdtol; /* (b) Tracking node */
 					/* Accumulated stats for all threads? */
 	struct pstats	*p_stats;	/* (b) Accounting/statistics (CPU). */
 	struct plimit	*p_limit;	/* (c) Process limits. */
 	struct callout	p_limco;	/* (c) Limit callout handle */
 	struct sigacts	*p_sigacts;	/* (x) Signal actions, state (CPU). */
 
 	/*
 	 * The following don't make too much sense.
 	 * See the td_ or ke_ versions of the same flags.
 	 */
 	int		p_flag;		/* (c) P_* flags. */
 	enum {
 		PRS_NEW = 0,		/* In creation */
 		PRS_NORMAL,		/* threads can be run. */
 		PRS_ZOMBIE
 	} p_state;			/* (j/c) S* process status. */
 	pid_t		p_pid;		/* (b) Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* (d) Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* (g + e) List of processes in pgrp. */
 	struct proc	*p_pptr;	/* (c + e) Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* (e) List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* (e) Pointer to list of children. */
 	struct mtx	p_mtx;		/* (n) Lock for this struct. */
 	struct ksiginfo *p_ksi;	/* Locked by parent proc lock */
 	sigqueue_t	p_sigqueue;	/* (c) Sigs not delivered to a td. */
 #define p_siglist	p_sigqueue.sq_signals
 
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 	pid_t		p_oppid;	/* (c + e) Save ppid in ptrace. XXX */
 	struct vmspace	*p_vmspace;	/* (b) Address space. */
 	u_int		p_swtick;	/* (j) Tick when swapped in or out. */
 	struct itimerval p_realtimer;	/* (c) Alarm timer. */
 	struct rusage	p_ru;		/* (a) Exit information. */
 	struct rusage_ext p_rux;	/* (cj) Internal resource usage. */
 	struct rusage_ext p_crux;	/* (c) Internal child resource usage. */
 	int		p_profthreads;	/* (c) Num threads in addupc_task. */
 	volatile int	p_exitthreads;	/* (j) Number of threads exiting */
 	int		p_traceflag;	/* (o) Kernel trace points. */
 	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
 	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
 	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
 	char		p_lock;		/* (c) Proclock (prevent swap) count. */
 	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
 	int		p_sigparent;	/* (c) Signal to parent on exit. */
 	int		p_sig;		/* (n) For core dump/debugger XXX. */
 	u_long		p_code;		/* (n) For core dump/debugger XXX. */
 	u_int		p_stops;	/* (c) Stop event bitmask. */
 	u_int		p_stype;	/* (c) Stop event type. */
 	char		p_step;		/* (c) Process is stopped. */
 	u_char		p_pfsflags;	/* (c) Procfs flags. */
 	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
 	struct kaioinfo	*p_aioinfo;	/* (c) ASYNC I/O info. */
 	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
 	int		p_suspcount;	/* (j) Num threads in suspended mode. */
 	struct thread	*p_xthread;	/* (c) Trap thread */
 	int		p_boundary_count;/* (c) Num threads at user boundary */
 	int		p_pendingcnt;	/* how many signals are pending */
 	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
 	int		p_numupcalls;	/* (j) Num upcalls. */
 	int		p_upsleeps;	/* (c) Num threads in kse_release(). */
 	struct kse_thr_mailbox *p_completed; /* (c) Completed thread mboxes. */
 	int		p_nextupcall;	/* (n) Next upcall time. */
 	int		p_upquantum;	/* (n) Quantum to schedule an upcall. */
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_magic
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_endzero
 	u_int		p_magic;	/* (b) Magic number. */
 	char		p_comm[MAXCOMLEN + 1];	/* (b) Process name. */
 	struct pgrp	*p_pgrp;	/* (c + e) Pointer to process group. */
 	struct sysentvec *p_sysent;	/* (b) Syscall dispatch info. */
 	struct pargs	*p_args;	/* (c) Process arguments. */
 	rlim_t		p_cpulimit;	/* (c) Current CPU limit in seconds. */
 	signed char	p_nice;		/* (c + j) Process "nice" value. */
 /* End area that is copied on creation. */
 #define	p_endcopy	p_xstat
 
 	u_short		p_xstat;	/* (c) Exit status; also stop sig. */
 	struct knlist	p_klist;	/* (c) Knotes attached to this proc. */
 	int		p_numthreads;	/* (j) Number of threads. */
 	struct mdproc	p_md;		/* Any machine-dependent fields. */
 	struct callout	p_itcallout;	/* (h + c) Interval timer callout. */
 	u_short		p_acflag;	/* (c) Accounting flags. */
 	struct proc	*p_peers;	/* (r) */
 	struct proc	*p_leader;	/* (b) */
 	void		*p_emuldata;	/* (c) Emulator state data. */
 	struct label	*p_label;	/* (*) Proc (not subject) MAC label. */
 	struct p_sched	*p_sched;	/* (*) Scheduler-specific data. */
 	STAILQ_HEAD(, ktr_request)	p_ktr;	/* (o) KTR event queue. */
 	LIST_HEAD(, mqueue_notifier)	p_mqnotifier; /* (c) mqueue notifiers.*/
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 #define	NOCPU	0xff		/* For when we aren't on a CPU. */
 
 #define	PROC_SLOCK(p)	mtx_lock_spin(&(p)->p_slock)
 #define	PROC_SUNLOCK(p)	mtx_unlock_spin(&(p)->p_slock)
 #define	PROC_SLOCK_ASSERT(p, type)	mtx_assert(&(p)->p_slock, (type))
 
 /* These flags are kept in p_flag. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_KTHREAD	0x00004	/* Kernel thread (*). */
 #define	P_NOLOAD	0x00008	/* Ignore during load avg calculations. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_STOPPROF	0x00040	/* Has thread requesting to stop profiling. */
 #define	P_HADTHREADS	0x00080	/* Has had threads (no cleanup shortcuts) */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Someone is waiting for us. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 #define	P_SA		0x08000	/* Using scheduler activations. */
 #define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
 #define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
 #define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
 #define	P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */
 #define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
 #define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
 #define	P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */
 #define	P_HWPMC		0x800000 /* Process is using HWPMCs */
 
 #define	P_JAILED	0x1000000 /* Process is in jail. */
 #define	P_INEXEC	0x4000000 /* Process is in execve(). */
 #define	P_STATCHILD	0x8000000 /* Child process stopped or exited. */
 #define	P_INMEM		0x10000000 /* Loaded into memory. */
 #define	P_SWAPPINGOUT	0x20000000 /* Process is being swapped out. */
 #define	P_SWAPPINGIN	0x40000000 /* Process is being swapped in. */
 
 #define	P_STOPPED	(P_STOPPED_SIG|P_STOPPED_SINGLE|P_STOPPED_TRACE)
 #define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
 
 /*
  * These were process status values (p_stat), now they are only used in
  * legacy conversion code.
  */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 #define	SWAIT	6		/* Waiting for interrupt. */
 #define	SLOCK	7		/* Blocked on a lock. */
 
 #define	P_MAGIC		0xbeefface
 
 #ifdef _KERNEL
 
 /* Flags for mi_switch(). */
 #define	SW_VOL		0x0001		/* Voluntary switch. */
 #define	SW_INVOL	0x0002		/* Involuntary switch. */
 #define SW_PREEMPT	0x0004		/* The invol switch is a preemption */
 
 /* How values for thread_single(). */
 #define	SINGLE_NO_EXIT	0
 #define	SINGLE_EXIT	1
 #define	SINGLE_BOUNDARY	2
 
 /* XXXKSE: Missing values for thread_suspend_check(). */
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_PARGS);
 MALLOC_DECLARE(M_PGRP);
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 #define	FOREACH_PROC_IN_SYSTEM(p)					\
 	LIST_FOREACH((p), &allproc, p_list)
 #define	FOREACH_THREAD_IN_PROC(p, td)					\
 	TAILQ_FOREACH((td), &(p)->p_threads, td_plist)
 #define	FOREACH_UPCALL_IN_PROC(p, ku)					\
 	TAILQ_FOREACH((ku), &(p)->p_upcalls, ku_link)
 
 /* XXXKSE the following lines should probably only be used in 1:1 code: */
 #define	FIRST_THREAD_IN_PROC(p)	TAILQ_FIRST(&(p)->p_threads)
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define	SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s)	sessrele(s)
 
 
 #define	STOPEVENT(p, e, v) do {						\
 	if ((p)->p_stops & (e))	{					\
 		PROC_LOCK(p);						\
 		stopevent((p), (e), (v));				\
 		PROC_UNLOCK(p);						\
 	}								\
 } while (0)
 #define	_STOPEVENT(p, e, v) do {					\
 	PROC_LOCK_ASSERT(p, MA_OWNED);					\
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, \
  	    "checking stopevent %d", (e));				\
 	if ((p)->p_stops & (e))						\
 		stopevent((p), (e), (v));				\
 } while (0)
 
 /* Lock and unlock a process. */
 #define	PROC_LOCK(p)	mtx_lock(&(p)->p_mtx)
 #define	PROC_TRYLOCK(p)	mtx_trylock(&(p)->p_mtx)
 #define	PROC_UNLOCK(p)	mtx_unlock(&(p)->p_mtx)
 #define	PROC_LOCKED(p)	mtx_owned(&(p)->p_mtx)
 #define	PROC_LOCK_ASSERT(p, type)	mtx_assert(&(p)->p_mtx, (type))
 
 /* Lock and unlock a process group. */
 #define	PGRP_LOCK(pg)	mtx_lock(&(pg)->pg_mtx)
 #define	PGRP_UNLOCK(pg)	mtx_unlock(&(pg)->pg_mtx)
 #define	PGRP_LOCKED(pg)	mtx_owned(&(pg)->pg_mtx)
 #define	PGRP_LOCK_ASSERT(pg, type)	mtx_assert(&(pg)->pg_mtx, (type))
 
 #define	PGRP_LOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_LOCK(pg);						\
 } while (0)
 #define	PGRP_UNLOCK_PGSIGNAL(pg) do {					\
 	if ((pg) != NULL)						\
 		PGRP_UNLOCK(pg);					\
 } while (0)
 
 /* Lock and unlock a session. */
 #define	SESS_LOCK(s)	mtx_lock(&(s)->s_mtx)
 #define	SESS_UNLOCK(s)	mtx_unlock(&(s)->s_mtx)
 #define	SESS_LOCKED(s)	mtx_owned(&(s)->s_mtx)
 #define	SESS_LOCK_ASSERT(s, type)	mtx_assert(&(s)->s_mtx, (type))
 
 /* Hold process U-area in memory, normally for ptrace/procfs work. */
 #define	PHOLD(p) do {							\
 	PROC_LOCK(p);							\
 	_PHOLD(p);							\
 	PROC_UNLOCK(p);							\
 } while (0)
 #define	_PHOLD(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	KASSERT(!((p)->p_flag & P_WEXIT) || (p) == curproc,		\
 	    ("PHOLD of exiting process"));				\
 	(p)->p_lock++;							\
 	if (((p)->p_flag & P_INMEM) == 0)				\
 		faultin((p));						\
 } while (0)
 #define PROC_ASSERT_HELD(p) do {					\
 	KASSERT((p)->p_lock > 0, ("process not held"));			\
 } while (0)
 
 #define	PRELE(p) do {							\
 	PROC_LOCK((p));							\
 	_PRELE((p));							\
 	PROC_UNLOCK((p));						\
 } while (0)
 #define	_PRELE(p) do {							\
 	PROC_LOCK_ASSERT((p), MA_OWNED);				\
 	(--(p)->p_lock);						\
 	if (((p)->p_flag & P_WEXIT) && (p)->p_lock == 0)		\
 		wakeup(&(p)->p_lock);					\
 } while (0)
 #define PROC_ASSERT_NOT_HELD(p) do {					\
 	KASSERT((p)->p_lock == 0, ("process held"));			\
 } while (0)
 
 /* Check whether a thread is safe to be swapped out. */
 #define	thread_safetoswapout(td) (TD_IS_SLEEPING(td) || TD_IS_SUSPENDED(td))
 
 /* Control whether or not it is safe for curthread to sleep. */
 #define	THREAD_NO_SLEEPING() do {					\
 	KASSERT(!(curthread->td_pflags & TDP_NOSLEEPING),		\
 	    ("nested no sleeping"));					\
 	curthread->td_pflags |= TDP_NOSLEEPING;				\
 } while (0)
 
 #define	THREAD_SLEEPING_OK() do {					\
 	KASSERT((curthread->td_pflags & TDP_NOSLEEPING),		\
 	    ("nested sleeping ok"));					\
 	curthread->td_pflags &= ~TDP_NOSLEEPING;			\
 } while (0)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 extern struct sx allproc_lock;
 extern struct sx proctree_lock;
 extern struct mtx ppeers_lock;
 extern struct proc proc0;		/* Process slot for swapper. */
 extern struct thread thread0;		/* Primary thread in proc0. */
 extern struct vmspace vmspace0;		/* VM space for proc0. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int lastpid;
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern u_long ps_arg_cache_limit;
 
 LIST_HEAD(proclist, proc);
 TAILQ_HEAD(procqueue, proc);
 TAILQ_HEAD(threadqueue, thread);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
 
 extern struct uma_zone *proc_zone;
 
 struct	proc *pfind(pid_t);		/* Find process by id. */
 struct	pgrp *pgfind(pid_t);		/* Find process group by id. */
 struct	proc *zpfind(pid_t);		/* Find zombie process by id. */
 
 void	ast(struct trapframe *framep);
 struct	thread *choosethread(void);
 int	cr_cansignal(struct ucred *cred, struct proc *proc, int signum);
 int	enterpgrp(struct proc *p, pid_t pgid, struct pgrp *pgrp,
 	    struct session *sess);
 int	enterthispgrp(struct proc *p, struct pgrp *pgrp);
 void	faultin(struct proc *p);
 void	fixjobc(struct proc *p, struct pgrp *pgrp, int entering);
 int	fork1(struct thread *, int, int, struct proc **);
 void	fork_exit(void (*)(void *, struct trapframe *), void *,
 	    struct trapframe *);
 void	fork_return(struct thread *, struct trapframe *);
 int	inferior(struct proc *p);
 void 	kick_proc0(void);
 int	leavepgrp(struct proc *p);
 int	maybe_preempt(struct thread *td);
 void	mi_switch(int flags, struct thread *newtd);
 int	p_candebug(struct thread *td, struct proc *p);
 int	p_cansee(struct thread *td, struct proc *p);
 int	p_cansched(struct thread *td, struct proc *p);
 int	p_cansignal(struct thread *td, struct proc *p, int signum);
 int	p_canwait(struct thread *td, struct proc *p);
 struct	pargs *pargs_alloc(int len);
 void	pargs_drop(struct pargs *pa);
 void	pargs_free(struct pargs *pa);
 void	pargs_hold(struct pargs *pa);
 void	procinit(void);
+void	proc_linkup0(struct proc *p, struct thread *td);
 void	proc_linkup(struct proc *p, struct thread *td);
 void	proc_reparent(struct proc *child, struct proc *newparent);
 struct	pstats *pstats_alloc(void);
 void	pstats_fork(struct pstats *src, struct pstats *dst);
 void	pstats_free(struct pstats *ps);
 int	securelevel_ge(struct ucred *cr, int level);
 int	securelevel_gt(struct ucred *cr, int level);
 void	sessrele(struct session *);
 void	setrunnable(struct thread *);
 void	setsugid(struct proc *p);
 int	sigonstack(size_t sp);
 void	sleepinit(void);
 void	stopevent(struct proc *, u_int, u_int);
 void	threadinit(void);
 void	cpu_idle(void);
 extern	void (*cpu_idle_hook)(void);	/* Hook to machdep CPU idler. */
 void	cpu_switch(struct thread *, struct thread *, struct mtx *);
 void	cpu_throw(struct thread *, struct thread *) __dead2;
 void	unsleep(struct thread *);
 void	userret(struct thread *, struct trapframe *);
 
 void	cpu_exit(struct thread *);
 void	exit1(struct thread *, int) __dead2;
 void	cpu_fork(struct thread *, struct proc *, struct thread *, int);
 void	cpu_set_fork_handler(struct thread *, void (*)(void *), void *);
 
 /* New in KSE. */
 #ifdef KSE
 void	kse_unlink(struct thread *);
 void	kseinit(void);
 void	upcall_reap(void);
 void	upcall_remove(struct thread *td);
 #endif
 void	cpu_set_upcall(struct thread *td, struct thread *td0);
 void	cpu_set_upcall_kse(struct thread *, void (*)(void *), void *, stack_t *);
 int	cpu_set_user_tls(struct thread *, void *tls_base);
 void	cpu_thread_clean(struct thread *);
 void	cpu_thread_exit(struct thread *);
 void	cpu_thread_setup(struct thread *td);
 void	cpu_thread_swapin(struct thread *);
 void	cpu_thread_swapout(struct thread *);
 struct	thread *thread_alloc(void);
 void	thread_continued(struct proc *p);
 void	thread_exit(void) __dead2;
 int	thread_export_context(struct thread *td, int willexit);
 void	thread_free(struct thread *td);
 void	thread_link(struct thread *td, struct proc *p);
 void	thread_reap(void);
 void	thread_signal_add(struct thread *td, ksiginfo_t *);
 int	thread_single(int how);
 void	thread_single_end(void);
 void	thread_stash(struct thread *td);
 int	thread_statclock(int user);
 void	thread_stopped(struct proc *p);
 void	childproc_stopped(struct proc *child, int reason);
 void	childproc_continued(struct proc *child);
 void	childproc_exited(struct proc *child);
 int	thread_suspend_check(int how);
 void	thread_suspend_switch(struct thread *);
 void	thread_suspend_one(struct thread *td);
 struct thread *thread_switchout(struct thread *td, int flags,
 	    struct thread *newtd);
 void	thread_unlink(struct thread *td);
 void	thread_unsuspend(struct proc *p);
 void	thread_unsuspend_one(struct thread *td);
 void	thread_unthread(struct thread *td);
 int	thread_userret(struct thread *td, struct trapframe *frame);
 void	thread_user_enter(struct thread *td);
 void	thread_wait(struct proc *p);
 struct thread	*thread_find(struct proc *p, lwpid_t tid);
 void	thr_exit1(void);
 
 #endif	/* _KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
Index: head/sys/sys/signalvar.h
===================================================================
--- head/sys/sys/signalvar.h	(revision 173360)
+++ head/sys/sys/signalvar.h	(revision 173361)
@@ -1,363 +1,364 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)signalvar.h	8.6 (Berkeley) 2/19/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_SIGNALVAR_H_
 #define	_SYS_SIGNALVAR_H_
 
 #include <sys/queue.h>
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
 #include <sys/signal.h>
 
 /*
  * Kernel signal definitions and data structures,
  * not exported to user programs.
  */
 
 /*
  * Logical process signal actions and state, needed only within the process
  * The mapping between sigacts and proc structures is 1:1 except for rfork()
  * processes masquerading as threads which use one structure for the whole
  * group.  All members are locked by the included mutex.  The reference count
  * and mutex must be last for the bcopy in sigacts_copy() to work.
  */
 struct sigacts {
 	sig_t	ps_sigact[_SIG_MAXSIG];	/* Disposition of signals. */
 	sigset_t ps_catchmask[_SIG_MAXSIG];	/* Signals to be blocked. */
 	sigset_t ps_sigonstack;		/* Signals to take on sigstack. */
 	sigset_t ps_sigintr;		/* Signals that interrupt syscalls. */
 	sigset_t ps_sigreset;		/* Signals that reset when caught. */
 	sigset_t ps_signodefer;		/* Signals not masked while handled. */
 	sigset_t ps_siginfo;		/* Signals that want SA_SIGINFO args. */
 	sigset_t ps_sigignore;		/* Signals being ignored. */
 	sigset_t ps_sigcatch;		/* Signals being caught by user. */
 	sigset_t ps_freebsd4;		/* signals using freebsd4 ucontext. */
 	sigset_t ps_osigset;		/* Signals using <= 3.x osigset_t. */
 	sigset_t ps_usertramp;		/* SunOS compat; libc sigtramp. XXX */
 	int	ps_flag;
 	int	ps_refcnt;
 	struct mtx ps_mtx;
 };
 
 #define	PS_NOCLDWAIT	0x0001	/* No zombies if child dies */
 #define	PS_NOCLDSTOP	0x0002	/* No SIGCHLD when children stop. */
 #define	PS_CLDSIGIGN	0x0004	/* The SIGCHLD handler is SIG_IGN. */
 
 #if defined(_KERNEL) && defined(COMPAT_43)
 /*
  * Compatibility.
  */
 typedef struct {
 	struct osigcontext si_sc;
 	int		si_signo;
 	int		si_code;
 	union sigval	si_value;
 } osiginfo_t;
 
 struct osigaction {
 	union {
 		void    (*__sa_handler)(int);
 		void    (*__sa_sigaction)(int, osiginfo_t *, void *);
 	} __sigaction_u;		/* signal handler */
 	osigset_t	sa_mask;	/* signal mask to apply */
 	int		sa_flags;	/* see signal options below */
 };
 
 typedef void __osiginfohandler_t(int, osiginfo_t *, void *);
 #endif /* _KERNEL && COMPAT_43 */
 
 /* additional signal action values, used only temporarily/internally */
 #define	SIG_CATCH	((__sighandler_t *)2)
 #define SIG_HOLD        ((__sighandler_t *)3)
 
 /*
  * get signal action for process and signal; currently only for current process
  */
 #define SIGACTION(p, sig)	(p->p_sigacts->ps_sigact[_SIG_IDX(sig)])
 
 /*
  * sigset_t manipulation macros
  */
 #define SIGADDSET(set, signo)						\
 	((set).__bits[_SIG_WORD(signo)] |= _SIG_BIT(signo))
 
 #define SIGDELSET(set, signo)						\
 	((set).__bits[_SIG_WORD(signo)] &= ~_SIG_BIT(signo))
 
 #define SIGEMPTYSET(set)						\
 	do {								\
 		int __i;						\
 		for (__i = 0; __i < _SIG_WORDS; __i++)			\
 			(set).__bits[__i] = 0;				\
 	} while (0)
 
 #define SIGFILLSET(set)							\
 	do {								\
 		int __i;						\
 		for (__i = 0; __i < _SIG_WORDS; __i++)			\
 			(set).__bits[__i] = ~0U;			\
 	} while (0)
 
 #define SIGISMEMBER(set, signo)						\
 	((set).__bits[_SIG_WORD(signo)] & _SIG_BIT(signo))
 
 #define SIGISEMPTY(set)		(__sigisempty(&(set)))
 #define SIGNOTEMPTY(set)	(!__sigisempty(&(set)))
 
 #define SIGSETEQ(set1, set2)	(__sigseteq(&(set1), &(set2)))
 #define SIGSETNEQ(set1, set2)	(!__sigseteq(&(set1), &(set2)))
 
 #define SIGSETOR(set1, set2)						\
 	do {								\
 		int __i;						\
 		for (__i = 0; __i < _SIG_WORDS; __i++)			\
 			(set1).__bits[__i] |= (set2).__bits[__i];	\
 	} while (0)
 
 #define SIGSETAND(set1, set2)						\
 	do {								\
 		int __i;						\
 		for (__i = 0; __i < _SIG_WORDS; __i++)			\
 			(set1).__bits[__i] &= (set2).__bits[__i];	\
 	} while (0)
 
 #define SIGSETNAND(set1, set2)						\
 	do {								\
 		int __i;						\
 		for (__i = 0; __i < _SIG_WORDS; __i++)			\
 			(set1).__bits[__i] &= ~(set2).__bits[__i];	\
 	} while (0)
 
 #define SIGSETLO(set1, set2)	((set1).__bits[0] = (set2).__bits[0])
 #define SIGSETOLD(set, oset)	((set).__bits[0] = (oset))
 
 #define SIG_CANTMASK(set)						\
 	SIGDELSET(set, SIGKILL), SIGDELSET(set, SIGSTOP)
 
 #define SIG_STOPSIGMASK(set)						\
 	SIGDELSET(set, SIGSTOP), SIGDELSET(set, SIGTSTP),		\
 	SIGDELSET(set, SIGTTIN), SIGDELSET(set, SIGTTOU)
 
 #define SIG_CONTSIGMASK(set)						\
 	SIGDELSET(set, SIGCONT)
 
 #define sigcantmask	(sigmask(SIGKILL) | sigmask(SIGSTOP))
 
 #define SIG2OSIG(sig, osig)	(osig = (sig).__bits[0])
 #define OSIG2SIG(osig, sig)	SIGEMPTYSET(sig); (sig).__bits[0] = osig
 
 static __inline int
 __sigisempty(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++) {
 		if (set->__bits[i])
 			return (0);
 	}
 	return (1);
 }
 
 static __inline int
 __sigseteq(sigset_t *set1, sigset_t *set2)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++) {
 		if (set1->__bits[i] != set2->__bits[i])
 			return (0);
 	}
 	return (1);
 }
 
 struct osigevent {
 	int	sigev_notify;		/* Notification type */
 	union {
 		int	__sigev_signo;	/* Signal number */
 		int	__sigev_notify_kqueue;
 	} __sigev_u;
 	union sigval sigev_value;	/* Signal value */
 };
 
 typedef struct ksiginfo {
 	TAILQ_ENTRY(ksiginfo)	ksi_link;
 	siginfo_t		ksi_info;
 	int			ksi_flags;
 	struct sigqueue		*ksi_sigq;
 } ksiginfo_t;
 
 #define ksi_signo	ksi_info.si_signo
 #define ksi_errno	ksi_info.si_errno
 #define ksi_code	ksi_info.si_code
 #define ksi_pid		ksi_info.si_pid
 #define ksi_uid		ksi_info.si_uid
 #define ksi_status      ksi_info.si_status
 #define ksi_addr        ksi_info.si_addr
 #define ksi_value	ksi_info.si_value
 #define ksi_band	ksi_info.si_band
 #define ksi_trapno	ksi_info.si_trapno
 #define ksi_overrun	ksi_info.si_overrun
 #define ksi_timerid	ksi_info.si_timerid
 #define ksi_mqd		ksi_info.si_mqd
 
 /* bits for ksi_flags */
 #define KSI_TRAP	0x01	/* Generated by trap. */
 #define	KSI_EXT		0x02	/* Externally managed ksi. */
 #define KSI_INS		0x04	/* Directly insert ksi, not the copy */
 #define	KSI_COPYMASK	KSI_TRAP
 
 #define	KSI_ONQ(ksi)	((ksi)->ksi_sigq != NULL)
 
 typedef struct sigqueue {
 	sigset_t	sq_signals;	/* All pending signals. */
 	sigset_t	sq_kill;	/* Legacy depth 1 queue. */
 	TAILQ_HEAD(, ksiginfo)	sq_list;/* Queued signal info. */
 	struct proc	*sq_proc;
 	int		sq_flags;
 } sigqueue_t;
 
 /* Flags for ksi_flags */
 #define	SQ_INIT	0x01
 
 #ifdef _KERNEL
 
 /* Return nonzero if process p has an unmasked pending signal. */
 #define	SIGPENDING(td)							\
 	(!SIGISEMPTY((td)->td_siglist) &&				\
 	    !sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask))
 
 /*
  * Return the value of the pseudo-expression ((*set & ~*mask) != 0).  This
  * is an optimized version of SIGISEMPTY() on a temporary variable
  * containing SIGSETNAND(*set, *mask).
  */
 static __inline int
 sigsetmasked(sigset_t *set, sigset_t *mask)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++) {
 		if (set->__bits[i] & ~mask->__bits[i])
 			return (0);
 	}
 	return (1);
 }
 
 #define ksiginfo_init(ksi)			\
 do {						\
 	bzero(ksi, sizeof(ksiginfo_t));		\
 } while(0)
 
 #define ksiginfo_init_trap(ksi)			\
 do {						\
 	ksiginfo_t *kp = ksi;			\
 	bzero(kp, sizeof(ksiginfo_t));		\
 	kp->ksi_flags |= KSI_TRAP;		\
 } while(0)
 
 static __inline void
 ksiginfo_copy(ksiginfo_t *src, ksiginfo_t *dst)
 {
 	(dst)->ksi_info = src->ksi_info;
 	(dst)->ksi_flags = (src->ksi_flags & KSI_COPYMASK);
 }
 
 struct pgrp;
 struct thread;
 struct proc;
 struct sigio;
 struct mtx;
 
 extern int sugid_coredump;	/* Sysctl variable kern.sugid_coredump */
 extern struct mtx	sigio_lock;
+extern int kern_logsigexit;	/* Sysctl variable kern.logsigexit */
 
 /*
  * Lock the pointers for a sigio object in the underlying objects of
  * a file descriptor.
  */
 #define SIGIO_LOCK()	mtx_lock(&sigio_lock)
 #define SIGIO_TRYLOCK()	mtx_trylock(&sigio_lock)
 #define SIGIO_UNLOCK()	mtx_unlock(&sigio_lock)
 #define SIGIO_LOCKED()	mtx_owned(&sigio_lock)
 #define SIGIO_ASSERT(type)	mtx_assert(&sigio_lock, type)
 
 /*
  * Machine-independent functions:
  */
 int	cursig(struct thread *td);
 void	execsigs(struct proc *p);
 void	gsignal(int pgid, int sig);
 void	killproc(struct proc *p, char *why);
 void	pgsigio(struct sigio **, int signum, int checkctty);
 void	pgsignal(struct pgrp *pgrp, int sig, int checkctty);
 void	postsig(int sig);
 void	psignal(struct proc *p, int sig);
 int	psignal_event(struct proc *p, struct sigevent *, ksiginfo_t *);
 struct sigacts *sigacts_alloc(void);
 void	sigacts_copy(struct sigacts *dest, struct sigacts *src);
 void	sigacts_free(struct sigacts *ps);
 struct sigacts *sigacts_hold(struct sigacts *ps);
 int	sigacts_shared(struct sigacts *ps);
 void	sigexit(struct thread *td, int signum) __dead2;
 int	sig_ffs(sigset_t *set);
 void	siginit(struct proc *p);
 void	signotify(struct thread *td);
 int	tdsignal(struct proc *p, struct thread *td, int sig,
 	    ksiginfo_t *ksi);
 void	trapsignal(struct thread *td, ksiginfo_t *);
 int	ptracestop(struct thread *td, int sig);
 ksiginfo_t * ksiginfo_alloc(int);
 void	ksiginfo_free(ksiginfo_t *);
 void	sigqueue_init(struct sigqueue *queue, struct proc *p);
 void	sigqueue_flush(struct sigqueue *queue);
 void	sigqueue_delete_proc(struct proc *p, int sig);
 void	sigqueue_delete_set(struct sigqueue *queue, sigset_t *set);
 void	sigqueue_delete(struct sigqueue *queue, int sig);
 void	sigqueue_move_set(struct sigqueue *src, sigqueue_t *dst, sigset_t *);
 int	sigqueue_get(struct sigqueue *queue, int sig, ksiginfo_t *info);
 int	sigqueue_add(struct sigqueue *queue, int sig, ksiginfo_t *info);
 void	sigqueue_collect_set(struct sigqueue *queue, sigset_t *set);
 void	sigqueue_move(struct sigqueue *, struct sigqueue *, int sig);
 void	sigqueue_delete_set_proc(struct proc *, sigset_t *);
 void	sigqueue_delete_stopmask_proc(struct proc *);
 void	sigqueue_take(ksiginfo_t *ksi);
 int	kern_sigtimedwait(struct thread *, sigset_t,
 		ksiginfo_t *, struct timespec *);
 
 /*
  * Machine-dependent functions:
  */
 void	sendsig(sig_t, ksiginfo_t *, sigset_t *retmask);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_SIGNALVAR_H_ */
Index: head/sys/vm/pmap.h
===================================================================
--- head/sys/vm/pmap.h	(revision 173360)
+++ head/sys/vm/pmap.h	(revision 173361)
@@ -1,138 +1,138 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)pmap.h	8.1 (Berkeley) 6/11/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Avadis Tevanian, Jr.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Machine address mapping definitions -- machine-independent
  *	section.  [For machine-dependent section, see "machine/pmap.h".]
  */
 
 #ifndef	_PMAP_VM_
 #define	_PMAP_VM_
 /*
  * Each machine dependent implementation is expected to
  * keep certain statistics.  They may do this anyway they
  * so choose, but are expected to return the statistics
  * in the following structure.
  */
 struct pmap_statistics {
 	long resident_count;	/* # of pages mapped (total) */
 	long wired_count;	/* # of pages wired */
 };
 typedef struct pmap_statistics *pmap_statistics_t;
 
 #include <machine/pmap.h>
 
 #ifdef _KERNEL
 struct proc;
 struct thread;
 
 /*
  * Updates to kernel_vm_end are synchronized by the kernel_map's system mutex.
  */
 extern vm_offset_t kernel_vm_end;
 
 void		 pmap_change_wiring(pmap_t, vm_offset_t, boolean_t);
 void		 pmap_clear_modify(vm_page_t m);
 void		 pmap_clear_reference(vm_page_t m);
 void		 pmap_copy(pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t);
 void		 pmap_copy_page(vm_page_t, vm_page_t);
 void		 pmap_enter(pmap_t, vm_offset_t, vm_page_t, vm_prot_t,
 		    boolean_t);
 void	 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m,
 		    vm_prot_t prot);
 void		 pmap_enter_object(pmap_t pmap, vm_offset_t start,
 		    vm_offset_t end, vm_page_t m_start, vm_prot_t prot);
 vm_paddr_t	 pmap_extract(pmap_t pmap, vm_offset_t va);
 vm_page_t	 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va,
 		    vm_prot_t prot);
 void		 pmap_growkernel(vm_offset_t);
 void		 pmap_init(void);
 boolean_t	 pmap_is_modified(vm_page_t m);
 boolean_t	 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va);
 boolean_t	 pmap_ts_referenced(vm_page_t m);
 vm_offset_t	 pmap_map(vm_offset_t *, vm_paddr_t, vm_paddr_t, int);
 void		 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex, vm_size_t size);
 boolean_t	 pmap_page_exists_quick(pmap_t pmap, vm_page_t m);
 void		 pmap_page_init(vm_page_t m);
-void		 pmap_pinit(pmap_t);
+int		 pmap_pinit(pmap_t);
 void		 pmap_pinit0(pmap_t);
 void		 pmap_protect(pmap_t, vm_offset_t, vm_offset_t, vm_prot_t);
 void		 pmap_qenter(vm_offset_t, vm_page_t *, int);
 void		 pmap_qremove(vm_offset_t, int);
 void		 pmap_release(pmap_t);
 void		 pmap_remove(pmap_t, vm_offset_t, vm_offset_t);
 void		 pmap_remove_all(vm_page_t m);
 void		 pmap_remove_pages(pmap_t);
 void		 pmap_remove_write(vm_page_t m);
 void		 pmap_zero_page(vm_page_t);
 void		 pmap_zero_page_area(vm_page_t, int off, int size);
 void		 pmap_zero_page_idle(vm_page_t);
 int		 pmap_mincore(pmap_t pmap, vm_offset_t addr);
 void		 pmap_activate(struct thread *td);
 vm_offset_t	 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size);
 
 #define	pmap_resident_count(pm)	((pm)->pm_stats.resident_count)
 #define	pmap_wired_count(pm)	((pm)->pm_stats.wired_count)
 
 #endif /* _KERNEL */
 #endif /* _PMAP_VM_ */
Index: head/sys/vm/vm_extern.h
===================================================================
--- head/sys/vm/vm_extern.h	(revision 173360)
+++ head/sys/vm/vm_extern.h	(revision 173361)
@@ -1,100 +1,100 @@
 /*-
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_extern.h	8.2 (Berkeley) 1/12/94
  * $FreeBSD$
  */
 
 #ifndef _VM_EXTERN_H_
 #define	_VM_EXTERN_H_
 
 struct buf;
 struct proc;
 struct vmspace;
 struct vmtotal;
 struct mount;
 struct vnode;
 
 #ifdef _KERNEL
 
 #ifdef TYPEDEF_FOR_UAP
 int getpagesize(struct thread *, void *, int *);
 int madvise(struct thread *, void *, int *);
 int mincore(struct thread *, void *, int *);
 int mprotect(struct thread *, void *, int *);
 int msync(struct thread *, void *, int *);
 int munmap(struct thread *, void *, int *);
 int obreak(struct thread *, void *, int *);
 int sbrk(struct thread *, void *, int *);
 int sstk(struct thread *, void *, int *);
 int swapon(struct thread *, void *, int *);
 #endif			/* TYPEDEF_FOR_UAP */
 
 int kernacc(void *, int, int);
 vm_offset_t kmem_alloc(vm_map_t, vm_size_t);
 vm_offset_t kmem_alloc_nofault(vm_map_t, vm_size_t);
 vm_offset_t kmem_alloc_wait(vm_map_t, vm_size_t);
 void kmem_free(vm_map_t, vm_offset_t, vm_size_t);
 void kmem_free_wakeup(vm_map_t, vm_offset_t, vm_size_t);
 void kmem_init(vm_offset_t, vm_offset_t);
 vm_offset_t kmem_malloc(vm_map_t, vm_size_t, boolean_t);
 vm_map_t kmem_suballoc(vm_map_t, vm_offset_t *, vm_offset_t *, vm_size_t);
 void swapout_procs(int);
 int useracc(void *, int, int);
 int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int);
 void vm_fault_copy_entry(vm_map_t, vm_map_t, vm_map_entry_t, vm_map_entry_t);
 void vm_fault_unwire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t);
 int vm_fault_wire(vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t);
-void vm_forkproc(struct thread *, struct proc *, struct thread *, int);
+int vm_forkproc(struct thread *, struct proc *, struct thread *, struct vmspace *, int);
 void vm_waitproc(struct proc *);
 int vm_mmap(vm_map_t, vm_offset_t *, vm_size_t, vm_prot_t, vm_prot_t, int, objtype_t, void *, vm_ooffset_t);
 void vm_set_page_size(void);
 struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t);
 struct vmspace *vmspace_fork(struct vmspace *);
-void vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
-void vmspace_unshare(struct proc *);
+int vmspace_exec(struct proc *, vm_offset_t, vm_offset_t);
+int vmspace_unshare(struct proc *);
 void vmspace_exit(struct thread *);
 struct vmspace *vmspace_acquire_ref(struct proc *);
 void vmspace_free(struct vmspace *);
 void vmspace_exitfree(struct proc *);
 void vnode_pager_setsize(struct vnode *, vm_ooffset_t);
 int vslock(void *, size_t);
 void vsunlock(void *, size_t);
 void vm_object_print(/* db_expr_t */ long, boolean_t, /* db_expr_t */ long,
 			  char *);
 int vm_fault_quick(caddr_t v, int prot);
 struct sf_buf *vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset);
 void vm_imgact_unmap_page(struct sf_buf *sf);
 void vm_thread_dispose(struct thread *td);
 void vm_thread_dispose_altkstack(struct thread *td);
-void vm_thread_new(struct thread *td, int pages);
-void vm_thread_new_altkstack(struct thread *td, int pages);
+int vm_thread_new(struct thread *td, int pages);
+int vm_thread_new_altkstack(struct thread *td, int pages);
 void vm_thread_swapin(struct thread *td);
 void vm_thread_swapout(struct thread *td);
 #endif				/* _KERNEL */
 #endif				/* !_VM_EXTERN_H_ */
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 173360)
+++ head/sys/vm/vm_glue.c	(revision 173361)
@@ -1,1049 +1,1060 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 extern int maxslp;
 
 /*
  * System initialization
  *
  * Note: proc0 from proc.h
  */
 static void vm_init_limits(void *);
 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
 
 /*
  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
  *
  * Note: run scheduling should be divorced from the vm system.
  */
 static void scheduler(void *);
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
 
 #ifndef NO_SWAPPING
 static int swapout(struct proc *);
 static void swapclear(struct proc *);
 #endif
 
 
 static volatile int proc0_rescan;
 
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjuction with this call.
  */
 int
 useracc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	PROC_LOCK(curproc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) >
 	    lim_cur(curproc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(curproc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(curproc);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
 	if (npages + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (error == KERN_SUCCESS ? 0 : EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m, ma[1];
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_LOCK(object);
 	pindex = OFF_TO_IDX(offset);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		ma[0] = m;
 		rv = vm_pager_get_pages(object, ma, 1, 0);
 		m = vm_page_lookup(object, pindex);
 		if (m == NULL)
 			goto out;
 		if (m->valid == 0 || rv != VM_PAGER_OK) {
 			vm_page_lock_queues();
 			vm_page_free(m);
 			vm_page_unlock_queues();
 			m = NULL;
 			goto out;
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(m);
 	vm_page_unlock_queues();
 	vm_page_wakeup(m);
 out:
 	VM_OBJECT_UNLOCK(object);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_lock_queues();
 	vm_page_unhold(m);
 	vm_page_unlock_queues();
 }
 
 #ifndef KSTACK_MAX_PAGES
 #define KSTACK_MAX_PAGES 32
 #endif
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
-void
+int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = KSTACK_PAGES;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
-	td->td_kstack_obj = ksobj;
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 	ks = kmem_alloc_nofault(kernel_map,
 	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
-	if (ks == 0)
-		panic("vm_thread_new: kstack allocation failed");
+	if (ks == 0) {
+		printf("vm_thread_new: kstack allocation failed\n");
+		vm_object_deallocate(ksobj);
+		return (0);
+	}
+	
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
+	td->td_kstack_obj = ksobj;
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		/*
 		 * Get a kernel stack page.
 		 */
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
 		ma[i] = m;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
+	return (1);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m;
 	int i, pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	pmap_qremove(ks, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
+	td->td_kstack = 0;
 }
 
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_dirty(m);
 		vm_page_unwire(m, 0);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 void
 vm_thread_swapin(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i, pages, rv;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid);
 			m = vm_page_lookup(ksobj, i);
 			m->valid = VM_PAGE_BITS_ALL;
 		}
 		ma[i] = m;
 		vm_page_lock_queues();
 		vm_page_wire(m);
 		vm_page_unlock_queues();
 		vm_page_wakeup(m);
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 
 /*
  * Set up a variable-sized alternate kstack.
  */
-void
+int
 vm_thread_new_altkstack(struct thread *td, int pages)
 {
 
 	td->td_altkstack = td->td_kstack;
 	td->td_altkstack_obj = td->td_kstack_obj;
 	td->td_altkstack_pages = td->td_kstack_pages;
 
-	vm_thread_new(td, pages);
+	return (vm_thread_new(td, pages));
 }
 
 /*
  * Restore the original kstack.
  */
 void
 vm_thread_dispose_altkstack(struct thread *td)
 {
 
 	vm_thread_dispose(td);
 
 	td->td_kstack = td->td_altkstack;
 	td->td_kstack_obj = td->td_altkstack_obj;
 	td->td_kstack_pages = td->td_altkstack_pages;
 	td->td_altkstack = 0;
 	td->td_altkstack_obj = NULL;
 	td->td_altkstack_pages = 0;
 }
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
-void
-vm_forkproc(td, p2, td2, flags)
+int
+vm_forkproc(td, p2, td2, vm2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
+	struct vmspace *vm2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
+	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
-				vmspace_unshare(p1);
+				error = vmspace_unshare(p1);
+				if (error)
+					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
-		return;
+		return (0);
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
-		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
+		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
+	return (0);
 }
 
 /*
  * Called after process has been wait(2)'ed apon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 /*
  * Set default limits for VM system.
  * Called for proc 0, and then inherited by all others.
  *
  * XXX should probably act directly on proc0.
  */
 static void
 vm_init_limits(udata)
 	void *udata;
 {
 	struct proc *p = udata;
 	struct plimit *limp;
 	int rss_limit;
 
 	/*
 	 * Set up the initial limits on process VM. Set the maximum resident
 	 * set size to be half of (reasonably) available memory.  Since this
 	 * is a soft limit, it comes into effect only when the system is out
 	 * of memory - half of main memory helps to favor smaller processes,
 	 * and reduces thrashing of the object cache.
 	 */
 	limp = p->p_limit;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
 	/* limit the limit to no less than 2MB */
 	rss_limit = max(cnt.v_free_count, 512);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_flag & P_SWAPPINGIN) {
 		while (p->p_flag & P_SWAPPINGIN)
 			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
 		return;
 	}
 	if ((p->p_flag & P_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		p->p_flag |= P_SWAPPINGIN;
 		PROC_UNLOCK(p);
 
 		/*
 		 * We hold no lock here because the list of threads
 		 * can not change while all threads in the process are
 		 * swapped out.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 		PROC_LOCK(p);
 		PROC_SLOCK(p);
 		swapclear(p);
 		p->p_swtick = ticks;
 		PROC_SUNLOCK(p);
 
 		wakeup(&p->p_flag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
 	}
 #endif /* NO_SWAPPING */
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
  *  XXXKSE - process with the thread with highest priority counts..
  *
  * Giant is held on entry.
  */
 /* ARGSUSED*/
 static void
 scheduler(dummy)
 	void *dummy;
 {
 	struct proc *p;
 	struct thread *td;
 	struct proc *pp;
 	int slptime;
 	int swtime;
 	int ppri;
 	int pri;
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(&Giant);
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		thread_lock(&thread0);
 		proc0_rescan = 0;
 		thread_unlock(&thread0);
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		swtime = (ticks - p->p_swtick) / hz;
 		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
 			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				slptime = (ticks - td->td_slptick) / hz;
 				pri = swtime + slptime;
 				if ((td->td_flags & TDF_SWAPINREQ) == 0)
 					pri -= p->p_nice * 8;
 				/*
 				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					pp = p;
 					ppri = pri;
 				}
 			}
 			thread_unlock(td);
 		}
 		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		thread_lock(&thread0);
 		if (!proc0_rescan) {
 			TD_SET_IWAIT(&thread0);
 			mi_switch(SW_VOL, NULL);
 		}
 		proc0_rescan = 0;
 		thread_unlock(&thread0);
 		goto loop;
 	}
 	PROC_LOCK(p);
 
 	/*
 	 * Another process may be bringing or may have already
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
 	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
 		thread_lock(&thread0);
 		proc0_rescan = 0;
 		thread_unlock(&thread0);
 		goto loop;
 	}
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
 	thread_lock(&thread0);
 	proc0_rescan = 0;
 	thread_unlock(&thread0);
 	goto loop;
 }
 
 void kick_proc0(void)
 {
 	struct thread *td = &thread0;
 
 	/* XXX This will probably cause a LOR in some cases */
 	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		proc0_rescan = 1;
 		CTR2(KTR_INTR, "%s: state %d",
 		    __func__, td->td_state);
 	}
 	thread_unlock(td);
 	
 }
 
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  * procs and swap out their stacks.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  * if any, otherwise the longest-resident process.
  */
 void
 swapout_procs(action)
 int action;
 {
 	struct proc *p;
 	struct thread *td;
 	int didswap = 0;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		int slptime;
 		
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
 		if (p->p_state == PRS_NEW)
 			continue;
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
 		 * Perform a quick check whether
 		 * a process has P_SYSTEM.
 		 */
 		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
 		 * structures as there is a possible
 		 * deadlock.  Test this first as
 		 * this may block.
 		 *
 		 * Lock the map until swapout
 		 * finishes, or a thread of this
 		 * process may attempt to alter
 		 * the map.
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL)
 			continue;
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
 		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
 		    ) != 0) {
 			goto nextproc2;
 		}
 		/*
 		 * only aiod changes vmspace, however it will be
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
 		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
 			goto nextproc2;
 
 		switch (p->p_state) {
 		default:
 			/* Don't swap out processes in any sort
 			 * of 'special' state. */
 			break;
 
 		case PRS_NORMAL:
 			PROC_SLOCK(p);
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (PRI_IS_REALTIME(td->td_pri_class)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				slptime = (ticks - td->td_slptick) / hz;
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
 				if (slptime < swap_idle_threshold1) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				/*
 				 * Do not swapout a process if it is
 				 * waiting on a critical event of some
 				 * kind or there is a thread whose
 				 * pageable memory may be accessed.
 				 *
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
 				if ((td->td_priority) < PSOCK ||
 				    !thread_safetoswapout(td)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
 				 * idle processes >= swap_idle_threshold2,
 				 * then swap the process out.
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
 				    (slptime < swap_idle_threshold2))) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				if (minslptime > slptime)
 					minslptime = slptime;
 				thread_unlock(td);
 			}
 
 			/*
 			 * If the pageout daemon didn't free enough pages,
 			 * or if this process is idle and the system is
 			 * configured to swap proactively, swap it out.
 			 */
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
 				if (swapout(p) == 0)
 					didswap++;
 				PROC_SUNLOCK(p);
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
 				sx_sunlock(&allproc_lock);
 				goto retry;
 			}
 nextproc:			
 			PROC_SUNLOCK(p);
 		}
 nextproc2:
 		PROC_UNLOCK(p);
 		vm_map_unlock(&vm->vm_map);
 nextproc1:
 		vmspace_free(vm);
 		continue;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapclear(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		td->td_flags |= TDF_INMEM;
 		td->td_flags &= ~TDF_SWAPINREQ;
 		TD_CLR_SWAPPED(td);
 		if (TD_CAN_RUN(td))
 			setrunnable(td);
 		thread_unlock(td);
 	}
 	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
 	p->p_flag |= P_INMEM;
 }
 
 static int
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED | MA_NOTRECURSED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
 		("swapout: lost a swapout race?"));
 
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 	/*
 	 * Check and mark all threads before we proceed.
 	 */
 	p->p_flag &= ~P_INMEM;
 	p->p_flag |= P_SWAPPINGOUT;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (!thread_safetoswapout(td)) {
 			thread_unlock(td);
 			swapclear(p);
 			return (EBUSY);
 		}
 		td->td_flags &= ~TDF_INMEM;
 		TD_SET_SWAPPED(td);
 		thread_unlock(td);
 	}
 	td = FIRST_THREAD_IN_PROC(p);
 	++td->td_ru.ru_nswap;
 	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 
 	/*
 	 * This list is stable because all threads are now prevented from
 	 * running.  The list is only modified in the context of a running
 	 * thread in this process.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	p->p_flag &= ~P_SWAPPINGOUT;
 	PROC_SLOCK(p);
 	p->p_swtick = ticks;
 	return (0);
 }
 #endif /* !NO_SWAPPING */
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 173360)
+++ head/sys/vm/vm_map.c	(revision 173361)
@@ -1,3439 +1,3456 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 #include <sys/file.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a single hint is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
 static struct vm_object kmapentobj;
 static int vmspace_zinit(void *mem, int size, int flags);
 static void vmspace_zfini(void *mem, int size);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void vm_map_zfini(void *mem, int size);
 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
 
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	uma_prealloc(kmapentzone, MAX_KMAPENT);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 static void
 vmspace_zfini(void *mem, int size)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
-	pmap_release(vmspace_pmap(vm));
 	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
+	vm->vm_map.pmap = NULL;
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
-	pmap_pinit(vmspace_pmap(vm));
 	return (0);
 }
 
 static void
 vm_map_zfini(void *mem, int size)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	mtx_destroy(&map->system_mtx);
 	sx_destroy(&map->lock);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	map->nentries = 0;
 	map->size = 0;
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  */
 struct vmspace *
 vmspace_alloc(min, max)
 	vm_offset_t min, max;
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
+	if (vm->vm_map.pmap == NULL && !pmap_pinit(vmspace_pmap(vm))) {
+		uma_zfree(vmspace_zone, vm);
+		return (NULL);
+	}
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, min, max);
 	vm->vm_map.pmap = vmspace_pmap(vm);		/* XXX */
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 void
 vm_init2(void)
 {
 	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
 	    (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 +
 	     maxproc * 2 + maxfiles);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 	    vm->vm_map.max_offset);
 
+	/*
+	 * XXX Comment out the pmap_release call for now. The
+	 * vmspace_zone is marked as UMA_ZONE_NOFREE, and bugs cause
+	 * pmap.resident_count to be != 0 on exit sometimes.
+	 */
+/* 	pmap_release(vmspace_pmap(vm)); */
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 	int refcnt;
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	do
 		refcnt = vm->vm_refcnt;
 	while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
 		(void)_sx_xlock(&map->lock, 0, file, line);
 	map->timestamp++;
 }
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_unlock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xunlock(&map->lock, file, line);
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
 		(void)_sx_xlock(&map->lock, 0, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_unlock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xunlock(&map->lock, file, line);
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !_mtx_trylock(&map->system_mtx, 0, file, line) :
 	    !_sx_try_xlock(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !_mtx_trylock(&map->system_mtx, 0, file, line) :
 	    !_sx_try_xlock(&map->lock, file, line);
 	return (error == 0);
 }
 
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 
 #ifdef INVARIANTS
 	if (map->system_map) {
 		_mtx_assert(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		_sx_assert(&map->lock, SX_XLOCKED, file, line);
 #endif
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 #ifdef INVARIANTS
 	if (map->system_map) {
 		_mtx_assert(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		_sx_assert(&map->lock, SX_XLOCKED, file, line);
 #endif
 }
 
 /*
  *	vm_map_unlock_and_wait:
  */
 int
 vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
 {
 
 	mtx_lock(&map_sleep_mtx);
 	vm_map_unlock(map);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0));
 }
 
 /*
  *	vm_map_wakeup:
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the vm_map_unlock()
 	 * and the msleep() in vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 long
 vmspace_wired_count(struct vmspace *vmspace)
 {
 	return pmap_wired_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, min, max);
 	result->pmap = pmap;
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  * The pmap is set elsewhere.
  */
 static void
 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.next = map->header.prev = &map->header;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
 }
 
 void
 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 {
 	_vm_map_init(map, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_set_max_free:
  *
  *	Set the max_free field in a vm_map_entry.
  */
 static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
 
 	entry->max_free = entry->adj_free;
 	if (entry->left != NULL && entry->left->max_free > entry->max_free)
 		entry->max_free = entry->left->max_free;
 	if (entry->right != NULL && entry->right->max_free > entry->max_free)
 		entry->max_free = entry->right->max_free;
 }
 
 /*
  *	vm_map_entry_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower or higher) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
 {
 	vm_map_entry_t llist, rlist;
 	vm_map_entry_t ltree, rtree;
 	vm_map_entry_t y;
 
 	/* Special case of empty tree. */
 	if (root == NULL)
 		return (root);
 
 	/*
 	 * Pass One: Splay down the tree until we find addr or a NULL
 	 * pointer where addr would go.  llist and rlist are the two
 	 * sides in reverse order (bottom-up), with llist linked by
 	 * the right pointer and rlist linked by the left pointer in
 	 * the vm_map_entry.  Wait until Pass Two to set max_free on
 	 * the two spines.
 	 */
 	llist = NULL;
 	rlist = NULL;
 	for (;;) {
 		/* root is never NULL in here. */
 		if (addr < root->start) {
 			y = root->left;
 			if (y == NULL)
 				break;
 			if (addr < y->start && y->left != NULL) {
 				/* Rotate right and put y on rlist. */
 				root->left = y->right;
 				y->right = root;
 				vm_map_entry_set_max_free(root);
 				root = y->left;
 				y->left = rlist;
 				rlist = y;
 			} else {
 				/* Put root on rlist. */
 				root->left = rlist;
 				rlist = root;
 				root = y;
 			}
 		} else {
 			y = root->right;
 			if (addr < root->end || y == NULL)
 				break;
 			if (addr >= y->end && y->right != NULL) {
 				/* Rotate left and put y on llist. */
 				root->right = y->left;
 				y->left = root;
 				vm_map_entry_set_max_free(root);
 				root = y->right;
 				y->right = llist;
 				llist = y;
 			} else {
 				/* Put root on llist. */
 				root->right = llist;
 				llist = root;
 				root = y;
 			}
 		}
 	}
 
 	/*
 	 * Pass Two: Walk back up the two spines, flip the pointers
 	 * and set max_free.  The subtrees of the root go at the
 	 * bottom of llist and rlist.
 	 */
 	ltree = root->left;
 	while (llist != NULL) {
 		y = llist->right;
 		llist->right = ltree;
 		vm_map_entry_set_max_free(llist);
 		ltree = llist;
 		llist = y;
 	}
 	rtree = root->right;
 	while (rlist != NULL) {
 		y = rlist->left;
 		rlist->left = rtree;
 		vm_map_entry_set_max_free(rlist);
 		rtree = rlist;
 		rlist = y;
 	}
 
 	/*
 	 * Final assembly: add ltree and rtree as subtrees of root.
 	 */
 	root->left = ltree;
 	root->right = rtree;
 	vm_map_entry_set_max_free(root);
 
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static void
 vm_map_entry_link(vm_map_t map,
 		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
 
 	CTR4(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
 	entry->next->prev = entry;
 	after_where->next = entry;
 
 	if (after_where != &map->header) {
 		if (after_where != map->root)
 			vm_map_entry_splay(after_where->start, map->root);
 		entry->right = after_where->right;
 		entry->left = after_where;
 		after_where->right = NULL;
 		after_where->adj_free = entry->start - after_where->end;
 		vm_map_entry_set_max_free(after_where);
 	} else {
 		entry->right = map->root;
 		entry->left = NULL;
 	}
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
 
 static void
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev, root;
 
 	if (entry != map->root)
 		vm_map_entry_splay(entry->start, map->root);
 	if (entry->left == NULL)
 		root = entry->right;
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
 		root->adj_free = (entry->next == &map->header ? map->max_offset :
 		    entry->next->start) - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
 
 	prev = entry->prev;
 	next = entry->next;
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize_free:
  *
  *	Recompute the amount of free space following a vm_map_entry
  *	and propagate that value up the tree.  Call this function after
  *	resizing a map entry in-place, that is, without a call to
  *	vm_map_entry_link() or _unlink().
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * Using splay trees without parent pointers, propagating
 	 * max_free up the tree is done by moving the entry to the
 	 * root and making the change there.
 	 */
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur;
 
 	cur = vm_map_entry_splay(address, map->root);
 	if (cur == NULL)
 		*entry = &map->header;
 	else {
 		map->root = cur;
 
 		if (address >= cur->start) {
 			*entry = cur;
 			if (cur->end > address)
 				return (TRUE);
 		} else
 			*entry = cur->prev;
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 	      int cow)
 {
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 	vm_eflags_t protoeflags;
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	protoeflags = 0;
 
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 
 	if (cow & MAP_NOFAULT) {
 		protoeflags |= MAP_ENTRY_NOFAULT;
 
 		KASSERT(object == NULL,
 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(object);
 	}
 	else if ((prev_entry != &map->header) &&
 		 (prev_entry->eflags == protoeflags) &&
 		 (prev_entry->end == start) &&
 		 (prev_entry->wired_count == 0) &&
 		 ((prev_entry->object.vm_object == NULL) ||
 		  vm_object_coalesce(prev_entry->object.vm_object,
 				     prev_entry->offset,
 				     (vm_size_t)(prev_entry->end - prev_entry->start),
 				     (vm_size_t)(end - prev_entry->end)))) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
 		    (prev_entry->protection == prot) &&
 		    (prev_entry->max_protection == max)) {
 			map->size += (end - prev_entry->end);
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 			(prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 	}
 
 	/*
 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
 	 * in things like the buffer map where we manage kva but do not manage
 	 * backing objects.
 	 */
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = VM_INHERIT_DEFAULT;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 #if 0
 	/*
 	 * Temporarily removed to avoid MAP_STACK panic, due to
 	 * MAP_STACK being a huge hack.  Will be added back in
 	 * when MAP_STACK (and the user stack mapping) is fixed.
 	 */
 	/*
 	 * It may be possible to simplify the entry
 	 */
 	vm_map_simplify_entry(map, new_entry);
 #endif
 
 	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 		vm_map_pmap_enter(map, start, prot,
 				    object, OFF_TO_IDX(offset), end - start,
 				    cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "adj_free" is the amount of free space
  *	adjacent (higher address) to this entry, and "max_free" is the
  *	maximum amount of contiguous free space in its subtree.  This
  *	allows finding a free region in one path down the tree, so
  *	O(log n) amortized with splay trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: 0 on success, and starting address in *addr,
  *		 1 if insufficient space.
  */
 int
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
     vm_offset_t *addr)	/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_offset_t end, st;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start + length > map->max_offset || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL) {
 		*addr = start;
 		goto found;
 	}
 
 	/*
 	 * After splay, if start comes before root node, then there
 	 * must be a gap from start to the root.
 	 */
 	map->root = vm_map_entry_splay(start, map->root);
 	if (start + length <= map->root->start) {
 		*addr = start;
 		goto found;
 	}
 
 	/*
 	 * Root is the last node that might begin its gap before
 	 * start, and this is the last comparison where address
 	 * wrap might be a problem.
 	 */
 	st = (start > map->root->end) ? start : map->root->end;
 	if (length <= map->root->end + map->root->adj_free - st) {
 		*addr = st;
 		goto found;
 	}
 
 	/* With max_free, can immediately tell if no solution. */
 	entry = map->root->right;
 	if (entry == NULL || length > entry->max_free)
 		return (1);
 
 	/*
 	 * Search the right subtree in the order: left subtree, root,
 	 * right subtree (first fit).  The previous splay implies that
 	 * all regions in the right subtree have addresses > start.
 	 */
 	while (entry != NULL) {
 		if (entry->left != NULL && entry->left->max_free >= length)
 			entry = entry->left;
 		else if (entry->adj_free >= length) {
 			*addr = entry->end;
 			goto found;
 		} else
 			entry = entry->right;
 	}
 
 	/* Can't get here, so panic if we do. */
 	panic("vm_map_findspace: max_free corrupt");
 
 found:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map) {
 		end = round_page(*addr + length);
 		if (end > kernel_vm_end)
 			pmap_growkernel(end);
 	}
 	return (0);
 }
 
 int
 vm_map_fixed(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr /* IN/OUT */, vm_size_t length, vm_prot_t prot,
     vm_prot_t max, int cow)
 {
 	vm_offset_t start, end;
 	int result;
 
 	start = *addr;
 	vm_map_lock(map);
 	end = start + length;
 	VM_MAP_RANGE_CHECK(map, start, end);
 	(void) vm_map_delete(map, start, end);
 	result = vm_map_insert(map, object, offset, start, end, prot,
 	    max, cow);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, boolean_t find_space, vm_prot_t prot,
 	    vm_prot_t max, int cow)
 {
 	vm_offset_t start;
 	int result;
 
 	start = *addr;
 	vm_map_lock(map);
 	if (find_space) {
 		if (vm_map_findspace(map, start, length, addr)) {
 			vm_map_unlock(map);
 			return (KERN_NO_SPACE);
 		}
 		start = *addr;
 	}
 	result = vm_map_insert(map, object, offset,
 		start, start + length, prot, max, cow);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
  *	routine also has the ability to merge with both neighbors.
  *
  *	The map must be locked.
  *
  *	This routine guarentees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
 void
 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
 	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
 		return;
 
 	prev = entry->prev;
 	if (prev != &map->header) {
 		prevsize = prev->end - prev->start;
 		if ( (prev->end == entry->start) &&
 		     (prev->object.vm_object == entry->object.vm_object) &&
 		     (!prev->object.vm_object ||
 			(prev->offset + prevsize == entry->offset)) &&
 		     (prev->eflags == entry->eflags) &&
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
 		     (prev->wired_count == entry->wired_count)) {
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
 			if (entry->prev != &map->header)
 				vm_map_entry_resize_free(map, entry->prev);
 			if (prev->object.vm_object)
 				vm_object_deallocate(prev->object.vm_object);
 			vm_map_entry_dispose(map, prev);
 		}
 	}
 
 	next = entry->next;
 	if (next != &map->header) {
 		esize = entry->end - entry->start;
 		if ((entry->end == next->start) &&
 		    (next->object.vm_object == entry->object.vm_object) &&
 		     (!entry->object.vm_object ||
 			(entry->offset + esize == next->offset)) &&
 		    (next->eflags == entry->eflags) &&
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
 		    (next->wired_count == entry->wired_count)) {
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			vm_map_entry_resize_free(map, entry);
 			if (next->object.vm_object)
 				vm_object_deallocate(next->object.vm_object);
 			vm_map_entry_dispose(map, next);
 		}
 	}
 }
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if ((endaddr) < (entry->end)) \
 		_vm_map_clip_end((map), (entry), (endaddr)); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  * The maximum number of pages to map
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload read-only mappings for the given object's resident pages into
  *	the given map.  This eliminates the soft faults on process startup and
  *	immediately after an mmap(2).  Unless the given flags include
  *	MAP_PREFAULT_MADVISE, cached pages are not reactivated and mapped.
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t psize, tmpidx;
 	boolean_t are_queues_locked;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	if (object->type == OBJT_DEVICE) {
 		pmap_object_init_pt(map->pmap, addr, object, pindex, size);
 		goto unlock_return;
 	}
 
 	psize = atop(size);
 
 	if (object->type != OBJT_VNODE ||
 	    ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
 	     (object->resident_page_count > MAX_INIT_PT))) {
 		goto unlock_return;
 	}
 
 	if (psize + pindex > object->size) {
 		if (object->size < pindex)
 			goto unlock_return;
 		psize = object->size - pindex;
 	}
 
 	are_queues_locked = FALSE;
 	start = 0;
 	p_start = NULL;
 
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < pindex) {
 			p = vm_page_splay(pindex, object->root);
 			if ((object->root = p)->pindex < pindex)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if ((flags & MAP_PREFAULT_MADVISE) &&
 		    cnt.v_free_count < cnt.v_free_reserved) {
 			psize = tmpidx;
 			break;
 		}
 		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
 		    (p->busy == 0)) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 		} else if (p_start != NULL) {
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
 				vm_page_lock_queues();
 			}
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL) {
 		if (!are_queues_locked) {
 			are_queues_locked = TRUE;
 			vm_page_lock_queues();
 		}
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	}
 	if (are_queues_locked)
 		vm_page_unlock_queues();
 unlock_return:
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		vm_prot_t old_prot;
 
 		vm_map_clip_end(map, current, end);
 
 		old_prot = current->protection;
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * Update physical map if necessary. Worry about copy-on-write
 		 * here -- CHECK THIS XXX
 		 */
 		if (current->protection != old_prot) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
 		current = current->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t current, entry;
 	int modify_map = 0;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pindex;
 		int count;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			vm_offset_t useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pindex = OFF_TO_IDX(current->offset);
 			count = atop(current->end - current->start);
 			useStart = current->start;
 
 			if (current->start < start) {
 				pindex += atop(start - current->start);
 				count -= atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end)
 				count -= atop(current->end - end);
 
 			if (count <= 0)
 				continue;
 
 			vm_object_madvise(current->object.vm_object,
 					  pindex, count, behav);
 			if (behav == MADV_WILLNEED) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
 				    current->object.vm_object,
 				    pindex,
 				    (count << PAGE_SHIFT),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vm_map_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 		entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, user_unwire)) {
 				/*
 				 * Allow interruption of user unwiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp+1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * First_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    entry->object.vm_object->type == OBJT_DEVICE);
 			}
 		}
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_unwire: in-transition flag missing"));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
  */
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t fictitious, need_wakeup, result, user_wire;
 
 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, user_wire)) {
 				/*
 				 * Allow interruption of user wiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * first_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		/*
 		 *
 		 */
 		if (entry->wired_count == 0) {
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
 			fictitious = entry->object.vm_object != NULL &&
 			    entry->object.vm_object->type == OBJT_DEVICE;
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.
 			 */
 			vm_map_unlock(map);
 			rv = vm_fault_wire(map, saved_start, saved_end,
 			    user_wire, fictitious);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				result = vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry);
 				KASSERT(result, ("vm_map_wire: lookup failed"));
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
 					if (rv != KERN_SUCCESS) {
 						KASSERT(entry->wired_count == 1,
 						    ("vm_map_wire: bad count"));
 						entry->wired_count = -1;
 					}
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
 				KASSERT(entry->wired_count == 1,
 				    ("vm_map_wire: bad count"));
 				/*
 				 * Assign an out-of-range value to represent
 				 * the failure to wire this entry.
 				 */
 				entry->wired_count = -1;
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else {
 			if (!user_wire ||
 			    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
 				entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    entry->object.vm_object->type == OBJT_DEVICE);
 			}
 		}
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_wire: in-transition flag missing"));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = entry->start;
 		end = entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (current = entry; current != &map->header && current->start < end;
 	    current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current != &map->header && current->start < end;
 	    current = current->next) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		vm_object_sync(object, offset, size, syncio, invalidate);
 		start += size;
 	}
 
 	vm_map_unlock_read(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_fault_unwire(map, entry->start, entry->end,
 	    entry->object.vm_object != NULL &&
 	    entry->object.vm_object->type == OBJT_DEVICE);
 	entry->wired_count = 0;
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count;
 
 	vm_map_entry_unlink(map, entry);
 	map->size -= entry->end - entry->start;
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 	    (object = entry->object.vm_object) != NULL) {
 		count = OFF_TO_IDX(entry->end - entry->start);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 			vm_object_page_remove(object, offidxstart, offidxend, FALSE);
 			if (object->type == OBJT_SWAP)
 				swap_pager_freespace(object, offidxstart, count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size)
 				object->size = offidxstart;
 		}
 		VM_OBJECT_UNLOCK(object);
 		vm_object_deallocate(object);
 	}
 
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 			vm_map_entry_t tmp_entry;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, FALSE);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 							 &tmp_entry))
 					entry = tmp_entry->next;
 				else {
 					entry = tmp_entry;
 					vm_map_clip_start(map, entry,
 							  saved_start);
 				}
 			}
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0) {
 			vm_map_entry_unwire(map, entry);
 		}
 
 		pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry (which may delete the object) only after
 		 * removing all pmap entries pointing to its pages.
 		 * (Otherwise, its page frames may be reallocated, and any
 		 * modify bits will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header)
 			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry)
 {
 	vm_object_t src_object;
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0) {
 
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(src_object);
 			if ((src_object->handle == NULL) &&
 				(src_object->type == OBJT_DEFAULT ||
 				 src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
 					src_object = src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			VM_OBJECT_UNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->offset = src_entry->offset;
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 		}
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * Of course, wired down pages can't be set copy-on-write.
 		 * Cause wired pages to be copied into the new map by
 		 * simulating faults (the new pages are pageable)
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1)
 {
 	struct vmspace *vm2;
 	vm_map_t old_map = &vm1->vm_map;
 	vm_map_t new_map;
 	vm_map_entry_t old_entry;
 	vm_map_entry_t new_entry;
 	vm_object_t object;
 
 	vm_map_lock(old_map);
 
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
+	if (vm2 == NULL)
+		goto unlock_and_return;
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	new_map = &vm2->vm_map;	/* XXX */
 	new_map->timestamp = 1;
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = 0;
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 					&old_entry->offset,
 					atop(old_entry->end - old_entry->start));
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				/* Transfer the second reference too. */
 				vm_object_reference(
 				    old_entry->object.vm_object);
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
 			VM_OBJECT_LOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			VM_OBJECT_UNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
-
+unlock_and_return:
 	vm_map_unlock(old_map);
 
 	return (vm2);
 }
 
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, top;
 	vm_size_t init_ssize;
 	int orient, rv;
 	rlim_t vmemlim;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 * NOTE: We explicitly allow bi-directional stacks.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
 	cow &= ~orient;
 	KASSERT(orient != 0, ("No stack grow direction"));
 
 	if (addrbos < vm_map_min(map) || addrbos > map->max_offset)
 		return (KERN_NO_SPACE);
 
 	init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
 
 	PROC_LOCK(curthread->td_proc);
 	vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM);
 	PROC_UNLOCK(curthread->td_proc);
 
 	vm_map_lock(map);
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If we can't accomodate max_ssize in the current mapping, no go.
 	 * However, we need to be aware that subsequent user mappings might
 	 * map into the space we have reserved for stack, and currently this
 	 * space is not protected.
 	 *
 	 * Hopefully we will at least detect this condition when we try to
 	 * grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < addrbos + max_ssize)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN)
 		bot = addrbos + max_ssize - init_ssize;
 	else if (orient == MAP_STACK_GROWS_UP)
 		bot = addrbos;
 	else
 		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
 	top = bot + init_ssize;
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 
 	/* Now set the avail_ssize amount. */
 	if (rv == KERN_SUCCESS) {
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, bot);
 		new_entry = prev_entry->next;
 		if (new_entry->end != top || new_entry->start != bot)
 			panic("Bad entry start/end for new stack entry");
 
 		new_entry->avail_ssize = max_ssize - init_ssize;
 		if (orient & MAP_STACK_GROWS_DOWN)
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		if (orient & MAP_STACK_GROWS_UP)
 			new_entry->eflags |= MAP_ENTRY_GROWS_UP;
 	}
 
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
  * desired address is already mapped, or if we successfully grow
  * the stack.  Also returns KERN_SUCCESS if addr is outside the
  * stack range (this is strange, but preserves compatibility with
  * the grow function in vm_machdep.c).
  */
 int
 vm_map_growstack(struct proc *p, vm_offset_t addr)
 {
 	vm_map_entry_t next_entry, prev_entry;
 	vm_map_entry_t new_entry, stack_entry;
 	struct vmspace *vm = p->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t end;
 	size_t grow_amount, max_grow;
 	rlim_t stacklim, vmemlim;
 	int is_procstack, rv;
 
 Retry:
 	PROC_LOCK(p);
 	stacklim = lim_cur(p, RLIMIT_STACK);
 	vmemlim = lim_cur(p, RLIMIT_VMEM);
 	PROC_UNLOCK(p);
 
 	vm_map_lock_read(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
 	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	}
 
 	next_entry = prev_entry->next;
 	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
 		/*
 		 * This entry does not grow upwards. Since the address lies
 		 * beyond this entry, the next entry (if one exists) has to
 		 * be a downward growable entry. The entry list header is
 		 * never a growable entry, so it suffices to check the flags.
 		 */
 		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
 			vm_map_unlock_read(map);
 			return (KERN_SUCCESS);
 		}
 		stack_entry = next_entry;
 	} else {
 		/*
 		 * This entry grows upward. If the next entry does not at
 		 * least grow downwards, this is the entry we need to grow.
 		 * otherwise we have two possible choices and we have to
 		 * select one.
 		 */
 		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
 			/*
 			 * We have two choices; grow the entry closest to
 			 * the address to minimize the amount of growth.
 			 */
 			if (addr - prev_entry->end <= next_entry->start - addr)
 				stack_entry = prev_entry;
 			else
 				stack_entry = next_entry;
 		} else
 			stack_entry = prev_entry;
 	}
 
 	if (stack_entry == next_entry) {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
 		KASSERT(addr < stack_entry->start, ("foo"));
 		end = (prev_entry != &map->header) ? prev_entry->end :
 		    stack_entry->start - stack_entry->avail_ssize;
 		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
 		max_grow = stack_entry->start - end;
 	} else {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
 		KASSERT(addr >= stack_entry->end, ("foo"));
 		end = (next_entry != &map->header) ? next_entry->start :
 		    stack_entry->end + stack_entry->avail_ssize;
 		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
 		max_grow = end - stack_entry->end;
 	}
 
 	if (grow_amount > stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If there is no longer enough space between the entries nogo, and
 	 * adjust the available space.  Note: this  should only happen if the
 	 * user has mapped into the stack area after the stack was created,
 	 * and is probably an error.
 	 *
 	 * This also effectively destroys any guard page the user might have
 	 * intended by limiting the stack size.
 	 */
 	if (grow_amount > max_grow) {
 		if (vm_map_lock_upgrade(map))
 			goto Retry;
 
 		stack_entry->avail_ssize = max_grow;
 
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* Round up the grow amount modulo SGROWSIZ */
 	grow_amount = roundup (grow_amount, sgrowsiz);
 	if (grow_amount > stack_entry->avail_ssize)
 		grow_amount = stack_entry->avail_ssize;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = stacklim - ctob(vm->vm_ssize);
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	if (vm_map_lock_upgrade(map))
 		goto Retry;
 
 	if (stack_entry == next_entry) {
 		/*
 		 * Growing downward.
 		 */
 		/* Get the preliminary new entry start value */
 		addr = stack_entry->start - grow_amount;
 
 		/*
 		 * If this puts us into the previous entry, cut back our
 		 * growth to the available space. Also, see the note above.
 		 */
 		if (addr < end) {
 			stack_entry->avail_ssize = max_grow;
 			addr = end;
 		}
 
 		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
 		    p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
 
 		/* Adjust the available stack space by the amount we grew. */
 		if (rv == KERN_SUCCESS) {
 			if (prev_entry != &map->header)
 				vm_map_clip_end(map, prev_entry, addr);
 			new_entry = prev_entry->next;
 			KASSERT(new_entry == stack_entry->prev, ("foo"));
 			KASSERT(new_entry->end == stack_entry->start, ("foo"));
 			KASSERT(new_entry->start == addr, ("foo"));
 			grow_amount = new_entry->end - new_entry->start;
 			new_entry->avail_ssize = stack_entry->avail_ssize -
 			    grow_amount;
 			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		}
 	} else {
 		/*
 		 * Growing upward.
 		 */
 		addr = stack_entry->end + grow_amount;
 
 		/*
 		 * If this puts us into the next entry, cut back our growth
 		 * to the available space. Also, see the note above.
 		 */
 		if (addr > end) {
 			stack_entry->avail_ssize = end - stack_entry->end;
 			addr = end;
 		}
 
 		grow_amount = addr - stack_entry->end;
 
 		/* Grow the underlying object if applicable. */
 		if (stack_entry->object.vm_object == NULL ||
 		    vm_object_coalesce(stack_entry->object.vm_object,
 		    stack_entry->offset,
 		    (vm_size_t)(stack_entry->end - stack_entry->start),
 		    (vm_size_t)grow_amount)) {
 			map->size += (addr - stack_entry->end);
 			/* Update the current entry. */
 			stack_entry->end = addr;
 			stack_entry->avail_ssize -= grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
 
 			if (next_entry != &map->header)
 				vm_map_clip_start(map, next_entry, addr);
 		} else
 			rv = KERN_FAILURE;
 	}
 
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	vm_map_unlock(map);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
 		vm_map_wire(map,
 		    (stack_entry == next_entry) ? addr : addr - grow_amount,
 		    (stack_entry == next_entry) ? stack_entry->start : addr,
 		    (p->p_flag & P_SYSTEM)
 		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
 		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	}
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
-void
+int
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	newvmspace = vmspace_alloc(minuser, maxuser);
+	if (newvmspace == NULL)
+		return (ENOMEM);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
+	return (0);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
-void
+int
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	if (oldvmspace->vm_refcnt == 1)
-		return;
+		return (0);
 	newvmspace = vmspace_fork(oldvmspace);
+	if (newvmspace == NULL)
+		return (ENOMEM);
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
+	return (0);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 RetryLookup:;
 	/*
 	 * Lookup the faulting address.
 	 */
 
 	vm_map_lock_read(map);
 #define	RETURN(why) \
 		{ \
 		vm_map_unlock_read(map); \
 		return (why); \
 		}
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 	entry = map->root;
 	*out_entry = entry;
 	if (entry == NULL ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, out_entry))
 			RETURN(KERN_INVALID_ADDRESS);
 
 		entry = *out_entry;
 	}
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 * Note the special case for MAP_ENTRY_COW
 	 * pages with an override.  This is to implement a forced
 	 * COW for debuggers.
 	 */
 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
 		prot = entry->max_protection;
 	else
 		prot = entry->protection;
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type) {
 			RETURN(KERN_PROTECTION_FAILURE);
 	}
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
 		RETURN(KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if (fault_type & VM_PROT_WRITE) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			vm_object_shadow(
 			    &entry->object.vm_object,
 			    &entry->offset,
 			    atop(entry->end - entry->start));
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(entry->end - entry->start));
 		entry->offset = 0;
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 
 #undef	RETURN
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 	entry = map->root;
 	*out_entry = entry;
 	if (entry == NULL ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, out_entry))
 			return (KERN_INVALID_ADDRESS);
 
 		entry = *out_entry;
 	}
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 * Note the special case for MAP_ENTRY_COW
 	 * pages with an override.  This is to implement a forced
 	 * COW for debuggers.
 	 */
 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
 		prot = entry->max_protection;
 	else
 		prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0)
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 /*
  *	vm_map_print:	[ debug ]
  */
 DB_SHOW_COMMAND(map, vm_map_print)
 {
 	static int nlines;
 	/* XXX convert args. */
 	vm_map_t map = (vm_map_t)addr;
 	boolean_t full = have_addr;
 
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 	nlines++;
 
 	if (!full && db_indent)
 		return;
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p\n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end);
 		nlines++;
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			nlines++;
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((db_expr_t)(intptr_t)
 					     entry->object.sub_map,
 					     full, 0, (char *)0);
 				db_indent -= 2;
 			}
 		} else {
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 			nlines++;
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						full, 0, (char *)0);
 				nlines += 4;
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 	if (db_indent == 0)
 		nlines = 0;
 }
 
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = (struct proc *) addr;
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
 }
 
 #endif /* DDB */