Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 169666)
+++ head/sys/amd64/amd64/machdep.c	(revision 169667)
@@ -1,1907 +1,1907 @@
 /*-
  * Copyright (c) 2003 Peter Wemm.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atalk.h"
 #include "opt_atpic.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_perfmon.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #endif
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #include <machine/tss.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ATPIC
 #include <amd64/isa/icu.h>
 #else
 #include <machine/apicvar.h>
 #endif
 
 #include <isa/isareg.h>
 #include <isa/rtc.h>
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void identify_cpu(void);
 extern void panicifcpuunsupported(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup(void *);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel, _ucode32sel;
 
 int cold = 1;
 
 long Maxmem = 0;
 long realmem = 0;
 
 #define PHYSMAP_SIZE	(2 * 30)
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 struct region_descriptor r_gdt, r_idt;
 
 struct pcpu __pcpu[MAXCPU];
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("usable memory = %ju (%ju MB)\n", ptoa((uintmax_t)physmem),
 	    ptoa((uintmax_t)physmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory  = %ju (%ju MB)\n",
-	    ptoa((uintmax_t)cnt.v_free_count),
-	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
+	    ptoa((uintmax_t)VMCNT_GET(free_count)),
+	    ptoa((uintmax_t)VMCNT_GET(free_count)) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_rsp = (long)sfp;
 	regs->tf_rip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_rflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	long rflags;
 	int cs, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	rflags = ucp->uc_mcontext.mc_rflags;
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.
 	 * The cpu sets PSL_RF in tf_rflags for faults.  Debuggers
 	 * should sometimes set it there too.  tf_rflags is kept in
 	 * the signal context during signal handling and there is no
 	 * other place to remember it, so the PSL_RF bit may be
 	 * corrupted by the signal handler without us knowing.
 	 * Corruption of the PSL_RF bit at worst causes one more or
 	 * one less debugger trap, so allowing it is fairly harmless.
 	 */
 	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
 		printf("sigreturn: rflags = 0x%lx\n", rflags);
 		return (EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 	cs = ucp->uc_mcontext.mc_cs;
 	if (!CS_SECURE(cs)) {
 		printf("sigreturn: cs = 0x%x\n", cs);
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGBUS;
 		ksi.ksi_code = BUS_OBJERR;
 		ksi.ksi_trapno = T_PROTFLT;
 		ksi.ksi_addr = (void *)regs->tf_rip;
 		trapsignal(td, &ksi);
 		return (EINVAL);
 	}
 
 	ret = set_fpcontext(td, &ucp->uc_mcontext);
 	if (ret != 0)
 		return (ret);
 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
  
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	mtx_lock_spin(&sched_lock);
 	sched_bind(curthread, cpu_id);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	sched_unbind(curthread);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 	
 	critical_enter();
 	wrmsr(MSR_FSBASE, 0);
 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	critical_exit();
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_udatasel);
 	load_gs(_udatasel);
 	pcb->pcb_ds = _udatasel;
 	pcb->pcb_es = _udatasel;
 	pcb->pcb_fs = _udatasel;
 	pcb->pcb_gs = _udatasel;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = entry;
 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
 	regs->tf_rdi = stack;		/* argv */
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/*
 	 * Reset the hardware debug registers if they were in use.
 	 * They won't have any meaning for the newly exec'd process.
 	 */
 	if (pcb->pcb_flags & PCB_DBREGS) {
 		pcb->pcb_dr0 = 0;
 		pcb->pcb_dr1 = 0;
 		pcb->pcb_dr2 = 0;
 		pcb->pcb_dr3 = 0;
 		pcb->pcb_dr6 = 0;
 		pcb->pcb_dr7 = 0;
 		if (pcb == PCPU_GET(curpcb)) {
 			/*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 			reset_dbregs();
 		}
 		pcb->pcb_flags &= ~PCB_DBREGS;
 	}
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 }
 
 void
 cpu_setregs(void)
 {
 	register_t cr0;
 
 	cr0 = rcr0();
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
 	 * BSP.  See the comments there about why we set them.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 }
 
 /*
  * Initialize amd64 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
 
 struct amd64tss common_tss[MAXCPU];
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE32_SEL	3 32 bit Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	4 32/64 bit Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	5 64 bit Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	1,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct amd64tss)-1,/* length - all address space */
 	SDT_SYSTSS,		/* segment type */
 	SEL_KPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* Actually, the TSS is a system descriptor which is double size */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0,			/* long */
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUGS32_SEL	8 32 bit GS Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0,			/* long */
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, ist)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int ist;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (uintptr_t)func;
 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
 	ip->gd_ist = ist;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(dblfault),
 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
 
 void
 sdtossd(sd, ssd)
 	struct user_segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_long  = sd->sd_long;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 ssdtosd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct user_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_long  = ssd->ssd_long;
 	sd->sd_def32 = ssd->ssd_def32;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 void
 ssdtosyssd(ssd, sd)
 	struct soft_segment_descriptor *ssd;
 	struct system_segment_descriptor *sd;
 {
 
 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
 	sd->sd_type  = ssd->ssd_type;
 	sd->sd_dpl   = ssd->ssd_dpl;
 	sd->sd_p     = ssd->ssd_p;
 	sd->sd_gran  = ssd->ssd_gran;
 }
 
 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
 #include <isa/isavar.h>
 u_int
 isa_irq_pending(void)
 {
 
 	return (0);
 }
 #endif
 
 u_int basemem;
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(caddr_t kmdp, u_int64_t first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	u_long physmem_tunable;
 	pt_entry_t *pte;
 	struct bios_smap *smapbase, *smap, *smapend;
 	u_int32_t smapsize;
 	quad_t dcons_addr, dcons_size;
 
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 	physmap_idx = 0;
 
 	/*
 	 * get memory map from INT 15:E820, kindly supplied by the loader.
 	 *
 	 * subr_module.c says:
 	 * "Consumer may safely assume that size value precedes data."
 	 * ie: an int32_t immediately precedes smap.
 	 */
 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
 	    MODINFO_METADATA | MODINFOMD_SMAP);
 	if (smapbase == NULL)
 		panic("No BIOS smap info from loader!");
 
 	smapsize = *((u_int32_t *)smapbase - 1);
 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
 
 	for (smap = smapbase; smap < smapend; smap++) {
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
 			    smap->type, smap->base, smap->length);
 
 		if (smap->type != 0x01)
 			continue;
 
 		if (smap->length == 0)
 			continue;
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-monotonic memory region, ignoring second region\n");
 				continue;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			continue;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 	}
 
 	/*
 	 * Find the 'base memory' segment for SMP
 	 */
 	basemem = 0;
 	for (i = 0; i <= physmap_idx; i += 2) {
 		if (physmap[i] == 0x00000000) {
 			basemem = physmap[i + 1] / 1024;
 			break;
 		}
 	}
 	if (basemem == 0)
 		panic("BIOS smap did not include a basemem segment!");
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
 	 * in the system.
 	 */
 	if (Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa; /* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 u_int64_t
 hammer_time(u_int64_t modulep, u_int64_t physfree)
 {
 	caddr_t kmdp;
 	int gsel_tss, x;
 	struct pcpu *pc;
 	u_int64_t msr;
 	char *env;
 
 	thread0.td_kstack = physfree + KERNBASE;
 	bzero((void *)thread0.td_kstack, KSTACK_PAGES * PAGE_SIZE);
 	physfree += KSTACK_PAGES * PAGE_SIZE;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup(&proc0, &thread0);
 
 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
 	preload_bootstrap_relocate(KERNBASE);
 	kmdp = preload_search_by_type("elf kernel");
 	if (kmdp == NULL)
 		kmdp = preload_search_by_type("elf64 kernel");
 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
 #ifdef DDB
 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * make gdt memory segments
 	 */
 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
 
 	for (x = 0; x < NGDT; x++) {
 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
 			ssdtosd(&gdt_segs[x], &gdt[x]);
 	}
 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (long) gdt;
 	lgdt(&r_gdt);
 	pc = &__pcpu[0];
 
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 	PCPU_SET(curtid, thread0.td_tid);
 	PCPU_SET(tssp, &common_tss[0]);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (long) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 #ifdef DEV_ATPIC
 	elcr_probe();
 	atpic_startup();
 #else
 	/* Reset and mask the atpics and leave them shut down. */
 	atpic_reset();
 
 	/*
 	 * Point the ICU spurious interrupt vectors at the APIC spurious
 	 * interrupt handler.
 	 */
 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
 #endif
 #else
 #error "have you forgotten the isa device?";
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	identify_cpu();		/* Final stage of CPU initialization */
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss[0].tss_rsp0 = thread0.td_kstack + \
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb);
 	/* Ensure the stack is aligned to 16 bytes */
 	common_tss[0].tss_rsp0 &= ~0xFul;
 	PCPU_SET(rsp0, common_tss[0].tss_rsp0);
 
 	/* doublefault stack space, runs on ist1 */
 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
 
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss);
 
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	ltr(gsel_tss);
 
 	/* Set up the fast syscall stuff */
 	msr = rdmsr(MSR_EFER) | EFER_SCE;
 	wrmsr(MSR_EFER, msr);
 	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
 	      ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
 	wrmsr(MSR_STAR, msr);
 	wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D);
 
 	getmemsize(kmdp, physfree);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 	fpuinit();
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
 	thread0.td_pcb->pcb_cr3 = KPML4phys;
 	thread0.td_frame = &proc0_tf;
 
         env = getenv("kernelname");
 	if (env != NULL)
 		strlcpy(kernelname, env, sizeof(kernelname));
 
 	/* Location of kernel stack for locore */
 	return ((u_int64_t)thread0.td_pcb);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_r12 = tf->tf_r12;
 	pcb->pcb_r13 = tf->tf_r13;
 	pcb->pcb_r14 = tf->tf_r14;
 	pcb->pcb_r15 = tf->tf_r15;
 	pcb->pcb_rbp = tf->tf_rbp;
 	pcb->pcb_rbx = tf->tf_rbx;
 	pcb->pcb_rip = tf->tf_rip;
 	pcb->pcb_rsp = (ISPL(tf->tf_cs)) ? tf->tf_rsp : (long)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	td->td_frame->tf_rip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_rflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	regs->r_r15 = tp->tf_r15;
 	regs->r_r14 = tp->tf_r14;
 	regs->r_r13 = tp->tf_r13;
 	regs->r_r12 = tp->tf_r12;
 	regs->r_r11 = tp->tf_r11;
 	regs->r_r10 = tp->tf_r10;
 	regs->r_r9  = tp->tf_r9;
 	regs->r_r8  = tp->tf_r8;
 	regs->r_rdi = tp->tf_rdi;
 	regs->r_rsi = tp->tf_rsi;
 	regs->r_rbp = tp->tf_rbp;
 	regs->r_rbx = tp->tf_rbx;
 	regs->r_rdx = tp->tf_rdx;
 	regs->r_rcx = tp->tf_rcx;
 	regs->r_rax = tp->tf_rax;
 	regs->r_rip = tp->tf_rip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_rflags = tp->tf_rflags;
 	regs->r_rsp = tp->tf_rsp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tp;
 	register_t rflags;
 
 	tp = td->td_frame;
 	rflags = regs->r_rflags & 0xffffffff;
 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_r15 = regs->r_r15;
 	tp->tf_r14 = regs->r_r14;
 	tp->tf_r13 = regs->r_r13;
 	tp->tf_r12 = regs->r_r12;
 	tp->tf_r11 = regs->r_r11;
 	tp->tf_r10 = regs->r_r10;
 	tp->tf_r9  = regs->r_r9;
 	tp->tf_r8  = regs->r_r8;
 	tp->tf_rdi = regs->r_rdi;
 	tp->tf_rsi = regs->r_rsi;
 	tp->tf_rbp = regs->r_rbp;
 	tp->tf_rbx = regs->r_rbx;
 	tp->tf_rdx = regs->r_rdx;
 	tp->tf_rcx = regs->r_rcx;
 	tp->tf_rax = regs->r_rax;
 	tp->tf_rip = regs->r_rip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = regs->r_rsp;
 	tp->tf_ss = regs->r_ss;
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (0);
 }
 
 /* XXX check all this stuff! */
 /* externalize from sv_xmm */
 static void
 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
 {
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* pcb -> fpregs */
 	bzero(fpregs, sizeof(*fpregs));
 
 	/* FPU control/status */
 	penv_fpreg->en_cw = penv_xmm->en_cw;
 	penv_fpreg->en_sw = penv_xmm->en_sw;
 	penv_fpreg->en_tw = penv_xmm->en_tw;
 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
 	penv_fpreg->en_rip = penv_xmm->en_rip;
 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
 }
 
 /* internalize from fpregs into sv_xmm */
 static void
 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
 {
 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
 	int i;
 
 	/* fpregs -> pcb */
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_fpreg->en_cw;
 	penv_xmm->en_sw = penv_fpreg->en_sw;
 	penv_xmm->en_tw = penv_fpreg->en_tw;
 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
 	penv_xmm->en_rip = penv_fpreg->en_rip;
 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
 
 	/* SSE registers */
 	for (i = 0; i < 16; ++i)
 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
 }
 
 /* externalize from td->pcb */
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	fill_fpregs_xmm(&td->td_pcb->pcb_save, fpregs);
 	return (0);
 }
 
 /* internalize to td->pcb */
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	set_fpregs_xmm(fpregs, &td->td_pcb->pcb_save);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_r15 = tp->tf_r15;
 	mcp->mc_r14 = tp->tf_r14;
 	mcp->mc_r13 = tp->tf_r13;
 	mcp->mc_r12 = tp->tf_r12;
 	mcp->mc_r11 = tp->tf_r11;
 	mcp->mc_r10 = tp->tf_r10;
 	mcp->mc_r9  = tp->tf_r9;
 	mcp->mc_r8  = tp->tf_r8;
 	mcp->mc_rdi = tp->tf_rdi;
 	mcp->mc_rsi = tp->tf_rsi;
 	mcp->mc_rbp = tp->tf_rbp;
 	mcp->mc_rbx = tp->tf_rbx;
 	mcp->mc_rcx = tp->tf_rcx;
 	mcp->mc_rflags = tp->tf_rflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_rax = 0;
 		mcp->mc_rdx = 0;
 		mcp->mc_rflags &= ~PSL_C;
 	} else {
 		mcp->mc_rax = tp->tf_rax;
 		mcp->mc_rdx = tp->tf_rdx;
 	}
 	mcp->mc_rip = tp->tf_rip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_rsp = tp->tf_rsp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	long rflags;
 	int ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
 	    (tp->tf_rflags & ~PSL_USERCHANGE);
 	ret = set_fpcontext(td, mcp);
 	if (ret != 0)
 		return (ret);
 	tp->tf_r15 = mcp->mc_r15;
 	tp->tf_r14 = mcp->mc_r14;
 	tp->tf_r13 = mcp->mc_r13;
 	tp->tf_r12 = mcp->mc_r12;
 	tp->tf_r11 = mcp->mc_r11;
 	tp->tf_r10 = mcp->mc_r10;
 	tp->tf_r9  = mcp->mc_r9;
 	tp->tf_r8  = mcp->mc_r8;
 	tp->tf_rdi = mcp->mc_rdi;
 	tp->tf_rsi = mcp->mc_rsi;
 	tp->tf_rbp = mcp->mc_rbp;
 	tp->tf_rbx = mcp->mc_rbx;
 	tp->tf_rdx = mcp->mc_rdx;
 	tp->tf_rcx = mcp->mc_rcx;
 	tp->tf_rax = mcp->mc_rax;
 	tp->tf_rip = mcp->mc_rip;
 	tp->tf_rflags = rflags;
 	tp->tf_rsp = mcp->mc_rsp;
 	tp->tf_ss = mcp->mc_ss;
 	td->td_pcb->pcb_flags |= PCB_FULLCTX;
 	return (0);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 
 	mcp->mc_ownedfp = fpugetregs(td, (struct savefpu *)&mcp->mc_fpstate);
 	mcp->mc_fpformat = fpuformat();
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct savefpu *fpstate;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/*
 		 * XXX we violate the dubious requirement that fpusetregs()
 		 * be called with interrupts disabled.
 		 * XXX obsolete on trap-16 systems?
 		 */
 		fpstate = (struct savefpu *)&mcp->mc_fpstate;
 		fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
 		fpusetregs(td, fpstate);
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 	if (PCPU_GET(fpcurthread) == td)
 		fpudrop();
 	/*
 	 * XXX force a full drop of the fpu.  The above only drops it if we
 	 * owned it.
 	 *
 	 * XXX I don't much like fpugetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of fpugetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_FPUINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	dbregs->dr[4] = 0;
 	dbregs->dr[5] = 0;
 	dbregs->dr[8] = 0;
 	dbregs->dr[9] = 0;
 	dbregs->dr[10] = 0;
 	dbregs->dr[11] = 0;
 	dbregs->dr[12] = 0;
 	dbregs->dr[13] = 0;
 	dbregs->dr[14] = 0;
 	dbregs->dr[15] = 0;
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.  Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP or a general protection fault right here.
 		 * Upper bits of dr6 and dr7 must not be set
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (td->td_frame->tf_cs == _ucode32sel &&
 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
 				return (EINVAL);
 		}
 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
 			return (EINVAL);
 
 		pcb = td->td_pcb;
 
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 void
 reset_dbregs(void)
 {
 
 	load_dr7(0);	/* Turn off the control bits first */
 	load_dr0(0);
 	load_dr1(0);
 	load_dr2(0);
 	load_dr3(0);
 	load_dr6(0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 169666)
+++ head/sys/amd64/amd64/pmap.c	(revision 169667)
@@ -1,3461 +1,3460 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 static int nkpt;
 static int ndmpdp;
 static vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end;
 pt_entry_t pg_nx;
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = 0;
 caddr_t CADDR1 = 0;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq,
 		vm_offset_t sva, pd_entry_t ptepde, vm_page_t *free);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 		vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
                 vm_page_t* free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, vm_page_t *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 2MB.  This is used to help improve performance
  * by using a large (2MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return newaddr;
 }
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return va >> PDRSHIFT;
 }
 
 
 /* Return various clipped indexes for a given VA */
 static __inline vm_pindex_t
 pmap_pte_index(vm_offset_t va)
 {
 
 	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pde_index(vm_offset_t va)
 {
 
 	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pdpe_index(vm_offset_t va)
 {
 
 	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
 }
 
 static __inline vm_pindex_t
 pmap_pml4e_index(vm_offset_t va)
 {
 
 	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
 }
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	if (!pmap)
 		return NULL;
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 
 	pml4e = pmap_pml4e(pmap, va);
 	if (pml4e == NULL || (*pml4e & PG_V) == 0)
 		return NULL;
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		 return NULL;
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return NULL;
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 
 static __inline pt_entry_t *
 pmap_pte_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *ptepde)
 {
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return NULL;
 	*ptepde = *pde;
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i;
 
 	/* Allocate pages */
 	KPTphys = allocpages(firstaddr, NKPT);
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 	KPDphys = allocpages(firstaddr, NKPDPE);
 
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	DMPDPphys = allocpages(firstaddr, NDMPML4E);
 	DMPDphys = allocpages(firstaddr, ndmpdp);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Fill in the underlying page table pages */
 	/* Read-only from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
 	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
 		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
 		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
 	}
 
 	/* Now map the page tables at their location within PTmap */
 	for (i = 0; i < NKPT; i++) {
 		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
 	}
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
 	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
 		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
 		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
 	}
 
 	/* And connect up the PD to the PDP */
 	for (i = 0; i < NKPDPE; i++) {
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
 	}
 
 
 	/* Now set up the direct map space using 2MB pages */
 	for (i = 0; i < NPDEPG * ndmpdp; i++) {
 		((pd_entry_t *)DMPDphys)[i] = (vm_paddr_t)i << PDRSHIFT;
 		((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
 	}
 
 	/* And the direct map space's PDP */
 	for (i = 0; i < ndmpdp; i++) {
 		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (i << PAGE_SHIFT);
 		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
 	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the Direct Map slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[DMPML4I] = DMPDPphys;
 	((pdp_entry_t *)KPML4phys)[DMPML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the KVA slot up to the PML4 */
 	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
 	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	virtual_avail = (vm_offset_t) KERNBASE + *firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 
 	/* XXX do %cr0 as well */
 	load_cr4(rcr4() | CR4_PGE | CR4_PSE);
 	load_cr3(KPML4phys);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *) (KERNBASE + KPML4phys);
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1 is only used for the memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 	invltlb();
 
 	/* Initialize the PAT MSR. */
 	pmap_init_pat();
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if (!(cpu_feature & CPUID_PAT))
 		panic("no PAT??");
 
 #ifdef PAT_WORKS
 	/*
 	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
 	 * Program 4 and 5 as WP and WC.
 	 * Leave 6 and 7 as UC and UC-.
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
 	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(5, PAT_WRITE_COMBINING);
 #else
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
 	 * of UC-.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~PAT_MASK(2);
 	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 #endif
 	wrmsr(MSR_PAT, pat_msr);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 }
 
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 static int
 pmap_pventry_proc(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0 && req->newptr) {
-		shpgperproc = (pv_entry_max - cnt.v_page_count) / maxproc;
+		shpgperproc = (pv_entry_max - VMCNT_GET(page_count)) / maxproc;
 		pv_entry_high_water = 9 * (pv_entry_max / 10);
 	}
 	return (error);
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, pv_entry_max, CTLTYPE_INT|CTLFLAG_RW, 
     &pv_entry_max, 0, pmap_pventry_proc, "IU", "Max number of PV entries");
 
 static int
 pmap_shpgperproc_proc(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
 	if (error == 0 && req->newptr) {
-		pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+		pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 		pv_entry_high_water = 9 * (pv_entry_max / 10);
 	}
 	return (error);
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, shpgperproc, CTLTYPE_INT|CTLFLAG_RW, 
     &shpgperproc, 0, pmap_shpgperproc_proc, "IU", "Page share factor per proc");
 
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int pat_flag, pat_index, cache_bits;
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* If we don't support PAT, map extended modes to older ones. */
 	if (!(cpu_feature & CPUID_PAT)) {
 		switch (mode) {
 		case PAT_UNCACHEABLE:
 		case PAT_WRITE_THROUGH:
 		case PAT_WRITE_BACK:
 			break;
 		case PAT_UNCACHED:
 		case PAT_WRITE_COMBINING:
 		case PAT_WRITE_PROTECTED:
 			mode = PAT_UNCACHEABLE;
 			break;
 		}
 	}
 	
 	/* Map the caching mode to a PAT index. */
 	switch (mode) {
 #ifdef PAT_WORKS
 	case PAT_UNCACHEABLE:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_UNCACHED:
 		pat_index = 2;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 5;
 		break;
 	case PAT_WRITE_PROTECTED:
 		pat_index = 4;
 		break;
 #else
 	case PAT_UNCACHED:
 	case PAT_UNCACHEABLE:
 	case PAT_WRITE_PROTECTED:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 2;
 		break;
 #endif
 	default:
 		panic("Unknown caching mode %d\n", mode);
 	}	
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_index & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_index & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_index & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 #endif /* !SMP */
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 	return (pmap == kernel_pmap ||
 	    (pmap->pm_pml4[PML4PML4I] & PG_FRAME) == (PML4pml4e[0] & PG_FRAME));
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde, *pdep;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL) {
 		pde = *pdep;
 		if (pde) {
 			if ((pde & PG_PS) != 0) {
 				rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 				PMAP_UNLOCK(pmap);
 				return rtval;
 			}
 			pte = pmap_pde_to_pte(pdep, va);
 			rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pde = vtopde(va);
 		if (*pde & PG_PS) {
 			pa = (*pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			pa = *vtopte(va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return pa;
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | PG_G);
 }
 
 PMAP_INLINE void 
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | PG_G | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, *pte;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		oldpte |= *pte;
 		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | PG_G | PG_RW | PG_V);
 		pte++;
 		ma++;
 	}
 	if ((oldpte & PG_V) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static PMAP_INLINE void
 pmap_free_zero_pages(vm_page_t free)
 {
 	vm_page_t m;
 
 	while (free != NULL) {
 		m = free;
 		free = m->right;
 		vm_page_free_zero(m);
 	}
 }
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
 		return _pmap_unwire_pte_hold(pmap, va, m, free);
 	else
 		return 0;
 }
 
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m, 
     vm_page_t *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUPDE + NUPDPE)) {
 		/* PDP page */
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		pteva = (vm_offset_t) PDPmap + amd64_ptob(m->pindex - (NUPDE + NUPDPE));
 		*pml4 = 0;
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		pteva = (vm_offset_t) PDmap + amd64_ptob(m->pindex - NUPDE);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		pteva = (vm_offset_t) PTmap + amd64_ptob(m->pindex);
 		*pd = 0;
 	}
 	--pmap->pm_stats.resident_count;
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_pte_hold(pmap, va, pdpg, free);
 	}
 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_pte_hold(pmap, va, pdppg, free);
 	}
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	m->right = *free;
 	*free = m;
-	
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+	VMCNT_DEC(wire_count, 1);
 	return 1;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde, vm_page_t *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return pmap_unwire_pte_hold(pmap, va, mpte, free);
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)(KERNBASE + KPML4phys);
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap_t pmap)
 {
 	vm_page_t pml4pg;
 	static vm_pindex_t color;
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * allocate the page directory page
 	 */
 	while ((pml4pg = vm_page_alloc(NULL, color++, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		VM_WAIT;
 
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
 	/* Wire in kernel global address entries. */
 	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
 	pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
 
 	/* install self-referential address mapping entry(s) */
 	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | PG_V | PG_RW | PG_A | PG_M;
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, int flags)
 {
 	vm_page_t m, pdppg, pdpg;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			VM_WAIT;
 			vm_page_lock_queues();
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 			    flags) == NULL) {
 				--m->wire_count;
 				vm_page_free(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to pdp page */
 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 			pdppg->wire_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    flags) == NULL) {
 				--m->wire_count;
 				vm_page_free(m);
 				return (NULL);
 			}
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    flags) == NULL) {
 					--m->wire_count;
 					vm_page_free(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 				pdpg->wire_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, int flags)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe;
 	vm_page_t pdpg;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpde: flags is neither M_NOWAIT nor M_WAITOK"));
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 		pdpg->wire_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, flags);
 		if (pdpg == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd;
 	vm_page_t m, free;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != 0 && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		*pd = 0;
 		pd = 0;
 		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 		free = NULL;
 		pmap_unuse_pt(pmap, va, *pmap_pdpe(pmap, va), &free);
 		pmap_invalidate_all(kernel_pmap);
 		pmap_free_zero_pages(free);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != 0 && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
 	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
 	pmap->pm_pml4[DMPML4I] = 0;	/* Direct Map */
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 
 	m->wire_count--;
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+	VMCNT_DEC(wire_count, 1);
 	vm_page_free_zero(m);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t newpdp;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while ((*pmap_pde(kernel_pmap, kernel_vm_end) & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;                       
 			}
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		pde = pmap_pde(kernel_pmap, kernel_vm_end);
 		if (pde == NULL) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, nkpt,
 			    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 			if (!nkpg)
 				panic("pmap_growkernel: no memory to grow kernel");
 			pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			newpdp = (pdp_entry_t)
 				(paddr | PG_V | PG_RW | PG_A | PG_M);
 			*pmap_pdpe(kernel_pmap, kernel_vm_end) = newpdp;
 			continue; /* try again */
 		}
 		if ((*pde & PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;                       
 			}
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (paddr | PG_V | PG_RW | PG_A | PG_M);
 		*pmap_pde(kernel_pmap, kernel_vm_end) = newpdir;
 
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 
 static int pmap_collect_inactive, pmap_collect_active;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 	"Current number times pmap_collect called on inactive queue");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.  This is normally called to
  * unmap inactive pages, and if necessary, active pages.
  */
 static void
 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 {
 	pd_entry_t ptepde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m, free;
 
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = PV_PMAP(pv);
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 			pte = pmap_pte_pde(pmap, va, &ptepde);
 			tpte = pte_load_clear(pte);
 			KASSERT((tpte & PG_W) == 0,
 			    ("pmap_collect: wired pte %#lx", tpte));
 			if (tpte & PG_A)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tpte & PG_M) {
 				KASSERT((tpte & PG_RW),
 	("pmap_collect: modified page not writable: va: %#lx, pte: %#lx",
 				    va, tpte));
 				vm_page_dirty(m);
 			}
 			free = NULL;
 			pmap_unuse_pt(pmap, va, ptepde, &free);
 			pmap_invalidate_page(pmap, va);
 			pmap_free_zero_pages(free);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 			free_pv_entry(pmap, pv);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 		}
 	}
 }
 
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	/* move to head of list */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2)
 		return;
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_free(m);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, int try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	static vm_pindex_t colour;
 	int bit, field, page_req;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			}
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM; 
 	m = vm_page_alloc(NULL, colour, page_req | VM_ALLOC_NOOBJ);
 	if (m == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		/*
 		 * Reclaim pv entries: At first, destroy mappings to inactive
 		 * pages.  After that, if a pv chunk entry is still needed,
 		 * destroy mappings to active pages.
 		 */
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing sysctl vm.pmap.shpgperproc or "
 			    "vm.pmap.pv_entry_max\n");
 		PV_STAT(pmap_collect_inactive++);
 		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
 		m = vm_page_alloc(NULL, colour,
 		    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
 		if (m == NULL) {
 			PV_STAT(pmap_collect_active++);
 			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
 			m = vm_page_alloc(NULL, colour,
 			    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ);
 			if (m == NULL)
 				panic("get_pv_entry: increase vm.pmap.shpgperproc");
 		}
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	colour++;
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) 
 			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, vm_page_t *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if (oldpte & PG_M) {
 			KASSERT((oldpte & PG_RW),
 	("pmap_remove_pte: modified page not writable: va: %#lx, pte: %#lx",
 			    va, oldpte));
 			vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde)
 {
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, *pde, &free);
 	pmap_invalidate_page(pmap, va);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde);
 			goto out;
 		}
 	}
 
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			*pde = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			pmap_unuse_pt(pmap, sva, *pdpe, &free);
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, ptpaddr, &free))
 				break;
 		}
 	}
 out:
 	if (anyvalid) {
 		pmap_invalidate_all(pmap);
 		pmap_free_zero_pages(free);
 	}
 	vm_page_unlock_queues();	
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t ptepde;
 	vm_page_t free;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%lx",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pte = pmap_pte_pde(pmap, pv->pv_va, &ptepde);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			KASSERT((tpte & PG_RW),
 	("pmap_remove_all: modified page not writable: va: %#lx, pte: %#lx",
 			    pv->pv_va, tpte));
 			vm_page_dirty(m);
 		}
 		free = NULL;
 		pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		pmap_free_zero_pages(free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte;
 	int anychanged;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	anychanged = 0;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			if ((prot & VM_PROT_WRITE) == 0)
 				*pde &= ~(PG_M|PG_RW);
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				*pde |= pg_nx;
 			anychanged = 1;
 			continue;
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits &
 						    PG_FRAME);
 					vm_page_dirty(m);
 				}
 			}
 
 			if ((prot & VM_PROT_WRITE) == 0)
 				pbits &= ~(PG_RW | PG_M);
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = 1;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
 	va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)", va);
 #endif
 
 	mpte = NULL;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pde=%p, va=%p\n",
 				origpte, va);
 		}
 	}
 #endif
 
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL) {
 		if ((*pde & PG_PS) != 0)
 			panic("pmap_enter: attempted pmap_enter on 2MB page");
 		pte = pmap_pde_to_pte(pde, va);
 	} else
 		pte = NULL;
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL)
 		panic("pmap_enter: invalid page directory va=%#lx\n", va);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte | PG_A);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m) || ((origpte &
 				    PG_NX) == 0 && (newpte & PG_NX)))
 					invlva = TRUE;
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
 	("pmap_enter: modified page not writable: va: %#lx, pte: %#lx",
 				    va, origpte));
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((newpte & PG_RW) == 0)
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte | PG_A);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
 		    prot, mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pmap);
 	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	vm_page_t free;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 2MB page");
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(free);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 	vm_offset_t va;
 	vm_page_t p, pdpg;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		vm_page_t m[1];
 		pd_entry_t ptepa, *pde;
 
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, addr);
 		if (pde != 0 && (*pde & PG_V) != 0)
 			goto out;
 		PMAP_UNLOCK(pmap);
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 			vm_page_unlock_queues();
 		}
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		PMAP_LOCK(pmap);
 		for (va = addr; va < addr + size; va += NBPDR) {
 			while ((pdpg =
 			    pmap_allocpde(pmap, va, M_NOWAIT)) == NULL) {
 				PMAP_UNLOCK(pmap);
 				vm_page_lock_queues();
 				vm_page_busy(p);
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(object);
 				VM_WAIT;
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				vm_page_wakeup(p);
 				vm_page_unlock_queues();
 				PMAP_LOCK(pmap);
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			pde = &pde[pmap_pde_index(va)];
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, ptepa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			} else {
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				     "to page directory page, va: 0x%lx", va));
 			}
 			ptepa += NBPDR;
 		}
 		pmap_invalidate_all(pmap);
 out:
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	pt_entry_t *pte;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
 	if (wired && (*pte & PG_W) == 0) {
 		pmap->pm_stats.wired_count++;
 		atomic_set_long(pte, PG_W);
 	} else if (!wired && (*pte & PG_W) != 0) {
 		pmap->pm_stats.wired_count--;
 		atomic_clear_long(pte, PG_W);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t va_next;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpde, dstmpte, srcmpte;
 		pml4_entry_t *pml4e;
 		pdp_entry_t *pdpe;
 		pd_entry_t srcptepaddr, *pde;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (addr + NBPDP) & ~PDPMASK;
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			dstmpde = pmap_allocpde(dst_pmap, addr, M_NOWAIT);
 			if (dstmpde == NULL)
 				break;
 			pde = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpde));
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0) {
 				*pde = srcptepaddr & ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			} else
 				dstmpde->wire_count--;
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < va_next) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = (pt_entry_t *)
 				    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 				dst_pte = &dst_pte[pmap_pte_index(addr)];
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 	 			} else {
 					free = NULL;
 					if (pmap_unwire_pte_hold(dst_pmap,
 					    addr, dstmpte, &free)) {
 					    	pmap_invalidate_page(dst_pmap,
 					 	    addr);
 				    	    	pmap_free_zero_pages(free);
 					}
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, free = NULL;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	int field, idx;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = vtopte(pv->pv_va);
 				tpte = *pte;
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08lx\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT(m < &vm_page_array[vm_page_array_size],
 					("pmap_remove_pages: bad tpte %#jx",
 					(uintmax_t)tpte));
 
 				pmap->pm_stats.resident_count--;
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (tpte & PG_M)
 					vm_page_dirty(m);
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				m->md.pv_list_count--;
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 				if (TAILQ_EMPTY(&m->md.pv_list))
 					vm_page_flag_clear(m, PG_WRITEABLE);
 				pmap_unuse_pt(pmap, pv->pv_va,
 				    *vtopde(pv->pv_va), &free);
 			}
 		}
 		if (allfree) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m->phys_addr);
 			vm_page_free(m);
 		}
 	}
 	pmap_invalidate_all(pmap);
 	pmap_free_zero_pages(free);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & PG_V)) {
 		pte = vtopte(addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t oldpte, *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pvf = pv;
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			pmap = PV_PMAP(pv);
 			PMAP_LOCK(pmap);
 			pte = pmap_pte(pmap, pv->pv_va);
 			if ((*pte & PG_A) != 0) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				rtval++;
 				if (rtval > 4)
 					pvn = NULL;
 			}
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		if (*pte & PG_M) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, pv->pv_va);
 		if (*pte & PG_A) {
 			atomic_clear_long(pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(vm_offset_t va, int mode)
 {
 	pt_entry_t *pte;
 	u_int opte, npte;
 
 	pte = vtopte(va);
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
 		npte |= pmap_cache_bits(mode, 0);
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(vm_offset_t va, int mode)
 {
 	pd_entry_t *pde;
 	u_int opde, npde;
 
 	pde = pmap_pde(kernel_pmap, va);
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~(PG_PDE_PAT | PG_NC_PCD | PG_NC_PWT);
 		npde |= pmap_cache_bits(mode, 1);
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	vm_offset_t va, tmpva, offset;
 
 	/*
 	 * If this fits within the direct map window and use WB caching
 	 * mode, use the direct map.
 	 */
 	if (pa < dmaplimit && (pa + size) < dmaplimit && mode == PAT_WRITE_BACK)
 		return ((void *)PHYS_TO_DMAP(pa));
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	pa = trunc_page(pa);
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter_attr(tmpva, pa, mode);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	pmap_invalidate_cache();
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset, tmpva;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 		pmap_kremove(tmpva);
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 int
 pmap_change_attr(va, size, mode)
 	vm_offset_t va;
 	vm_size_t size;
 	int mode;
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	/* Only supported on kernel virtual addresses. */
 	if (base <= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	/*
 	 * XXX: We have to support tearing 2MB pages down into 4k pages if
 	 * needed here.
 	 */
 	/* Pages that aren't mapped aren't supported. */
 	for (tmpva = base; tmpva < (base + size); ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde == 0)
 			return (EINVAL);
 		if (*pde & PG_PS) {
 			/* Handle 2MB pages that are completely contained. */
 			if (size >= NBPDR) {
 				tmpva += NBPDR;
 				continue;
 			}
 			return (EINVAL);
 		}
 		pte = vtopte(va);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode.
 	 */
 	for (tmpva = base; size > 0; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS) {
 			pmap_pde_attr(tmpva, mode);
 			tmpva += NBPDR;
 			size -= NBPDR;
 		} else {
 			pmap_pte_attr(tmpva, mode);
 			tmpva += PAGE_SIZE;
 			size -= PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that shouldn't
 	 * be, etc.
 	 */    
 	pmap_invalidate_range(kernel_pmap, base, tmpva);
 	pmap_invalidate_cache();
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	ptep = pmap_pte(pmap, addr);
 	pte = (ptep != NULL) ? *ptep : 0;
 	PMAP_UNLOCK(pmap);
 
 	if (pte != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int64_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #ifdef SMP
 if (oldpmap)	/* XXX FIXME */
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 if (oldpmap)	/* XXX FIXME */
 	oldpmap->pm_active &= ~PCPU_GET(cpumask);
 	pmap->pm_active |= PCPU_GET(cpumask);
 #endif
 	cr3 = vtophys(pmap->pm_pml4);
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
 	return addr;
 }
Index: head/sys/arm/arm/machdep.c
===================================================================
--- head/sys/arm/arm/machdep.c	(revision 169666)
+++ head/sys/arm/arm/machdep.c	(revision 169667)
@@ -1,633 +1,633 @@
 /*	$NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $	*/
 
 /*-
  * Copyright (c) 2004 Olivier Houchard
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Mark Brinicombe
  *	for the NetBSD Project.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Machine dependant functions for kernel setup
  *
  * Created      : 17/09/94
  * Updated	: 18/04/01 updated for new wscons
  */
 
 #include "opt_compat.h"
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/signalvar.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <machine/armreg.h>
 #include <machine/cpu.h>
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/reg.h>
 #include <machine/trap.h>
 #include <machine/undefined.h>
 #include <machine/vmparam.h>
 #include <machine/sysarch.h>
 
 uint32_t cpu_reset_address = 0;
 int cold = 1;
 vm_offset_t vector_page;
 
 long realmem = 0;
 
 int (*_arm_memcpy)(void *, void *, int, int) = NULL;
 int (*_arm_bzero)(void *, int, int) = NULL;
 int _min_memcpy_size = 0;
 int _min_bzero_size = 0;
 
 void
 sendsig(catcher, ksi, mask)
 	sig_t catcher;
 	ksiginfo_t *ksi;
 	sigset_t *mask;
 {
 	struct thread *td;
 	struct proc *p;
 	struct trapframe *tf;
 	struct sigframe *fp, frame;
 	struct sigacts *psp;
 	int onstack;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	onstack = sigonstack(tf->tf_usr_sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_flags & TDP_ALTSTACK) != 0 && !(onstack) &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct sigframe *)(td->td_sigstk.ss_sp + 
 		    td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct sigframe *)td->td_frame->tf_usr_sp;
 		 
 	/* make room on the stack */
 	fp--;
 	
 	/* make the stack aligned */
 	fp = (struct sigframe *)STACKALIGN(fp);
 	/* Populate the siginfo frame. */
 	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
 	frame.sf_si = ksi->ksi_info;
 	frame.sf_uc.uc_sigmask = *mask;
 	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK ) 
 	    ? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	frame.sf_uc.uc_stack = td->td_sigstk;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(td->td_proc);
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
 		/* Process has trashed its stack. Kill it. */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/*
 	 * Build context to run handler in.  We invoke the handler
 	 * directly, only returning via the trampoline.  Note the
 	 * trampoline version numbers are coordinated with machine-
 	 * dependent code in libc.
 	 */
 	
 	tf->tf_r0 = sig;
 	tf->tf_r1 = (register_t)&fp->sf_si;
 	tf->tf_r2 = (register_t)&fp->sf_uc;
 
 	/* the trampoline uses r5 as the uc address */
 	tf->tf_r5 = (register_t)&fp->sf_uc;
 	tf->tf_pc = (register_t)catcher;
 	tf->tf_usr_sp = (register_t)fp;
 	tf->tf_usr_lr = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr,
 	    tf->tf_usr_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 struct kva_md_info kmi;
 
 /*
  * arm32_vector_init:
  *
  *	Initialize the vector page, and select whether or not to
  *	relocate the vectors.
  *
  *	NOTE: We expect the vector page to be mapped at its expected
  *	destination.
  */
 
 extern unsigned int page0[], page0_data[];
 void
 arm_vector_init(vm_offset_t va, int which)
 {
 	unsigned int *vectors = (int *) va;
 	unsigned int *vectors_data = vectors + (page0_data - page0);
 	int vec;
 
 	/*
 	 * Loop through the vectors we're taking over, and copy the
 	 * vector's insn and data word.
 	 */
 	for (vec = 0; vec < ARM_NVEC; vec++) {
 		if ((which & (1 << vec)) == 0) {
 			/* Don't want to take over this vector. */
 			continue;
 		}
 		vectors[vec] = page0[vec];
 		vectors_data[vec] = page0_data[vec];
 	}
 
 	/* Now sync the vectors. */
 	cpu_icache_sync_range(va, (ARM_NVEC * 2) * sizeof(u_int));
 
 	vector_page = va;
 
 	if (va == ARM_VECTORS_HIGH) {
 		/*
 		 * Assume the MD caller knows what it's doing here, and
 		 * really does want the vector page relocated.
 		 *
 		 * Note: This has to be done here (and not just in
 		 * cpu_setup()) because the vector page needs to be
 		 * accessible *before* cpu_startup() is called.
 		 * Think ddb(9) ...
 		 *
 		 * NOTE: If the CPU control register is not readable,
 		 * this will totally fail!  We'll just assume that
 		 * any system that has high vector support has a
 		 * readable CPU control register, for now.  If we
 		 * ever encounter one that does not, we'll have to
 		 * rethink this.
 		 */
 		cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC);
 	}
 }
 
 static void
 cpu_startup(void *dummy)
 {
 	struct pcb *pcb = thread0.td_pcb;
 #ifndef ARM_CACHE_LOCK_ENABLE
 	vm_page_t m;
 #endif
 
 	cpu_setup("");
 	identify_arm_cpu();
 
 	printf("real memory  = %ju (%ju MB)\n", (uintmax_t)ptoa(physmem),
 	    (uintmax_t)ptoa(physmem) / 1048576);
 	realmem = physmem;
 
 	/*
 	 * Display the RAM layout.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
-	    (uintmax_t)ptoa(cnt.v_free_count),
-	    (uintmax_t)ptoa(cnt.v_free_count) / 1048576);
+	    (uintmax_t)ptoa(VMCNT_GET(free_count)),
+	    (uintmax_t)ptoa(VMCNT_GET(free_count)) / 1048576);
 
 	bufinit();
 	vm_pager_bufferinit();
 	pcb->un_32.pcb32_und_sp = (u_int)thread0.td_kstack +
 	    USPACE_UNDEF_STACK_TOP;
 	pcb->un_32.pcb32_sp = (u_int)thread0.td_kstack +
 	    USPACE_SVC_STACK_TOP;
 	vector_page_setprot(VM_PROT_READ);
 	pmap_set_pcb_pagedir(pmap_kernel(), pcb);
 	thread0.td_frame = (struct trapframe *)pcb->un_32.pcb32_sp - 1;
 	pmap_postinit();
 #ifdef ARM_CACHE_LOCK_ENABLE
 	pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
 	arm_lock_cache_line(ARM_TP_ADDRESS);
 #else
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_ZERO);
 	pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m));
 #endif
 }
 
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 void
 cpu_idle(void)
 {
 	cpu_sleep(0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf = td->td_frame;
 	bcopy(&tf->tf_r0, regs->r, sizeof(regs->r));
 	regs->r_sp = tf->tf_usr_sp;
 	regs->r_lr = tf->tf_usr_lr;
 	regs->r_pc = tf->tf_pc;
 	regs->r_cpsr = tf->tf_spsr;
 	return (0);
 }
 int
 fill_fpregs(struct thread *td, struct fpreg *regs)
 {
 	bzero(regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf = td->td_frame;
 	
 	bcopy(regs->r, &tf->tf_r0, sizeof(regs->r));
 	tf->tf_usr_sp = regs->r_sp;
 	tf->tf_usr_lr = regs->r_lr;
 	tf->tf_pc = regs->r_pc;
 	tf->tf_spsr &=  ~PSR_FLAGS;
 	tf->tf_spsr |= regs->r_cpsr & PSR_FLAGS;
 	return (0);								
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *regs)
 {
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *regs)
 {
 	return (0);
 }
 int
 set_dbregs(struct thread *td, struct dbreg *regs)
 {
 	return (0);
 }
 
 
 static int
 ptrace_read_int(struct thread *td, vm_offset_t addr, u_int32_t *v)
 {
 	struct iovec iov;
 	struct uio uio;
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
 	iov.iov_base = (caddr_t) v;
 	iov.iov_len = sizeof(u_int32_t);
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = (off_t)addr;
 	uio.uio_resid = sizeof(u_int32_t);
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 	return proc_rwmem(td->td_proc, &uio);
 }
 
 static int
 ptrace_write_int(struct thread *td, vm_offset_t addr, u_int32_t v)
 {
 	struct iovec iov;
 	struct uio uio;
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
 	iov.iov_base = (caddr_t) &v;
 	iov.iov_len = sizeof(u_int32_t);
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = (off_t)addr;
 	uio.uio_resid = sizeof(u_int32_t);
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 	return proc_rwmem(td->td_proc, &uio);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct proc *p;
 	int error;
 	
 	KASSERT(td->td_md.md_ptrace_instr == 0,
 	 ("Didn't clear single step"));
 	p = td->td_proc;
 	PROC_UNLOCK(p);
 	error = ptrace_read_int(td, td->td_frame->tf_pc + 4, 
 	    &td->td_md.md_ptrace_instr);
 	if (error)
 		goto out;
 	error = ptrace_write_int(td, td->td_frame->tf_pc + 4,
 	    PTRACE_BREAKPOINT);
 	if (error)
 		td->td_md.md_ptrace_instr = 0;
 	td->td_md.md_ptrace_addr = td->td_frame->tf_pc + 4;
 out:
 	PROC_LOCK(p);
 	return (error);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct proc *p;
 
 	if (td->td_md.md_ptrace_instr) {
 		p = td->td_proc;
 		PROC_UNLOCK(p);
 		ptrace_write_int(td, td->td_md.md_ptrace_addr,
 		    td->td_md.md_ptrace_instr);
 		PROC_LOCK(p);
 		td->td_md.md_ptrace_instr = 0;
 	}
 	return (0);
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	td->td_frame->tf_pc = addr;
 	return (0);
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_cspr = disable_interrupts(I32_bit | F32_bit);
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		restore_interrupts(td->td_md.md_saved_cspr);
 }
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf = td->td_frame;
 
 	memset(tf, 0, sizeof(*tf));
 	tf->tf_usr_sp = stack;
 	tf->tf_usr_lr = entry;
 	tf->tf_svc_lr = 0x77777777;
 	tf->tf_pc = entry;
 	tf->tf_spsr = PSR_USR32_MODE;
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int clear_ret)
 {
 	struct trapframe *tf = td->td_frame;
 	__greg_t *gr = mcp->__gregs;
 
 	if (clear_ret & GET_MC_CLEAR_RET)
 		gr[_REG_R0] = 0;
 	else
 		gr[_REG_R0]   = tf->tf_r0;
 	gr[_REG_R1]   = tf->tf_r1;
 	gr[_REG_R2]   = tf->tf_r2;
 	gr[_REG_R3]   = tf->tf_r3;
 	gr[_REG_R4]   = tf->tf_r4;
 	gr[_REG_R5]   = tf->tf_r5;
 	gr[_REG_R6]   = tf->tf_r6;
 	gr[_REG_R7]   = tf->tf_r7;
 	gr[_REG_R8]   = tf->tf_r8;
 	gr[_REG_R9]   = tf->tf_r9;
 	gr[_REG_R10]  = tf->tf_r10;
 	gr[_REG_R11]  = tf->tf_r11;
 	gr[_REG_R12]  = tf->tf_r12;
 	gr[_REG_SP]   = tf->tf_usr_sp;
 	gr[_REG_LR]   = tf->tf_usr_lr;
 	gr[_REG_PC]   = tf->tf_pc;
 	gr[_REG_CPSR] = tf->tf_spsr;
 
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tf = td->td_frame;
 	__greg_t *gr = mcp->__gregs;
 
 	tf->tf_r0 = gr[_REG_R0];
 	tf->tf_r1 = gr[_REG_R1];
 	tf->tf_r2 = gr[_REG_R2];
 	tf->tf_r3 = gr[_REG_R3];
 	tf->tf_r4 = gr[_REG_R4];
 	tf->tf_r5 = gr[_REG_R5];
 	tf->tf_r6 = gr[_REG_R6];
 	tf->tf_r7 = gr[_REG_R7];
 	tf->tf_r8 = gr[_REG_R8];
 	tf->tf_r9 = gr[_REG_R9];
 	tf->tf_r10 = gr[_REG_R10];
 	tf->tf_r11 = gr[_REG_R11];
 	tf->tf_r12 = gr[_REG_R12];
 	tf->tf_usr_sp = gr[_REG_SP];
 	tf->tf_usr_lr = gr[_REG_LR];
 	tf->tf_pc = gr[_REG_PC];
 	tf->tf_spsr = gr[_REG_CPSR];
 
 	return (0);
 }
 
 /*
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct sigframe sf;
 	struct trapframe *tf;
 	int spsr;
 	
 	if (uap == NULL)
 		return (EFAULT);
 	if (copyin(uap->sigcntxp, &sf, sizeof(sf)))
 		return (EFAULT);
 	/*
 	 * Make sure the processor mode has not been tampered with and
 	 * interrupts have not been disabled.
 	 */
 	spsr = sf.sf_uc.uc_mcontext.__gregs[_REG_CPSR];
 	if ((spsr & PSR_MODE) != PSR_USR32_MODE ||
 	    (spsr & (I32_bit | F32_bit)) != 0)
 		return (EINVAL);
 		/* Restore register context. */
 	tf = td->td_frame;
 	set_mcontext(td, &sf.sf_uc.uc_mcontext);
 
 	/* Restore signal mask. */
 	PROC_LOCK(p);
 	td->td_sigmask = sf.sf_uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	return (EJUSTRETURN);
 }
 
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 	pcb->un_32.pcb32_r8 = tf->tf_r8;
 	pcb->un_32.pcb32_r9 = tf->tf_r9;
 	pcb->un_32.pcb32_r10 = tf->tf_r10;
 	pcb->un_32.pcb32_r11 = tf->tf_r11;
 	pcb->un_32.pcb32_r12 = tf->tf_r12;
 	pcb->un_32.pcb32_pc = tf->tf_pc;
 	pcb->un_32.pcb32_lr = tf->tf_usr_lr;
 	pcb->un_32.pcb32_sp = tf->tf_usr_sp;
 }
Index: head/sys/arm/arm/pmap.c
===================================================================
--- head/sys/arm/arm/pmap.c	(revision 169666)
+++ head/sys/arm/arm/pmap.c	(revision 169667)
@@ -1,4849 +1,4849 @@
 /* From: $NetBSD: pmap.c,v 1.148 2004/04/03 04:35:48 bsh Exp $ */
 /*-
  * Copyright 2004 Olivier Houchard.
  * Copyright 2003 Wasabi Systems, Inc.
  * All rights reserved.
  *
  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed for the NetBSD Project by
  *      Wasabi Systems, Inc.
  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  *    or promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 2002-2003 Wasabi Systems, Inc.
  * Copyright (c) 2001 Richard Earnshaw
  * Copyright (c) 2001-2002 Christopher Gilbert
  * All rights reserved.
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 1999 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Charles M. Hannum.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *        This product includes software developed by the NetBSD
  *        Foundation, Inc. and its contributors.
  * 4. Neither the name of The NetBSD Foundation nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Mark Brinicombe.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  *
  * RiscBSD kernel project
  *
  * pmap.c
  *
  * Machine dependant vm stuff
  *
  * Created      : 20/09/94
  */
 
 /*
  * Special compilation symbols
  * PMAP_DEBUG           - Build in pmap_debug_level code
  */
 /* Include header files */
 
 #include "opt_vm.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 
 #include <vm/vm.h>
 #include <vm/uma.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <machine/md_var.h>
 #include <machine/vmparam.h>
 #include <machine/cpu.h>
 #include <machine/cpufunc.h>
 #include <machine/pcb.h>
 
 #ifdef PMAP_DEBUG
 #define PDEBUG(_lev_,_stat_) \
         if (pmap_debug_level >= (_lev_)) \
                 ((_stat_))
 #define dprintf printf
 
 int pmap_debug_level = 0;
 #define PMAP_INLINE 
 #else   /* PMAP_DEBUG */
 #define PDEBUG(_lev_,_stat_) /* Nothing */
 #define dprintf(x, arg...)
 #define PMAP_INLINE __inline
 #endif  /* PMAP_DEBUG */
 
 extern struct pv_addr systempage;
 /*
  * Internal function prototypes
  */
 static void pmap_free_pv_entry (pv_entry_t);
 static pv_entry_t pmap_get_pv_entry(void);
 
 static void		pmap_enter_locked(pmap_t, vm_offset_t, vm_page_t,
     vm_prot_t, boolean_t, int);
 static void		pmap_vac_me_harder(struct vm_page *, pmap_t,
     vm_offset_t);
 static void		pmap_vac_me_kpmap(struct vm_page *, pmap_t, 
     vm_offset_t);
 static void		pmap_vac_me_user(struct vm_page *, pmap_t, vm_offset_t);
 static void		pmap_alloc_l1(pmap_t);
 static void		pmap_free_l1(pmap_t);
 static void		pmap_use_l1(pmap_t);
 
 static int		pmap_clearbit(struct vm_page *, u_int);
 
 static struct l2_bucket *pmap_get_l2_bucket(pmap_t, vm_offset_t);
 static struct l2_bucket *pmap_alloc_l2_bucket(pmap_t, vm_offset_t);
 static void		pmap_free_l2_bucket(pmap_t, struct l2_bucket *, u_int);
 static vm_offset_t	kernel_pt_lookup(vm_paddr_t);
 
 static MALLOC_DEFINE(M_VMPMAP, "pmap", "PMAP L1");
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t pmap_curmaxkvaddr;
 vm_paddr_t kernel_l1pa;
 
 extern void *end;
 vm_offset_t kernel_vm_end = 0;
 
 struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 
 static pt_entry_t *csrc_pte, *cdst_pte;
 static vm_offset_t csrcp, cdstp;
 static struct mtx cmtx;
 
 static void		pmap_init_l1(struct l1_ttable *, pd_entry_t *);
 /*
  * These routines are called when the CPU type is identified to set up
  * the PTE prototypes, cache modes, etc.
  *
  * The variables are always here, just in case LKMs need to reference
  * them (though, they shouldn't).
  */
 
 pt_entry_t	pte_l1_s_cache_mode;
 pt_entry_t	pte_l1_s_cache_mode_pt;
 pt_entry_t	pte_l1_s_cache_mask;
 
 pt_entry_t	pte_l2_l_cache_mode;
 pt_entry_t	pte_l2_l_cache_mode_pt;
 pt_entry_t	pte_l2_l_cache_mask;
 
 pt_entry_t	pte_l2_s_cache_mode;
 pt_entry_t	pte_l2_s_cache_mode_pt;
 pt_entry_t	pte_l2_s_cache_mask;
 
 pt_entry_t	pte_l2_s_prot_u;
 pt_entry_t	pte_l2_s_prot_w;
 pt_entry_t	pte_l2_s_prot_mask;
 
 pt_entry_t	pte_l1_s_proto;
 pt_entry_t	pte_l1_c_proto;
 pt_entry_t	pte_l2_s_proto;
 
 void		(*pmap_copy_page_func)(vm_paddr_t, vm_paddr_t);
 void		(*pmap_zero_page_func)(vm_paddr_t, int, int);
 /*
  * Which pmap is currently 'live' in the cache
  *
  * XXXSCW: Fix for SMP ...
  */
 union pmap_cache_state *pmap_cache_state;
 
 /* static pt_entry_t *msgbufmap;*/
 struct msgbuf *msgbufp = 0;
 
 extern void bcopy_page(vm_offset_t, vm_offset_t);
 extern void bzero_page(vm_offset_t);
 
 extern vm_offset_t alloc_firstaddr;
 
 char *_tmppt;
 
 /*
  * Metadata for L1 translation tables.
  */
 struct l1_ttable {
 	/* Entry on the L1 Table list */
 	SLIST_ENTRY(l1_ttable) l1_link;
 
 	/* Entry on the L1 Least Recently Used list */
 	TAILQ_ENTRY(l1_ttable) l1_lru;
 
 	/* Track how many domains are allocated from this L1 */
 	volatile u_int l1_domain_use_count;
 
 	/*
 	 * A free-list of domain numbers for this L1.
 	 * We avoid using ffs() and a bitmap to track domains since ffs()
 	 * is slow on ARM.
 	 */
 	u_int8_t l1_domain_first;
 	u_int8_t l1_domain_free[PMAP_DOMAINS];
 
 	/* Physical address of this L1 page table */
 	vm_paddr_t l1_physaddr;
 
 	/* KVA of this L1 page table */
 	pd_entry_t *l1_kva;
 };
 
 /*
  * Convert a virtual address into its L1 table index. That is, the
  * index used to locate the L2 descriptor table pointer in an L1 table.
  * This is basically used to index l1->l1_kva[].
  *
  * Each L2 descriptor table represents 1MB of VA space.
  */
 #define	L1_IDX(va)		(((vm_offset_t)(va)) >> L1_S_SHIFT)
 
 /*
  * L1 Page Tables are tracked using a Least Recently Used list.
  *  - New L1s are allocated from the HEAD.
  *  - Freed L1s are added to the TAIl.
  *  - Recently accessed L1s (where an 'access' is some change to one of
  *    the userland pmaps which owns this L1) are moved to the TAIL.
  */
 static TAILQ_HEAD(, l1_ttable) l1_lru_list;
 /*
  * A list of all L1 tables
  */
 static SLIST_HEAD(, l1_ttable) l1_list;
 static struct mtx l1_lru_lock;
 
 /*
  * The l2_dtable tracks L2_BUCKET_SIZE worth of L1 slots.
  *
  * This is normally 16MB worth L2 page descriptors for any given pmap.
  * Reference counts are maintained for L2 descriptors so they can be
  * freed when empty.
  */
 struct l2_dtable {
 	/* The number of L2 page descriptors allocated to this l2_dtable */
 	u_int l2_occupancy;
 
 	/* List of L2 page descriptors */
 	struct l2_bucket {
 		pt_entry_t *l2b_kva;	/* KVA of L2 Descriptor Table */
 		vm_paddr_t l2b_phys;	/* Physical address of same */
 		u_short l2b_l1idx;	/* This L2 table's L1 index */
 		u_short l2b_occupancy;	/* How many active descriptors */
 	} l2_bucket[L2_BUCKET_SIZE];
 };
 
 /* pmap_kenter_internal flags */
 #define KENTER_CACHE	0x1
 #define KENTER_USER	0x2
 
 /*
  * Given an L1 table index, calculate the corresponding l2_dtable index
  * and bucket index within the l2_dtable.
  */
 #define	L2_IDX(l1idx)		(((l1idx) >> L2_BUCKET_LOG2) & \
 				 (L2_SIZE - 1))
 #define	L2_BUCKET(l1idx)	((l1idx) & (L2_BUCKET_SIZE - 1))
 
 /*
  * Given a virtual address, this macro returns the
  * virtual address required to drop into the next L2 bucket.
  */
 #define	L2_NEXT_BUCKET(va)	(((va) & L1_S_FRAME) + L1_S_SIZE)
 
 /*
  * L2 allocation.
  */
 #define	pmap_alloc_l2_dtable()		\
 		(void*)uma_zalloc(l2table_zone, M_NOWAIT|M_USE_RESERVE)
 #define	pmap_free_l2_dtable(l2)		\
 		uma_zfree(l2table_zone, l2)
 
 /*
  * We try to map the page tables write-through, if possible.  However, not
  * all CPUs have a write-through cache mode, so on those we have to sync
  * the cache when we frob page tables.
  *
  * We try to evaluate this at compile time, if possible.  However, it's
  * not always possible to do that, hence this run-time var.
  */
 int	pmap_needs_pte_sync;
 
 /*
  * Macro to determine if a mapping might be resident in the
  * instruction cache and/or TLB
  */
 #define	PV_BEEN_EXECD(f)  (((f) & (PVF_REF | PVF_EXEC)) == (PVF_REF | PVF_EXEC))
 
 /*
  * Macro to determine if a mapping might be resident in the
  * data cache and/or TLB
  */
 #define	PV_BEEN_REFD(f)   (((f) & PVF_REF) != 0)
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #define pmap_is_current(pm)	((pm) == pmap_kernel() || \
             curproc->p_vmspace->vm_map.pmap == (pm))
 static uma_zone_t pvzone;
 uma_zone_t l2zone;
 static uma_zone_t l2table_zone;
 static vm_offset_t pmap_kernel_l2dtable_kva;
 static vm_offset_t pmap_kernel_l2ptp_kva;
 static vm_paddr_t pmap_kernel_l2ptp_phys;
 static struct vm_object pvzone_obj;
 static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
 
 /*
  * This list exists for the benefit of pmap_map_chunk().  It keeps track
  * of the kernel L2 tables during bootstrap, so that pmap_map_chunk() can
  * find them as necessary.
  *
  * Note that the data on this list MUST remain valid after initarm() returns,
  * as pmap_bootstrap() uses it to contruct L2 table metadata.
  */
 SLIST_HEAD(, pv_addr) kernel_pt_list = SLIST_HEAD_INITIALIZER(kernel_pt_list);
 
 static void
 pmap_init_l1(struct l1_ttable *l1, pd_entry_t *l1pt)
 {
 	int i;
 
 	l1->l1_kva = l1pt;
 	l1->l1_domain_use_count = 0;
 	l1->l1_domain_first = 0;
 
 	for (i = 0; i < PMAP_DOMAINS; i++)
 		l1->l1_domain_free[i] = i + 1;
 
 	/*
 	 * Copy the kernel's L1 entries to each new L1.
 	 */
 	if (l1pt != pmap_kernel()->pm_l1->l1_kva)
 		memcpy(l1pt, pmap_kernel()->pm_l1->l1_kva, L1_TABLE_SIZE);
 
 	if ((l1->l1_physaddr = pmap_extract(pmap_kernel(), (vm_offset_t)l1pt)) == 0)
 		panic("pmap_init_l1: can't get PA of L1 at %p", l1pt);
 	SLIST_INSERT_HEAD(&l1_list, l1, l1_link);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 }
 
 static vm_offset_t
 kernel_pt_lookup(vm_paddr_t pa)
 {
 	struct pv_addr *pv;
 
 	SLIST_FOREACH(pv, &kernel_pt_list, pv_list) {
 		if (pv->pv_pa == pa)
 			return (pv->pv_va);
 	}
 	return (0);
 }
 
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0
 void
 pmap_pte_init_generic(void)
 {
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_generic;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_generic;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_generic;
 
 	/*
 	 * If we have a write-through cache, set B and C.  If
 	 * we have a write-back cache, then we assume setting
 	 * only C will make those pages write-through.
 	 */
 	if (cpufuncs.cf_dcache_wb_range == (void *) cpufunc_nullop) {
 		pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_B|L2_C;
 		pte_l2_s_cache_mode_pt = L2_B|L2_C;
 	} else {
 		pte_l1_s_cache_mode_pt = L1_S_C;
 		pte_l2_l_cache_mode_pt = L2_C;
 		pte_l2_s_cache_mode_pt = L2_C;
 	}
 
 	pte_l2_s_prot_u = L2_S_PROT_U_generic;
 	pte_l2_s_prot_w = L2_S_PROT_W_generic;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_generic;
 
 	pte_l1_s_proto = L1_S_PROTO_generic;
 	pte_l1_c_proto = L1_C_PROTO_generic;
 	pte_l2_s_proto = L2_S_PROTO_generic;
 
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 }
 
 #if defined(CPU_ARM8)
 void
 pmap_pte_init_arm8(void)
 {
 
 	/*
 	 * ARM8 is compatible with generic, but we need to use
 	 * the page tables uncached.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = 0;
 	pte_l2_l_cache_mode_pt = 0;
 	pte_l2_s_cache_mode_pt = 0;
 }
 #endif /* CPU_ARM8 */
 
 #if defined(CPU_ARM9) && defined(ARM9_CACHE_WRITE_THROUGH)
 void
 pmap_pte_init_arm9(void)
 {
 
 	/*
 	 * ARM9 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_C;
 	pte_l2_l_cache_mode = L2_C;
 	pte_l2_s_cache_mode = L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 }
 #endif /* CPU_ARM9 */
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if defined(CPU_ARM10)
 void
 pmap_pte_init_arm10(void)
 {
 
 	/*
 	 * ARM10 is compatible with generic, but we want to use
 	 * write-through caching for now.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode = L1_S_B | L1_S_C;
 	pte_l2_l_cache_mode = L2_B | L2_C;
 	pte_l2_s_cache_mode = L2_B | L2_C;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 
 }
 #endif /* CPU_ARM10 */
 
 #if  ARM_MMU_SA1 == 1
 void
 pmap_pte_init_sa1(void)
 {
 
 	/*
 	 * The StrongARM SA-1 cache does not have a write-through
 	 * mode.  So, do the generic initialization, then reset
 	 * the page table cache mode to B=1,C=1, and note that
 	 * the PTEs need to be sync'd.
 	 */
 	pmap_pte_init_generic();
 
 	pte_l1_s_cache_mode_pt = L1_S_B|L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_B|L2_C;
 	pte_l2_s_cache_mode_pt = L2_B|L2_C;
 
 	pmap_needs_pte_sync = 1;
 }
 #endif /* ARM_MMU_SA1 == 1*/
 
 #if ARM_MMU_XSCALE == 1
 #if (ARM_NMMUS > 1) || defined (CPU_XSCALE_CORE3)
 static u_int xscale_use_minidata;
 #endif
 
 void
 pmap_pte_init_xscale(void)
 {
 	uint32_t auxctl;
 	int write_through = 0;
 
 	pte_l1_s_cache_mode = L1_S_B|L1_S_C|L1_S_XSCALE_P;
 	pte_l1_s_cache_mask = L1_S_CACHE_MASK_xscale;
 
 	pte_l2_l_cache_mode = L2_B|L2_C;
 	pte_l2_l_cache_mask = L2_L_CACHE_MASK_xscale;
 
 	pte_l2_s_cache_mode = L2_B|L2_C;
 	pte_l2_s_cache_mask = L2_S_CACHE_MASK_xscale;
 
 	pte_l1_s_cache_mode_pt = L1_S_C;
 	pte_l2_l_cache_mode_pt = L2_C;
 	pte_l2_s_cache_mode_pt = L2_C;
 #ifdef XSCALE_CACHE_READ_WRITE_ALLOCATE
 	/*
 	 * The XScale core has an enhanced mode where writes that
 	 * miss the cache cause a cache line to be allocated.  This
 	 * is significantly faster than the traditional, write-through
 	 * behavior of this case.
 	 */
 #ifndef CPU_XSCALE_CORE3
 	pte_l1_s_cache_mode |= L1_S_XSCALE_TEX(TEX_XSCALE_X);
 	pte_l2_l_cache_mode |= L2_XSCALE_L_TEX(TEX_XSCALE_X);
 	pte_l2_s_cache_mode |= L2_XSCALE_T_TEX(TEX_XSCALE_X);
 #endif
 #endif /* XSCALE_CACHE_READ_WRITE_ALLOCATE */
 #ifdef XSCALE_CACHE_WRITE_THROUGH
 	/*
 	 * Some versions of the XScale core have various bugs in
 	 * their cache units, the work-around for which is to run
 	 * the cache in write-through mode.  Unfortunately, this
 	 * has a major (negative) impact on performance.  So, we
 	 * go ahead and run fast-and-loose, in the hopes that we
 	 * don't line up the planets in a way that will trip the
 	 * bugs.
 	 *
 	 * However, we give you the option to be slow-but-correct.
 	 */
 	write_through = 1;
 #elif defined(XSCALE_CACHE_WRITE_BACK)
 	/* force write back cache mode */
 	write_through = 0;
 #elif defined(CPU_XSCALE_PXA2X0)
 	/*
 	 * Intel PXA2[15]0 processors are known to have a bug in
 	 * write-back cache on revision 4 and earlier (stepping
 	 * A[01] and B[012]).  Fixed for C0 and later.
 	 */
 	{
 		uint32_t id, type;
 
 		id = cpufunc_id();
 		type = id & ~(CPU_ID_XSCALE_COREREV_MASK|CPU_ID_REVISION_MASK);
 
 		if (type == CPU_ID_PXA250 || type == CPU_ID_PXA210) {
 			if ((id & CPU_ID_REVISION_MASK) < 5) {
 				/* write through for stepping A0-1 and B0-2 */
 				write_through = 1;
 			}
 		}
 	}
 #endif /* XSCALE_CACHE_WRITE_THROUGH */
 
 	if (write_through) {
 		pte_l1_s_cache_mode = L1_S_C;
 		pte_l2_l_cache_mode = L2_C;
 		pte_l2_s_cache_mode = L2_C;
 	}
 
 #if (ARM_NMMUS > 1)
 	xscale_use_minidata = 1;
 #endif
 
 	pte_l2_s_prot_u = L2_S_PROT_U_xscale;
 	pte_l2_s_prot_w = L2_S_PROT_W_xscale;
 	pte_l2_s_prot_mask = L2_S_PROT_MASK_xscale;
 
 	pte_l1_s_proto = L1_S_PROTO_xscale;
 	pte_l1_c_proto = L1_C_PROTO_xscale;
 	pte_l2_s_proto = L2_S_PROTO_xscale;
 
 #ifdef CPU_XSCALE_CORE3
 	pmap_copy_page_func = pmap_copy_page_generic;
 	pmap_zero_page_func = pmap_zero_page_generic;
 	xscale_use_minidata = 0;
 	pte_l1_s_cache_mode_pt = pte_l2_l_cache_mode_pt =
 	    pte_l2_s_cache_mode_pt = 0;
 #else
 	pmap_copy_page_func = pmap_copy_page_xscale;
 	pmap_zero_page_func = pmap_zero_page_xscale;
 #endif
 
 	/*
 	 * Disable ECC protection of page table access, for now.
 	 */
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl &= ~XSCALE_AUXCTL_P;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 
 /*
  * xscale_setup_minidata:
  *
  *	Set up the mini-data cache clean area.  We require the
  *	caller to allocate the right amount of physically and
  *	virtually contiguous space.
  */
 extern vm_offset_t xscale_minidata_clean_addr;
 extern vm_size_t xscale_minidata_clean_size; /* already initialized */
 void
 xscale_setup_minidata(vm_offset_t l1pt, vm_offset_t va, vm_paddr_t pa)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte;
 	vm_size_t size;
 	uint32_t auxctl;
 
 	xscale_minidata_clean_addr = va;
 
 	/* Round it to page size. */
 	size = (xscale_minidata_clean_size + L2_S_OFFSET) & L2_S_FRAME;
 
 	for (; size != 0;
 	     va += L2_S_SIZE, pa += L2_S_SIZE, size -= L2_S_SIZE) {
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("xscale_setup_minidata: can't find L2 table for "
 			    "VA 0x%08x", (u_int32_t) va);
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 		    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);
 	}
 
 	/*
 	 * Configure the mini-data cache for write-back with
 	 * read/write-allocate.
 	 *
 	 * NOTE: In order to reconfigure the mini-data cache, we must
 	 * make sure it contains no valid data!  In order to do that,
 	 * we must issue a global data cache invalidate command!
 	 *
 	 * WE ASSUME WE ARE RUNNING UN-CACHED WHEN THIS ROUTINE IS CALLED!
 	 * THIS IS VERY IMPORTANT!
 	 */
 
 	/* Invalidate data and mini-data. */
 	__asm __volatile("mcr p15, 0, %0, c7, c6, 0" : : "r" (0));
 	__asm __volatile("mrc p15, 0, %0, c1, c0, 1" : "=r" (auxctl));
 	auxctl = (auxctl & ~XSCALE_AUXCTL_MD_MASK) | XSCALE_AUXCTL_MD_WB_RWA;
 	__asm __volatile("mcr p15, 0, %0, c1, c0, 1" : : "r" (auxctl));
 }
 #endif
 
 /*
  * Allocate an L1 translation table for the specified pmap.
  * This is called at pmap creation time.
  */
 static void
 pmap_alloc_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 	u_int8_t domain;
 
 	/*
 	 * Remove the L1 at the head of the LRU list
 	 */
 	mtx_lock(&l1_lru_lock);
 	l1 = TAILQ_FIRST(&l1_lru_list);
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Pick the first available domain number, and update
 	 * the link to the next number.
 	 */
 	domain = l1->l1_domain_first;
 	l1->l1_domain_first = l1->l1_domain_free[domain];
 
 	/*
 	 * If there are still free domain numbers in this L1,
 	 * put it back on the TAIL of the LRU list.
 	 */
 	if (++l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 
 	/*
 	 * Fix up the relevant bits in the pmap structure
 	 */
 	pm->pm_l1 = l1;
 	pm->pm_domain = domain;
 }
 
 /*
  * Free an L1 translation table.
  * This is called at pmap destruction time.
  */
 static void
 pmap_free_l1(pmap_t pm)
 {
 	struct l1_ttable *l1 = pm->pm_l1;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * If this L1 is currently on the LRU list, remove it.
 	 */
 	if (l1->l1_domain_use_count < PMAP_DOMAINS)
 		TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 
 	/*
 	 * Free up the domain number which was allocated to the pmap
 	 */
 	l1->l1_domain_free[pm->pm_domain] = l1->l1_domain_first;
 	l1->l1_domain_first = pm->pm_domain;
 	l1->l1_domain_use_count--;
 
 	/*
 	 * The L1 now must have at least 1 free domain, so add
 	 * it back to the LRU list. If the use count is zero,
 	 * put it at the head of the list, otherwise it goes
 	 * to the tail.
 	 */
 	if (l1->l1_domain_use_count == 0) {
 		TAILQ_INSERT_HEAD(&l1_lru_list, l1, l1_lru);
 	}	else
 		TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 static PMAP_INLINE void
 pmap_use_l1(pmap_t pm)
 {
 	struct l1_ttable *l1;
 
 	/*
 	 * Do nothing if we're in interrupt context.
 	 * Access to an L1 by the kernel pmap must not affect
 	 * the LRU list.
 	 */
 	if (pm == pmap_kernel())
 		return;
 
 	l1 = pm->pm_l1;
 
 	/*
 	 * If the L1 is not currently on the LRU list, just return
 	 */
 	if (l1->l1_domain_use_count == PMAP_DOMAINS)
 		return;
 
 	mtx_lock(&l1_lru_lock);
 
 	/*
 	 * Check the use count again, now that we've acquired the lock
 	 */
 	if (l1->l1_domain_use_count == PMAP_DOMAINS) {
 		mtx_unlock(&l1_lru_lock);
 		return;
 	}
 
 	/*
 	 * Move the L1 to the back of the LRU list
 	 */
 	TAILQ_REMOVE(&l1_lru_list, l1, l1_lru);
 	TAILQ_INSERT_TAIL(&l1_lru_list, l1, l1_lru);
 
 	mtx_unlock(&l1_lru_lock);
 }
 
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA, or NULL if no L2 bucket exists for the address.
  */
 static PMAP_INLINE struct l2_bucket *
 pmap_get_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL ||
 	    (l2b = &l2->l2_bucket[L2_BUCKET(l1idx)])->l2b_kva == NULL)
 		return (NULL);
 
 	return (l2b);
 }
 
 /*
  * Returns a pointer to the L2 bucket associated with the specified pmap
  * and VA.
  *
  * If no L2 bucket exists, perform the necessary allocations to put an L2
  * bucket/page table in place.
  *
  * Note that if a new L2 bucket/page was allocated, the caller *must*
  * increment the bucket occupancy counter appropriately *before* 
  * releasing the pmap's lock to ensure no other thread or cpu deallocates
  * the bucket/page in the meantime.
  */
 static struct l2_bucket *
 pmap_alloc_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	u_short l1idx;
 
 	l1idx = L1_IDX(va);
 
 	PMAP_ASSERT_LOCKED(pm);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 again_l2table:
 		PMAP_UNLOCK(pm);
 		vm_page_unlock_queues();
 		if ((l2 = pmap_alloc_l2_dtable()) == NULL) {
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			return (NULL);
 		}
 		vm_page_lock_queues();
 		PMAP_LOCK(pm);
 		if (pm->pm_l2[L2_IDX(l1idx)] != NULL) {
 			PMAP_UNLOCK(pm);
 			vm_page_unlock_queues();
 			uma_zfree(l2table_zone, l2);
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			l2 = pm->pm_l2[L2_IDX(l1idx)];
 			if (l2 == NULL)
 				goto again_l2table;
 			/*
 			 * Someone already allocated the l2_dtable while
 			 * we were doing the same.
 			 */
 		} else {
 			bzero(l2, sizeof(*l2));
 			/*
 			 * Link it into the parent pmap
 			 */
 			pm->pm_l2[L2_IDX(l1idx)] = l2;
 		}
 	} 
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 again_ptep:
 		PMAP_UNLOCK(pm);
 		vm_page_unlock_queues();
 		ptep = (void*)uma_zalloc(l2zone, M_NOWAIT|M_USE_RESERVE);
 		vm_page_lock_queues();
 		PMAP_LOCK(pm);
 		if (l2b->l2b_kva != 0) {
 			/* We lost the race. */
 			PMAP_UNLOCK(pm);
 			vm_page_unlock_queues();
 			uma_zfree(l2zone, ptep);
 			vm_page_lock_queues();
 			PMAP_LOCK(pm);
 			if (l2b->l2b_kva == 0)
 				goto again_ptep;
 			return (l2b);
 		}
 		l2b->l2b_phys = vtophys(ptep);
 		if (ptep == NULL) {
 			/*
 			 * Oops, no more L2 page tables available at this
 			 * time. We may need to deallocate the l2_dtable
 			 * if we allocated a new one above.
 			 */
 			if (l2->l2_occupancy == 0) {
 				pm->pm_l2[L2_IDX(l1idx)] = NULL;
 				pmap_free_l2_dtable(l2);
 			}
 			return (NULL);
 		}
 
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 	}
 
 	return (l2b);
 }
 
 static PMAP_INLINE void
 #ifndef PMAP_INCLUDE_PTE_SYNC
 pmap_free_l2_ptp(pt_entry_t *l2)
 #else
 pmap_free_l2_ptp(boolean_t need_sync, pt_entry_t *l2)
 #endif
 {
 #ifdef PMAP_INCLUDE_PTE_SYNC
 	/*
 	 * Note: With a write-back cache, we may need to sync this
 	 * L2 table before re-using it.
 	 * This is because it may have belonged to a non-current
 	 * pmap, in which case the cache syncs would have been
 	 * skipped when the pages were being unmapped. If the
 	 * L2 table were then to be immediately re-allocated to
 	 * the *current* pmap, it may well contain stale mappings
 	 * which have not yet been cleared by a cache write-back
 	 * and so would still be visible to the mmu.
 	 */
 	if (need_sync)
 		PTE_SYNC_RANGE(l2, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 #endif
 	uma_zfree(l2zone, l2);
 }
 /*
  * One or more mappings in the specified L2 descriptor table have just been
  * invalidated.
  *
  * Garbage collect the metadata and descriptor table itself if necessary.
  *
  * The pmap lock must be acquired when this is called (not necessary
  * for the kernel pmap).
  */
 static void
 pmap_free_l2_bucket(pmap_t pm, struct l2_bucket *l2b, u_int count)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 
 	/*
 	 * Update the bucket's reference count according to how many
 	 * PTEs the caller has just invalidated.
 	 */
 	l2b->l2b_occupancy -= count;
 
 	/*
 	 * Note:
 	 *
 	 * Level 2 page tables allocated to the kernel pmap are never freed
 	 * as that would require checking all Level 1 page tables and
 	 * removing any references to the Level 2 page table. See also the
 	 * comment elsewhere about never freeing bootstrap L2 descriptors.
 	 *
 	 * We make do with just invalidating the mapping in the L2 table.
 	 *
 	 * This isn't really a big deal in practice and, in fact, leads
 	 * to a performance win over time as we don't need to continually
 	 * alloc/free.
 	 */
 	if (l2b->l2b_occupancy > 0 || pm == pmap_kernel())
 		return;
 
 	/*
 	 * There are no more valid mappings in this level 2 page table.
 	 * Go ahead and NULL-out the pointer in the bucket, then
 	 * free the page table.
 	 */
 	l1idx = l2b->l2b_l1idx;
 	ptep = l2b->l2b_kva;
 	l2b->l2b_kva = NULL;
 
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 
 	/*
 	 * If the L1 slot matches the pmap's domain
 	 * number, then invalidate it.
 	 */
 	l1pd = *pl1pd & (L1_TYPE_MASK | L1_C_DOM_MASK);
 	if (l1pd == (L1_C_DOM(pm->pm_domain) | L1_TYPE_C)) {
 		*pl1pd = 0;
 		PTE_SYNC(pl1pd);
 	}
 
 	/*
 	 * Release the L2 descriptor table back to the pool cache.
 	 */
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	pmap_free_l2_ptp(ptep);
 #else
 	pmap_free_l2_ptp(!pmap_is_current(pm), ptep);
 #endif
 
 	/*
 	 * Update the reference count in the associated l2_dtable
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (--l2->l2_occupancy > 0)
 		return;
 
 	/*
 	 * There are no more valid mappings in any of the Level 1
 	 * slots managed by this l2_dtable. Go ahead and NULL-out
 	 * the pointer in the parent pmap and free the l2_dtable.
 	 */
 	pm->pm_l2[L2_IDX(l1idx)] = NULL;
 	pmap_free_l2_dtable(l2);
 }
 
 /*
  * Pool cache constructors for L2 descriptor tables, metadata and pmap
  * structures.
  */
 static int
 pmap_l2ptp_ctor(void *mem, int size, void *arg, int flags)
 {
 #ifndef PMAP_INCLUDE_PTE_SYNC
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 #ifdef ARM_USE_SMALL_ALLOC
 	pd_entry_t *pde;
 #endif
 	vm_offset_t va = (vm_offset_t)mem & ~PAGE_MASK;
 
 	/*
 	 * The mappings for these page tables were initially made using
 	 * pmap_kenter() by the pool subsystem. Therefore, the cache-
 	 * mode will not be right for page table mappings. To avoid
 	 * polluting the pmap_kenter() code with a special case for
 	 * page tables, we simply fix up the cache-mode here if it's not
 	 * correct.
 	 */
 #ifdef ARM_USE_SMALL_ALLOC
 	pde = &kernel_pmap->pm_l1->l1_kva[L1_IDX(va)];
 	if (!l1pte_section_p(*pde)) {
 #endif
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		pte = *ptep;
 		
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			/*
 			 * Page tables must have the cache-mode set to 
 			 * Write-Thru.
 			 */
 			*ptep = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_tlb_flushD_SE(va);
 			cpu_cpwait();
 		}
 #ifdef ARM_USE_SMALL_ALLOC
 	}
 #endif
 #endif
 	memset(mem, 0, L2_TABLE_SIZE_REAL);
 	PTE_SYNC_RANGE(mem, L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 	return (0);
 }
 
 /*
  * A bunch of routines to conditionally flush the caches/TLB depending
  * on whether the specified pmap actually needs to be flushed at any
  * given time.
  */
 static PMAP_INLINE void
 pmap_tlb_flushID_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushD_SE(pmap_t pm, vm_offset_t va)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD_SE(va);
 }
 
 static PMAP_INLINE void
 pmap_tlb_flushID(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushID();
 }
 static PMAP_INLINE void
 pmap_tlb_flushD(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_tlb_flushD();
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_range(pmap_t pm, vm_offset_t va, vm_size_t len)
 {
 
 	if (pmap_is_current(pm))
 		cpu_idcache_wbinv_range(va, len);
 }
 
 static PMAP_INLINE void
 pmap_dcache_wb_range(pmap_t pm, vm_offset_t va, vm_size_t len,
     boolean_t do_inv, boolean_t rd_only)
 {
 
 	if (pmap_is_current(pm)) {
 		if (do_inv) {
 			if (rd_only)
 				cpu_dcache_inv_range(va, len);
 			else
 				cpu_dcache_wbinv_range(va, len);
 		} else
 		if (!rd_only)
 			cpu_dcache_wb_range(va, len);
 	}
 }
 
 static PMAP_INLINE void
 pmap_idcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_idcache_wbinv_all();
 }
 
 static PMAP_INLINE void
 pmap_dcache_wbinv_all(pmap_t pm)
 {
 
 	if (pmap_is_current(pm))
 		cpu_dcache_wbinv_all();
 }
 
 /*
  * PTE_SYNC_CURRENT:
  *
  *     Make sure the pte is written out to RAM.
  *     We need to do this for one of two cases:
  *       - We're dealing with the kernel pmap
  *       - There is no pmap active in the cache/tlb.
  *       - The specified pmap is 'active' in the cache/tlb.
  */
 #ifdef PMAP_INCLUDE_PTE_SYNC
 #define	PTE_SYNC_CURRENT(pm, ptep)	\
 do {					\
 	if (PMAP_NEEDS_PTE_SYNC && 	\
 	    pmap_is_current(pm))	\
 		PTE_SYNC(ptep);		\
 } while (/*CONSTCOND*/0)
 #else
 #define	PTE_SYNC_CURRENT(pm, ptep)	/* nothing */
 #endif
 
 /*
  * Since we have a virtually indexed cache, we may need to inhibit caching if
  * there is more than one mapping and at least one of them is writable.
  * Since we purge the cache on every context switch, we only need to check for
  * other mappings within the same pmap, or kernel_pmap.
  * This function is also called when a page is unmapped, to possibly reenable
  * caching on any remaining mappings.
  *
  * The code implements the following logic, where:
  *
  * KW = # of kernel read/write pages
  * KR = # of kernel read only pages
  * UW = # of user read/write pages
  * UR = # of user read only pages
  * 
  * KC = kernel mapping is cacheable
  * UC = user mapping is cacheable
  *
  *               KW=0,KR=0  KW=0,KR>0  KW=1,KR=0  KW>1,KR>=0
  *             +---------------------------------------------
  * UW=0,UR=0   | ---        KC=1       KC=1       KC=0
  * UW=0,UR>0   | UC=1       KC=1,UC=1  KC=0,UC=0  KC=0,UC=0
  * UW=1,UR=0   | UC=1       KC=0,UC=0  KC=0,UC=0  KC=0,UC=0
  * UW>1,UR>=0  | UC=0       KC=0,UC=0  KC=0,UC=0  KC=0,UC=0
  */
 
 static const int pmap_vac_flags[4][4] = {
 	{-1,		0,		0,		PVF_KNC},
 	{0,		0,		PVF_NC,		PVF_NC},
 	{0,		PVF_NC,		PVF_NC,		PVF_NC},
 	{PVF_UNC,	PVF_NC,		PVF_NC,		PVF_NC}
 };
 
 static PMAP_INLINE int
 pmap_get_vac_flags(const struct vm_page *pg)
 {
 	int kidx, uidx;
 
 	kidx = 0;
 	if (pg->md.kro_mappings || pg->md.krw_mappings > 1)
 		kidx |= 1;
 	if (pg->md.krw_mappings)
 		kidx |= 2;
 
 	uidx = 0;
 	if (pg->md.uro_mappings || pg->md.urw_mappings > 1)
 		uidx |= 1;
 	if (pg->md.urw_mappings)
 		uidx |= 2;
 
 	return (pmap_vac_flags[uidx][kidx]);
 }
 
 static __inline void
 pmap_vac_me_harder(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	int nattr;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	nattr = pmap_get_vac_flags(pg);
 
 	if (nattr < 0) {
 		pg->md.pvh_attrs &= ~PVF_NC;
 		return;
 	}
 
 	if (nattr == 0 && (pg->md.pvh_attrs & PVF_NC) == 0) {
 		return;
 	}
 
 	if (pm == pmap_kernel())
 		pmap_vac_me_kpmap(pg, pm, va);
 	else
 		pmap_vac_me_user(pg, pm, va);
 
 	pg->md.pvh_attrs = (pg->md.pvh_attrs & ~PVF_NC) | nattr;
 }
 
 static void
 pmap_vac_me_kpmap(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	u_int u_cacheable, u_entries;
 	struct pv_entry *pv;
 	pmap_t last_pmap = pm;
 
 	/* 
 	 * Pass one, see if there are both kernel and user pmaps for
 	 * this page.  Calculate whether there are user-writable or
 	 * kernel-writable pages.
 	 */
 	u_cacheable = 0;
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		if (pv->pv_pmap != pm && (pv->pv_flags & PVF_NC) == 0)
 			u_cacheable++;
 	}
 
 	u_entries = pg->md.urw_mappings + pg->md.uro_mappings;
 
 	/* 
 	 * We know we have just been updating a kernel entry, so if
 	 * all user pages are already cacheable, then there is nothing
 	 * further to do.
 	 */
 	if (pg->md.k_mappings == 0 && u_cacheable == u_entries)
 		return;
 
 	if (u_entries) {
 		/* 
 		 * Scan over the list again, for each entry, if it
 		 * might not be set correctly, call pmap_vac_me_user
 		 * to recalculate the settings.
 		 */
 		TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 			/* 
 			 * We know kernel mappings will get set
 			 * correctly in other calls.  We also know
 			 * that if the pmap is the same as last_pmap
 			 * then we've just handled this entry.
 			 */
 			if (pv->pv_pmap == pm || pv->pv_pmap == last_pmap)
 				continue;
 
 			/* 
 			 * If there are kernel entries and this page
 			 * is writable but non-cacheable, then we can
 			 * skip this entry also.  
 			 */
 			if (pg->md.k_mappings &&
 			    (pv->pv_flags & (PVF_NC | PVF_WRITE)) ==
 			    (PVF_NC | PVF_WRITE))
 				continue;
 
 			/* 
 			 * Similarly if there are no kernel-writable 
 			 * entries and the page is already 
 			 * read-only/cacheable.
 			 */
 			if (pg->md.krw_mappings == 0 &&
 			    (pv->pv_flags & (PVF_NC | PVF_WRITE)) == 0)
 				continue;
 
 			/* 
 			 * For some of the remaining cases, we know
 			 * that we must recalculate, but for others we
 			 * can't tell if they are correct or not, so
 			 * we recalculate anyway.
 			 */
 			pmap_vac_me_user(pg, (last_pmap = pv->pv_pmap), 0);
 		}
 
 		if (pg->md.k_mappings == 0)
 			return;
 	}
 
 	pmap_vac_me_user(pg, pm, va);
 }
 
 static void
 pmap_vac_me_user(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	pmap_t kpmap = pmap_kernel();
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	u_int entries = 0;
 	u_int writable = 0;
 	u_int cacheable_entries = 0;
 	u_int kern_cacheable = 0;
 	u_int other_writable = 0;
 
 	/*
 	 * Count mappings and writable mappings in this pmap.
 	 * Include kernel mappings as part of our own.
 	 * Keep a pointer to the first one.
 	 */
 	npv = TAILQ_FIRST(&pg->md.pv_list);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		/* Count mappings in the same pmap */
 		if (pm == pv->pv_pmap || kpmap == pv->pv_pmap) {
 			if (entries++ == 0)
 				npv = pv;
 
 			/* Cacheable mappings */
 			if ((pv->pv_flags & PVF_NC) == 0) {
 				cacheable_entries++;
 				if (kpmap == pv->pv_pmap)
 					kern_cacheable++;
 			}
 
 			/* Writable mappings */
 			if (pv->pv_flags & PVF_WRITE)
 				++writable;
 		} else
 		if (pv->pv_flags & PVF_WRITE)
 			other_writable = 1;
 	}
 
 	/*
 	 * Enable or disable caching as necessary.
 	 * Note: the first entry might be part of the kernel pmap,
 	 * so we can't assume this is indicative of the state of the
 	 * other (maybe non-kpmap) entries.
 	 */
 	if ((entries > 1 && writable) ||
 	    (entries > 0 && pm == kpmap && other_writable)) {
 		if (cacheable_entries == 0)
 			return;
 
 		for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) {
 			if ((pm != pv->pv_pmap && kpmap != pv->pv_pmap) ||
 			    (pv->pv_flags & PVF_NC))
 				continue;
 
 			pv->pv_flags |= PVF_NC;
 
 			l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 			ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 			pte = *ptep & ~L2_S_CACHE_MASK;
 
 			if ((va != pv->pv_va || pm != pv->pv_pmap) &&
 			    l2pte_valid(pte)) {
 				if (PV_BEEN_EXECD(pv->pv_flags)) {
 					pmap_idcache_wbinv_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE);
 					pmap_tlb_flushID_SE(pv->pv_pmap,
 					    pv->pv_va);
 				} else
 				if (PV_BEEN_REFD(pv->pv_flags)) {
 					pmap_dcache_wb_range(pv->pv_pmap,
 					    pv->pv_va, PAGE_SIZE, TRUE,
 					    (pv->pv_flags & PVF_WRITE) == 0);
 					pmap_tlb_flushD_SE(pv->pv_pmap,
 					    pv->pv_va);
 				}
 			}
 
 			*ptep = pte;
 			PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		}
 		cpu_cpwait();
 	} else
 	if (entries > cacheable_entries) {
 		/*
 		 * Turn cacheing back on for some pages.  If it is a kernel
 		 * page, only do so if there are no other writable pages.
 		 */
 		for (pv = npv; pv; pv = TAILQ_NEXT(pv, pv_list)) {
 			if (!(pv->pv_flags & PVF_NC) || (pm != pv->pv_pmap &&
 			    (kpmap != pv->pv_pmap || other_writable)))
 				continue;
 
 			pv->pv_flags &= ~PVF_NC;
 
 			l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 			ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 			pte = (*ptep & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode;
 
 			if (l2pte_valid(pte)) {
 				if (PV_BEEN_EXECD(pv->pv_flags)) {
 					pmap_tlb_flushID_SE(pv->pv_pmap,
 					    pv->pv_va);
 				} else
 				if (PV_BEEN_REFD(pv->pv_flags)) {
 					pmap_tlb_flushD_SE(pv->pv_pmap,
 					    pv->pv_va);
 				}
 			}
 
 			*ptep = pte;
 			PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		}
 	}
 }
 
 /*
  * Modify pte bits for all ptes corresponding to the given physical address.
  * We use `maskbits' rather than `clearbits' because we're always passing
  * constants and the latter would require an extra inversion at run-time.
  */
 static int 
 pmap_clearbit(struct vm_page *pg, u_int maskbits)
 {
 	struct l2_bucket *l2b;
 	struct pv_entry *pv;
 	pt_entry_t *ptep, npte, opte;
 	pmap_t pm;
 	vm_offset_t va;
 	u_int oflags;
 	int count = 0;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Clear saved attributes (modify, reference)
 	 */
 	pg->md.pvh_attrs &= ~(maskbits & (PVF_MOD | PVF_REF));
 
 	if (TAILQ_EMPTY(&pg->md.pv_list)) {
 		return (0);
 	}
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos
 	 */
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list) {
 		va = pv->pv_va;
 		pm = pv->pv_pmap;
 		oflags = pv->pv_flags;
 		pv->pv_flags &= ~maskbits;
 
 		PMAP_LOCK(pm);
 
 		l2b = pmap_get_l2_bucket(pm, va);
 
 		ptep = &l2b->l2b_kva[l2pte_index(va)];
 		npte = opte = *ptep;
 
 		if (maskbits & (PVF_WRITE|PVF_MOD)) {
 			if ((pv->pv_flags & PVF_NC)) {
 				/* 
 				 * Entry is not cacheable:
 				 *
 				 * Don't turn caching on again if this is a 
 				 * modified emulation. This would be
 				 * inconsitent with the settings created by
 				 * pmap_vac_me_harder(). Otherwise, it's safe
 				 * to re-enable cacheing.
 				 *
 				 * There's no need to call pmap_vac_me_harder()
 				 * here: all pages are losing their write
 				 * permission.
 				 */
 				if (maskbits & PVF_WRITE) {
 					npte |= pte_l2_s_cache_mode;
 					pv->pv_flags &= ~PVF_NC;
 				}
 			} else
 			if (opte & L2_S_PROT_W) {
 				vm_page_dirty(pg);
 				/* 
 				 * Entry is writable/cacheable: check if pmap
 				 * is current if it is flush it, otherwise it
 				 * won't be in the cache
 				 */
 				if (PV_BEEN_EXECD(oflags))
 					pmap_idcache_wbinv_range(pm, pv->pv_va,
 					    PAGE_SIZE);
 				else
 				if (PV_BEEN_REFD(oflags))
 					pmap_dcache_wb_range(pm, pv->pv_va,
 					    PAGE_SIZE,
 					    (maskbits & PVF_REF) ? TRUE : FALSE,
 					    FALSE);
 			}
 
 			/* make the pte read only */
 			npte &= ~L2_S_PROT_W;
 
 			if (maskbits & PVF_WRITE) {
 				/*
 				 * Keep alias accounting up to date
 				 */
 				if (pv->pv_pmap == pmap_kernel()) {
 					if (oflags & PVF_WRITE) {
 						pg->md.krw_mappings--;
 						pg->md.kro_mappings++;
 					}
 				} else
 				if (oflags & PVF_WRITE) {
 					pg->md.urw_mappings--;
 					pg->md.uro_mappings++;
 				}
 			}
 		}
 
 		if (maskbits & PVF_REF) {
 			if ((pv->pv_flags & PVF_NC) == 0 &&
 			    (maskbits & (PVF_WRITE|PVF_MOD)) == 0) {
 				/*
 				 * Check npte here; we may have already
 				 * done the wbinv above, and the validity
 				 * of the PTE is the same for opte and
 				 * npte.
 				 */
 				if (npte & L2_S_PROT_W) {
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, FALSE);
 				} else
 				if ((npte & L2_TYPE_MASK) != L2_TYPE_INV) {
 					/* XXXJRT need idcache_inv_range */
 					if (PV_BEEN_EXECD(oflags))
 						pmap_idcache_wbinv_range(pm,
 						    pv->pv_va, PAGE_SIZE);
 					else
 					if (PV_BEEN_REFD(oflags))
 						pmap_dcache_wb_range(pm,
 						    pv->pv_va, PAGE_SIZE,
 						    TRUE, TRUE);
 				}
 			}
 
 			/*
 			 * Make the PTE invalid so that we will take a
 			 * page fault the next time the mapping is
 			 * referenced.
 			 */
 			npte &= ~L2_TYPE_MASK;
 			npte |= L2_TYPE_INV;
 		}
 
 		if (npte != opte) {
 			count++;
 			*ptep = npte;
 			PTE_SYNC(ptep);
 			/* Flush the TLB entry if a current pmap. */
 			if (PV_BEEN_EXECD(oflags))
 				pmap_tlb_flushID_SE(pm, pv->pv_va);
 			else
 			if (PV_BEEN_REFD(oflags))
 				pmap_tlb_flushD_SE(pm, pv->pv_va);
 		}
 
 		PMAP_UNLOCK(pm);
 
 	}
 
 	if (maskbits & PVF_WRITE)
 		vm_page_flag_clear(pg, PG_WRITEABLE);
 	return (count);
 }
 
 /*
  * main pv_entry manipulation functions:
  *   pmap_enter_pv: enter a mapping onto a vm_page list
  *   pmap_remove_pv: remove a mappiing from a vm_page list
  *
  * NOTE: pmap_enter_pv expects to lock the pvh itself
  *       pmap_remove_pv expects te caller to lock the pvh before calling
  */
 
 /*
  * pmap_enter_pv: enter a mapping onto a vm_page lst
  *
  * => caller should hold the proper lock on pmap_main_lock
  * => caller should have pmap locked
  * => we will gain the lock on the vm_page and allocate the new pv_entry
  * => caller should adjust ptp's wire_count before calling
  * => caller should not adjust pmap's wire_count
  */
 static void
 pmap_enter_pv(struct vm_page *pg, struct pv_entry *pve, pmap_t pm,
     vm_offset_t va, u_int flags)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_ASSERT_LOCKED(pm);
 	pve->pv_pmap = pm;
 	pve->pv_va = va;
 	pve->pv_flags = flags;
 
 	TAILQ_INSERT_HEAD(&pg->md.pv_list, pve, pv_list);
 	TAILQ_INSERT_HEAD(&pm->pm_pvlist, pve, pv_plist);
 	pg->md.pvh_attrs |= flags & (PVF_REF | PVF_MOD);
 	if (pm == pmap_kernel()) {
 		if (flags & PVF_WRITE)
 			pg->md.krw_mappings++;
 		else
 			pg->md.kro_mappings++;
 	} 
 	if (flags & PVF_WRITE)
 		pg->md.urw_mappings++;
 	else
 		pg->md.uro_mappings++;
 	pg->md.pv_list_count++;
 	if (pve->pv_flags & PVF_WIRED)
 		++pm->pm_stats.wired_count;
 	vm_page_flag_set(pg, PG_REFERENCED);
 }
 
 /*
  *
  * pmap_find_pv: Find a pv entry
  *
  * => caller should hold lock on vm_page
  */
 static PMAP_INLINE struct pv_entry *
 pmap_find_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pv;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &pg->md.pv_list, pv_list)
 	    if (pm == pv->pv_pmap && va == pv->pv_va)
 		    break;
 	return (pv);
 }
 
 /*
  * vector_page_setprot:
  *
  *	Manipulate the protection of the vector page.
  */
 void
 vector_page_setprot(int prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), vector_page);
 
 	ptep = &l2b->l2b_kva[l2pte_index(vector_page)];
 
 	*ptep = (*ptep & ~L1_S_PROT_MASK) | L2_S_PROT(PTE_KERNEL, prot);
 	PTE_SYNC(ptep);
 	cpu_tlb_flushD_SE(vector_page);
 	cpu_cpwait();
 }
 
 /*
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
  * => caller should hold proper lock on pmap_main_lock
  * => pmap should be locked
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should adjust ptp's wire_count and free PTP if needed
  * => caller should NOT adjust pmap's wire_count
  * => we return the removed pve
  */
 
 static void
 pmap_nuke_pv(struct vm_page *pg, pmap_t pm, struct pv_entry *pve)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_ASSERT_LOCKED(pm);
 	TAILQ_REMOVE(&pg->md.pv_list, pve, pv_list);
 	TAILQ_REMOVE(&pm->pm_pvlist, pve, pv_plist);
 	if (pve->pv_flags & PVF_WIRED)
 		--pm->pm_stats.wired_count;
 	pg->md.pv_list_count--;
 	if (pg->md.pvh_attrs & PVF_MOD)
 		vm_page_dirty(pg);
 	if (pm == pmap_kernel()) {
 		if (pve->pv_flags & PVF_WRITE)
 			pg->md.krw_mappings--;
 		else
 			pg->md.kro_mappings--;
 	} else
 		if (pve->pv_flags & PVF_WRITE)
 			pg->md.urw_mappings--;
 		else
 			pg->md.uro_mappings--;
 	if (TAILQ_FIRST(&pg->md.pv_list) == NULL ||
 	    (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0)) {
 		pg->md.pvh_attrs &= ~PVF_MOD;
 		if (TAILQ_FIRST(&pg->md.pv_list) == NULL)
 			pg->md.pvh_attrs &= ~PVF_REF;
 		vm_page_flag_clear(pg, PG_WRITEABLE);
 	}
 	if (TAILQ_FIRST(&pg->md.pv_list))
 		vm_page_flag_set(pg, PG_REFERENCED);
 	if (pve->pv_flags & PVF_WRITE)
 		pmap_vac_me_harder(pg, pm, 0);
 }
 
 static struct pv_entry *
 pmap_remove_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va)
 {
 	struct pv_entry *pve;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pve = TAILQ_FIRST(&pg->md.pv_list);
 
 	while (pve) {
 		if (pve->pv_pmap == pm && pve->pv_va == va) {	/* match? */
 			pmap_nuke_pv(pg, pm, pve);
 			break;
 		}
 		pve = TAILQ_NEXT(pve, pv_list);
 	}
 
 	return(pve);				/* return removed pve */
 }
 /*
  *
  * pmap_modify_pv: Update pv flags
  *
  * => caller should hold lock on vm_page [so that attrs can be adjusted]
  * => caller should NOT adjust pmap's wire_count
  * => caller must call pmap_vac_me_harder() if writable status of a page
  *    may have changed.
  * => we return the old flags
  * 
  * Modify a physical-virtual mapping in the pv table
  */
 static u_int
 pmap_modify_pv(struct vm_page *pg, pmap_t pm, vm_offset_t va,
     u_int clr_mask, u_int set_mask)
 {
 	struct pv_entry *npv;
 	u_int flags, oflags;
 
 	PMAP_ASSERT_LOCKED(pm);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((npv = pmap_find_pv(pg, pm, va)) == NULL)
 		return (0);
 
 	/*
 	 * There is at least one VA mapping this page.
 	 */
 
 	if (clr_mask & (PVF_REF | PVF_MOD))
 		pg->md.pvh_attrs |= set_mask & (PVF_REF | PVF_MOD);
 
 	oflags = npv->pv_flags;
 	npv->pv_flags = flags = (oflags & ~clr_mask) | set_mask;
 
 	if ((flags ^ oflags) & PVF_WIRED) {
 		if (flags & PVF_WIRED)
 			++pm->pm_stats.wired_count;
 		else
 			--pm->pm_stats.wired_count;
 	}
 
 	if ((flags ^ oflags) & PVF_WRITE) {
 		if (pm == pmap_kernel()) {
 			if (flags & PVF_WRITE) {
 				pg->md.krw_mappings++;
 				pg->md.kro_mappings--;
 			} else {
 				pg->md.kro_mappings++;
 				pg->md.krw_mappings--;
 			}
 		} else
 		if (flags & PVF_WRITE) {
 			pg->md.urw_mappings++;
 			pg->md.uro_mappings--;
 		} else {
 			pg->md.uro_mappings++;
 			pg->md.urw_mappings--;
 		}
 		if (pg->md.krw_mappings == 0 && pg->md.urw_mappings == 0) {
 			pg->md.pvh_attrs &= ~PVF_MOD;
 			vm_page_flag_clear(pg, PG_WRITEABLE);
 		}
 		pmap_vac_me_harder(pg, pm, 0);
 	}
 
 	return (oflags);
 }
 
 /* Function to set the debug level of the pmap code */
 #ifdef PMAP_DEBUG
 void
 pmap_debug(int level)
 {
 	pmap_debug_level = level;
 	dprintf("pmap_debug: level=%d\n", pmap_debug_level);
 }
 #endif  /* PMAP_DEBUG */
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	PDEBUG(1, printf("pmap_pinit0: pmap = %08x\n", (u_int32_t) pmap));
 
 	dprintf("pmap_pinit0: pmap = %08x, pm_pdir = %08x\n",
 		(u_int32_t) pmap, (u_int32_t) pmap->pm_pdir);
 	bcopy(kernel_pmap, pmap, sizeof(*pmap));
 	bzero(&pmap->pm_mtx, sizeof(pmap->pm_mtx));
 	PMAP_LOCK_INIT(pmap);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *      Initialize the pmap module.
  *      Called by vm_init, to initialize any structures that the pmap
  *      system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	PDEBUG(1, printf("pmap_init: phys_start = %08x\n"));
 
 	/*
 	 * init the pv free list
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof (struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	PDEBUG(1, printf("pmap_init: done!\n"));
 
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	l2zone = uma_zcreate("L2 Table", L2_TABLE_SIZE_REAL, pmap_l2ptp_ctor,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	l2table_zone = uma_zcreate("L2 Table", sizeof(struct l2_dtable),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
 }
 
 int
 pmap_fault_fixup(pmap_t pm, vm_offset_t va, vm_prot_t ftype, int user)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	int rv = 0;
 
 	l1idx = L1_IDX(va);
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 
 	/*
 	 * If there is no l2_dtable for this address, then the process
 	 * has no business accessing it.
 	 *
 	 * Note: This will catch userland processes trying to access
 	 * kernel addresses.
 	 */
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 	if (l2 == NULL)
 		goto out;
 
 	/*
 	 * Likewise if there is no L2 descriptor table
 	 */
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 	if (l2b->l2b_kva == NULL)
 		goto out;
 
 	/*
 	 * Check the PTE itself.
 	 */
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	if (pte == 0)
 		goto out;
 
 	/*
 	 * Catch a userland access to the vector page mapped at 0x0
 	 */
 	if (user && (pte & L2_S_PROT_U) == 0)
 		goto out;
 	if (va == vector_page)
 		goto out;
 
 	pa = l2pte_pa(pte);
 
 	if ((ftype & VM_PROT_WRITE) && (pte & L2_S_PROT_W) == 0) {
 		/*
 		 * This looks like a good candidate for "page modified"
 		 * emulation...
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL) {
 			goto out;
 		}
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL) {
 			goto out;
 		}
 
 		/*
 		 * Do the flags say this page is writable? If not then it
 		 * is a genuine write fault. If yes then the write fault is
 		 * our fault as we did not reflect the write access in the
 		 * PTE. Now we know a write has occurred we can correct this
 		 * and also set the modified bit
 		 */
 		if ((pv->pv_flags & PVF_WRITE) == 0) {
 			goto out;
 		}
 
 		pg->md.pvh_attrs |= PVF_REF | PVF_MOD;
 		vm_page_dirty(pg);
 		pv->pv_flags |= PVF_REF | PVF_MOD;
 
 		/* 
 		 * Re-enable write permissions for the page.  No need to call
 		 * pmap_vac_me_harder(), since this is just a
 		 * modified-emulation fault, and the PVF_WRITE bit isn't
 		 * changing. We've already set the cacheable bits based on
 		 * the assumption that we can write to this page.
 		 */
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO | L2_S_PROT_W;
 		PTE_SYNC(ptep);
 		rv = 1;
 	} else
 	if ((pte & L2_TYPE_MASK) == L2_TYPE_INV) {
 		/*
 		 * This looks like a good candidate for "page referenced"
 		 * emulation.
 		 */
 		struct pv_entry *pv;
 		struct vm_page *pg;
 
 		/* Extract the physical address of the page */
 		if ((pg = PHYS_TO_VM_PAGE(pa)) == NULL)
 			goto out;
 		/* Get the current flags for this page. */
 
 		pv = pmap_find_pv(pg, pm, va);
 		if (pv == NULL)
 			goto out;
 
 		pg->md.pvh_attrs |= PVF_REF;
 		pv->pv_flags |= PVF_REF;
 
 
 		*ptep = (pte & ~L2_TYPE_MASK) | L2_S_PROTO;
 		PTE_SYNC(ptep);
 		rv = 1;
 	}
 
 	/*
 	 * We know there is a valid mapping here, so simply
 	 * fix up the L1 if necessary.
 	 */
 	pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = l2b->l2b_phys | L1_C_DOM(pm->pm_domain) | L1_C_PROTO;
 	if (*pl1pd != l1pd) {
 		*pl1pd = l1pd;
 		PTE_SYNC(pl1pd);
 		rv = 1;
 	}
 
 #ifdef CPU_SA110
 	/*
 	 * There are bugs in the rev K SA110.  This is a check for one
 	 * of them.
 	 */
 	if (rv == 0 && curcpu()->ci_arm_cputype == CPU_ID_SA110 &&
 	    curcpu()->ci_arm_cpurev < 3) {
 		/* Always current pmap */
 		if (l2pte_valid(pte)) {
 			extern int kernel_debug;
 			if (kernel_debug & 1) {
 				struct proc *p = curlwp->l_proc;
 				printf("prefetch_abort: page is already "
 				    "mapped - pte=%p *pte=%08x\n", ptep, pte);
 				printf("prefetch_abort: pc=%08lx proc=%p "
 				    "process=%s\n", va, p, p->p_comm);
 				printf("prefetch_abort: far=%08x fs=%x\n",
 				    cpu_faultaddress(), cpu_faultstatus());
 			}
 #ifdef DDB
 			if (kernel_debug & 2)
 				Debugger();
 #endif
 			rv = 1;
 		}
 	}
 #endif /* CPU_SA110 */
 
 #ifdef DEBUG
 	/*
 	 * If 'rv == 0' at this point, it generally indicates that there is a
 	 * stale TLB entry for the faulting address. This happens when two or
 	 * more processes are sharing an L1. Since we don't flush the TLB on
 	 * a context switch between such processes, we can take domain faults
 	 * for mappings which exist at the same VA in both processes. EVEN IF
 	 * WE'VE RECENTLY FIXED UP THE CORRESPONDING L1 in pmap_enter(), for
 	 * example.
 	 *
 	 * This is extremely likely to happen if pmap_enter() updated the L1
 	 * entry for a recently entered mapping. In this case, the TLB is
 	 * flushed for the new mapping, but there may still be TLB entries for
 	 * other mappings belonging to other processes in the 1MB range
 	 * covered by the L1 entry.
 	 *
 	 * Since 'rv == 0', we know that the L1 already contains the correct
 	 * value, so the fault must be due to a stale TLB entry.
 	 *
 	 * Since we always need to flush the TLB anyway in the case where we
 	 * fixed up the L1, or frobbed the L2 PTE, we effectively deal with
 	 * stale TLB entries dynamically.
 	 *
 	 * However, the above condition can ONLY happen if the current L1 is
 	 * being shared. If it happens when the L1 is unshared, it indicates
 	 * that other parts of the pmap are not doing their job WRT managing
 	 * the TLB.
 	 */
 	if (rv == 0 && pm->pm_l1->l1_domain_use_count == 1) {
 		extern int last_fault_code;
 		printf("fixup: pm %p, va 0x%lx, ftype %d - nothing to do!\n",
 		    pm, va, ftype);
 		printf("fixup: l2 %p, l2b %p, ptep %p, pl1pd %p\n",
 		    l2, l2b, ptep, pl1pd);
 		printf("fixup: pte 0x%x, l1pd 0x%x, last code 0x%x\n",
 		    pte, l1pd, last_fault_code);
 #ifdef DDB
 		Debugger();
 #endif
 	}
 #endif
 
 	cpu_tlb_flushID_SE(va);
 	cpu_cpwait();
 
 	rv = 1;
 
 out:
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pm);
 	return (rv);
 }
 
 void
 pmap_postinit(void)
 {
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pt;
 	pt_entry_t *ptep, pte;
 	vm_offset_t va, eva;
 	u_int loop, needed;
 	
 	needed = (maxproc / PMAP_DOMAINS) + ((maxproc % PMAP_DOMAINS) ? 1 : 0);
 	needed -= 1;
 	l1 = malloc(sizeof(*l1) * needed, M_VMPMAP, M_WAITOK);
 
 	for (loop = 0; loop < needed; loop++, l1++) {
 		/* Allocate a L1 page table */
 		va = (vm_offset_t)contigmalloc(L1_TABLE_SIZE, M_VMPMAP, 0, 0x0,
 		    0xffffffff, L1_TABLE_SIZE, 0);
 
 		if (va == 0)
 			panic("Cannot allocate L1 KVM");
 
 		eva = va + L1_TABLE_SIZE;
 		pl1pt = (pd_entry_t *)va;
 		
 		while (va < eva) {
 				l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 				ptep = &l2b->l2b_kva[l2pte_index(va)];
 				pte = *ptep;
 				pte = (pte & ~L2_S_CACHE_MASK) | pte_l2_s_cache_mode_pt;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 				cpu_tlb_flushD_SE(va);
 				
 				va += PAGE_SIZE;
 		}
 		pmap_init_l1(l1, pl1pt);
 	}
 
 
 #ifdef DEBUG
 	printf("pmap_postinit: Allocated %d static L1 descriptor tables\n",
 	    needed);
 #endif
 }
 
 /*
  * This is used to stuff certain critical values into the PCB where they
  * can be accessed quickly from cpu_switch() et al.
  */
 void
 pmap_set_pcb_pagedir(pmap_t pm, struct pcb *pcb)
 {
 	struct l2_bucket *l2b;
 
 	pcb->pcb_pagedir = pm->pm_l1->l1_physaddr;
 	pcb->pcb_dacr = (DOMAIN_CLIENT << (PMAP_DOMAIN_KERNEL * 2)) |
 	    (DOMAIN_CLIENT << (pm->pm_domain * 2));
 
 	if (vector_page < KERNBASE) {
 		pcb->pcb_pl1vec = &pm->pm_l1->l1_kva[L1_IDX(vector_page)];
 		l2b = pmap_get_l2_bucket(pm, vector_page);
 		pcb->pcb_l1vec = l2b->l2b_phys | L1_C_PROTO |
 	 	    L1_C_DOM(pm->pm_domain) | L1_C_DOM(PMAP_DOMAIN_KERNEL);
 	} else
 		pcb->pcb_pl1vec = NULL;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pm;
 	struct pcb *pcb;
 
 	pm = vmspace_pmap(td->td_proc->p_vmspace);
 	pcb = td->td_pcb;
 
 	critical_enter();
 	pmap_set_pcb_pagedir(pm, pcb);
 
 	if (td == curthread) {
 		u_int cur_dacr, cur_ttb;
 
 		__asm __volatile("mrc p15, 0, %0, c2, c0, 0" : "=r"(cur_ttb));
 		__asm __volatile("mrc p15, 0, %0, c3, c0, 0" : "=r"(cur_dacr));
 
 		cur_ttb &= ~(L1_TABLE_SIZE - 1);
 
 		if (cur_ttb == (u_int)pcb->pcb_pagedir &&
 		    cur_dacr == pcb->pcb_dacr) {
 			/*
 			 * No need to switch address spaces.
 			 */
 			critical_exit();
 			return;
 		}
 
 
 		/*
 		 * We MUST, I repeat, MUST fix up the L1 entry corresponding
 		 * to 'vector_page' in the incoming L1 table before switching
 		 * to it otherwise subsequent interrupts/exceptions (including
 		 * domain faults!) will jump into hyperspace.
 		 */
 		if (pcb->pcb_pl1vec) {
 
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			/*
 			 * Don't need to PTE_SYNC() at this point since
 			 * cpu_setttb() is about to flush both the cache
 			 * and the TLB.
 			 */
 		}
 
 		cpu_domains(pcb->pcb_dacr);
 		cpu_setttb(pcb->pcb_pagedir);
 	}
 	critical_exit();
 }
 
 static int
 pmap_set_pt_cache_mode(pd_entry_t *kl1, vm_offset_t va)
 {
 	pd_entry_t *pdep, pde;
 	pt_entry_t *ptep, pte;
 	vm_offset_t pa;
 	int rv = 0;
 
 	/*
 	 * Make sure the descriptor itself has the correct cache mode
 	 */
 	pdep = &kl1[L1_IDX(va)];
 	pde = *pdep;
 
 	if (l1pte_section_p(pde)) {
 		if ((pde & L1_S_CACHE_MASK) != pte_l1_s_cache_mode_pt) {
 			*pdep = (pde & ~L1_S_CACHE_MASK) |
 			    pte_l1_s_cache_mode_pt;
 			PTE_SYNC(pdep);
 			cpu_dcache_wbinv_range((vm_offset_t)pdep,
 			    sizeof(*pdep));
 			rv = 1;
 		}
 	} else {
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		if (ptep == NULL)
 			panic("pmap_bootstrap: No L2 for L2 @ va %p\n", ptep);
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 		if ((pte & L2_S_CACHE_MASK) != pte_l2_s_cache_mode_pt) {
 			*ptep = (pte & ~L2_S_CACHE_MASK) |
 			    pte_l2_s_cache_mode_pt;
 			PTE_SYNC(ptep);
 			cpu_dcache_wbinv_range((vm_offset_t)ptep,
 			    sizeof(*ptep));
 			rv = 1;
 		}
 	}
 
 	return (rv);
 }
 
 static void
 pmap_alloc_specials(vm_offset_t *availp, int pages, vm_offset_t *vap, 
     pt_entry_t **ptep)
 {
 	vm_offset_t va = *availp;
 	struct l2_bucket *l2b;
 
 	if (ptep) {
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 		if (l2b == NULL)
 			panic("pmap_alloc_specials: no l2b for 0x%x", va);
 
 		*ptep = &l2b->l2b_kva[l2pte_index(va)];
 	}
 
 	*vap = va;
 	*availp = va + (PAGE_SIZE * pages);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the arm this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 #define PMAP_STATIC_L2_SIZE 16
 #ifdef ARM_USE_SMALL_ALLOC
 extern struct mtx smallalloc_mtx;
 #endif
 
 void
 pmap_bootstrap(vm_offset_t firstaddr, vm_offset_t lastaddr, struct pv_addr *l1pt)
 {
 	static struct l1_ttable static_l1;
 	static struct l2_dtable static_l2[PMAP_STATIC_L2_SIZE];
 	struct l1_ttable *l1 = &static_l1;
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	pd_entry_t pde;
 	pd_entry_t *kernel_l1pt = (pd_entry_t *)l1pt->pv_va;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_size_t size;
 	int l1idx, l2idx, l2next = 0;
 
 	PDEBUG(1, printf("firstaddr = %08x, loadaddr = %08x\n",
 	    firstaddr, loadaddr));
 	
 	virtual_avail = firstaddr;
 	kernel_pmap = &kernel_pmap_store;
 	kernel_pmap->pm_l1 = l1;
 	kernel_l1pa = l1pt->pv_pa;
 	
 	/*
 	 * Scan the L1 translation table created by initarm() and create
 	 * the required metadata for all valid mappings found in it.
 	 */
 	for (l1idx = 0; l1idx < (L1_TABLE_SIZE / sizeof(pd_entry_t)); l1idx++) {
 		pde = kernel_l1pt[l1idx];
 
 		/*
 		 * We're only interested in Coarse mappings.
 		 * pmap_extract() can deal with section mappings without
 		 * recourse to checking L2 metadata.
 		 */
 		if ((pde & L1_TYPE_MASK) != L1_TYPE_C)
 			continue;
 
 		/*
 		 * Lookup the KVA of this L2 descriptor table
 		 */
 		pa = (vm_paddr_t)(pde & L1_C_ADDR_MASK);
 		ptep = (pt_entry_t *)kernel_pt_lookup(pa);
 		
 		if (ptep == NULL) {
 			panic("pmap_bootstrap: No L2 for va 0x%x, pa 0x%lx",
 			    (u_int)l1idx << L1_S_SHIFT, (long unsigned int)pa);
 		}
 
 		/*
 		 * Fetch the associated L2 metadata structure.
 		 * Allocate a new one if necessary.
 		 */
 		if ((l2 = kernel_pmap->pm_l2[L2_IDX(l1idx)]) == NULL) {
 			if (l2next == PMAP_STATIC_L2_SIZE)
 				panic("pmap_bootstrap: out of static L2s");
 			kernel_pmap->pm_l2[L2_IDX(l1idx)] = l2 = 
 			    &static_l2[l2next++];
 		}
 
 		/*
 		 * One more L1 slot tracked...
 		 */
 		l2->l2_occupancy++;
 
 		/*
 		 * Fill in the details of the L2 descriptor in the
 		 * appropriate bucket.
 		 */
 		l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 		l2b->l2b_kva = ptep;
 		l2b->l2b_phys = pa;
 		l2b->l2b_l1idx = l1idx;
 
 		/*
 		 * Establish an initial occupancy count for this descriptor
 		 */
 		for (l2idx = 0;
 		    l2idx < (L2_TABLE_SIZE_REAL / sizeof(pt_entry_t));
 		    l2idx++) {
 			if ((ptep[l2idx] & L2_TYPE_MASK) != L2_TYPE_INV) {
 				l2b->l2b_occupancy++;
 			}
 		}
 
 		/*
 		 * Make sure the descriptor itself has the correct cache mode.
 		 * If not, fix it, but whine about the problem. Port-meisters
 		 * should consider this a clue to fix up their initarm()
 		 * function. :)
 		 */
 		if (pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)ptep)) {
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "L2 pte @ %p\n", ptep);
 		}
 	}
 
 	
 	/*
 	 * Ensure the primary (kernel) L1 has the correct cache mode for
 	 * a page table. Bitch if it is not correctly set.
 	 */
 	for (va = (vm_offset_t)kernel_l1pt;
 	    va < ((vm_offset_t)kernel_l1pt + L1_TABLE_SIZE); va += PAGE_SIZE) {
 		if (pmap_set_pt_cache_mode(kernel_l1pt, va))
 			printf("pmap_bootstrap: WARNING! wrong cache mode for "
 			    "primary L1 @ 0x%x\n", va);
 	}
 
 	cpu_dcache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_active = -1;
 	kernel_pmap->pm_domain = PMAP_DOMAIN_KERNEL;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define SYSMAP(c, p, v, n)						\
     v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
     
 	pmap_alloc_specials(&virtual_avail, 1, &csrcp, &csrc_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)csrc_pte);
 	pmap_alloc_specials(&virtual_avail, 1, &cdstp, &cdst_pte);
 	pmap_set_pt_cache_mode(kernel_l1pt, (vm_offset_t)cdst_pte);
 	size = ((lastaddr - pmap_curmaxkvaddr) + L1_S_OFFSET) / L1_S_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * L2_TABLE_SIZE_REAL) / PAGE_SIZE,
 	    &pmap_kernel_l2ptp_kva, NULL);
 	
 	size = (size + (L2_BUCKET_SIZE - 1)) / L2_BUCKET_SIZE;
 	pmap_alloc_specials(&virtual_avail,
 	    round_page(size * sizeof(struct l2_dtable)) / PAGE_SIZE,
 	    &pmap_kernel_l2dtable_kva, NULL);
 
 	pmap_alloc_specials(&virtual_avail,
 	    1, (vm_offset_t*)&_tmppt, NULL);
 	SLIST_INIT(&l1_list);
 	TAILQ_INIT(&l1_lru_list);
 	mtx_init(&l1_lru_lock, "l1 list lock", NULL, MTX_DEF);
 	pmap_init_l1(l1, kernel_l1pt);
 	cpu_dcache_wbinv_all();
 
 	virtual_avail = round_page(virtual_avail);
 	virtual_end = lastaddr;
 	kernel_vm_end = pmap_curmaxkvaddr;
 	arm_nocache_startaddr = lastaddr;
 	mtx_init(&cmtx, "TMP mappings mtx", NULL, MTX_DEF);
 
 #ifdef ARM_USE_SMALL_ALLOC
 	mtx_init(&smallalloc_mtx, "Small alloc page list", NULL, MTX_DEF);
 	arm_init_smallalloc();
 #endif
 	pmap_set_pcb_pagedir(kernel_pmap, thread0.td_pcb);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	struct pcb *pcb;
 	
 	pmap_idcache_wbinv_all(pmap);
 	pmap_tlb_flushID(pmap);
 	cpu_cpwait();
 	if (vector_page < KERNBASE) {
 		struct pcb *curpcb = PCPU_GET(curpcb);
 		pcb = thread0.td_pcb;
 		if (pmap_is_current(pmap)) {
 			/*
  			 * Frob the L1 entry corresponding to the vector
 			 * page so that it contains the kernel pmap's domain
 			 * number. This will ensure pmap_remove() does not
 			 * pull the current vector page out from under us.
 			 */
 			critical_enter();
 			*pcb->pcb_pl1vec = pcb->pcb_l1vec;
 			cpu_domains(pcb->pcb_dacr);
 			cpu_setttb(pcb->pcb_pagedir);
 			critical_exit();
 		}
 		pmap_remove(pmap, vector_page, vector_page + PAGE_SIZE);
 		/*
 		 * Make sure cpu_switch(), et al, DTRT. This is safe to do
 		 * since this process has no remaining mappings of its own.
 		 */
 		curpcb->pcb_pl1vec = pcb->pcb_pl1vec;
 		curpcb->pcb_l1vec = pcb->pcb_l1vec;
 		curpcb->pcb_dacr = pcb->pcb_dacr;
 		curpcb->pcb_pagedir = pcb->pcb_pagedir;
 
 	}
 	pmap_free_l1(pmap);
 	PMAP_LOCK_DESTROY(pmap);
 	
 	dprintf("pmap_release()\n");
 }
 
 
 
 /*
  * Helper function for pmap_grow_l2_bucket()
  */
 static __inline int
 pmap_grow_map(vm_offset_t va, pt_entry_t cache_mode, vm_paddr_t *pap)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep;
 	vm_paddr_t pa;
 	struct vm_page *pg;
 	
 	pg = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (pg == NULL)
 		return (1);
 	pa = VM_PAGE_TO_PHYS(pg);
 
 	if (pap)
 		*pap = pa;
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	*ptep = L2_S_PROTO | pa | cache_mode |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ | VM_PROT_WRITE);
 	PTE_SYNC(ptep);
 	return (0);
 }
 
 /*
  * This is the same as pmap_alloc_l2_bucket(), except that it is only
  * used by pmap_growkernel().
  */
 static __inline struct l2_bucket *
 pmap_grow_l2_bucket(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	struct l2_bucket *l2b;
 	struct l1_ttable *l1;
 	pd_entry_t *pl1pd;
 	u_short l1idx;
 	vm_offset_t nva;
 
 	l1idx = L1_IDX(va);
 
 	if ((l2 = pm->pm_l2[L2_IDX(l1idx)]) == NULL) {
 		/*
 		 * No mapping at this address, as there is
 		 * no entry in the L1 table.
 		 * Need to allocate a new l2_dtable.
 		 */
 		nva = pmap_kernel_l2dtable_kva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		l2 = (struct l2_dtable *)nva;
 		nva += sizeof(struct l2_dtable);
 
 		if ((nva & PAGE_MASK) < (pmap_kernel_l2dtable_kva & 
 		    PAGE_MASK)) {
 			/*
 			 * The new l2_dtable straddles a page boundary.
 			 * Map in another page to cover it.
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode, NULL))
 				return (NULL);
 		}
 
 		pmap_kernel_l2dtable_kva = nva;
 
 		/*
 		 * Link it into the parent pmap
 		 */
 		pm->pm_l2[L2_IDX(l1idx)] = l2;
 		memset(l2, 0, sizeof(*l2));
 	}
 
 	l2b = &l2->l2_bucket[L2_BUCKET(l1idx)];
 
 	/*
 	 * Fetch pointer to the L2 page table associated with the address.
 	 */
 	if (l2b->l2b_kva == NULL) {
 		pt_entry_t *ptep;
 
 		/*
 		 * No L2 page table has been allocated. Chances are, this
 		 * is because we just allocated the l2_dtable, above.
 		 */
 		nva = pmap_kernel_l2ptp_kva;
 		ptep = (pt_entry_t *)nva;
 		if ((nva & PAGE_MASK) == 0) {
 			/*
 			 * Need to allocate a backing page
 			 */
 			if (pmap_grow_map(nva, pte_l2_s_cache_mode_pt,
 			    &pmap_kernel_l2ptp_phys))
 				return (NULL);
 			PTE_SYNC_RANGE(ptep, PAGE_SIZE / sizeof(pt_entry_t));
 		}
 		memset(ptep, 0, L2_TABLE_SIZE_REAL);
 		l2->l2_occupancy++;
 		l2b->l2b_kva = ptep;
 		l2b->l2b_l1idx = l1idx;
 		l2b->l2b_phys = pmap_kernel_l2ptp_phys;
 
 		pmap_kernel_l2ptp_kva += L2_TABLE_SIZE_REAL;
 		pmap_kernel_l2ptp_phys += L2_TABLE_SIZE_REAL;
 	}
 
 	/* Distribute new L1 entry to all other L1s */
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 			pl1pd = &l1->l1_kva[L1_IDX(va)];
 			*pl1pd = l2b->l2b_phys | L1_C_DOM(PMAP_DOMAIN_KERNEL) |
 			    L1_C_PROTO;
 			PTE_SYNC(pl1pd);
 	}
 
 	return (l2b);
 }
 
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	pmap_t kpm = pmap_kernel();
 
 	if (addr <= pmap_curmaxkvaddr)
 		return;		/* we are OK */
 
 	/*
 	 * whoops!   we need to add kernel PTPs
 	 */
 
 	/* Map 1MB at a time */
 	for (; pmap_curmaxkvaddr < addr; pmap_curmaxkvaddr += L1_S_SIZE)
 		pmap_grow_l2_bucket(kpm, pmap_curmaxkvaddr);
 
 	/*
 	 * flush out the cache, expensive but growkernel will happen so
 	 * rarely
 	 */
 	cpu_dcache_wbinv_all();
 	cpu_tlb_flushD();
 	cpu_cpwait();
 	kernel_vm_end = pmap_curmaxkvaddr;
 
 }
 
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct pv_entry *pv, *npv;
 	struct l2_bucket *l2b = NULL;
 	vm_page_t m;
 	pt_entry_t *pt;
 	
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		if (pv->pv_flags & PVF_WIRED) {
 			/* The page is wired, cannot remove it now. */
 			npv = TAILQ_NEXT(pv, pv_plist);
 			continue;
 		}
 		pmap->pm_stats.resident_count--;
 		l2b = pmap_get_l2_bucket(pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No L2 bucket in pmap_remove_pages"));
 		pt = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		m = PHYS_TO_VM_PAGE(*pt & L2_ADDR_MASK);
 #ifdef ARM_USE_SMALL_ALLOC
 		KASSERT((vm_offset_t)m >= alloc_firstaddr, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #else
 		KASSERT((vm_offset_t)m >= KERNBASE, ("Trying to access non-existent page va %x pte %x", pv->pv_va, *pt));
 #endif
 		*pt = 0;
 		PTE_SYNC(pt);
 		npv = TAILQ_NEXT(pv, pv_plist);
 		pmap_nuke_pv(m, pmap, pv);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_WRITEABLE);
 		pmap_free_pv_entry(pv);
 		pmap_free_l2_bucket(pmap, l2b, 1);
 	}
 	vm_page_unlock_queues();
 	cpu_idcache_wbinv_all();
 	cpu_tlb_flushID();
 	cpu_cpwait();
 	PMAP_UNLOCK(pmap);
 }
 
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /* Map a section into the KVA. */
 
 void
 pmap_kenter_section(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	pd_entry_t pd = L1_S_PROTO | pa | L1_S_PROT(PTE_KERNEL,
 	    VM_PROT_READ|VM_PROT_WRITE) | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	struct l1_ttable *l1;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0,
 	    ("Not a valid section mapping"));
 	if (flags & SECTION_CACHE)
 		pd |= pte_l1_s_cache_mode;
 	else if (flags & SECTION_PT)
 		pd |= pte_l1_s_cache_mode_pt;
 	SLIST_FOREACH(l1, &l1_list, l1_link) {
 		l1->l1_kva[L1_IDX(va)] = pd;
 		PTE_SYNC(&l1->l1_kva[L1_IDX(va)]);
 	}
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a invltlb after doing the pmap_kenter...
  */
 static PMAP_INLINE void
 pmap_kenter_internal(vm_offset_t va, vm_offset_t pa, int flags)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte;
 	pt_entry_t opte;
 	PDEBUG(1, printf("pmap_kenter: va = %08x, pa = %08x\n",
 	    (uint32_t) va, (uint32_t) pa));
 
 
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (l2b == NULL)
 		l2b = pmap_grow_l2_bucket(pmap_kernel(), va);
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	PDEBUG(1, printf("pmap_kenter: pte = %08x, opte = %08x, npte = %08x\n",
 	    (uint32_t) pte, opte, *pte));
 	if (l2pte_valid(opte)) {
 		cpu_dcache_wbinv_range(va, PAGE_SIZE);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 	} else {
 		if (opte == 0)
 			l2b->l2b_occupancy++;
 	}
 	*pte = L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, 
 	    VM_PROT_READ | VM_PROT_WRITE);
 	if (flags & KENTER_CACHE)
 		*pte |= pte_l2_s_cache_mode;
 	if (flags & KENTER_USER)
 		*pte |= L2_S_PROT_U;
 	PTE_SYNC(pte);
 }
 
 void
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pmap_kenter_internal(va, pa, KENTER_CACHE);
 }
 
 void
 pmap_kenter_nocache(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, 0);
 }
 
 void
 pmap_kenter_user(vm_offset_t va, vm_paddr_t pa)
 {
 
 	pmap_kenter_internal(va, pa, KENTER_CACHE|KENTER_USER);
 	/*
 	 * Call pmap_fault_fixup now, to make sure we'll have no exception
 	 * at the first use of the new address, or bad things will happen,
 	 * as we use one of these addresses in the exception handlers.
 	 */
 	pmap_fault_fixup(pmap_kernel(), va, VM_PROT_READ|VM_PROT_WRITE, 1);
 }
 
 /*
  * remove a page rom the kernel pagetables
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *pte, opte;
 		
 	l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 	if (!l2b)
 		return;
 	KASSERT(l2b != NULL, ("No L2 Bucket"));
 	pte = &l2b->l2b_kva[l2pte_index(va)];
 	opte = *pte;
 	if (l2pte_valid(opte)) {
 		cpu_dcache_wbinv_range(va, PAGE_SIZE);
 		cpu_tlb_flushD_SE(va);
 		cpu_cpwait();
 		*pte = 0;
 	}
 }
 
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	return (arm_ptovirt(start));
 #else
 	vm_offset_t sva = *virt;
 	vm_offset_t va = sva;
 
 	PDEBUG(1, printf("pmap_map: virt = %08x, start = %08x, end = %08x, "
 	    "prot = %d\n", (uint32_t) *virt, (uint32_t) start, (uint32_t) end,
 	    prot));
 	    
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	*virt = va;
 	return (sva);
 #endif
 }
 
 static void
 pmap_wb_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, FALSE,
 		(pv->pv_flags & PVF_WRITE) == 0);
 }
 
 static void
 pmap_inv_page(vm_page_t m)
 {
 	struct pv_entry *pv;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list)
 	    pmap_dcache_wb_range(pv->pv_pmap, pv->pv_va, PAGE_SIZE, TRUE, TRUE);
 }
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pmap_wb_page(m[i]);
 		pmap_kenter_internal(va, VM_PAGE_TO_PHYS(m[i]), 
 		    KENTER_CACHE);
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	vm_paddr_t pa;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pa = vtophys(va);
 		if (pa) {
 			pmap_inv_page(PHYS_TO_VM_PAGE(pa));
 			pmap_kremove(va);
 		}
 		va += PAGE_SIZE;
 	}
 }
 
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 
 	if (!pmap_get_pde_pte(pmap, addr, &pde, &pte))
 		return (FALSE);
 	KASSERT(pte != NULL, ("Valid mapping but no pte ?"));
 	if (*pte == 0)
 		return (TRUE);
 	return (FALSE);
 }
 
 /*
  * Fetch pointers to the PDE/PTE for the given pmap/VA pair.
  * Returns TRUE if the mapping exists, else FALSE.
  *
  * NOTE: This function is only used by a couple of arm-specific modules.
  * It is not safe to take any pmap locks here, since we could be right
  * in the middle of debugging the pmap anyway...
  *
  * It is possible for this routine to return FALSE even though a valid
  * mapping does exist. This is because we don't lock, so the metadata
  * state may be inconsistent.
  *
  * NOTE: We can return a NULL *ptp in the case where the L1 pde is
  * a "section" mapping.
  */
 boolean_t
 pmap_get_pde_pte(pmap_t pm, vm_offset_t va, pd_entry_t **pdp, pt_entry_t **ptp)
 {
 	struct l2_dtable *l2;
 	pd_entry_t *pl1pd, l1pd;
 	pt_entry_t *ptep;
 	u_short l1idx;
 
 	if (pm->pm_l1 == NULL)
 		return (FALSE);
 
 	l1idx = L1_IDX(va);
 	*pdp = pl1pd = &pm->pm_l1->l1_kva[l1idx];
 	l1pd = *pl1pd;
 
 	if (l1pte_section_p(l1pd)) {
 		*ptp = NULL;
 		return (TRUE);
 	}
 
 	if (pm->pm_l2 == NULL)
 		return (FALSE);
 
 	l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 	if (l2 == NULL ||
 	    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		return (FALSE);
 	}
 
 	*ptp = &ptep[l2pte_index(va)];
 	return (TRUE);
 }
 
 /*
  *      Routine:        pmap_remove_all
  *      Function:
  *              Removes this physical page from
  *              all physical maps in which it resides.
  *              Reflects back modify bits to the pager.
  *
  *      Notes:
  *              Original versions of this routine were very
  *              inefficient because they iteratively called
  *              pmap_remove (slow...)
  */
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *ptep, pte;
 	struct l2_bucket *l2b;
 	boolean_t flush = FALSE;
 	pmap_t curpm;
 	int flags = 0;
 
 #if defined(PMAP_DEBUG)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%x", VM_PAGE_TO_PHYS(m));
 	}
 #endif
 
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	curpm = vmspace_pmap(curproc->p_vmspace);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		if (flush == FALSE && (pv->pv_pmap == curpm ||
 		    pv->pv_pmap == pmap_kernel()))
 			flush = TRUE;
 		PMAP_LOCK(pv->pv_pmap);
 		l2b = pmap_get_l2_bucket(pv->pv_pmap, pv->pv_va);
 		KASSERT(l2b != NULL, ("No l2 bucket"));
 		ptep = &l2b->l2b_kva[l2pte_index(pv->pv_va)];
 		pte = *ptep;
 		*ptep = 0;
 		PTE_SYNC_CURRENT(pv->pv_pmap, ptep);
 		pmap_free_l2_bucket(pv->pv_pmap, l2b, 1);
 		if (pv->pv_flags & PVF_WIRED)
 			pv->pv_pmap->pm_stats.wired_count--;
 		pv->pv_pmap->pm_stats.resident_count--;
 		flags |= pv->pv_flags;
 		pmap_nuke_pv(m, pv->pv_pmap, pv);
 		PMAP_UNLOCK(pv->pv_pmap);
 		pmap_free_pv_entry(pv);
 	}
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(curpm);
 		else
 			pmap_tlb_flushD(curpm);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_offset_t next_bucket;
 	u_int flags;
 	int flush;
 
 	if ((prot & VM_PROT_READ) == 0) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE) {
 		/*
 		 * If this is a read->write transition, just ignore it and let
 		 * vm_fault() take care of it later.
 		 */
 		return;
 	}
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 
 	/*
 	 * OK, at this point, we know we're doing write-protect operation.
 	 * If the pmap is active, write-back the range.
 	 */
 	pmap_dcache_wb_range(pm, sva, eva - sva, FALSE, FALSE);
 
 	flush = ((eva - sva) >= (PAGE_SIZE * 4)) ? 0 : -1;
 	flags = 0;
 
 	while (sva < eva) {
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 
 		while (sva < next_bucket) {
 			if ((pte = *ptep) != 0 && (pte & L2_S_PROT_W) != 0) {
 				struct vm_page *pg;
 				u_int f;
 
 				pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 				pte &= ~L2_S_PROT_W;
 				*ptep = pte;
 				PTE_SYNC(ptep);
 
 				if (pg != NULL) {
 					f = pmap_modify_pv(pg, pm, sva,
 					    PVF_WRITE, 0);
 					pmap_vac_me_harder(pg, pm, sva);
 					vm_page_dirty(pg);
 				} else
 					f = PVF_REF | PVF_EXEC;
 
 				if (flush >= 0) {
 					flush++;
 					flags |= f;
 				} else
 				if (PV_BEEN_EXECD(f))
 					pmap_tlb_flushID_SE(pm, sva);
 				else
 				if (PV_BEEN_REFD(f))
 					pmap_tlb_flushD_SE(pm, sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 		}
 	}
 
 
 	if (flush) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_tlb_flushID(pm);
 		else
 		if (PV_BEEN_REFD(flags))
 			pmap_tlb_flushD(pm);
 	}
 	vm_page_unlock_queues();
 
  	PMAP_UNLOCK(pm);
 }
 
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot, wired, M_WAITOK);
 	vm_page_unlock_queues();
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	The page queues and pmap must be locked.
  */
 static void
 pmap_enter_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired, int flags)
 {
 	struct l2_bucket *l2b = NULL;
 	struct vm_page *opg;
 	struct pv_entry *pve = NULL;
 	pt_entry_t *ptep, npte, opte;
 	u_int nflags;
 	u_int oflags;
 	vm_paddr_t pa;
 
 	PMAP_ASSERT_LOCKED(pmap);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (va == vector_page) {
 		pa = systempage.pv_pa;
 		m = NULL;
 	} else
 		pa = VM_PAGE_TO_PHYS(m);
 	nflags = 0;
 	if (prot & VM_PROT_WRITE)
 		nflags |= PVF_WRITE;
 	if (prot & VM_PROT_EXECUTE)
 		nflags |= PVF_EXEC;
 	if (wired)
 		nflags |= PVF_WIRED;
 	PDEBUG(1, printf("pmap_enter: pmap = %08x, va = %08x, m = %08x, prot = %x, "
 	    "wired = %x\n", (uint32_t) pmap, va, (uint32_t) m, prot, wired));
 	    
 	if (pmap == pmap_kernel()) {
 		l2b = pmap_get_l2_bucket(pmap, va);
 		if (l2b == NULL)
 			l2b = pmap_grow_l2_bucket(pmap, va);
 	} else {
 do_l2b_alloc:
 		l2b = pmap_alloc_l2_bucket(pmap, va);
 		if (l2b == NULL) {
 			if (flags & M_WAITOK) {
 				PMAP_UNLOCK(pmap);
 				vm_page_unlock_queues();
 				VM_WAIT;
 				vm_page_lock_queues();
 				PMAP_LOCK(pmap);
 				goto do_l2b_alloc;
 			}
 			return;
 		}
 	}
 
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 		    
 	opte = *ptep;
 	npte = pa;
 	oflags = 0;
 	if (opte) {
 		/*
 		 * There is already a mapping at this address.
 		 * If the physical address is different, lookup the
 		 * vm_page.
 		 */
 		if (l2pte_pa(opte) != pa)
 			opg = PHYS_TO_VM_PAGE(l2pte_pa(opte));
 		else
 			opg = m;
 	} else
 		opg = NULL;
 
 	if ((prot & (VM_PROT_ALL)) ||
 	    (!m || m->md.pvh_attrs & PVF_REF)) {
 		/*
 		 * - The access type indicates that we don't need
 		 *   to do referenced emulation.
 		 * OR
 		 * - The physical page has already been referenced
 		 *   so no need to re-do referenced emulation here.
 		 */
 		npte |= L2_S_PROTO;
 		
 		nflags |= PVF_REF;
 		
 		if (m && ((prot & VM_PROT_WRITE) != 0 ||
 		    (m->md.pvh_attrs & PVF_MOD))) {
 			/*
 			 * This is a writable mapping, and the
 			 * page's mod state indicates it has
 			 * already been modified. Make it
 			 * writable from the outset.
 			 */
 			nflags |= PVF_MOD;
 			if (!(m->md.pvh_attrs & PVF_MOD))
 				vm_page_dirty(m);
 		}
 		if (m && opte)
 			vm_page_flag_set(m, PG_REFERENCED);
 	} else {
 		/*
 		 * Need to do page referenced emulation.
 		 */
 		npte |= L2_TYPE_INV;
 	}
 	
 	if (prot & VM_PROT_WRITE) {
 		npte |= L2_S_PROT_W;
 		if (m != NULL)
 			vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	npte |= pte_l2_s_cache_mode;
 	if (m && m == opg) {
 		/*
 		 * We're changing the attrs of an existing mapping.
 		 */
 		oflags = pmap_modify_pv(m, pmap, va,
 		    PVF_WRITE | PVF_EXEC | PVF_WIRED |
 		    PVF_MOD | PVF_REF, nflags);
 		
 		/*
 		 * We may need to flush the cache if we're
 		 * doing rw-ro...
 		 */
 		if (pmap_is_current(pmap) &&
 		    (oflags & PVF_NC) == 0 &&
 			    (opte & L2_S_PROT_W) != 0 &&
 			    (prot & VM_PROT_WRITE) == 0)
 			cpu_dcache_wb_range(va, PAGE_SIZE);
 	} else {
 		/*
 		 * New mapping, or changing the backing page
 		 * of an existing mapping.
 		 */
 		if (opg) {
 			/*
 			 * Replacing an existing mapping with a new one.
 			 * It is part of our managed memory so we
 			 * must remove it from the PV list
 			 */
 			pve = pmap_remove_pv(opg, pmap, va);
 			if (m && (m->flags & (PG_UNMANAGED | PG_FICTITIOUS)) &&
 			    pve)
 				pmap_free_pv_entry(pve);
 			else if (!pve && 
 			    !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS)))
 				pve = pmap_get_pv_entry();
 			KASSERT(pve != NULL || m->flags & (PG_UNMANAGED | 
 			    PG_FICTITIOUS), ("No pv"));
 			oflags = pve->pv_flags;
 			
 			/*
 			 * If the old mapping was valid (ref/mod
 			 * emulation creates 'invalid' mappings
 			 * initially) then make sure to frob
 			 * the cache.
 			 */
 			if ((oflags & PVF_NC) == 0 &&
 			    l2pte_valid(opte)) {
 				if (PV_BEEN_EXECD(oflags)) {
 					pmap_idcache_wbinv_range(pmap, va,
 					    PAGE_SIZE);
 				} else
 					if (PV_BEEN_REFD(oflags)) {
 						pmap_dcache_wb_range(pmap, va,
 						    PAGE_SIZE, TRUE,
 						    (oflags & PVF_WRITE) == 0);
 					}
 			}
 		} else if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS)))
 			if ((pve = pmap_get_pv_entry()) == NULL) {
 				panic("pmap_enter: no pv entries");	
 			}
 		if (m && !(m->flags & (PG_UNMANAGED | PG_FICTITIOUS))) {
 			KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 			    ("pmap_enter: managed mapping within the clean submap"));
 			pmap_enter_pv(m, pve, pmap, va, nflags);
 		}
 	}
 	/*
 	 * Make sure userland mappings get the right permissions
 	 */
 	if (pmap != pmap_kernel() && va != vector_page) {
 		npte |= L2_S_PROT_U;
 	}
 
 	/*
 	 * Keep the stats up to date
 	 */
 	if (opte == 0) {
 		l2b->l2b_occupancy++;
 		pmap->pm_stats.resident_count++;
 	} 
 
 
 	/*
 	 * If this is just a wiring change, the two PTEs will be
 	 * identical, so there's no need to update the page table.
 	 */
 	if (npte != opte) {
 		boolean_t is_cached = pmap_is_current(pmap);
 
 		*ptep = npte;
 		if (is_cached) {
 			/*
 			 * We only need to frob the cache/tlb if this pmap
 			 * is current
 			 */
 			PTE_SYNC(ptep);
 			if (L1_IDX(va) != L1_IDX(vector_page) && 
 			    l2pte_valid(npte)) {
 				/*
 				 * This mapping is likely to be accessed as
 				 * soon as we return to userland. Fix up the
 				 * L1 entry to avoid taking another
 				 * page/domain fault.
 				 */
 				pd_entry_t *pl1pd, l1pd;
 
 				pl1pd = &pmap->pm_l1->l1_kva[L1_IDX(va)];
 				l1pd = l2b->l2b_phys | L1_C_DOM(pmap->pm_domain) |
 				    L1_C_PROTO;
 				if (*pl1pd != l1pd) {
 					*pl1pd = l1pd;
 					PTE_SYNC(pl1pd);
 				}
 			}
 		}
 
 		if (PV_BEEN_EXECD(oflags))
 			pmap_tlb_flushID_SE(pmap, va);
 		else if (PV_BEEN_REFD(oflags))
 			pmap_tlb_flushD_SE(pmap, va);
 
 
 		if (m)
 			pmap_vac_me_harder(m, pmap, va);
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pmap, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE, M_NOWAIT);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
  	PMAP_LOCK(pmap);
 	pmap_enter_locked(pmap, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE, M_NOWAIT);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, pte;
 	vm_page_t pg;
 
 	vm_page_lock_queues();
  	PMAP_LOCK(pmap);
 	l2b = pmap_get_l2_bucket(pmap, va);
 	KASSERT(l2b, ("No l2b bucket in pmap_change_wiring"));
 	ptep = &l2b->l2b_kva[l2pte_index(va)];
 	pte = *ptep;
 	pg = PHYS_TO_VM_PAGE(l2pte_pa(pte));
 	if (pg) 
 		pmap_modify_pv(pg, pmap, va, PVF_WIRED, wired);
 	vm_page_unlock_queues();
  	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 }
 
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pm, vm_offset_t va)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 
 	PMAP_LOCK(pm);
 	l1pd = pm->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pm == pmap_kernel(), ("huh"));
 		pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pm->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 			PMAP_UNLOCK(pm);
 			return (0);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 			PMAP_UNLOCK(pm);
 			return (0);
 		}
 
 		switch (pte & L2_TYPE_MASK) {
 		case L2_TYPE_L:
 			pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 			break;
 
 		default:
 			pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 			break;
 		}
 	}
 
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  *
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct l2_dtable *l2;
 	pd_entry_t l1pd;
 	pt_entry_t *ptep, pte;
 	vm_paddr_t pa;
 	vm_page_t m = NULL;
 	u_int l1idx;
 	l1idx = L1_IDX(va);
 
 	vm_page_lock_queues();
  	PMAP_LOCK(pmap);
 	l1pd = pmap->pm_l1->l1_kva[l1idx];
 	if (l1pte_section_p(l1pd)) {
 		/*
 		 * These should only happen for pmap_kernel()
 		 */
 		KASSERT(pmap == pmap_kernel(), ("huh"));
 		pa = (l1pd & L1_S_FRAME) | (va & L1_S_OFFSET);
 		if (l1pd & L1_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 			
 	} else {
 		/*
 		 * Note that we can't rely on the validity of the L1
 		 * descriptor as an indication that a mapping exists.
 		 * We have to look it up in the L2 dtable.
 		 */
 		l2 = pmap->pm_l2[L2_IDX(l1idx)];
 
 		if (l2 == NULL ||
 		    (ptep = l2->l2_bucket[L2_BUCKET(l1idx)].l2b_kva) == NULL) {
 		 	PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			return (NULL);
 		}
 
 		ptep = &ptep[l2pte_index(va)];
 		pte = *ptep;
 
 		if (pte == 0) {
 		 	PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			return (NULL);
 		}
 		if (pte & L2_S_PROT_W || (prot & VM_PROT_WRITE) == 0) {
 			switch (pte & L2_TYPE_MASK) {
 			case L2_TYPE_L:
 				pa = (pte & L2_L_FRAME) | (va & L2_L_OFFSET);
 				break;
 				
 			default:
 				pa = (pte & L2_S_FRAME) | (va & L2_S_OFFSET);
 				break;
 			}
 			m = PHYS_TO_VM_PAGE(pa);
 			vm_page_hold(m);
 		}
 	}
 
  	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
 	return (m);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 
 void
 pmap_pinit(pmap_t pmap)
 {
 	PDEBUG(1, printf("pmap_pinit: pmap = %08x\n", (uint32_t) pmap));
 	
 	PMAP_LOCK_INIT(pmap);
 	pmap_alloc_l1(pmap);
 	bzero(pmap->pm_l2, sizeof(pmap->pm_l2));
 
 	pmap->pm_count = 1;
 	pmap->pm_active = 0;
 		
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_stats.resident_count = 1;
 	if (vector_page < KERNBASE) {
 		pmap_enter(pmap, vector_page, PHYS_TO_VM_PAGE(systempage.pv_pa),
 		    VM_PROT_READ, 1);
 	} 
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 
 static void
 pmap_free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
 static pv_entry_t
 pmap_get_pv_entry(void)
 {
 	pv_entry_t ret_value;
 	
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	ret_value = uma_zalloc(pvzone, M_NOWAIT);
 	return ret_value;
 }
 
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 #define  PMAP_REMOVE_CLEAN_LIST_SIZE     3
 void
 pmap_remove(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
 	struct l2_bucket *l2b;
 	vm_offset_t next_bucket;
 	pt_entry_t *ptep;
 	u_int cleanlist_idx, total, cnt;
 	struct {
 		vm_offset_t va;
 		pt_entry_t *pte;
 	} cleanlist[PMAP_REMOVE_CLEAN_LIST_SIZE];
 	u_int mappings, is_exec, is_refd;
 	int flushall = 0;
 
 
 	/*
 	 * we lock in the pmap => pv_head direction
 	 */
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (!pmap_is_current(pm)) {
 		cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1;
 	} else
 		cleanlist_idx = 0;
 
 	total = 0;
 	while (sva < eva) {
 		/*
 		 * Do one L2 bucket's worth at a time.
 		 */
 		next_bucket = L2_NEXT_BUCKET(sva);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pm, sva);
 		if (l2b == NULL) {
 			sva = next_bucket;
 			continue;
 		}
 
 		ptep = &l2b->l2b_kva[l2pte_index(sva)];
 		mappings = 0;
 
 		while (sva < next_bucket) {
 			struct vm_page *pg;
 			pt_entry_t pte;
 			vm_paddr_t pa;
 
 			pte = *ptep;
 
 			if (pte == 0) {
 				/*
 				 * Nothing here, move along
 				 */
 				sva += PAGE_SIZE;
 				ptep++;
 				continue;
 			}
 
 			pm->pm_stats.resident_count--;
 			pa = l2pte_pa(pte);
 			is_exec = 0;
 			is_refd = 1;
 
 			/*
 			 * Update flags. In a number of circumstances,
 			 * we could cluster a lot of these and do a
 			 * number of sequential pages in one go.
 			 */
 			if ((pg = PHYS_TO_VM_PAGE(pa)) != NULL) {
 				struct pv_entry *pve;
 
 				pve = pmap_remove_pv(pg, pm, sva);
 				if (pve) {
 					is_exec = PV_BEEN_EXECD(pve->pv_flags);
 					is_refd = PV_BEEN_REFD(pve->pv_flags);
 					pmap_free_pv_entry(pve);
 				}
 			}
 
 			if (!l2pte_valid(pte)) {
 				*ptep = 0;
 				PTE_SYNC_CURRENT(pm, ptep);
 				sva += PAGE_SIZE;
 				ptep++;
 				mappings++;
 				continue;
 			}
 
 			if (cleanlist_idx < PMAP_REMOVE_CLEAN_LIST_SIZE) {
 				/* Add to the clean list. */
 				cleanlist[cleanlist_idx].pte = ptep;
 				cleanlist[cleanlist_idx].va =
 				    sva | (is_exec & 1);
 				cleanlist_idx++;
 			} else
 			if (cleanlist_idx == PMAP_REMOVE_CLEAN_LIST_SIZE) {
 				/* Nuke everything if needed. */
 				pmap_idcache_wbinv_all(pm);
 				pmap_tlb_flushID(pm);
 
 				/*
 				 * Roll back the previous PTE list,
 				 * and zero out the current PTE.
 				 */
 				for (cnt = 0;
 				     cnt < PMAP_REMOVE_CLEAN_LIST_SIZE; cnt++) {
 					*cleanlist[cnt].pte = 0;
 				}
 				*ptep = 0;
 				PTE_SYNC(ptep);
 				cleanlist_idx++;
 				flushall = 1;
 			} else {
 				*ptep = 0;
 				PTE_SYNC(ptep);
 					if (is_exec)
 						pmap_tlb_flushID_SE(pm, sva);
 					else
 					if (is_refd)
 						pmap_tlb_flushD_SE(pm, sva);
 			}
 
 			sva += PAGE_SIZE;
 			ptep++;
 			mappings++;
 		}
 
 		/*
 		 * Deal with any left overs
 		 */
 		if (cleanlist_idx <= PMAP_REMOVE_CLEAN_LIST_SIZE) {
 			total += cleanlist_idx;
 			for (cnt = 0; cnt < cleanlist_idx; cnt++) {
 				vm_offset_t clva =
 				    cleanlist[cnt].va & ~1;
 				if (cleanlist[cnt].va & 1) {
 					pmap_idcache_wbinv_range(pm,
 					    clva, PAGE_SIZE);
 					pmap_tlb_flushID_SE(pm, clva);
 				} else {
 					pmap_dcache_wb_range(pm,
 					    clva, PAGE_SIZE, TRUE,
 					    FALSE);
 					pmap_tlb_flushD_SE(pm, clva);
 				}
 				*cleanlist[cnt].pte = 0;
 				PTE_SYNC_CURRENT(pm, cleanlist[cnt].pte);
 			}
 
 			if (total <= PMAP_REMOVE_CLEAN_LIST_SIZE)
 				cleanlist_idx = 0;
 			else {
 				/*
 				 * We are removing so much entries it's just
 				 * easier to flush the whole cache.
 				 */
 				cleanlist_idx = PMAP_REMOVE_CLEAN_LIST_SIZE + 1;
 				pmap_idcache_wbinv_all(pm);
 				flushall = 1;
 			}
 		}
 
 		pmap_free_l2_bucket(pm, l2b, mappings);
 	}
 
 	vm_page_unlock_queues();
 	if (flushall)
 		cpu_tlb_flushID();
  	PMAP_UNLOCK(pm);
 }
 
 
 
 
 /*
  * pmap_zero_page()
  * 
  * Zero a given physical page by mapping it at a page hook point.
  * In doing the zero page op, the page we zero is mapped cachable, as with
  * StrongARM accesses to non-cached pages are non-burst making writing
  * _any_ bulk data very slow.
  */
 #if (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined(CPU_XSCALE_CORE3)
 void
 pmap_zero_page_generic(vm_paddr_t phys, int off, int size)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	char *dstpg;
 #endif
 
 #ifdef DEBUG
 	struct vm_page *pg = PHYS_TO_VM_PAGE(phys);
 
 	if (pg->md.pvh_list != NULL)
 		panic("pmap_zero_page: page has mappings");
 #endif
 
 	if (_arm_bzero && 
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 
 #ifdef ARM_USE_SMALL_ALLOC
 	dstpg = (char *)arm_ptovirt(phys);
 	if (off || size != PAGE_SIZE) {
 		bzero(dstpg + off, size);
 		cpu_dcache_wbinv_range((vm_offset_t)(dstpg + off), size);
 	} else {
 		bzero_page((vm_offset_t)dstpg);
 		cpu_dcache_wbinv_range((vm_offset_t)dstpg, PAGE_SIZE);
 	}
 #else
 
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, and purge the cache for that
 	 * zeroed page. Invalidate the TLB as needed.
 	 */
 	*cdst_pte = L2_S_PROTO | phys |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode;
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE) {
 		bzero((void *)(cdstp + off), size);
 		cpu_dcache_wbinv_range(cdstp + off, size);
 	} else {
 		bzero_page(cdstp);
 		cpu_dcache_wbinv_range(cdstp, PAGE_SIZE);
 	}
 	mtx_unlock(&cmtx);
 #endif
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_zero_page_xscale(vm_paddr_t phys, int off, int size)
 {
 	if (_arm_bzero && 
 	    _arm_bzero((void *)(phys + off), size, IS_PHYSICAL) == 0)
 		return;
 	mtx_lock(&cmtx);
 	/*
 	 * Hook in the page, zero it, and purge the cache for that
 	 * zeroed page. Invalidate the TLB as needed.
 	 */
 	*cdst_pte = L2_S_PROTO | phys |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	if (off || size != PAGE_SIZE)
 		bzero((void *)(cdstp + off), size);
 	else
 		bzero_page(cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 }
 
 /*
  * Change the PTEs for the specified kernel mappings such that they
  * will use the mini data cache instead of the main data cache.
  */
 void
 pmap_use_minicache(vm_offset_t va, vm_size_t size)
 {
 	struct l2_bucket *l2b;
 	pt_entry_t *ptep, *sptep, pte;
 	vm_offset_t next_bucket, eva;
 
 #if (ARM_NMMUS > 1) || defined(CPU_XSCALE_CORE3)
 	if (xscale_use_minidata == 0)
 		return;
 #endif
 
 	eva = va + size;
 
 	while (va < eva) {
 		next_bucket = L2_NEXT_BUCKET(va);
 		if (next_bucket > eva)
 			next_bucket = eva;
 
 		l2b = pmap_get_l2_bucket(pmap_kernel(), va);
 
 		sptep = ptep = &l2b->l2b_kva[l2pte_index(va)];
 
 		while (va < next_bucket) {
 			pte = *ptep;
 			if (!l2pte_minidata(pte)) {
 				cpu_dcache_wbinv_range(va, PAGE_SIZE);
 				cpu_tlb_flushD_SE(va);
 				*ptep = pte & ~L2_B;
 			}
 			ptep++;
 			va += PAGE_SIZE;
 		}
 		PTE_SYNC_RANGE(sptep, (u_int)(ptep - sptep));
 	}
 	cpu_cpwait();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), 0, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 
 	pmap_zero_page_func(VM_PAGE_TO_PHYS(m), off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	pmap_zero_page(m);
 }
 
 #if 0
 /*
  * pmap_clean_page()
  *
  * This is a local function used to work out the best strategy to clean
  * a single page referenced by its entry in the PV table. It's used by
  * pmap_copy_page, pmap_zero page and maybe some others later on.
  *
  * Its policy is effectively:
  *  o If there are no mappings, we don't bother doing anything with the cache.
  *  o If there is one mapping, we clean just that page.
  *  o If there are multiple mappings, we clean the entire cache.
  *
  * So that some functions can be further optimised, it returns 0 if it didn't
  * clean the entire cache, or 1 if it did.
  *
  * XXX One bug in this routine is that if the pv_entry has a single page
  * mapped at 0x00000000 a whole cache clean will be performed rather than
  * just the 1 page. Since this should not occur in everyday use and if it does
  * it will just result in not the most efficient clean for the page.
  */
 static int
 pmap_clean_page(struct pv_entry *pv, boolean_t is_src)
 {
 	pmap_t pm, pm_to_clean = NULL;
 	struct pv_entry *npv;
 	u_int cache_needs_cleaning = 0;
 	u_int flags = 0;
 	vm_offset_t page_to_clean = 0;
 
 	if (pv == NULL) {
 		/* nothing mapped in so nothing to flush */
 		return (0);
 	}
 
 	/*
 	 * Since we flush the cache each time we change to a different
 	 * user vmspace, we only need to flush the page if it is in the
 	 * current pmap.
 	 */
 	if (curthread)
 		pm = vmspace_pmap(curproc->p_vmspace);
 	else
 		pm = pmap_kernel();
 
 	for (npv = pv; npv; npv = TAILQ_NEXT(npv, pv_list)) {
 		if (npv->pv_pmap == pmap_kernel() || npv->pv_pmap == pm) {
 			flags |= npv->pv_flags;
 			/*
 			 * The page is mapped non-cacheable in 
 			 * this map.  No need to flush the cache.
 			 */
 			if (npv->pv_flags & PVF_NC) {
 #ifdef DIAGNOSTIC
 				if (cache_needs_cleaning)
 					panic("pmap_clean_page: "
 					    "cache inconsistency");
 #endif
 				break;
 			} else if (is_src && (npv->pv_flags & PVF_WRITE) == 0)
 				continue;
 			if (cache_needs_cleaning) {
 				page_to_clean = 0;
 				break;
 			} else {
 				page_to_clean = npv->pv_va;
 				pm_to_clean = npv->pv_pmap;
 			}
 			cache_needs_cleaning = 1;
 		}
 	}
 	if (page_to_clean) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE);
 		else
 			pmap_dcache_wb_range(pm_to_clean, page_to_clean,
 			    PAGE_SIZE, !is_src, (flags & PVF_WRITE) == 0);
 	} else if (cache_needs_cleaning) {
 		if (PV_BEEN_EXECD(flags))
 			pmap_idcache_wbinv_all(pm);
 		else
 			pmap_dcache_wbinv_all(pm);
 		return (1);
 	}
 	return (0);
 }
 #endif
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 
 /*
  * pmap_copy_page()
  *
  * Copy one physical page into another, by mapping the pages into
  * hook points. The same comment regarding cachability as in
  * pmap_zero_page also applies here.
  */
 #if  (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 || defined (CPU_XSCALE_CORE3)
 void
 pmap_copy_page_generic(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 #ifdef DEBUG
 	struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst);
 
 	if (dst_pg->md.pvh_list != NULL)
 		panic("pmap_copy_page: dst page has mappings");
 #endif
 
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) | pte_l2_s_cache_mode;
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) | pte_l2_s_cache_mode;
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	cpu_dcache_inv_range(csrcp, PAGE_SIZE);
 	cpu_dcache_wbinv_range(cdstp, PAGE_SIZE);
 }
 #endif /* (ARM_MMU_GENERIC + ARM_MMU_SA1) != 0 */
 
 #if ARM_MMU_XSCALE == 1
 void
 pmap_copy_page_xscale(vm_paddr_t src, vm_paddr_t dst)
 {
 #if 0
 	/* XXX: Only needed for pmap_clean_page(), which is commented out. */
 	struct vm_page *src_pg = PHYS_TO_VM_PAGE(src);
 #endif
 #ifdef DEBUG
 	struct vm_page *dst_pg = PHYS_TO_VM_PAGE(dst);
 
 	if (dst_pg->md.pvh_list != NULL)
 		panic("pmap_copy_page: dst page has mappings");
 #endif
 
 
 	/*
 	 * Clean the source page.  Hold the source page's lock for
 	 * the duration of the copy so that no other mappings can
 	 * be created while we have a potentially aliased mapping.
 	 */
 #if 0
 	/*
 	 * XXX: Not needed while we call cpu_dcache_wbinv_all() in
 	 * pmap_copy_page().
 	 */
 	(void) pmap_clean_page(TAILQ_FIRST(&src_pg->md.pv_list), TRUE);
 #endif
 	/*
 	 * Map the pages into the page hook points, copy them, and purge
 	 * the cache for the appropriate page. Invalidate the TLB
 	 * as required.
 	 */
 	mtx_lock(&cmtx);
 	*csrc_pte = L2_S_PROTO | src |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_READ) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(csrc_pte);
 	*cdst_pte = L2_S_PROTO | dst |
 	    L2_S_PROT(PTE_KERNEL, VM_PROT_WRITE) |
 	    L2_C | L2_XSCALE_T_TEX(TEX_XSCALE_X);	/* mini-data */
 	PTE_SYNC(cdst_pte);
 	cpu_tlb_flushD_SE(csrcp);
 	cpu_tlb_flushD_SE(cdstp);
 	cpu_cpwait();
 	bcopy_page(csrcp, cdstp);
 	mtx_unlock(&cmtx);
 	xscale_cache_clean_minidata();
 }
 #endif /* ARM_MMU_XSCALE == 1 */
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 #ifdef ARM_USE_SMALL_ALLOC
 	vm_offset_t srcpg, dstpg;
 #endif
 
 	cpu_dcache_wbinv_all();
 	if (_arm_memcpy && 
 	    _arm_memcpy((void *)VM_PAGE_TO_PHYS(dst), 
 	    (void *)VM_PAGE_TO_PHYS(src), PAGE_SIZE, IS_PHYSICAL) == 0)
 		return;
 #ifdef ARM_USE_SMALL_ALLOC
 	srcpg = arm_ptovirt(VM_PAGE_TO_PHYS(src));
 	dstpg = arm_ptovirt(VM_PAGE_TO_PHYS(dst));
 	bcopy_page(srcpg, dstpg);
 	cpu_dcache_wbinv_range(dstpg, PAGE_SIZE);
 #else
 	pmap_copy_page_func(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
 #endif
 }
 
 
 
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 	
 	if (m->flags & PG_FICTITIOUS)
 		return (FALSE);
 		
 	/*
 	 * Not found, check current mappings returning immediately
 	 */
 	for (pv = TAILQ_FIRST(&m->md.pv_list);
 	    pv;
 	    pv = TAILQ_NEXT(pv, pv_list)) {
 	    	if (pv->pv_pmap == pmap) {
 	    		return (TRUE);
 	    	}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return the count of reference bits for a page, clearing all of them.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 
 	if (m->flags & PG_FICTITIOUS)
 		return (0);
 	return (pmap_clearbit(m, PVF_REF));
 }
 
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_MOD)
 		return (TRUE);
 	
 	return(FALSE);
 }
 
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_MOD)
 		pmap_clearbit(m, PVF_MOD);
 }
 
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 
 	if (m->md.pvh_attrs & PVF_REF) 
 		pmap_clearbit(m, PVF_REF);
 }
 
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 
 	if (m->flags & PG_WRITEABLE)
 		pmap_clearbit(m, PVF_WRITE);
 }
 
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	printf("pmap_mincore()\n");
 	
 	return (0);
 }
 
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	return(addr);
 }
 
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	vm_offset_t va, tmpva, offset;
 	
 	offset = pa & PAGE_MASK;
 	size = roundup(size, PAGE_SIZE);
 	
 	GIANT_REQUIRED;
 	
 	va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 	for (tmpva = va; size > 0;) {
 		pmap_kenter_internal(tmpva, pa, 0);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	
 	return ((void *)(va + offset));
 }
 
 #define BOOTSTRAP_DEBUG
 
 /*
  * pmap_map_section:
  *
  *	Create a single section mapping.
  */
 void
 pmap_map_section(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pd_entry_t fl;
 
 	KASSERT(((va | pa) & L1_S_OFFSET) == 0, ("ouin2"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l1_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l1_s_cache_mode_pt;
 		break;
 	}
 
 	pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 	    L1_S_PROT(PTE_KERNEL, prot) | fl | L1_S_DOM(PMAP_DOMAIN_KERNEL);
 	PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 
 }
 
 /*
  * pmap_link_l2pt:
  *
  *	Link the L2 page table specified by l2pv.pv_pa into the L1
  *	page table at the slot for "va".
  */
 void
 pmap_link_l2pt(vm_offset_t l1pt, vm_offset_t va, struct pv_addr *l2pv)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt, proto;
 	u_int slot = va >> L1_S_SHIFT;
 
 	proto = L1_S_DOM(PMAP_DOMAIN_KERNEL) | L1_C_PROTO;
 
 #ifdef VERBOSE_INIT_ARM     
 	printf("pmap_link_l2pt: pa=0x%x va=0x%x\n", l2pv->pv_pa, l2pv->pv_va);
 #endif
 
 	pde[slot + 0] = proto | (l2pv->pv_pa + 0x000);
 
 	PTE_SYNC(&pde[slot]);
 
 	SLIST_INSERT_HEAD(&kernel_pt_list, l2pv, pv_list);
 
 	
 }
 
 /*
  * pmap_map_entry
  *
  * 	Create a single page mapping.
  */
 void
 pmap_map_entry(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa, int prot,
     int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t fl;
 	pt_entry_t *pte;
 
 	KASSERT(((va | pa) & PAGE_MASK) == 0, ("ouin"));
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		fl = 0;
 		break;
 
 	case PTE_CACHE:
 		fl = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		fl = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 		panic("pmap_map_entry: no L2 table for VA 0x%08x", va);
 
 	pte = (pt_entry_t *) kernel_pt_lookup(pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 
 	if (pte == NULL)
 		panic("pmap_map_entry: can't find L2 table for VA 0x%08x", va);
 
 	pte[l2pte_index(va)] =
 	    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | fl;
 	PTE_SYNC(&pte[l2pte_index(va)]);
 }
 
 /*
  * pmap_map_chunk:
  *
  *	Map a chunk of memory using the most efficient mappings
  *	possible (section. large page, small page) into the
  *	provided L1 and L2 tables at the specified virtual address.
  */
 vm_size_t
 pmap_map_chunk(vm_offset_t l1pt, vm_offset_t va, vm_offset_t pa,
     vm_size_t size, int prot, int cache)
 {
 	pd_entry_t *pde = (pd_entry_t *) l1pt;
 	pt_entry_t *pte, f1, f2s, f2l;
 	vm_size_t resid;  
 	int i;
 
 	resid = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
 
 	if (l1pt == 0)
 		panic("pmap_map_chunk: no L1 table provided");
 
 #ifdef VERBOSE_INIT_ARM     
 	printf("pmap_map_chunk: pa=0x%x va=0x%x size=0x%x resid=0x%x "
 	    "prot=0x%x cache=%d\n", pa, va, size, resid, prot, cache);
 #endif
 
 	switch (cache) {
 	case PTE_NOCACHE:
 	default:
 		f1 = 0;
 		f2l = 0;
 		f2s = 0;
 		break;
 
 	case PTE_CACHE:
 		f1 = pte_l1_s_cache_mode;
 		f2l = pte_l2_l_cache_mode;
 		f2s = pte_l2_s_cache_mode;
 		break;
 
 	case PTE_PAGETABLE:
 		f1 = pte_l1_s_cache_mode_pt;
 		f2l = pte_l2_l_cache_mode_pt;
 		f2s = pte_l2_s_cache_mode_pt;
 		break;
 	}
 
 	size = resid;
 
 	while (resid > 0) {
 		/* See if we can use a section mapping. */
 		if (L1_S_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("S");
 #endif
 			pde[va >> L1_S_SHIFT] = L1_S_PROTO | pa |
 			    L1_S_PROT(PTE_KERNEL, prot) | f1 |
 			    L1_S_DOM(PMAP_DOMAIN_KERNEL);
 			PTE_SYNC(&pde[va >> L1_S_SHIFT]);
 			va += L1_S_SIZE;
 			pa += L1_S_SIZE;
 			resid -= L1_S_SIZE;
 			continue;
 		}
 
 		/*
 		 * Ok, we're going to use an L2 table.  Make sure
 		 * one is actually in the corresponding L1 slot
 		 * for the current VA.
 		 */
 		if ((pde[va >> L1_S_SHIFT] & L1_TYPE_MASK) != L1_TYPE_C)
 			panic("pmap_map_chunk: no L2 table for VA 0x%08x", va);
 
 		pte = (pt_entry_t *) kernel_pt_lookup(
 		    pde[L1_IDX(va)] & L1_C_ADDR_MASK);
 		if (pte == NULL)
 			panic("pmap_map_chunk: can't find L2 table for VA"
 			    "0x%08x", va);
 		/* See if we can use a L2 large page mapping. */
 		if (L2_L_MAPPABLE_P(va, pa, resid)) {
 #ifdef VERBOSE_INIT_ARM
 			printf("L");
 #endif
 			for (i = 0; i < 16; i++) {
 				pte[l2pte_index(va) + i] =
 				    L2_L_PROTO | pa |
 				    L2_L_PROT(PTE_KERNEL, prot) | f2l;
 				PTE_SYNC(&pte[l2pte_index(va) + i]);
 			}
 			va += L2_L_SIZE;
 			pa += L2_L_SIZE;
 			resid -= L2_L_SIZE;
 			continue;
 		}
 
 		/* Use a small page mapping. */
 #ifdef VERBOSE_INIT_ARM
 		printf("P");
 #endif
 		pte[l2pte_index(va)] =
 		    L2_S_PROTO | pa | L2_S_PROT(PTE_KERNEL, prot) | f2s;
 		PTE_SYNC(&pte[l2pte_index(va)]);
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		resid -= PAGE_SIZE;
 	}
 #ifdef VERBOSE_INIT_ARM
 	printf("\n");
 #endif
 	return (size);
 
 }
 
 /********************** Static device map routines ***************************/
 
 static const struct pmap_devmap *pmap_devmap_table;
 
 /*
  * Register the devmap table.  This is provided in case early console
  * initialization needs to register mappings created by bootstrap code
  * before pmap_devmap_bootstrap() is called.
  */
 void
 pmap_devmap_register(const struct pmap_devmap *table)
 {
 
 	pmap_devmap_table = table;
 }
 
 /*
  * Map all of the static regions in the devmap table, and remember
  * the devmap table so other parts of the kernel can look up entries
  * later.
  */
 void
 pmap_devmap_bootstrap(vm_offset_t l1pt, const struct pmap_devmap *table)
 {
 	int i;
 
 	pmap_devmap_table = table;
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 #ifdef VERBOSE_INIT_ARM
 		printf("devmap: %08x -> %08x @ %08x\n",
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_pa +
 			pmap_devmap_table[i].pd_size - 1,
 		    pmap_devmap_table[i].pd_va);
 #endif
 		pmap_map_chunk(l1pt, pmap_devmap_table[i].pd_va,
 		    pmap_devmap_table[i].pd_pa,
 		    pmap_devmap_table[i].pd_size,
 		    pmap_devmap_table[i].pd_prot,
 		    pmap_devmap_table[i].pd_cache);
 	}
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_pa(vm_paddr_t pa, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (pa >= pmap_devmap_table[i].pd_pa &&
 		    pa + size <= pmap_devmap_table[i].pd_pa +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
 const struct pmap_devmap *
 pmap_devmap_find_va(vm_offset_t va, vm_size_t size)
 {
 	int i;
 
 	if (pmap_devmap_table == NULL)
 		return (NULL);
 
 	for (i = 0; pmap_devmap_table[i].pd_size != 0; i++) {
 		if (va >= pmap_devmap_table[i].pd_va &&
 		    va + size <= pmap_devmap_table[i].pd_va +
 				 pmap_devmap_table[i].pd_size)
 			return (&pmap_devmap_table[i]);
 	}
 
 	return (NULL);
 }
 
Index: head/sys/compat/linprocfs/linprocfs.c
===================================================================
--- head/sys/compat/linprocfs/linprocfs.c	(revision 169666)
+++ head/sys/compat/linprocfs/linprocfs.c	(revision 169667)
@@ -1,1264 +1,1264 @@
 /*-
  * Copyright (c) 2000 Dag-Erling Co�dan Sm�rgrav
  * Copyright (c) 1999 Pierre Beyssac
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_status.c	8.4 (Berkeley) 6/15/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/blist.h>
 #include <sys/conf.h>
 #include <sys/exec.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/sem.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/tty.h>
 #include <sys/user.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #include <machine/clock.h>
 
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #endif /* __i386__ || __amd64__ */
 
 #include "opt_compat.h"
 #ifdef COMPAT_LINUX32				/* XXX */
 #include <machine/../linux32/linux.h>
 #else
 #include <machine/../linux/linux.h>
 #endif
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_util.h>
 #include <fs/pseudofs/pseudofs.h>
 #include <fs/procfs/procfs.h>
 
 /*
  * Various conversion macros
  */
 #define T2J(x) (((x) * 100UL) / (stathz ? stathz : hz))	/* ticks to jiffies */
 #define T2S(x) ((x) / (stathz ? stathz : hz))		/* ticks to seconds */
 #define B2K(x) ((x) >> 10)				/* bytes to kbytes */
 #define B2P(x) ((x) >> PAGE_SHIFT)			/* bytes to pages */
 #define P2B(x) ((x) << PAGE_SHIFT)			/* pages to bytes */
 #define P2K(x) ((x) << (PAGE_SHIFT - 10))		/* pages to kbytes */
 
 /**
  * @brief Mapping of ki_stat in struct kinfo_proc to the linux state
  *
  * The linux procfs state field displays one of the characters RSDZTW to
  * denote running, sleeping in an interruptible wait, waiting in an
  * uninteruptible disk sleep, a zombie process, process is being traced
  * or stopped, or process is paging respectively.
  *
  * Our struct kinfo_proc contains the variable ki_stat which contains a
  * value out of SIDL, SRUN, SSLEEP, SSTOP, SZOMB, SWAIT and SLOCK.
  *
  * This character array is used with ki_stati-1 as an index and tries to
  * map our states to suitable linux states.
  */
 static char linux_state[] = "RRSTZDD";
 
 /*
  * Filler function for proc/meminfo
  */
 static int
 linprocfs_domeminfo(PFS_FILL_ARGS)
 {
 	unsigned long memtotal;		/* total memory in bytes */
 	unsigned long memused;		/* used memory in bytes */
 	unsigned long memfree;		/* free memory in bytes */
 	unsigned long memshared;	/* shared memory ??? */
 	unsigned long buffers, cached;	/* buffer / cache memory ??? */
 	unsigned long long swaptotal;	/* total swap space in bytes */
 	unsigned long long swapused;	/* used swap space in bytes */
 	unsigned long long swapfree;	/* free swap space in bytes */
 	vm_object_t object;
 	int i, j;
 
 	memtotal = physmem * PAGE_SIZE;
 	/*
 	 * The correct thing here would be:
 	 *
-	memfree = cnt.v_free_count * PAGE_SIZE;
+	memfree = VMCNT_GET(free_count) * PAGE_SIZE;
 	memused = memtotal - memfree;
 	 *
 	 * but it might mislead linux binaries into thinking there
 	 * is very little memory left, so we cheat and tell them that
 	 * all memory that isn't wired down is free.
 	 */
-	memused = cnt.v_wire_count * PAGE_SIZE;
+	memused = VMCNT_GET(wire_count) * PAGE_SIZE;
 	memfree = memtotal - memused;
 	swap_pager_status(&i, &j);
 	swaptotal = (unsigned long long)i * PAGE_SIZE;
 	swapused = (unsigned long long)j * PAGE_SIZE;
 	swapfree = swaptotal - swapused;
 	memshared = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			memshared += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 	memshared *= PAGE_SIZE;
 	/*
 	 * We'd love to be able to write:
 	 *
 	buffers = bufspace;
 	 *
 	 * but bufspace is internal to vfs_bio.c and we don't feel
 	 * like unstaticizing it just for linprocfs's sake.
 	 */
 	buffers = 0;
-	cached = cnt.v_cache_count * PAGE_SIZE;
+	cached = VMCNT_GET(cache_count) * PAGE_SIZE;
 
 	sbuf_printf(sb,
 	    "	     total:    used:	free:  shared: buffers:	 cached:\n"
 	    "Mem:  %lu %lu %lu %lu %lu %lu\n"
 	    "Swap: %llu %llu %llu\n"
 	    "MemTotal: %9lu kB\n"
 	    "MemFree:  %9lu kB\n"
 	    "MemShared:%9lu kB\n"
 	    "Buffers:  %9lu kB\n"
 	    "Cached:   %9lu kB\n"
 	    "SwapTotal:%9llu kB\n"
 	    "SwapFree: %9llu kB\n",
 	    memtotal, memused, memfree, memshared, buffers, cached,
 	    swaptotal, swapused, swapfree,
 	    B2K(memtotal), B2K(memfree),
 	    B2K(memshared), B2K(buffers), B2K(cached),
 	    B2K(swaptotal), B2K(swapfree));
 
 	return (0);
 }
 
 #if defined(__i386__) || defined(__amd64__)
 /*
  * Filler function for proc/cpuinfo (i386 & amd64 version)
  */
 static int
 linprocfs_docpuinfo(PFS_FILL_ARGS)
 {
 	int hw_model[2];
 	char model[128];
 	size_t size;
 	int class, fqmhz, fqkhz;
 	int i;
 
 	/*
 	 * We default the flags to include all non-conflicting flags,
 	 * and the Intel versions of conflicting flags.
 	 */
 	static char *flags[] = {
 		"fpu",	    "vme",     "de",	   "pse",      "tsc",
 		"msr",	    "pae",     "mce",	   "cx8",      "apic",
 		"sep",	    "sep",     "mtrr",	   "pge",      "mca",
 		"cmov",	    "pat",     "pse36",	   "pn",       "b19",
 		"b20",	    "b21",     "mmxext",   "mmx",      "fxsr",
 		"xmm",	    "b26",     "b27",	   "b28",      "b29",
 		"3dnowext", "3dnow"
 	};
 
 	switch (cpu_class) {
 #ifdef __i386__
 	case CPUCLASS_286:
 		class = 2;
 		break;
 	case CPUCLASS_386:
 		class = 3;
 		break;
 	case CPUCLASS_486:
 		class = 4;
 		break;
 	case CPUCLASS_586:
 		class = 5;
 		break;
 	case CPUCLASS_686:
 		class = 6;
 		break;
 	default:
 		class = 0;
 		break;
 #else /* __amd64__ */
 	default:
 		class = 15;
 		break;
 #endif
 	}
 
 	hw_model[0] = CTL_HW;
 	hw_model[1] = HW_MODEL;
 	model[0] = '\0';
 	size = sizeof(model);
 	if (kernel_sysctl(td, hw_model, 2, &model, &size, 0, 0, 0, 0) != 0)
 		strcpy(model, "unknown");
 	for (i = 0; i < mp_ncpus; ++i) {
 		sbuf_printf(sb,
 		    "processor\t: %d\n"
 		    "vendor_id\t: %.20s\n"
 		    "cpu family\t: %d\n"
 		    "model\t\t: %d\n"
 		    "model name\t: %s\n"
 		    "stepping\t: %d\n",
 		    i, cpu_vendor, class, cpu, model, cpu_id & 0xf);
 		/* XXX per-cpu vendor / class / model / id? */
 	}
 
 	sbuf_cat(sb,
 	    "flags\t\t:");
 
 	if (!strcmp(cpu_vendor, "AuthenticAMD") && (class < 6)) {
 		flags[16] = "fcmov";
 	} else if (!strcmp(cpu_vendor, "CyrixInstead")) {
 		flags[24] = "cxmmx";
 	}
 
 	for (i = 0; i < 32; i++)
 		if (cpu_feature & (1 << i))
 			sbuf_printf(sb, " %s", flags[i]);
 	sbuf_cat(sb, "\n");
 	if (class >= 5) {
 		fqmhz = (tsc_freq + 4999) / 1000000;
 		fqkhz = ((tsc_freq + 4999) / 10000) % 100;
 		sbuf_printf(sb,
 		    "cpu MHz\t\t: %d.%02d\n"
 		    "bogomips\t: %d.%02d\n",
 		    fqmhz, fqkhz, fqmhz, fqkhz);
 	}
 
 	return (0);
 }
 #endif /* __i386__ || __amd64__ */
 
 /*
  * Filler function for proc/mtab
  *
  * This file doesn't exist in Linux' procfs, but is included here so
  * users can symlink /compat/linux/etc/mtab to /proc/mtab
  */
 static int
 linprocfs_domtab(PFS_FILL_ARGS)
 {
 	struct nameidata nd;
 	struct mount *mp;
 	const char *lep;
 	char *dlep, *flep, *mntto, *mntfrom, *fstype;
 	size_t lep_len;
 	int error;
 
 	/* resolve symlinks etc. in the emulation tree prefix */
 	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, linux_emul_path, td);
 	flep = NULL;
 	error = namei(&nd);
 	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
 	if (error != 0 || vn_fullpath(td, nd.ni_vp, &dlep, &flep) != 0)
 		lep = linux_emul_path;
 	else
 		lep = dlep;
 	lep_len = strlen(lep);
 
 	mtx_lock(&mountlist_mtx);
 	error = 0;
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		/* determine device name */
 		mntfrom = mp->mnt_stat.f_mntfromname;
 
 		/* determine mount point */
 		mntto = mp->mnt_stat.f_mntonname;
 		if (strncmp(mntto, lep, lep_len) == 0 &&
 		    mntto[lep_len] == '/')
 			mntto += lep_len;
 
 		/* determine fs type */
 		fstype = mp->mnt_stat.f_fstypename;
 		if (strcmp(fstype, pn->pn_info->pi_name) == 0)
 			mntfrom = fstype = "proc";
 		else if (strcmp(fstype, "procfs") == 0)
 			continue;
 
 		if (strcmp(fstype, "linsysfs") == 0) {
 			sbuf_printf(sb, "/sys %s sysfs %s", mntto,
 			    mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
 		} else {
 			sbuf_printf(sb, "%s %s %s %s", mntfrom, mntto, fstype,
 			    mp->mnt_stat.f_flags & MNT_RDONLY ? "ro" : "rw");
 		}
 #define ADD_OPTION(opt, name) \
 	if (mp->mnt_stat.f_flags & (opt)) sbuf_printf(sb, "," name);
 		ADD_OPTION(MNT_SYNCHRONOUS,	"sync");
 		ADD_OPTION(MNT_NOEXEC,		"noexec");
 		ADD_OPTION(MNT_NOSUID,		"nosuid");
 		ADD_OPTION(MNT_UNION,		"union");
 		ADD_OPTION(MNT_ASYNC,		"async");
 		ADD_OPTION(MNT_SUIDDIR,		"suiddir");
 		ADD_OPTION(MNT_NOSYMFOLLOW,	"nosymfollow");
 		ADD_OPTION(MNT_NOATIME,		"noatime");
 #undef ADD_OPTION
 		/* a real Linux mtab will also show NFS options */
 		sbuf_printf(sb, " 0 0\n");
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (flep != NULL)
 		free(flep, M_TEMP);
 	return (error);
 }
 
 /*
  * Filler function for proc/stat
  */
 static int
 linprocfs_dostat(PFS_FILL_ARGS)
 {
 	int i;
 
 	sbuf_printf(sb, "cpu %ld %ld %ld %ld\n",
 	    T2J(cp_time[CP_USER]),
 	    T2J(cp_time[CP_NICE]),
 	    T2J(cp_time[CP_SYS] /*+ cp_time[CP_INTR]*/),
 	    T2J(cp_time[CP_IDLE]));
 	for (i = 0; i < mp_ncpus; ++i)
 		sbuf_printf(sb, "cpu%d %ld %ld %ld %ld\n", i,
 		    T2J(cp_time[CP_USER]) / mp_ncpus,
 		    T2J(cp_time[CP_NICE]) / mp_ncpus,
 		    T2J(cp_time[CP_SYS]) / mp_ncpus,
 		    T2J(cp_time[CP_IDLE]) / mp_ncpus);
 	sbuf_printf(sb,
 	    "disk 0 0 0 0\n"
 	    "page %u %u\n"
 	    "swap %u %u\n"
 	    "intr %u\n"
 	    "ctxt %u\n"
 	    "btime %lld\n",
-	    cnt.v_vnodepgsin,
-	    cnt.v_vnodepgsout,
-	    cnt.v_swappgsin,
-	    cnt.v_swappgsout,
-	    cnt.v_intr,
-	    cnt.v_swtch,
+	    VMCNT_GET(vnodepgsin),
+	    VMCNT_GET(vnodepgsout),
+	    VMCNT_GET(swappgsin),
+	    VMCNT_GET(swappgsout),
+	    VMCNT_GET(intr),
+	    VMCNT_GET(swtch),
 	    (long long)boottime.tv_sec);
 	return (0);
 }
 
 /*
  * Filler function for proc/uptime
  */
 static int
 linprocfs_douptime(PFS_FILL_ARGS)
 {
 	struct timeval tv;
 
 	getmicrouptime(&tv);
 	sbuf_printf(sb, "%lld.%02ld %ld.%02ld\n",
 	    (long long)tv.tv_sec, tv.tv_usec / 10000,
 	    T2S(cp_time[CP_IDLE]), T2J(cp_time[CP_IDLE]) % 100);
 	return (0);
 }
 
 /*
  * Get OS build date
  */
 static void
 linprocfs_osbuild(struct thread *td, struct sbuf *sb)
 {
 #if 0
 	char osbuild[256];
 	char *cp1, *cp2;
 
 	strncpy(osbuild, version, 256);
 	osbuild[255] = '\0';
 	cp1 = strstr(osbuild, "\n");
 	cp2 = strstr(osbuild, ":");
 	if (cp1 && cp2) {
 		*cp1 = *cp2 = '\0';
 		cp1 = strstr(osbuild, "#");
 	} else
 		cp1 = NULL;
 	if (cp1)
 		sbuf_printf(sb, "%s%s", cp1, cp2 + 1);
 	else
 #endif
 		sbuf_cat(sb, "#4 Sun Dec 18 04:30:00 CET 1977");
 }
 
 /*
  * Get OS builder
  */
 static void
 linprocfs_osbuilder(struct thread *td, struct sbuf *sb)
 {
 #if 0
 	char builder[256];
 	char *cp;
 
 	cp = strstr(version, "\n    ");
 	if (cp) {
 		strncpy(builder, cp + 5, 256);
 		builder[255] = '\0';
 		cp = strstr(builder, ":");
 		if (cp)
 			*cp = '\0';
 	}
 	if (cp)
 		sbuf_cat(sb, builder);
 	else
 #endif
 		sbuf_cat(sb, "des@freebsd.org");
 }
 
 /*
  * Filler function for proc/version
  */
 static int
 linprocfs_doversion(PFS_FILL_ARGS)
 {
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 	sbuf_printf(sb, "%s version %s (", osname, osrelease);
 	linprocfs_osbuilder(td, sb);
 	sbuf_cat(sb, ") (gcc version " __VERSION__ ") ");
 	linprocfs_osbuild(td, sb);
 	sbuf_cat(sb, "\n");
 
 	return (0);
 }
 
 /*
  * Filler function for proc/loadavg
  */
 static int
 linprocfs_doloadavg(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb,
 	    "%d.%02d %d.%02d %d.%02d %d/%d %d\n",
 	    (int)(averunnable.ldavg[0] / averunnable.fscale),
 	    (int)(averunnable.ldavg[0] * 100 / averunnable.fscale % 100),
 	    (int)(averunnable.ldavg[1] / averunnable.fscale),
 	    (int)(averunnable.ldavg[1] * 100 / averunnable.fscale % 100),
 	    (int)(averunnable.ldavg[2] / averunnable.fscale),
 	    (int)(averunnable.ldavg[2] * 100 / averunnable.fscale % 100),
 	    1,				/* number of running tasks */
 	    nprocs,			/* number of tasks */
 	    lastpid			/* the last pid */
 	);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/stat
  */
 static int
 linprocfs_doprocstat(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	char state;
 	static int ratelimit = 0;
 
 	PROC_LOCK(p);
 	fill_kinfo_proc(p, &kp);
 	sbuf_printf(sb, "%d", p->p_pid);
 #define PS_ADD(name, fmt, arg) sbuf_printf(sb, " " fmt, arg)
 	PS_ADD("comm",		"(%s)",	p->p_comm);
 	if (kp.ki_stat > sizeof(linux_state)) {
 		state = 'R';
 
 		if (ratelimit == 0) {
 			printf("linprocfs: don't know how to handle unknown FreeBSD state %d/%zd, mapping to R\n",
 			    kp.ki_stat, sizeof(linux_state));
 			++ratelimit;
 		}
 	} else
 		state = linux_state[kp.ki_stat - 1];
 	PS_ADD("state",		"%c",	state);
 	PS_ADD("ppid",		"%d",	p->p_pptr ? p->p_pptr->p_pid : 0);
 	PS_ADD("pgrp",		"%d",	p->p_pgid);
 	PS_ADD("session",	"%d",	p->p_session->s_sid);
 	PROC_UNLOCK(p);
 	PS_ADD("tty",		"%d",	0); /* XXX */
 	PS_ADD("tpgid",		"%d",	kp.ki_tpgid);
 	PS_ADD("flags",		"%u",	0); /* XXX */
 	PS_ADD("minflt",	"%lu",	kp.ki_rusage.ru_minflt);
 	PS_ADD("cminflt",	"%lu",	kp.ki_rusage_ch.ru_minflt);
 	PS_ADD("majflt",	"%lu",	kp.ki_rusage.ru_majflt);
 	PS_ADD("cmajflt",	"%lu",	kp.ki_rusage_ch.ru_majflt);
 	PS_ADD("utime",		"%ld",	T2J(tvtohz(&kp.ki_rusage.ru_utime)));
 	PS_ADD("stime",		"%ld",	T2J(tvtohz(&kp.ki_rusage.ru_stime)));
 	PS_ADD("cutime",	"%ld",	T2J(tvtohz(&kp.ki_rusage_ch.ru_utime)));
 	PS_ADD("cstime",	"%ld",	T2J(tvtohz(&kp.ki_rusage_ch.ru_stime)));
 	PS_ADD("priority",	"%d",	kp.ki_pri.pri_user);
 	PS_ADD("nice",		"%d",	kp.ki_nice); /* 19 (nicest) to -19 */
 	PS_ADD("0",		"%d",	0); /* removed field */
 	PS_ADD("itrealvalue",	"%d",	0); /* XXX */
 	/* XXX: starttime is not right, it is the _same_ for _every_ process.
 	   It should be the number of jiffies between system boot and process
 	   start. */
 	PS_ADD("starttime",	"%lu",	T2J(tvtohz(&kp.ki_start)));
 	PS_ADD("vsize",		"%ju",	P2K((uintmax_t)kp.ki_size));
 	PS_ADD("rss",		"%ju",	(uintmax_t)kp.ki_rssize);
 	PS_ADD("rlim",		"%lu",	kp.ki_rusage.ru_maxrss);
 	PS_ADD("startcode",	"%u",	(unsigned)0);
 	PS_ADD("endcode",	"%u",	0); /* XXX */
 	PS_ADD("startstack",	"%u",	0); /* XXX */
 	PS_ADD("kstkesp",	"%u",	0); /* XXX */
 	PS_ADD("kstkeip",	"%u",	0); /* XXX */
 	PS_ADD("signal",	"%u",	0); /* XXX */
 	PS_ADD("blocked",	"%u",	0); /* XXX */
 	PS_ADD("sigignore",	"%u",	0); /* XXX */
 	PS_ADD("sigcatch",	"%u",	0); /* XXX */
 	PS_ADD("wchan",		"%u",	0); /* XXX */
 	PS_ADD("nswap",		"%lu",	kp.ki_rusage.ru_nswap);
 	PS_ADD("cnswap",	"%lu",	kp.ki_rusage_ch.ru_nswap);
 	PS_ADD("exitsignal",	"%d",	0); /* XXX */
 	PS_ADD("processor",	"%u",	kp.ki_lastcpu);
 	PS_ADD("rt_priority",	"%u",	0); /* XXX */ /* >= 2.5.19 */
 	PS_ADD("policy",	"%u",	kp.ki_pri.pri_class); /* >= 2.5.19 */
 #undef PS_ADD
 	sbuf_putc(sb, '\n');
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/statm
  */
 static int
 linprocfs_doprocstatm(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	segsz_t lsize;
 
 	PROC_LOCK(p);
 	fill_kinfo_proc(p, &kp);
 	PROC_UNLOCK(p);
 
 	/*
 	 * See comments in linprocfs_doprocstatus() regarding the
 	 * computation of lsize.
 	 */
 	/* size resident share trs drs lrs dt */
 	sbuf_printf(sb, "%ju ", B2P((uintmax_t)kp.ki_size));
 	sbuf_printf(sb, "%ju ", (uintmax_t)kp.ki_rssize);
 	sbuf_printf(sb, "%ju ", (uintmax_t)0); /* XXX */
 	sbuf_printf(sb, "%ju ",	(uintmax_t)kp.ki_tsize);
 	sbuf_printf(sb, "%ju ", (uintmax_t)(kp.ki_dsize + kp.ki_ssize));
 	lsize = B2P(kp.ki_size) - kp.ki_dsize -
 	    kp.ki_ssize - kp.ki_tsize - 1;
 	sbuf_printf(sb, "%ju ", (uintmax_t)lsize);
 	sbuf_printf(sb, "%ju\n", (uintmax_t)0); /* XXX */
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/status
  */
 static int
 linprocfs_doprocstatus(PFS_FILL_ARGS)
 {
 	struct kinfo_proc kp;
 	char *state;
 	segsz_t lsize;
 	struct thread *td2;
 	struct sigacts *ps;
 	int i;
 
 	PROC_LOCK(p);
 	td2 = FIRST_THREAD_IN_PROC(p); /* XXXKSE pretend only one thread */
 
 	if (P_SHOULDSTOP(p)) {
 		state = "T (stopped)";
 	} else {
 		mtx_lock_spin(&sched_lock);
 		switch(p->p_state) {
 		case PRS_NEW:
 			state = "I (idle)";
 			break;
 		case PRS_NORMAL:
 			if (p->p_flag & P_WEXIT) {
 				state = "X (exiting)";
 				break;
 			}
 			switch(td2->td_state) {
 			case TDS_INHIBITED:
 				state = "S (sleeping)";
 				break;
 			case TDS_RUNQ:
 			case TDS_RUNNING:
 				state = "R (running)";
 				break;
 			default:
 				state = "? (unknown)";
 				break;
 			}
 			break;
 		case PRS_ZOMBIE:
 			state = "Z (zombie)";
 			break;
 		default:
 			state = "? (unknown)";
 			break;
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	fill_kinfo_proc(p, &kp);
 	sbuf_printf(sb, "Name:\t%s\n",		p->p_comm); /* XXX escape */
 	sbuf_printf(sb, "State:\t%s\n",		state);
 
 	/*
 	 * Credentials
 	 */
 	sbuf_printf(sb, "Pid:\t%d\n",		p->p_pid);
 	sbuf_printf(sb, "PPid:\t%d\n",		p->p_pptr ?
 						p->p_pptr->p_pid : 0);
 	sbuf_printf(sb, "Uid:\t%d %d %d %d\n",	p->p_ucred->cr_ruid,
 						p->p_ucred->cr_uid,
 						p->p_ucred->cr_svuid,
 						/* FreeBSD doesn't have fsuid */
 						p->p_ucred->cr_uid);
 	sbuf_printf(sb, "Gid:\t%d %d %d %d\n",	p->p_ucred->cr_rgid,
 						p->p_ucred->cr_gid,
 						p->p_ucred->cr_svgid,
 						/* FreeBSD doesn't have fsgid */
 						p->p_ucred->cr_gid);
 	sbuf_cat(sb, "Groups:\t");
 	for (i = 0; i < p->p_ucred->cr_ngroups; i++)
 		sbuf_printf(sb, "%d ",		p->p_ucred->cr_groups[i]);
 	PROC_UNLOCK(p);
 	sbuf_putc(sb, '\n');
 
 	/*
 	 * Memory
 	 *
 	 * While our approximation of VmLib may not be accurate (I
 	 * don't know of a simple way to verify it, and I'm not sure
 	 * it has much meaning anyway), I believe it's good enough.
 	 *
 	 * The same code that could (I think) accurately compute VmLib
 	 * could also compute VmLck, but I don't really care enough to
 	 * implement it. Submissions are welcome.
 	 */
 	sbuf_printf(sb, "VmSize:\t%8ju kB\n",	B2K((uintmax_t)kp.ki_size));
 	sbuf_printf(sb, "VmLck:\t%8u kB\n",	P2K(0)); /* XXX */
 	sbuf_printf(sb, "VmRss:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_rssize));
 	sbuf_printf(sb, "VmData:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_dsize));
 	sbuf_printf(sb, "VmStk:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_ssize));
 	sbuf_printf(sb, "VmExe:\t%8ju kB\n",	P2K((uintmax_t)kp.ki_tsize));
 	lsize = B2P(kp.ki_size) - kp.ki_dsize -
 	    kp.ki_ssize - kp.ki_tsize - 1;
 	sbuf_printf(sb, "VmLib:\t%8ju kB\n",	P2K((uintmax_t)lsize));
 
 	/*
 	 * Signal masks
 	 *
 	 * We support up to 128 signals, while Linux supports 32,
 	 * but we only define 32 (the same 32 as Linux, to boot), so
 	 * just show the lower 32 bits of each mask. XXX hack.
 	 *
 	 * NB: on certain platforms (Sparc at least) Linux actually
 	 * supports 64 signals, but this code is a long way from
 	 * running on anything but i386, so ignore that for now.
 	 */
 	PROC_LOCK(p);
 	sbuf_printf(sb, "SigPnd:\t%08x\n",	p->p_siglist.__bits[0]);
 	/*
 	 * I can't seem to find out where the signal mask is in
 	 * relation to struct proc, so SigBlk is left unimplemented.
 	 */
 	sbuf_printf(sb, "SigBlk:\t%08x\n",	0); /* XXX */
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	sbuf_printf(sb, "SigIgn:\t%08x\n",	ps->ps_sigignore.__bits[0]);
 	sbuf_printf(sb, "SigCgt:\t%08x\n",	ps->ps_sigcatch.__bits[0]);
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Linux also prints the capability masks, but we don't have
 	 * capabilities yet, and when we do get them they're likely to
 	 * be meaningless to Linux programs, so we lie. XXX
 	 */
 	sbuf_printf(sb, "CapInh:\t%016x\n",	0);
 	sbuf_printf(sb, "CapPrm:\t%016x\n",	0);
 	sbuf_printf(sb, "CapEff:\t%016x\n",	0);
 
 	return (0);
 }
 
 
 /*
  * Filler function for proc/pid/cwd
  */
 static int
 linprocfs_doproccwd(PFS_FILL_ARGS)
 {
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 
 	vn_fullpath(td, p->p_fd->fd_cdir, &fullpath, &freepath);
 	sbuf_printf(sb, "%s", fullpath);
 	if (freepath)
 		free(freepath, M_TEMP);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/root
  */
 static int
 linprocfs_doprocroot(PFS_FILL_ARGS)
 {
 	struct vnode *rvp;
 	char *fullpath = "unknown";
 	char *freepath = NULL;
 
 	rvp = jailed(p->p_ucred) ? p->p_fd->fd_jdir : p->p_fd->fd_rdir;
 	vn_fullpath(td, rvp, &fullpath, &freepath);
 	sbuf_printf(sb, "%s", fullpath);
 	if (freepath)
 		free(freepath, M_TEMP);
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/cmdline
  */
 static int
 linprocfs_doproccmdline(PFS_FILL_ARGS)
 {
 	struct ps_strings pstr;
 	char **ps_argvstr;
 	int error, i;
 
 	/*
 	 * If we are using the ps/cmdline caching, use that.  Otherwise
 	 * revert back to the old way which only implements full cmdline
 	 * for the currept process and just p->p_comm for all other
 	 * processes.
 	 * Note that if the argv is no longer available, we deliberately
 	 * don't fall back on p->p_comm or return an error: the authentic
 	 * Linux behaviour is to return zero-length in this case.
 	 */
 
 	PROC_LOCK(p);
 	if (p->p_args && p_cansee(td, p) == 0) {
 		sbuf_bcpy(sb, p->p_args->ar_args, p->p_args->ar_length);
 		PROC_UNLOCK(p);
 	} else if (p != td->td_proc) {
 		PROC_UNLOCK(p);
 		sbuf_printf(sb, "%.*s", MAXCOMLEN, p->p_comm);
 	} else {
 		PROC_UNLOCK(p);
 		error = copyin((void *)p->p_sysent->sv_psstrings, &pstr,
 		    sizeof(pstr));
 		if (error)
 			return (error);
 		if (pstr.ps_nargvstr > ARG_MAX)
 			return (E2BIG);
 		ps_argvstr = malloc(pstr.ps_nargvstr * sizeof(char *),
 		    M_TEMP, M_WAITOK);
 		error = copyin((void *)pstr.ps_argvstr, ps_argvstr,
 		    pstr.ps_nargvstr * sizeof(char *));
 		if (error) {
 			free(ps_argvstr, M_TEMP);
 			return (error);
 		}
 		for (i = 0; i < pstr.ps_nargvstr; i++) {
 			sbuf_copyin(sb, ps_argvstr[i], 0);
 			sbuf_printf(sb, "%c", '\0');
 		}
 		free(ps_argvstr, M_TEMP);
 	}
 
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/environ
  */
 static int
 linprocfs_doprocenviron(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "doprocenviron\n%c", '\0');
 	return (0);
 }
 
 /*
  * Filler function for proc/pid/maps
  */
 static int
 linprocfs_doprocmaps(PFS_FILL_ARGS)
 {
 	char mebuffer[512];
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry, tmp_entry;
 	vm_object_t obj, tobj, lobj;
 	vm_offset_t saved_end;
 	vm_ooffset_t off = 0;
 	char *name = "", *freename = NULL;
 	size_t len;
 	ino_t ino;
 	unsigned int last_timestamp;
 	int ref_count, shadow_count, flags;
 	int error;
 	struct vnode *vp;
 	struct vattr vat;
 	int locked;
 
 	PROC_LOCK(p);
 	error = p_candebug(td, p);
 	PROC_UNLOCK(p);
 	if (error)
 		return (error);
 
 	if (uio->uio_rw != UIO_READ)
 		return (EOPNOTSUPP);
 
 	if (uio->uio_offset != 0)
 		return (0);
 
 	error = 0;
 	vm_map_lock_read(map);
 	for (entry = map->header.next;
 	    ((uio->uio_resid > 0) && (entry != &map->header));
 	    entry = entry->next) {
 		name = "";
 		freename = NULL;
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			continue;
 		saved_end = entry->end;
 		obj = entry->object.vm_object;
 		for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
 			VM_OBJECT_LOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			lobj = tobj;
 		}
 		ino = 0;
 		if (lobj) {
 			off = IDX_TO_OFF(lobj->size);
 			if (lobj->type == OBJT_VNODE) {
 				vp = lobj->handle;
 				if (vp)
 					vref(vp);
 			}
 			else
 				vp = NULL;
 			if (lobj != obj)
 				VM_OBJECT_UNLOCK(lobj);
 			flags = obj->flags;
 			ref_count = obj->ref_count;
 			shadow_count = obj->shadow_count;
 			VM_OBJECT_UNLOCK(obj);
 			if (vp) {
 				vn_fullpath(td, vp, &name, &freename);
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				vn_lock(vp, LK_SHARED | LK_RETRY, td);
 				VOP_GETATTR(vp, &vat, td->td_ucred, td);
 				ino = vat.va_fileid;
 				vput(vp);
 				VFS_UNLOCK_GIANT(locked);
 			}
 		} else {
 			flags = 0;
 			ref_count = 0;
 			shadow_count = 0;
 		}
 
 		/*
 		 * format:
 		 *  start, end, access, offset, major, minor, inode, name.
 		 */
 		snprintf(mebuffer, sizeof mebuffer,
 		    "%08lx-%08lx %s%s%s%s %08lx %02x:%02x %lu%s%s\n",
 		    (u_long)entry->start, (u_long)entry->end,
 		    (entry->protection & VM_PROT_READ)?"r":"-",
 		    (entry->protection & VM_PROT_WRITE)?"w":"-",
 		    (entry->protection & VM_PROT_EXECUTE)?"x":"-",
 		    "p",
 		    (u_long)off,
 		    0,
 		    0,
 		    (u_long)ino,
 		    *name ? "     " : "",
 		    name
 		    );
 		if (freename)
 			free(freename, M_TEMP);
 		len = strlen(mebuffer);
 		if (len > uio->uio_resid)
 			len = uio->uio_resid; /*
 					       * XXX We should probably return
 					       * EFBIG here, as in procfs.
 					       */
 		last_timestamp = map->timestamp;
 		vm_map_unlock_read(map);
 		error = uiomove(mebuffer, len, uio);
 		vm_map_lock_read(map);
 		if (error)
 			break;
 		if (last_timestamp + 1 != map->timestamp) {
 			/*
 			 * Look again for the entry because the map was
 			 * modified while it was unlocked.  Specifically,
 			 * the entry may have been clipped, merged, or deleted.
 			 */
 			vm_map_lookup_entry(map, saved_end - 1, &tmp_entry);
 			entry = tmp_entry;
 		}
 	}
 	vm_map_unlock_read(map);
 
 	return (error);
 }
 
 /*
  * Filler function for proc/net/dev
  */
 static int
 linprocfs_donetdev(PFS_FILL_ARGS)
 {
 	char ifname[16]; /* XXX LINUX_IFNAMSIZ */
 	struct ifnet *ifp;
 
 	sbuf_printf(sb, "%6s|%58s|%s\n%6s|%58s|%58s\n",
 	    "Inter-", "   Receive", "  Transmit", " face",
 	    "bytes    packets errs drop fifo frame compressed",
 	    "bytes    packets errs drop fifo frame compressed");
 
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		linux_ifname(ifp, ifname, sizeof ifname);
 			sbuf_printf(sb, "%6.6s:", ifname);
 		sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu ",
 		    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
 		sbuf_printf(sb, "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
 		    0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL, 0UL);
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/osrelease
  */
 static int
 linprocfs_doosrelease(PFS_FILL_ARGS)
 {
 	char osrelease[LINUX_MAX_UTSNAME];
 
 	linux_get_osrelease(td, osrelease);
 	sbuf_printf(sb, "%s\n", osrelease);
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/ostype
  */
 static int
 linprocfs_doostype(PFS_FILL_ARGS)
 {
 	char osname[LINUX_MAX_UTSNAME];
 
 	linux_get_osname(td, osname);
 	sbuf_printf(sb, "%s\n", osname);
 
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/version
  */
 static int
 linprocfs_doosbuild(PFS_FILL_ARGS)
 {
 
 	linprocfs_osbuild(td, sb);
 	sbuf_cat(sb, "\n");
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/msgmni
  */
 static int
 linprocfs_domsgmni(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%d\n", msginfo.msgmni);
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/pid_max
  */
 static int
 linprocfs_dopid_max(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%i\n", PID_MAX);
 	return (0);
 }
 
 /*
  * Filler function for proc/sys/kernel/sem
  */
 static int
 linprocfs_dosem(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "%d %d %d %d\n", seminfo.semmsl, seminfo.semmns,
 	    seminfo.semopm, seminfo.semmni);
 	return (0);
 }
 
 /*
  * Filler function for proc/scsi/device_info
  */
 static int
 linprocfs_doscsidevinfo(PFS_FILL_ARGS)
 {
 
 	return (0);
 }
 
 /*
  * Filler function for proc/scsi/scsi
  */
 static int
 linprocfs_doscsiscsi(PFS_FILL_ARGS)
 {
 
 	return (0);
 }
 
 extern struct cdevsw *cdevsw[];
 
 /*
  * Filler function for proc/devices
  */
 static int
 linprocfs_dodevices(PFS_FILL_ARGS)
 {
 	char *char_devices;
 	sbuf_printf(sb, "Character devices:\n");
 
 	char_devices = linux_get_char_devices();
 	sbuf_printf(sb, "%s", char_devices);
 	linux_free_get_char_devices(char_devices);
 
 	sbuf_printf(sb, "\nBlock devices:\n");
 
 	return (0);
 }
 
 /*
  * Filler function for proc/cmdline
  */
 static int
 linprocfs_docmdline(PFS_FILL_ARGS)
 {
 
 	sbuf_printf(sb, "BOOT_IMAGE=%s", kernelname);
 	sbuf_printf(sb, " ro root=302\n");
 	return (0);
 }
 
 #if 0
 /*
  * Filler function for proc/modules
  */
 static int
 linprocfs_domodules(PFS_FILL_ARGS)
 {
 	struct linker_file *lf;
 
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		sbuf_printf(sb, "%-20s%8lu%4d\n", lf->filename,
 		    (unsigned long)lf->size, lf->refs);
 	}
 	return (0);
 }
 #endif
 
 /*
  * Constructor
  */
 static int
 linprocfs_init(PFS_INIT_ARGS)
 {
 	struct pfs_node *root;
 	struct pfs_node *dir;
 
 	root = pi->pi_root;
 
 	/* /proc/... */
 	pfs_create_file(root, "cmdline", &linprocfs_docmdline,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "cpuinfo", &linprocfs_docpuinfo,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "devices", &linprocfs_dodevices,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "loadavg", &linprocfs_doloadavg,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "meminfo", &linprocfs_domeminfo,
 	    NULL, NULL, NULL, PFS_RD);
 #if 0
 	pfs_create_file(root, "modules", &linprocfs_domodules,
 	    NULL, NULL, NULL, PFS_RD);
 #endif
 	pfs_create_file(root, "mounts", &linprocfs_domtab,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "mtab", &linprocfs_domtab,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(root, "self", &procfs_docurproc,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(root, "stat", &linprocfs_dostat,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "uptime", &linprocfs_douptime,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(root, "version", &linprocfs_doversion,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/net/... */
 	dir = pfs_create_dir(root, "net", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "dev", &linprocfs_donetdev,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/<pid>/... */
 	dir = pfs_create_dir(root, "pid", NULL, NULL, NULL, PFS_PROCDEP);
 	pfs_create_file(dir, "cmdline", &linprocfs_doproccmdline,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(dir, "cwd", &linprocfs_doproccwd,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "environ", &linprocfs_doprocenviron,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_link(dir, "exe", &procfs_doprocfile,
 	    NULL, &procfs_notsystem, NULL, 0);
 	pfs_create_file(dir, "maps", &linprocfs_doprocmaps,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "mem", &procfs_doprocmem,
 	    &procfs_attr, &procfs_candebug, NULL, PFS_RDWR|PFS_RAW);
 	pfs_create_link(dir, "root", &linprocfs_doprocroot,
 	    NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "stat", &linprocfs_doprocstat,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "statm", &linprocfs_doprocstatm,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "status", &linprocfs_doprocstatus,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/scsi/... */
 	dir = pfs_create_dir(root, "scsi", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "device_info", &linprocfs_doscsidevinfo,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "scsi", &linprocfs_doscsiscsi,
 	    NULL, NULL, NULL, PFS_RD);
 
 	/* /proc/sys/... */
 	dir = pfs_create_dir(root, "sys", NULL, NULL, NULL, 0);
 	/* /proc/sys/kernel/... */
 	dir = pfs_create_dir(dir, "kernel", NULL, NULL, NULL, 0);
 	pfs_create_file(dir, "osrelease", &linprocfs_doosrelease,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "ostype", &linprocfs_doostype,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "version", &linprocfs_doosbuild,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "msgmni", &linprocfs_domsgmni,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "pid_max", &linprocfs_dopid_max,
 	    NULL, NULL, NULL, PFS_RD);
 	pfs_create_file(dir, "sem", &linprocfs_dosem,
 	    NULL, NULL, NULL, PFS_RD);
 
 	return (0);
 }
 
 /*
  * Destructor
  */
 static int
 linprocfs_uninit(PFS_INIT_ARGS)
 {
 
 	/* nothing to do, pseudofs will GC */
 	return (0);
 }
 
 PSEUDOFS(linprocfs, 1);
 MODULE_DEPEND(linprocfs, linux, 1, 1, 1);
 MODULE_DEPEND(linprocfs, procfs, 1, 1, 1);
 MODULE_DEPEND(linprocfs, sysvmsg, 1, 1, 1);
 MODULE_DEPEND(linprocfs, sysvsem, 1, 1, 1);
Index: head/sys/compat/linux/linux_misc.c
===================================================================
--- head/sys/compat/linux/linux_misc.c	(revision 169666)
+++ head/sys/compat/linux/linux_misc.c	(revision 169667)
@@ -1,1713 +1,1713 @@
 /*-
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1994-1995 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/blist.h>
 #include <sys/fcntl.h>
 #if defined(__i386__)
 #include <sys/imgact_aout.h>
 #endif
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/swap_pager.h>
 
 #include <compat/linux/linux_sysproto.h>
 #include <compat/linux/linux_emul.h>
 #include <compat/linux/linux_misc.h>
 
 #ifdef COMPAT_LINUX32
 #include <machine/../linux32/linux.h>
 #include <machine/../linux32/linux32_proto.h>
 #else
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #endif
 
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 
 #ifdef __i386__
 #include <machine/cputypes.h>
 #endif
 
 #define BSD_TO_LINUX_SIGNAL(sig)	\
 	(((sig) <= LINUX_SIGTBLSZ) ? bsd_to_linux_signal[_SIG_IDX(sig)] : sig)
 
 static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
 	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
 	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
 	RLIMIT_MEMLOCK, RLIMIT_AS 
 };
 
 struct l_sysinfo {
 	l_long		uptime;		/* Seconds since boot */
 	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
 #define LINUX_SYSINFO_LOADS_SCALE 65536
 	l_ulong		totalram;	/* Total usable main memory size */
 	l_ulong		freeram;	/* Available memory size */
 	l_ulong		sharedram;	/* Amount of shared memory */
 	l_ulong		bufferram;	/* Memory used by buffers */
 	l_ulong		totalswap;	/* Total swap space size */
 	l_ulong		freeswap;	/* swap space still available */
 	l_ushort	procs;		/* Number of current processes */
 	l_ushort	pads;
 	l_ulong		totalbig;
 	l_ulong		freebig;
 	l_uint		mem_unit;
 	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
 };
 int
 linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
 {
 	struct l_sysinfo sysinfo;
 	vm_object_t object;
 	int i, j;
 	struct timespec ts;
 
 	getnanouptime(&ts);
 	if (ts.tv_nsec != 0)
 		ts.tv_sec++;
 	sysinfo.uptime = ts.tv_sec;
 
 	/* Use the information from the mib to get our load averages */
 	for (i = 0; i < 3; i++)
 		sysinfo.loads[i] = averunnable.ldavg[i] *
 		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
 
 	sysinfo.totalram = physmem * PAGE_SIZE;
-	sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE;
+	sysinfo.freeram = sysinfo.totalram - VMCNT_GET(wire_count) * PAGE_SIZE;
 
 	sysinfo.sharedram = 0;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list)
 		if (object->shadow_count > 1)
 			sysinfo.sharedram += object->resident_page_count;
 	mtx_unlock(&vm_object_list_mtx);
 
 	sysinfo.sharedram *= PAGE_SIZE;
 	sysinfo.bufferram = 0;
 
 	swap_pager_status(&i, &j);
 	sysinfo.totalswap = i * PAGE_SIZE;
 	sysinfo.freeswap = (i - j) * PAGE_SIZE;
 
 	sysinfo.procs = nprocs;
 
 	/* The following are only present in newer Linux kernels. */
 	sysinfo.totalbig = 0;
 	sysinfo.freebig = 0;
 	sysinfo.mem_unit = 1;
 
 	return copyout(&sysinfo, args->info, sizeof(sysinfo));
 }
 
 int
 linux_alarm(struct thread *td, struct linux_alarm_args *args)
 {
 	struct itimerval it, old_it;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(alarm))
 		printf(ARGS(alarm, "%u"), args->secs);
 #endif
 
 	if (args->secs > 100000000)
 		return (EINVAL);
 
 	it.it_value.tv_sec = (long)args->secs;
 	it.it_value.tv_usec = 0;
 	it.it_interval.tv_sec = 0;
 	it.it_interval.tv_usec = 0;
 	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
 	if (error)
 		return (error);
 	if (timevalisset(&old_it.it_value)) {
 		if (old_it.it_value.tv_usec != 0)
 			old_it.it_value.tv_sec++;
 		td->td_retval[0] = old_it.it_value.tv_sec;
 	}
 	return (0);
 }
 
 int
 linux_brk(struct thread *td, struct linux_brk_args *args)
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_offset_t new, old;
 	struct obreak_args /* {
 		char * nsize;
 	} */ tmp;
 
 #ifdef DEBUG
 	if (ldebug(brk))
 		printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
 #endif
 	old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
 	new = (vm_offset_t)args->dsend;
 	tmp.nsize = (char *)new;
 	if (((caddr_t)new > vm->vm_daddr) && !obreak(td, &tmp))
 		td->td_retval[0] = (long)new;
 	else
 		td->td_retval[0] = (long)old;
 
 	return 0;
 }
 
 #if defined(__i386__)
 /* XXX: what about amd64/linux32? */
 
 int
 linux_uselib(struct thread *td, struct linux_uselib_args *args)
 {
 	struct nameidata ni;
 	struct vnode *vp;
 	struct exec *a_out;
 	struct vattr attr;
 	vm_offset_t vmaddr;
 	unsigned long file_offset;
 	vm_offset_t buffer;
 	unsigned long bss_size;
 	char *library;
 	int error;
 	int locked, vfslocked;
 
 	LCONVPATHEXIST(td, args->library, &library);
 
 #ifdef DEBUG
 	if (ldebug(uselib))
 		printf(ARGS(uselib, "%s"), library);
 #endif
 
 	a_out = NULL;
 	vfslocked = 0;
 	locked = 0;
 	vp = NULL;
 
 	NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_SYSSPACE, library, td);
 	error = namei(&ni);
 	LFREEPATH(library);
 	if (error)
 		goto cleanup;
 
 	vp = ni.ni_vp;
 	vfslocked = NDHASGIANT(&ni);
 	NDFREE(&ni, NDF_ONLY_PNBUF);
 
 	/*
 	 * From here on down, we have a locked vnode that must be unlocked.
 	 * XXX: The code below largely duplicates exec_check_permissions().
 	 */
 	locked = 1;
 
 	/* Writable? */
 	if (vp->v_writecount) {
 		error = ETXTBSY;
 		goto cleanup;
 	}
 
 	/* Executable? */
 	error = VOP_GETATTR(vp, &attr, td->td_ucred, td);
 	if (error)
 		goto cleanup;
 
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) {
 		/* EACCESS is what exec(2) returns. */
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Sensible size? */
 	if (attr.va_size == 0) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* Can we access it? */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		goto cleanup;
 
 	/*
 	 * XXX: This should use vn_open() so that it is properly authorized,
 	 * and to reduce code redundancy all over the place here.
 	 * XXX: Not really, it duplicates far more of exec_check_permissions()
 	 * than vn_open().
 	 */
 #ifdef MAC
 	error = mac_check_vnode_open(td->td_ucred, vp, FREAD);
 	if (error)
 		goto cleanup;
 #endif
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
 	if (error)
 		goto cleanup;
 
 	/* Pull in executable header into kernel_map */
 	error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
 	    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
 	if (error)
 		goto cleanup;
 
 	/* Is it a Linux binary ? */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/*
 	 * While we are here, we should REALLY do some more checks
 	 */
 
 	/* Set file/virtual offset based on a.out variant. */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case 0413:			/* ZMAGIC */
 		file_offset = 1024;
 		break;
 	case 0314:			/* QMAGIC */
 		file_offset = 0;
 		break;
 	default:
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	bss_size = round_page(a_out->a_bss);
 
 	/* Check various fields in header for validity/bounds. */
 	if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) {
 		error = ENOEXEC;
 		goto cleanup;
 	}
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > attr.va_size) {
 		error = EFAULT;
 		goto cleanup;
 	}
 
 	/*
 	 * text/data/bss must not exceed limits
 	 * XXX - this is not complete. it should check current usage PLUS
 	 * the resources needed by this library.
 	 */
 	PROC_LOCK(td->td_proc);
 	if (a_out->a_text > maxtsiz ||
 	    a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA)) {
 		PROC_UNLOCK(td->td_proc);
 		error = ENOMEM;
 		goto cleanup;
 	}
 	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * Prevent more writers.
 	 * XXX: Note that if any of the VM operations fail below we don't
 	 * clear this flag.
 	 */
 	vp->v_vflag |= VV_TEXT;
 
 	/*
 	 * Lock no longer needed
 	 */
 	locked = 0;
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 
 	/*
 	 * Check if file_offset page aligned. Currently we cannot handle
 	 * misalinged file offsets, and so we read in the entire image
 	 * (what a waste).
 	 */
 	if (file_offset & PAGE_MASK) {
 #ifdef DEBUG
 		printf("uselib: Non page aligned binary %lu\n", file_offset);
 #endif
 		/* Map text+data read/write/execute */
 
 		/* a_entry is the load address and is page aligned */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/* get anon user mapping, read+write+execute */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, a_out->a_text + a_out->a_data, FALSE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 
 		/* map file into kernel_map */
 		error = vm_mmap(kernel_map, &buffer,
 		    round_page(a_out->a_text + a_out->a_data + file_offset),
 		    VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp,
 		    trunc_page(file_offset));
 		if (error)
 			goto cleanup;
 
 		/* copy from kernel VM space to user space */
 		error = copyout(PTRIN(buffer + file_offset),
 		    (void *)vmaddr, a_out->a_text + a_out->a_data);
 
 		/* release temporary kernel space */
 		vm_map_remove(kernel_map, buffer, buffer +
 		    round_page(a_out->a_text + a_out->a_data + file_offset));
 
 		if (error)
 			goto cleanup;
 	} else {
 #ifdef DEBUG
 		printf("uselib: Page aligned binary %lu\n", file_offset);
 #endif
 		/*
 		 * for QMAGIC, a_entry is 20 bytes beyond the load address
 		 * to skip the executable header
 		 */
 		vmaddr = trunc_page(a_out->a_entry);
 
 		/*
 		 * Map it all into the process's space as a single
 		 * copy-on-write "data" segment.
 		 */
 		error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
 		    a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
 		    MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset);
 		if (error)
 			goto cleanup;
 	}
 #ifdef DEBUG
 	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
 	    ((long *)vmaddr)[1]);
 #endif
 	if (bss_size != 0) {
 		/* Calculate BSS start address */
 		vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
 		    a_out->a_data;
 
 		/* allocate some 'anon' space */
 		error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
 		    &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			goto cleanup;
 	}
 
 cleanup:
 	/* Unlock vnode if needed */
 	if (locked) {
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 
 	/* Release the kernel mapping. */
 	if (a_out)
 		vm_map_remove(kernel_map, (vm_offset_t)a_out,
 		    (vm_offset_t)a_out + PAGE_SIZE);
 
 	return error;
 }
 
 #endif	/* __i386__ */
 
 int
 linux_select(struct thread *td, struct linux_select_args *args)
 {
 	l_timeval ltv;
 	struct timeval tv0, tv1, utv, *tvp;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
 		    (void *)args->readfds, (void *)args->writefds,
 		    (void *)args->exceptfds, (void *)args->timeout);
 #endif
 
 	/*
 	 * Store current time for computation of the amount of
 	 * time left.
 	 */
 	if (args->timeout) {
 		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
 			goto select_out;
 		utv.tv_sec = ltv.tv_sec;
 		utv.tv_usec = ltv.tv_usec;
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("incoming timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 
 		if (itimerfix(&utv)) {
 			/*
 			 * The timeval was invalid.  Convert it to something
 			 * valid that will act as it does under Linux.
 			 */
 			utv.tv_sec += utv.tv_usec / 1000000;
 			utv.tv_usec %= 1000000;
 			if (utv.tv_usec < 0) {
 				utv.tv_sec -= 1;
 				utv.tv_usec += 1000000;
 			}
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		}
 		microtime(&tv0);
 		tvp = &utv;
 	} else
 		tvp = NULL;
 
 	error = kern_select(td, args->nfds, args->readfds, args->writefds,
 	    args->exceptfds, tvp);
 
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("real select returns %d"), error);
 #endif
 	if (error) {
 		/*
 		 * See fs/select.c in the Linux kernel.  Without this,
 		 * Maelstrom doesn't work.
 		 */
 		if (error == ERESTART)
 			error = EINTR;
 		goto select_out;
 	}
 
 	if (args->timeout) {
 		if (td->td_retval[0]) {
 			/*
 			 * Compute how much time was left of the timeout,
 			 * by subtracting the current time and the time
 			 * before we started the call, and subtracting
 			 * that result from the user-supplied value.
 			 */
 			microtime(&tv1);
 			timevalsub(&tv1, &tv0);
 			timevalsub(&utv, &tv1);
 			if (utv.tv_sec < 0)
 				timevalclear(&utv);
 		} else
 			timevalclear(&utv);
 #ifdef DEBUG
 		if (ldebug(select))
 			printf(LMSG("outgoing timeout (%jd/%ld)"),
 			    (intmax_t)utv.tv_sec, utv.tv_usec);
 #endif
 		ltv.tv_sec = utv.tv_sec;
 		ltv.tv_usec = utv.tv_usec;
 		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
 			goto select_out;
 	}
 
 select_out:
 #ifdef DEBUG
 	if (ldebug(select))
 		printf(LMSG("select_out -> %d"), error);
 #endif
 	return error;
 }
 
 int
 linux_mremap(struct thread *td, struct linux_mremap_args *args)
 {
 	struct munmap_args /* {
 		void *addr;
 		size_t len;
 	} */ bsd_args;
 	int error = 0;
 
 #ifdef DEBUG
 	if (ldebug(mremap))
 		printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
 		    (void *)(uintptr_t)args->addr,
 		    (unsigned long)args->old_len,
 		    (unsigned long)args->new_len,
 		    (unsigned long)args->flags);
 #endif
 	args->new_len = round_page(args->new_len);
 	args->old_len = round_page(args->old_len);
 
 	if (args->new_len > args->old_len) {
 		td->td_retval[0] = 0;
 		return ENOMEM;
 	}
 
 	if (args->new_len < args->old_len) {
 		bsd_args.addr =
 		    (caddr_t)((uintptr_t)args->addr + args->new_len);
 		bsd_args.len = args->old_len - args->new_len;
 		error = munmap(td, &bsd_args);
 	}
 
 	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
 	return error;
 }
 
 #define LINUX_MS_ASYNC       0x0001
 #define LINUX_MS_INVALIDATE  0x0002
 #define LINUX_MS_SYNC        0x0004
 
 int
 linux_msync(struct thread *td, struct linux_msync_args *args)
 {
 	struct msync_args bsd_args;
 
 	bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
 	bsd_args.len = (uintptr_t)args->len;
 	bsd_args.flags = args->fl & ~LINUX_MS_SYNC;
 
 	return msync(td, &bsd_args);
 }
 
 int
 linux_time(struct thread *td, struct linux_time_args *args)
 {
 	struct timeval tv;
 	l_time_t tm;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(time))
 		printf(ARGS(time, "*"));
 #endif
 
 	microtime(&tv);
 	tm = tv.tv_sec;
 	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
 		return error;
 	td->td_retval[0] = tm;
 	return 0;
 }
 
 struct l_times_argv {
 	l_long	tms_utime;
 	l_long	tms_stime;
 	l_long	tms_cutime;
 	l_long	tms_cstime;
 };
 
 #define CLK_TCK 100			/* Linux uses 100 */
 
 #define CONVTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
 
 int
 linux_times(struct thread *td, struct linux_times_args *args)
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct l_times_argv tms;
 	struct proc *p;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(times))
 		printf(ARGS(times, "*"));
 #endif
 
 	if (args->buf != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		calcru(p, &utime, &stime);
 		calccru(p, &cutime, &cstime);
 		PROC_UNLOCK(p);
 
 		tms.tms_utime = CONVTCK(utime);
 		tms.tms_stime = CONVTCK(stime);
 
 		tms.tms_cutime = CONVTCK(cutime);
 		tms.tms_cstime = CONVTCK(cstime);
 
 		if ((error = copyout(&tms, args->buf, sizeof(tms))))
 			return error;
 	}
 
 	microuptime(&tv);
 	td->td_retval[0] = (int)CONVTCK(tv);
 	return 0;
 }
 
 int
 linux_newuname(struct thread *td, struct linux_newuname_args *args)
 {
 	struct l_new_utsname utsname;
 	char osname[LINUX_MAX_UTSNAME];
 	char osrelease[LINUX_MAX_UTSNAME];
 	char *p;
 
 #ifdef DEBUG
 	if (ldebug(newuname))
 		printf(ARGS(newuname, "*"));
 #endif
 
 	linux_get_osname(td, osname);
 	linux_get_osrelease(td, osrelease);
 
 	bzero(&utsname, sizeof(utsname));
 	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
 	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
 	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
 	for (p = utsname.version; *p != '\0'; ++p)
 		if (*p == '\n') {
 			*p = '\0';
 			break;
 		}
 #ifdef __i386__
 	{
 		const char *class;
 
 		switch (cpu_class) {
 		case CPUCLASS_686:
 			class = "i686";
 			break;
 		case CPUCLASS_586:
 			class = "i586";
 			break;
 		case CPUCLASS_486:
 			class = "i486";
 			break;
 		default:
 			class = "i386";
 		}
 		strlcpy(utsname.machine, class, LINUX_MAX_UTSNAME);
 	}
 #elif defined(__amd64__)	/* XXX: Linux can change 'personality'. */
 #ifdef COMPAT_LINUX32
 	strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
 #else
 	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
 #endif /* COMPAT_LINUX32 */
 #else /* something other than i386 or amd64 - assume we and Linux agree */
 	strlcpy(utsname.machine, machine, LINUX_MAX_UTSNAME);
 #endif /* __i386__ */
 	strlcpy(utsname.domainname, domainname, LINUX_MAX_UTSNAME);
 
 	return (copyout(&utsname, args->buf, sizeof(utsname)));
 }
 
 #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
 struct l_utimbuf {
 	l_time_t l_actime;
 	l_time_t l_modtime;
 };
 
 int
 linux_utime(struct thread *td, struct linux_utime_args *args)
 {
 	struct timeval tv[2], *tvp;
 	struct l_utimbuf lut;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utime))
 		printf(ARGS(utime, "%s, *"), fname);
 #endif
 
 	if (args->times) {
 		if ((error = copyin(args->times, &lut, sizeof lut))) {
 			LFREEPATH(fname);
 			return error;
 		}
 		tv[0].tv_sec = lut.l_actime;
 		tv[0].tv_usec = 0;
 		tv[1].tv_sec = lut.l_modtime;
 		tv[1].tv_usec = 0;
 		tvp = tv;
 	} else
 		tvp = NULL;
 
 	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 
 int
 linux_utimes(struct thread *td, struct linux_utimes_args *args)
 {
 	l_timeval ltv[2];
 	struct timeval tv[2], *tvp = NULL;
 	char *fname;
 	int error;
 
 	LCONVPATHEXIST(td, args->fname, &fname);
 
 #ifdef DEBUG
 	if (ldebug(utimes))
 		printf(ARGS(utimes, "%s, *"), fname);
 #endif
 
 	if (args->tptr != NULL) {
 		if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
 			LFREEPATH(fname);
 			return (error);
 		}
 		tv[0].tv_sec = ltv[0].tv_sec;
 		tv[0].tv_usec = ltv[0].tv_usec;
 		tv[1].tv_sec = ltv[1].tv_sec;
 		tv[1].tv_usec = ltv[1].tv_usec;
 		tvp = tv;
 	}
 
 	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
 	LFREEPATH(fname);
 	return (error);
 }
 #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
 
 #define __WCLONE 0x80000000
 
 int
 linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
 {
 	int error, options, tmpstat;
 
 #ifdef DEBUG
 	if (ldebug(waitpid))
 		printf(ARGS(waitpid, "%d, %p, %d"),
 		    args->pid, (void *)args->status, args->options);
 #endif
 	/*
 	 * this is necessary because the test in kern_wait doesn't work
 	 * because we mess with the options here
 	 */
 	if (args->options & ~(WUNTRACED | WNOHANG | WCONTINUED | __WCLONE))
 		return (EINVAL);
 
 	options = (args->options & (WNOHANG | WUNTRACED));
 	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
 	if (args->options & __WCLONE)
 		options |= WLINUXCLONE;
 
 	error = kern_wait(td, args->pid, &tmpstat, options, NULL);
 	if (error)
 		return error;
 
 	if (args->status) {
 		tmpstat &= 0xffff;
 		if (WIFSIGNALED(tmpstat))
 			tmpstat = (tmpstat & 0xffffff80) |
 			    BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
 		else if (WIFSTOPPED(tmpstat))
 			tmpstat = (tmpstat & 0xffff00ff) |
 			    (BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
 		return copyout(&tmpstat, args->status, sizeof(int));
 	}
 
 	return 0;
 }
 
 int
 linux_wait4(struct thread *td, struct linux_wait4_args *args)
 {
 	int error, options, tmpstat;
 	struct rusage ru, *rup;
 	struct proc *p;
 
 #ifdef DEBUG
 	if (ldebug(wait4))
 		printf(ARGS(wait4, "%d, %p, %d, %p"),
 		    args->pid, (void *)args->status, args->options,
 		    (void *)args->rusage);
 #endif
 
 	options = (args->options & (WNOHANG | WUNTRACED));
 	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
 	if (args->options & __WCLONE)
 		options |= WLINUXCLONE;
 
 	if (args->rusage != NULL)
 		rup = &ru;
 	else
 		rup = NULL;
 	error = kern_wait(td, args->pid, &tmpstat, options, rup);
 	if (error)
 		return error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	sigqueue_delete(&p->p_sigqueue, SIGCHLD);
 	PROC_UNLOCK(p);
 
 	if (args->status) {
 		tmpstat &= 0xffff;
 		if (WIFSIGNALED(tmpstat))
 			tmpstat = (tmpstat & 0xffffff80) |
 			    BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
 		else if (WIFSTOPPED(tmpstat))
 			tmpstat = (tmpstat & 0xffff00ff) |
 			    (BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
 		error = copyout(&tmpstat, args->status, sizeof(int));
 	}
 	if (args->rusage != NULL && error == 0)
 		error = copyout(&ru, args->rusage, sizeof(ru));
 
 	return (error);
 }
 
 int
 linux_mknod(struct thread *td, struct linux_mknod_args *args)
 {
 	char *path;
 	int error;
 
 	LCONVPATHCREAT(td, args->path, &path);
 
 #ifdef DEBUG
 	if (ldebug(mknod))
 		printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev);
 #endif
 
 	switch (args->mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFSOCK:
 		error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode);
 		break;
 
 	case S_IFCHR:
 	case S_IFBLK:
 		error = kern_mknod(td, path, UIO_SYSSPACE, args->mode,
 		    args->dev);
 		break;
 
 	case S_IFDIR:
 		error = EPERM;
 		break;
 
 	case 0:
 		args->mode |= S_IFREG;
 		/* FALLTHROUGH */
 	case S_IFREG:
 		error = kern_open(td, path, UIO_SYSSPACE,
 		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	LFREEPATH(path);
 	return (error);
 }
 
 /*
  * UGH! This is just about the dumbest idea I've ever heard!!
  */
 int
 linux_personality(struct thread *td, struct linux_personality_args *args)
 {
 #ifdef DEBUG
 	if (ldebug(personality))
 		printf(ARGS(personality, "%lu"), (unsigned long)args->per);
 #endif
 	if (args->per != 0)
 		return EINVAL;
 
 	/* Yes Jim, it's still a Linux... */
 	td->td_retval[0] = 0;
 	return 0;
 }
 
 struct l_itimerval {
 	l_timeval it_interval;
 	l_timeval it_value;
 };
 
 #define	B2L_ITIMERVAL(bip, lip) 					\
 	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
 	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
 	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
 	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
 
 int
 linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv, oitv;
 
 #ifdef DEBUG
 	if (ldebug(setitimer))
 		printf(ARGS(setitimer, "%p, %p"),
 		    (void *)uap->itv, (void *)uap->oitv);
 #endif
 
 	if (uap->itv == NULL) {
 		uap->itv = uap->oitv;
 		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
 	}
 
 	error = copyin(uap->itv, &ls, sizeof(ls));
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&aitv, &ls);
 #ifdef DEBUG
 	if (ldebug(setitimer)) {
 		printf("setitimer: value: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
 		printf("setitimer: interval: sec: %jd, usec: %ld\n",
 		    (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
 	}
 #endif
 	error = kern_setitimer(td, uap->which, &aitv, &oitv);
 	if (error != 0 || uap->oitv == NULL)
 		return (error);
 	B2L_ITIMERVAL(&ls, &oitv);
 
 	return (copyout(&ls, uap->oitv, sizeof(ls)));
 }
 
 int
 linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
 {
 	int error;
 	struct l_itimerval ls;
 	struct itimerval aitv;
 
 #ifdef DEBUG
 	if (ldebug(getitimer))
 		printf(ARGS(getitimer, "%p"), (void *)uap->itv);
 #endif
 	error = kern_getitimer(td, uap->which, &aitv);
 	if (error != 0)
 		return (error);
 	B2L_ITIMERVAL(&ls, &aitv);
 	return (copyout(&ls, uap->itv, sizeof(ls)));
 }
 
 int
 linux_nice(struct thread *td, struct linux_nice_args *args)
 {
 	struct setpriority_args bsd_args;
 
 	bsd_args.which = PRIO_PROCESS;
 	bsd_args.who = 0;		/* current process */
 	bsd_args.prio = args->inc;
 	return setpriority(td, &bsd_args);
 }
 
 int
 linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
 {
 	struct ucred *newcred, *oldcred;
 	l_gid_t linux_gidset[NGROUPS];
 	gid_t *bsd_gidset;
 	int ngrp, error;
 	struct proc *p;
 
 	ngrp = args->gidsetsize;
 	if (ngrp < 0 || ngrp >= NGROUPS)
 		return (EINVAL);
 	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
 	if (error)
 		return (error);
 	newcred = crget();
 	p = td->td_proc;
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 	/*
 	 * cr_groups[0] holds egid. Setting the whole set from
 	 * the supplied set will cause egid to be changed too.
 	 * Keep cr_groups[0] unchanged to prevent that.
 	 */
 
 	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS,
 	    SUSER_ALLOWJAIL)) != 0) {
 		PROC_UNLOCK(p);
 		crfree(newcred);
 		return (error);
 	}
 
 	crcopy(newcred, oldcred);
 	if (ngrp > 0) {
 		newcred->cr_ngroups = ngrp + 1;
 
 		bsd_gidset = newcred->cr_groups;
 		ngrp--;
 		while (ngrp >= 0) {
 			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
 			ngrp--;
 		}
 	} else
 		newcred->cr_ngroups = 1;
 
 	setsugid(p);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
 	crfree(oldcred);
 	return (0);
 }
 
 int
 linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
 {
 	struct ucred *cred;
 	l_gid_t linux_gidset[NGROUPS];
 	gid_t *bsd_gidset;
 	int bsd_gidsetsz, ngrp, error;
 
 	cred = td->td_ucred;
 	bsd_gidset = cred->cr_groups;
 	bsd_gidsetsz = cred->cr_ngroups - 1;
 
 	/*
 	 * cr_groups[0] holds egid. Returning the whole set
 	 * here will cause a duplicate. Exclude cr_groups[0]
 	 * to prevent that.
 	 */
 
 	if ((ngrp = args->gidsetsize) == 0) {
 		td->td_retval[0] = bsd_gidsetsz;
 		return (0);
 	}
 
 	if (ngrp < bsd_gidsetsz)
 		return (EINVAL);
 
 	ngrp = 0;
 	while (ngrp < bsd_gidsetsz) {
 		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
 		ngrp++;
 	}
 
 	if ((error = copyout(linux_gidset, args->grouplist,
 	    ngrp * sizeof(l_gid_t))))
 		return (error);
 
 	td->td_retval[0] = ngrp;
 	return (0);
 }
 
 int
 linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
 {
 	struct rlimit bsd_rlim;
 	struct l_rlimit rlim;
 	u_int which;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(setrlimit))
 		printf(ARGS(setrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	error = copyin(args->rlim, &rlim, sizeof(rlim));
 	if (error)
 		return (error);
 
 	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
 	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
 	return (kern_setrlimit(td, which, &bsd_rlim));
 }
 
 int
 linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct proc *p = td->td_proc;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(old_getrlimit))
 		printf(ARGS(old_getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	lim_rlimit(p, which, &bsd_rlim);
 	PROC_UNLOCK(p);
 
 #ifdef COMPAT_LINUX32
 	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == UINT_MAX)
 		rlim.rlim_cur = INT_MAX;
 	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == UINT_MAX)
 		rlim.rlim_max = INT_MAX;
 #else
 	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
 	if (rlim.rlim_cur == ULONG_MAX)
 		rlim.rlim_cur = LONG_MAX;
 	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
 	if (rlim.rlim_max == ULONG_MAX)
 		rlim.rlim_max = LONG_MAX;
 #endif
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
 {
 	struct l_rlimit rlim;
 	struct proc *p = td->td_proc;
 	struct rlimit bsd_rlim;
 	u_int which;
 
 #ifdef DEBUG
 	if (ldebug(getrlimit))
 		printf(ARGS(getrlimit, "%d, %p"),
 		    args->resource, (void *)args->rlim);
 #endif
 
 	if (args->resource >= LINUX_RLIM_NLIMITS)
 		return (EINVAL);
 
 	which = linux_to_bsd_resource[args->resource];
 	if (which == -1)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	lim_rlimit(p, which, &bsd_rlim);
 	PROC_UNLOCK(p);
 
 	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
 	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
 	return (copyout(&rlim, args->rlim, sizeof(rlim)));
 }
 
 int
 linux_sched_setscheduler(struct thread *td,
     struct linux_sched_setscheduler_args *args)
 {
 	struct sched_setscheduler_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_setscheduler))
 		printf(ARGS(sched_setscheduler, "%d, %d, %p"),
 		    args->pid, args->policy, (const void *)args->param);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return EINVAL;
 	}
 
 	bsd.pid = args->pid;
 	bsd.param = (struct sched_param *)args->param;
 	return sched_setscheduler(td, &bsd);
 }
 
 int
 linux_sched_getscheduler(struct thread *td,
     struct linux_sched_getscheduler_args *args)
 {
 	struct sched_getscheduler_args bsd;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(sched_getscheduler))
 		printf(ARGS(sched_getscheduler, "%d"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	error = sched_getscheduler(td, &bsd);
 
 	switch (td->td_retval[0]) {
 	case SCHED_OTHER:
 		td->td_retval[0] = LINUX_SCHED_OTHER;
 		break;
 	case SCHED_FIFO:
 		td->td_retval[0] = LINUX_SCHED_FIFO;
 		break;
 	case SCHED_RR:
 		td->td_retval[0] = LINUX_SCHED_RR;
 		break;
 	}
 
 	return error;
 }
 
 int
 linux_sched_get_priority_max(struct thread *td,
     struct linux_sched_get_priority_max_args *args)
 {
 	struct sched_get_priority_max_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_max))
 		printf(ARGS(sched_get_priority_max, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return EINVAL;
 	}
 	return sched_get_priority_max(td, &bsd);
 }
 
 int
 linux_sched_get_priority_min(struct thread *td,
     struct linux_sched_get_priority_min_args *args)
 {
 	struct sched_get_priority_min_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(sched_get_priority_min))
 		printf(ARGS(sched_get_priority_min, "%d"), args->policy);
 #endif
 
 	switch (args->policy) {
 	case LINUX_SCHED_OTHER:
 		bsd.policy = SCHED_OTHER;
 		break;
 	case LINUX_SCHED_FIFO:
 		bsd.policy = SCHED_FIFO;
 		break;
 	case LINUX_SCHED_RR:
 		bsd.policy = SCHED_RR;
 		break;
 	default:
 		return EINVAL;
 	}
 	return sched_get_priority_min(td, &bsd);
 }
 
 #define REBOOT_CAD_ON	0x89abcdef
 #define REBOOT_CAD_OFF	0
 #define REBOOT_HALT	0xcdef0123
 #define REBOOT_RESTART	0x01234567
 #define REBOOT_RESTART2	0xA1B2C3D4
 #define REBOOT_POWEROFF	0x4321FEDC
 #define REBOOT_MAGIC1	0xfee1dead
 #define REBOOT_MAGIC2	0x28121969
 #define REBOOT_MAGIC2A	0x05121996
 #define REBOOT_MAGIC2B	0x16041998
 
 int
 linux_reboot(struct thread *td, struct linux_reboot_args *args)
 {
 	struct reboot_args bsd_args;
 
 #ifdef DEBUG
 	if (ldebug(reboot))
 		printf(ARGS(reboot, "0x%x"), args->cmd);
 #endif
 
 	if (args->magic1 != REBOOT_MAGIC1)
 		return EINVAL;
 
 	switch (args->magic2) {
 	case REBOOT_MAGIC2:
 	case REBOOT_MAGIC2A:
 	case REBOOT_MAGIC2B:
 		break;
 	default:
 		return EINVAL;
 	}
 
 	switch (args->cmd) {
 	case REBOOT_CAD_ON:
 	case REBOOT_CAD_OFF:
 		return (priv_check(td, PRIV_REBOOT));
 	case REBOOT_HALT:
 		bsd_args.opt = RB_HALT;
 		break;
 	case REBOOT_RESTART:
 	case REBOOT_RESTART2:
 		bsd_args.opt = 0;
 		break;
 	case REBOOT_POWEROFF:
 		bsd_args.opt = RB_POWEROFF;
 		break;
 	default:
 		return EINVAL;
 	}
 	return reboot(td, &bsd_args);
 }
 
 
 /*
  * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
  * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
  * are assumed to be preserved. The following lightweight syscalls fixes
  * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
  *
  * linux_getpid() - MP SAFE
  * linux_getgid() - MP SAFE
  * linux_getuid() - MP SAFE
  */
 
 int
 linux_getpid(struct thread *td, struct linux_getpid_args *args)
 {
 	struct linux_emuldata *em;
 
 #ifdef DEBUG
 	if (ldebug(getpid))
 		printf(ARGS(getpid, ""));
 #endif
 
 	if (linux_use26(td)) {
 		em = em_find(td->td_proc, EMUL_DONTLOCK);
 		KASSERT(em != NULL, ("getpid: emuldata not found.\n"));
 		td->td_retval[0] = em->shared->group_pid;
 	} else {
 		td->td_retval[0] = td->td_proc->p_pid;
 	}
 
 	return (0);
 }
 
 int
 linux_gettid(struct thread *td, struct linux_gettid_args *args)
 {
 #ifdef DEBUG
 	if (ldebug(gettid))
 		printf(ARGS(gettid, ""));
 #endif
 
 	td->td_retval[0] = td->td_proc->p_pid;
 	return (0);
 }
 
 
 int
 linux_getppid(struct thread *td, struct linux_getppid_args *args)
 {
 	struct linux_emuldata *em;
 	struct proc *p, *pp;
 
 #ifdef DEBUG
 	if (ldebug(getppid))
 		printf(ARGS(getppid, ""));
 #endif
 
 	if (!linux_use26(td)) {
 		PROC_LOCK(td->td_proc);
 		td->td_retval[0] = td->td_proc->p_pptr->p_pid;
 		PROC_UNLOCK(td->td_proc);
 		return (0);
 	}
 
 	em = em_find(td->td_proc, EMUL_DONTLOCK);
 
 	KASSERT(em != NULL, ("getppid: process emuldata not found.\n"));
 
 	/* find the group leader */
 	p = pfind(em->shared->group_pid);
 
 	if (p == NULL) {
 #ifdef DEBUG
 	   	printf(LMSG("parent process not found.\n"));
 #endif
 		return (0);
 	}
 
 	pp = p->p_pptr;		/* switch to parent */
 	PROC_LOCK(pp);
 	PROC_UNLOCK(p);
 
 	/* if its also linux process */
 	if (pp->p_sysent == &elf_linux_sysvec) {
 		em = em_find(pp, EMUL_DONTLOCK);
 		KASSERT(em != NULL, ("getppid: parent emuldata not found.\n"));
 
 		td->td_retval[0] = em->shared->group_pid;
 	} else
 		td->td_retval[0] = pp->p_pid;
 
 	PROC_UNLOCK(pp);
 
 	return (0);
 }
 
 int
 linux_getgid(struct thread *td, struct linux_getgid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getgid))
 		printf(ARGS(getgid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_rgid;
 	return (0);
 }
 
 int
 linux_getuid(struct thread *td, struct linux_getuid_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getuid))
 		printf(ARGS(getuid, ""));
 #endif
 
 	td->td_retval[0] = td->td_ucred->cr_ruid;
 	return (0);
 }
 
 
 int
 linux_getsid(struct thread *td, struct linux_getsid_args *args)
 {
 	struct getsid_args bsd;
 
 #ifdef DEBUG
 	if (ldebug(getsid))
 		printf(ARGS(getsid, "%i"), args->pid);
 #endif
 
 	bsd.pid = args->pid;
 	return getsid(td, &bsd);
 }
 
 int
 linux_nosys(struct thread *td, struct nosys_args *ignore)
 {
 
 	return (ENOSYS);
 }
 
 int
 linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
 {
 	struct getpriority_args bsd_args;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(getpriority))
 		printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
 #endif
 
 	bsd_args.which = args->which;
 	bsd_args.who = args->who;
 	error = getpriority(td, &bsd_args);
 	td->td_retval[0] = 20 - td->td_retval[0];
 	return error;
 }
 
 int
 linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
 {
 	int name[2];
 
 #ifdef DEBUG
 	if (ldebug(sethostname))
 		printf(ARGS(sethostname, "*, %i"), args->len);
 #endif
 
 	name[0] = CTL_KERN;
 	name[1] = KERN_HOSTNAME;
 	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
 	    args->len, 0, 0));
 }
 
 int
 linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
 {
 	struct linux_emuldata *em, *td_em, *tmp_em;
 	struct proc *sp;
 
 #ifdef DEBUG
 	if (ldebug(exit_group))
 		printf(ARGS(exit_group, "%i"), args->error_code);
 #endif
 
 	if (linux_use26(td)) {
 		td_em = em_find(td->td_proc, EMUL_DONTLOCK);
 
 		KASSERT(td_em != NULL, ("exit_group: emuldata not found.\n"));
 
 		EMUL_SHARED_RLOCK(&emul_shared_lock);
 		LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) {
 			if (em->pid == td_em->pid)
 				continue;
 
 			sp = pfind(em->pid);
 			psignal(sp, SIGKILL);
 			PROC_UNLOCK(sp);
 #ifdef DEBUG
 			printf(LMSG("linux_sys_exit_group: kill PID %d\n"), em->pid);
 #endif
 		}
 
 		EMUL_SHARED_RUNLOCK(&emul_shared_lock);
 	}
 	/*
 	 * XXX: we should send a signal to the parent if
 	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
 	 * as it doesnt occur often.
 	 */
 	exit1(td, W_EXITCODE(args->error_code, 0));
 
 	return (0);
 }
 
 int
 linux_prctl(struct thread *td, struct linux_prctl_args *args)
 {
 	int error = 0, max_size;
 	struct proc *p = td->td_proc;
 	char comm[LINUX_MAX_COMM_LEN];
 	struct linux_emuldata *em;
 	int pdeath_signal;
 
 #ifdef DEBUG
 	if (ldebug(prctl))
 		printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option,
 		    args->arg2, args->arg3, args->arg4, args->arg5);
 #endif
 
 	switch (args->option) {
 	case LINUX_PR_SET_PDEATHSIG:
 		if (!LINUX_SIG_VALID(args->arg2))
 			return (EINVAL);
 		em = em_find(p, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		em->pdeath_signal = args->arg2;
 		EMUL_UNLOCK(&emul_lock);
 		break;
 	case LINUX_PR_GET_PDEATHSIG:
 		em = em_find(p, EMUL_DOLOCK);
 		KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
 		pdeath_signal = em->pdeath_signal;
 		EMUL_UNLOCK(&emul_lock);
 		error = copyout(&pdeath_signal,
 		    (void *)(register_t)args->arg2,
 		    sizeof(pdeath_signal));
 		break;
 	case LINUX_PR_SET_NAME:
 		/*
 		 * To be on the safe side we need to make sure to not
 		 * overflow the size a linux program expects. We already
 		 * do this here in the copyin, so that we don't need to
 		 * check on copyout.
 		 */
 		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
 		error = copyinstr((void *)(register_t)args->arg2, comm,
 		    max_size, NULL);
 
 		/* Linux silently truncates the name if it is too long. */
 		if (error == ENAMETOOLONG) {
 			/*
 			 * XXX: copyinstr() isn't documented to populate the
 			 * array completely, so do a copyin() to be on the
 			 * safe side. This should be changed in case
 			 * copyinstr() is changed to guarantee this.
 			 */
 			error = copyin((void *)(register_t)args->arg2, comm,
 			    max_size - 1);
 			comm[max_size - 1] = '\0';
 		}
 		if (error)
 			return (error);
 
 		PROC_LOCK(p);
 		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
 		PROC_UNLOCK(p);
 		break;
 	case LINUX_PR_GET_NAME:
 		PROC_LOCK(p);
 		strlcpy(comm, p->p_comm, sizeof(comm));
 		PROC_UNLOCK(p);
 		error = copyout(comm, (void *)(register_t)args->arg2,
 		    strlen(comm) + 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	return (error);
 }
Index: head/sys/compat/svr4/svr4_misc.c
===================================================================
--- head/sys/compat/svr4/svr4_misc.c	(revision 169666)
+++ head/sys/compat/svr4/svr4_misc.c	(revision 169667)
@@ -1,1627 +1,1627 @@
 /*-
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * SVR4 compatibility module.
  *
  * SVR4 system calls that are implemented differently in BSD are
  * handled here.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sem.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_sysconfig.h>
 #include <compat/svr4/svr4_dirent.h>
 #include <compat/svr4/svr4_acl.h>
 #include <compat/svr4/svr4_ulimit.h>
 #include <compat/svr4/svr4_statvfs.h>
 #include <compat/svr4/svr4_hrt.h>
 #include <compat/svr4/svr4_mman.h>
 #include <compat/svr4/svr4_wait.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #if defined(__FreeBSD__)
 #include <vm/uma.h>
 #include <vm/vm_extern.h>
 #endif
 
 #if defined(NetBSD)
 # if defined(UVM)
 #  include <uvm/uvm_extern.h>
 # endif
 #endif
 
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 
 static int svr4_mknod(struct thread *, register_t *, char *,
     svr4_mode_t, svr4_dev_t);
 
 static __inline clock_t timeval_to_clock_t(struct timeval *);
 static int svr4_setinfo	(pid_t , struct rusage *, int, svr4_siginfo_t *);
 
 struct svr4_hrtcntl_args;
 static int svr4_hrtcntl	(struct thread *, struct svr4_hrtcntl_args *,
     register_t *);
 static void bsd_statfs_to_svr4_statvfs(const struct statfs *,
     struct svr4_statvfs *);
 static void bsd_statfs_to_svr4_statvfs64(const struct statfs *,
     struct svr4_statvfs64 *);
 static struct proc *svr4_pfind(pid_t pid);
 
 /* BOGUS noop */
 #if defined(BOGUS)
 int
 svr4_sys_setitimer(td, uap)
         register struct thread *td;
 	struct svr4_sys_setitimer_args *uap;
 {
         td->td_retval[0] = 0;
 	return 0;
 }
 #endif
 
 int
 svr4_sys_wait(td, uap)
 	struct thread *td;
 	struct svr4_sys_wait_args *uap;
 {
 	int error, st, sig;
 
 	error = kern_wait(td, WAIT_ANY, &st, 0, NULL);
 	if (error)
 		return (error);
       
 	if (WIFSIGNALED(st)) {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig);
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8);
 	}
 
 	/*
 	 * It looks like wait(2) on svr4/solaris/2.4 returns
 	 * the status in retval[1], and the pid on retval[0].
 	 */
 	td->td_retval[1] = st;
 
 	if (uap->status)
 		error = copyout(&st, uap->status, sizeof(st));
 
 	return (error);
 }
 
 int
 svr4_sys_execv(td, uap)
 	struct thread *td;
 	struct svr4_sys_execv_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 svr4_sys_execve(td, uap)
 	struct thread *td;
 	struct svr4_sys_execve_args *uap;
 {
 	struct image_args eargs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
 	    uap->envp);
 	free(path, M_TEMP);
 	if (error == 0)
 		error = kern_execve(td, &eargs, NULL);
 	return (error);
 }
 
 int
 svr4_sys_time(td, v)
 	struct thread *td;
 	struct svr4_sys_time_args *v;
 {
 	struct svr4_sys_time_args *uap = v;
 	int error = 0;
 	struct timeval tv;
 
 	microtime(&tv);
 	if (uap->t)
 		error = copyout(&tv.tv_sec, uap->t,
 				sizeof(*(uap->t)));
 	td->td_retval[0] = (int) tv.tv_sec;
 
 	return error;
 }
 
 
 /*
  * Read SVR4-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  
  *
  * This code is ported from the Linux emulator:  Changes to the VFS interface
  * between FreeBSD and NetBSD have made it simpler to port it from there than
  * to adapt the NetBSD version.
  */
 int
 svr4_sys_getdents64(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents64_args *uap;
 {
 	register struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* SVR4-format */
 	int resid, svr4reclen=0;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	off_t off;
 	struct svr4_dirent64 svr4_dirent;
 	int buflen, error, eofflag, nbytes, justone, vfslocked;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n",
 		uap->fd, uap->nbytes));
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0) {
 		return (error);
 	}
 
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	nbytes = uap->nbytes;
 	if (nbytes == 1) {
 		nbytes = sizeof (struct svr4_dirent64);
 		justone = 1;
 	}
 	else
 		justone = 0;
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 						&ncookies, &cookies);
 	if (error) {
 		goto out;
 	}
 
 	inp = buf;
 	outp = (caddr_t) uap->dp;
 	resid = nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0) {
 		goto eof;
 	}
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			bdp = (struct dirent *) inp;
 			len -= bdp->d_reclen;
 			inp += bdp->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	while (len > 0) {
 		if (cookiep && ncookies == 0)
 			break;
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3) {
 			DPRINTF(("svr4_readdir: reclen=%d\n", reclen));
 			error = EFAULT;
 			goto out;
 		}
   
 		if (bdp->d_fileno == 0) {
 	    		inp += reclen;
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			len -= reclen;
 			continue;
 		}
 		svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen);
 		if (reclen > len || resid < svr4reclen) {
 			outp++;
 			break;
 		}
 		svr4_dirent.d_ino = (long) bdp->d_fileno;
 		if (justone) {
 			/*
 			 * old svr4-style readdir usage.
 			 */
 			svr4_dirent.d_off = (svr4_off_t) svr4reclen;
 			svr4_dirent.d_reclen = (u_short) bdp->d_namlen;
 		} else {
 			svr4_dirent.d_off = (svr4_off_t)(off + reclen);
 			svr4_dirent.d_reclen = (u_short) svr4reclen;
 		}
 		strcpy(svr4_dirent.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen)))
 			goto out;
 		inp += reclen;
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		outp += svr4reclen;
 		resid -= svr4reclen;
 		len -= reclen;
 		if (justone)
 			break;
     	}
 
 	if (outp == (caddr_t) uap->dp)
 		goto again;
 	fp->f_offset = off;
 
 	if (justone)
 		nbytes = resid + svr4reclen;
 
 eof:
 	td->td_retval[0] = nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_getdents(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents_args *uap;
 {
 	struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;	/* BSD-format */
 	int len, reclen;	/* BSD-format */
 	caddr_t outp;		/* SVR4-format */
 	int resid, svr4_reclen;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct svr4_dirent idb;
 	off_t off;		/* true file offset */
 	int buflen, error, eofflag, vfslocked;
 	u_long *cookiebuf = NULL, *cookie;
 	int ncookies = 0, *retval = td->td_retval;
 
 	if (uap->nbytes < 0)
 		return (EINVAL);
 
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 
 	if ((fp->f_flag & FREAD) == 0) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	vp = fp->f_vnode;
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
 
 	buflen = min(MAXBSIZE, uap->nbytes);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	off = fp->f_offset;
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error)
 		goto out;
 #endif
 
 	/*
          * First we read into the malloc'ed buffer, then
          * we massage it into user space, one record at a time.
          */
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
 	    &cookiebuf);
 	if (error) {
 		goto out;
 	}
 
 	inp = buf;
 	outp = uap->buf;
 	resid = uap->nbytes;
 	if ((len = buflen - auio.uio_resid) == 0)
 		goto eof;
 
 	for (cookie = cookiebuf; len > 0; len -= reclen) {
 		bdp = (struct dirent *)inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3)
 			panic("svr4_sys_getdents64: bad reclen");
 		off = *cookie++;	/* each entry points to the next */
 		if ((off >> 32) != 0) {
 			uprintf("svr4_sys_getdents64: dir offset too large for emulated program");
 			error = EINVAL;
 			goto out;
 		}
 		if (bdp->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			continue;
 		}
 		svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen);
 		if (reclen > len || resid < svr4_reclen) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make a SVR4-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (svr4_ino_t)bdp->d_fileno;
 		idb.d_off = (svr4_off_t)off;
 		idb.d_reclen = (u_short)svr4_reclen;
 		strcpy(idb.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&idb, outp, svr4_reclen)))
 			goto out;
 		/* advance past this real entry */
 		inp += reclen;
 		/* advance output past SVR4-shaped entry */
 		outp += svr4_reclen;
 		resid -= svr4_reclen;
 	}
 
 	/* if we squished out the whole block, try again */
 	if (outp == uap->buf)
 		goto again;
 	fp->f_offset = off;	/* update the vnode offset */
 
 eof:
 	*retval = uap->nbytes - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	if (cookiebuf)
 		free(cookiebuf, M_TEMP);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_mmap(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap_args *uap;
 {
 	struct mmap_args	 mm;
 	int             *retval;
 
 	retval = td->td_retval;
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (uap->len == 0)
 		return EINVAL;
 
 	mm.prot = uap->prot;
 	mm.len = uap->len;
 	mm.flags = uap->flags & ~_MAP_NEW;
 	mm.fd = uap->fd;
 	mm.addr = uap->addr;
 	mm.pos = uap->pos;
 
 	return mmap(td, &mm);
 }
 
 int
 svr4_sys_mmap64(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap64_args *uap;
 {
 	struct mmap_args	 mm;
 	void		*rp;
 
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (uap->prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (uap->len == 0)
 		return EINVAL;
 
 	mm.prot = uap->prot;
 	mm.len = uap->len;
 	mm.flags = uap->flags & ~_MAP_NEW;
 	mm.fd = uap->fd;
 	mm.addr = uap->addr;
 	mm.pos = uap->pos;
 
 	rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz));
 	if ((mm.flags & MAP_FIXED) == 0 &&
 	    mm.addr != 0 && (void *)mm.addr < rp)
 		mm.addr = rp;
 
 	return mmap(td, &mm);
 }
 
 
 int
 svr4_sys_fchroot(td, uap)
 	struct thread *td;
 	struct svr4_sys_fchroot_args *uap;
 {
 	struct filedesc	*fdp = td->td_proc->p_fd;
 	struct vnode	*vp;
 	struct file	*fp;
 	int		 error, vfslocked;
 
 	if ((error = priv_check_cred(td->td_ucred, PRIV_VFS_FCHROOT,
 	    SUSER_ALLOWJAIL)) != 0)
 		return error;
 	if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
 		return error;
 	vp = fp->f_vnode;
 	VREF(vp);
 	fdrop(fp, td);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = change_dir(vp, td);
 	if (error)
 		goto fail;
 #ifdef MAC
 	error = mac_check_vnode_chroot(td->td_ucred, vp);
 	if (error)
 		goto fail;
 #endif
 	VOP_UNLOCK(vp, 0, td);
 	error = change_root(vp, td);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 fail:
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 
 static int
 svr4_mknod(td, retval, path, mode, dev)
 	struct thread *td;
 	register_t *retval;
 	char *path;
 	svr4_mode_t mode;
 	svr4_dev_t dev;
 {
 	char *newpath;
 	int error;
 
 	CHECKALTEXIST(td, path, &newpath);
 
 	if (S_ISFIFO(mode))
 		error = kern_mkfifo(td, newpath, UIO_SYSSPACE, mode);
 	else
 		error = kern_mknod(td, newpath, UIO_SYSSPACE, mode, dev);
 	free(newpath, M_TEMP);
 	return (error);
 }
 
 
 int
 svr4_sys_mknod(td, uap)
 	register struct thread *td;
 	struct svr4_sys_mknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  uap->path, uap->mode,
 			  (svr4_dev_t)svr4_to_bsd_odev_t(uap->dev));
 }
 
 
 int
 svr4_sys_xmknod(td, uap)
 	struct thread *td;
 	struct svr4_sys_xmknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  uap->path, uap->mode,
 			  (svr4_dev_t)svr4_to_bsd_dev_t(uap->dev));
 }
 
 
 int
 svr4_sys_vhangup(td, uap)
 	struct thread *td;
 	struct svr4_sys_vhangup_args *uap;
 {
 	return 0;
 }
 
 
 int
 svr4_sys_sysconfig(td, uap)
 	struct thread *td;
 	struct svr4_sys_sysconfig_args *uap;
 {
 	int *retval;
 
 	retval = &(td->td_retval[0]);
 
 	switch (uap->name) {
 	case SVR4_CONFIG_UNUSED:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_NGROUPS:
 		*retval = NGROUPS_MAX;
 		break;
 	case SVR4_CONFIG_CHILD_MAX:
 		*retval = maxproc;
 		break;
 	case SVR4_CONFIG_OPEN_FILES:
 		*retval = maxfiles;
 		break;
 	case SVR4_CONFIG_POSIX_VER:
 		*retval = 198808;
 		break;
 	case SVR4_CONFIG_PAGESIZE:
 		*retval = PAGE_SIZE;
 		break;
 	case SVR4_CONFIG_CLK_TCK:
 		*retval = 60;	/* should this be `hz', ie. 100? */
 		break;
 	case SVR4_CONFIG_XOPEN_VER:
 		*retval = 2;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_PROF_TCK:
 		*retval = 60;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_NPROC_CONF:
 		*retval = 1;	/* Only one processor for now */
 		break;
 	case SVR4_CONFIG_NPROC_ONLN:
 		*retval = 1;	/* And it better be online */
 		break;
 	case SVR4_CONFIG_AIO_LISTIO_MAX:
 	case SVR4_CONFIG_AIO_MAX:
 	case SVR4_CONFIG_AIO_PRIO_DELTA_MAX:
 		*retval = 0;	/* No aio support */
 		break;
 	case SVR4_CONFIG_DELAYTIMER_MAX:
 		*retval = 0;	/* No delaytimer support */
 		break;
 	case SVR4_CONFIG_MQ_OPEN_MAX:
 		*retval = msginfo.msgmni;
 		break;
 	case SVR4_CONFIG_MQ_PRIO_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_RTSIG_MAX:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_SEM_NSEMS_MAX:
 		*retval = seminfo.semmni;
 		break;
 	case SVR4_CONFIG_SEM_VALUE_MAX:
 		*retval = seminfo.semvmx;
 		break;
 	case SVR4_CONFIG_SIGQUEUE_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_SIGRT_MIN:
 	case SVR4_CONFIG_SIGRT_MAX:
 		*retval = 0;	/* No real time signals */
 		break;
 	case SVR4_CONFIG_TIMER_MAX:
 		*retval = 3;	/* XXX: real, virtual, profiling */
 		break;
 #if defined(NOTYET)
 	case SVR4_CONFIG_PHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.free;	/* XXX: free instead of total */
 #else
-		*retval = cnt.v_free_count;	/* XXX: free instead of total */
+		*retval = VMCNT_GET(free_count);	/* XXX: free instead of total */
 #endif
 		break;
 	case SVR4_CONFIG_AVPHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.active;	/* XXX: active instead of avg */
 #else
-		*retval = cnt.v_active_count;	/* XXX: active instead of avg */
+		*retval = VMCNT_GET(active_count);	/* XXX: active instead of avg */
 #endif
 		break;
 #endif /* NOTYET */
 
 	default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 /* ARGSUSED */
 int
 svr4_sys_break(td, uap)
 	struct thread *td;
 	struct svr4_sys_break_args *uap;
 {
 	struct obreak_args ap;
 
 	ap.nsize = uap->nsize;
 	return (obreak(td, &ap));
 }
 
 static __inline clock_t
 timeval_to_clock_t(tv)
 	struct timeval *tv;
 {
 	return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz);
 }
 
 
 int
 svr4_sys_times(td, uap)
 	struct thread *td;
 	struct svr4_sys_times_args *uap;
 {
 	struct timeval tv, utime, stime, cutime, cstime;
 	struct tms tms;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	calcru(p, &utime, &stime);
 	calccru(p, &cutime, &cstime);
 	PROC_UNLOCK(p);
 
 	tms.tms_utime = timeval_to_clock_t(&utime);
 	tms.tms_stime = timeval_to_clock_t(&stime);
 
 	tms.tms_cutime = timeval_to_clock_t(&cutime);
 	tms.tms_cstime = timeval_to_clock_t(&cstime);
 
 	error = copyout(&tms, uap->tp, sizeof(tms));
 	if (error)
 		return (error);
 
 	microtime(&tv);
 	td->td_retval[0] = (int)timeval_to_clock_t(&tv);
 	return (0);
 }
 
 
 int
 svr4_sys_ulimit(td, uap)
 	struct thread *td;
 	struct svr4_sys_ulimit_args *uap;
 {
         int *retval = td->td_retval;
 	int error;
 
 	switch (uap->cmd) {
 	case SVR4_GFILLIM:
 		PROC_LOCK(td->td_proc);
 		*retval = lim_cur(td->td_proc, RLIMIT_FSIZE) / 512;
 		PROC_UNLOCK(td->td_proc);
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	case SVR4_SFILLIM:
 		{
 			struct rlimit krl;
 
 			krl.rlim_cur = uap->newlimit * 512;
 			PROC_LOCK(td->td_proc);
 			krl.rlim_max = lim_max(td->td_proc, RLIMIT_FSIZE);
 			PROC_UNLOCK(td->td_proc);
 
 			error = kern_setrlimit(td, RLIMIT_FSIZE, &krl);
 			if (error)
 				return error;
 
 			PROC_LOCK(td->td_proc);
 			*retval = lim_cur(td->td_proc, RLIMIT_FSIZE);
 			PROC_UNLOCK(td->td_proc);
 			if (*retval == -1)
 				*retval = 0x7fffffff;
 			return 0;
 		}
 
 	case SVR4_GMEMLIM:
 		{
 			struct vmspace *vm = td->td_proc->p_vmspace;
 			register_t r;
 
 			PROC_LOCK(td->td_proc);
 			r = lim_cur(td->td_proc, RLIMIT_DATA);
 			PROC_UNLOCK(td->td_proc);
 
 			if (r == -1)
 				r = 0x7fffffff;
 			mtx_lock(&Giant);	/* XXX */
 			r += (long) vm->vm_daddr;
 			mtx_unlock(&Giant);
 			if (r < 0)
 				r = 0x7fffffff;
 			*retval = r;
 			return 0;
 		}
 
 	case SVR4_GDESLIM:
 		PROC_LOCK(td->td_proc);
 		*retval = lim_cur(td->td_proc, RLIMIT_NOFILE);
 		PROC_UNLOCK(td->td_proc);
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 static struct proc *
 svr4_pfind(pid)
 	pid_t pid;
 {
 	struct proc *p;
 
 	/* look in the live processes */
 	if ((p = pfind(pid)) == NULL)
 		/* look in the zombies */
 		p = zpfind(pid);
 
 	return p;
 }
 
 
 int
 svr4_sys_pgrpsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_pgrpsys_args *uap;
 {
         int *retval = td->td_retval;
 	struct proc *p = td->td_proc;
 
 	switch (uap->cmd) {
 	case 1:			/* setpgrp() */
 		/*
 		 * SVR4 setpgrp() (which takes no arguments) has the
 		 * semantics that the session ID is also created anew, so
 		 * in almost every sense, setpgrp() is identical to
 		 * setsid() for SVR4.  (Under BSD, the difference is that
 		 * a setpgid(0,0) will not create a new session.)
 		 */
 		setsid(td, NULL);
 		/*FALLTHROUGH*/
 
 	case 0:			/* getpgrp() */
 		PROC_LOCK(p);
 		*retval = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 2:			/* getsid(pid) */
 		if (uap->pid == 0)
 			PROC_LOCK(p);
 		else if ((p = svr4_pfind(uap->pid)) == NULL)
 			return ESRCH;
 		/*
 		 * This has already been initialized to the pid of
 		 * the session leader.
 		 */
 		*retval = (register_t) p->p_session->s_sid;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 3:			/* setsid() */
 		return setsid(td, NULL);
 
 	case 4:			/* getpgid(pid) */
 
 		if (uap->pid == 0)
 			PROC_LOCK(p);
 		else if ((p = svr4_pfind(uap->pid)) == NULL)
 			return ESRCH;
 
 		*retval = (int) p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 5:			/* setpgid(pid, pgid); */
 		{
 			struct setpgid_args sa;
 
 			sa.pid = uap->pid;
 			sa.pgid = uap->pgid;
 			return setpgid(td, &sa);
 		}
 
 	default:
 		return EINVAL;
 	}
 }
 
 struct svr4_hrtcntl_args {
 	int 			cmd;
 	int 			fun;
 	int 			clk;
 	svr4_hrt_interval_t *	iv;
 	svr4_hrt_time_t *	ti;
 };
 
 
 static int
 svr4_hrtcntl(td, uap, retval)
 	struct thread *td;
 	struct svr4_hrtcntl_args *uap;
 	register_t *retval;
 {
 	switch (uap->fun) {
 	case SVR4_HRT_CNTL_RES:
 		DPRINTF(("htrcntl(RES)\n"));
 		*retval = SVR4_HRT_USEC;
 		return 0;
 
 	case SVR4_HRT_CNTL_TOFD:
 		DPRINTF(("htrcntl(TOFD)\n"));
 		{
 			struct timeval tv;
 			svr4_hrt_time_t t;
 			if (uap->clk != SVR4_HRT_CLK_STD) {
 				DPRINTF(("clk == %d\n", uap->clk));
 				return EINVAL;
 			}
 			if (uap->ti == NULL) {
 				DPRINTF(("ti NULL\n"));
 				return EINVAL;
 			}
 			microtime(&tv);
 			t.h_sec = tv.tv_sec;
 			t.h_rem = tv.tv_usec;
 			t.h_res = SVR4_HRT_USEC;
 			return copyout(&t, uap->ti, sizeof(t));
 		}
 
 	case SVR4_HRT_CNTL_START:
 		DPRINTF(("htrcntl(START)\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CNTL_GET:
 		DPRINTF(("htrcntl(GET)\n"));
 		return ENOSYS;
 	default:
 		DPRINTF(("Bad htrcntl command %d\n", uap->fun));
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_hrtsys(td, uap) 
 	struct thread *td;
 	struct svr4_sys_hrtsys_args *uap;
 {
         int *retval = td->td_retval;
 
 	switch (uap->cmd) {
 	case SVR4_HRT_CNTL:
 		return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap,
 				    retval);
 
 	case SVR4_HRT_ALRM:
 		DPRINTF(("hrtalarm\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_SLP:
 		DPRINTF(("hrtsleep\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CAN:
 		DPRINTF(("hrtcancel\n"));
 		return ENOSYS;
 
 	default:
 		DPRINTF(("Bad hrtsys command %d\n", uap->cmd));
 		return EINVAL;
 	}
 }
 
 
 static int
 svr4_setinfo(pid, ru, st, s)
 	pid_t pid;
 	struct rusage *ru;
 	int st;
 	svr4_siginfo_t *s;
 {
 	svr4_siginfo_t i;
 	int sig;
 
 	memset(&i, 0, sizeof(i));
 
 	i.svr4_si_signo = SVR4_SIGCHLD;
 	i.svr4_si_errno = 0;	/* XXX? */
 
 	i.svr4_si_pid = pid;
 	if (ru) {
 		i.svr4_si_stime = ru->ru_stime.tv_sec;
 		i.svr4_si_utime = ru->ru_utime.tv_sec;
 	}
 
 	if (WIFEXITED(st)) {
 		i.svr4_si_status = WEXITSTATUS(st);
 		i.svr4_si_code = SVR4_CLD_EXITED;
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (i.svr4_si_status == SVR4_SIGCONT)
 			i.svr4_si_code = SVR4_CLD_CONTINUED;
 		else
 			i.svr4_si_code = SVR4_CLD_STOPPED;
 	} else {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (WCOREDUMP(st))
 			i.svr4_si_code = SVR4_CLD_DUMPED;
 		else
 			i.svr4_si_code = SVR4_CLD_KILLED;
 	}
 
 	DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n",
 		 i.svr4_si_pid, i.svr4_si_signo, i.svr4_si_code, i.svr4_si_errno,
 		 i.svr4_si_status));
 
 	return copyout(&i, s, sizeof(i));
 }
 
 
 int
 svr4_sys_waitsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_waitsys_args *uap;
 {
 	struct rusage ru;
 	pid_t pid;
 	int nfound, status;
 	int error, *retval = td->td_retval;
 	struct proc *p, *q;
 
 	DPRINTF(("waitsys(%d, %d, %p, %x)\n", 
 	         uap->grp, uap->id,
 		 uap->info, uap->options));
 
 	q = td->td_proc;
 	switch (uap->grp) {
 	case SVR4_P_PID:
 		pid = uap->id;
 		break;
 
 	case SVR4_P_PGID:
 		PROC_LOCK(q);
 		pid = -q->p_pgid;
 		PROC_UNLOCK(q);
 		break;
 
 	case SVR4_P_ALL:
 		pid = WAIT_ANY;
 		break;
 
 	default:
 		return EINVAL;
 	}
 
 	/* Hand off the easy cases to kern_wait(). */
 	if (!(uap->options & (SVR4_WNOWAIT)) &&
 	    (uap->options & (SVR4_WEXITED | SVR4_WTRAPPED))) {
 		int options;
 
 		options = 0;
 		if (uap->options & SVR4_WSTOPPED)
 			options |= WUNTRACED;
 		if (uap->options & SVR4_WCONTINUED)
 			options |= WCONTINUED;
 		if (uap->options & SVR4_WNOHANG)
 			options |= WNOHANG;
 
 		error = kern_wait(td, pid, &status, options, &ru);
 		if (error)
 			return (error);
 		if (uap->options & SVR4_WNOHANG && *retval == 0)
 			error = svr4_setinfo(*retval, NULL, 0, uap->info);
 		else
 			error = svr4_setinfo(*retval, &ru, status, uap->info);
 		*retval = 0;
 		return (error);
 	}
 
 	/*
 	 * Ok, handle the weird cases.  Either WNOWAIT is set (meaning we
 	 * just want to see if there is a process to harvest, we dont'
 	 * want to actually harvest it), or WEXIT and WTRAPPED are clear
 	 * meaning we want to ignore zombies.  Either way, we don't have
 	 * to handle harvesting zombies here.  We do have to duplicate the
 	 * other portions of kern_wait() though, especially for the
 	 * WCONTINUED and WSTOPPED.
 	 */
 loop:
 	nfound = 0;
 	sx_slock(&proctree_lock);
 	LIST_FOREACH(p, &q->p_children, p_sibling) {
 		PROC_LOCK(p);
 		if (pid != WAIT_ANY &&
 		    p->p_pid != pid && p->p_pgid != -pid) {
 			PROC_UNLOCK(p);
 			DPRINTF(("pid %d pgid %d != %d\n", p->p_pid,
 				 p->p_pgid, pid));
 			continue;
 		}
 		if (p_canwait(td, p)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 
 		nfound++;
 
 		/*
 		 * See if we have a zombie.  If so, WNOWAIT should be set,
 		 * as otherwise we should have called kern_wait() up above.
 		 */
 		if ((p->p_state == PRS_ZOMBIE) && 
 		    ((uap->options & (SVR4_WEXITED|SVR4_WTRAPPED)))) {
 			KASSERT(uap->options & SVR4_WNOWAIT,
 			    ("WNOWAIT is clear"));
 
 			/* Found a zombie, so cache info in local variables. */
 			pid = p->p_pid;
 			status = p->p_xstat;
 			ru = *p->p_ru;
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_UNLOCK(p);
 			sx_sunlock(&proctree_lock);
 
 			/* Copy the info out to userland. */
 			*retval = 0;
 			DPRINTF(("found %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 
 		/*
 		 * See if we have a stopped or continued process.
 		 * XXX: This duplicates the same code in kern_wait().
 		 */
 		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || uap->options & SVR4_WSTOPPED)) {
 			mtx_unlock_spin(&sched_lock);
 		        if (((uap->options & SVR4_WNOWAIT)) == 0)
 				p->p_flag |= P_WAITED;
 			sx_sunlock(&proctree_lock);
 			pid = p->p_pid;
 			status = W_STOPCODE(p->p_xstat);
 			ru = *p->p_ru;
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			*retval = 0;
 			DPRINTF(("jobcontrol %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 		mtx_unlock_spin(&sched_lock);
 		if (uap->options & SVR4_WCONTINUED &&
 		    (p->p_flag & P_CONTINUED)) {
 			sx_sunlock(&proctree_lock);
 		        if (((uap->options & SVR4_WNOWAIT)) == 0)
 				p->p_flag &= ~P_CONTINUED;
 			pid = p->p_pid;
 			ru = *p->p_ru;
 			status = SIGCONT;
 			calcru(p, &ru.ru_utime, &ru.ru_stime);
 			PROC_UNLOCK(p);
 
 		        if (((uap->options & SVR4_WNOWAIT)) == 0) {
 				PROC_LOCK(q);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(q);
 			}
 
 			*retval = 0;
 			DPRINTF(("jobcontrol %d\n", pid));
 			return (svr4_setinfo(pid, &ru, status, uap->info));
 		}
 		PROC_UNLOCK(p);
 	}
 
 	if (nfound == 0) {
 		sx_sunlock(&proctree_lock);
 		return (ECHILD);
 	}
 
 	if (uap->options & SVR4_WNOHANG) {
 		sx_sunlock(&proctree_lock);
 		*retval = 0;
 		return (svr4_setinfo(0, NULL, 0, uap->info));
 	}
 
 	PROC_LOCK(q);
 	sx_sunlock(&proctree_lock);
 	if (q->p_flag & P_STATCHILD) {
 		q->p_flag &= ~P_STATCHILD;
 		error = 0;
 	} else
 		error = msleep(q, &q->p_mtx, PWAIT | PCATCH, "svr4_wait", 0);
 	PROC_UNLOCK(q);
 	if (error)
 		return error;
 	goto loop;
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs64(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs64 *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 int
 svr4_sys_statvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs_args *uap;
 {
 	struct svr4_statvfs sfs;
 	struct statfs bfs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
 	free(path, M_TEMP);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_fstatvfs_args *uap;
 {
 	struct svr4_statvfs sfs;
 	struct statfs bfs;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &bfs);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_statvfs64(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs64_args *uap;
 {
 	struct svr4_statvfs64 sfs;
 	struct statfs bfs;
 	char *path;
 	int error;
 
 	CHECKALTEXIST(td, uap->path, &path);
 
 	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
 	free(path, M_TEMP);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs64(td, uap) 
 	struct thread *td;
 	struct svr4_sys_fstatvfs64_args *uap;
 {
 	struct svr4_statvfs64 sfs;
 	struct statfs bfs;
 	int error;
 
 	error = kern_fstatfs(td, uap->fd, &bfs);
 	if (error)
 		return (error);
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 	return copyout(&sfs, uap->fs, sizeof(sfs));
 }
 
 int
 svr4_sys_alarm(td, uap)
 	struct thread *td;
 	struct svr4_sys_alarm_args *uap;
 {
         struct itimerval itv, oitv;
 	int error;
 
 	timevalclear(&itv.it_interval);
 	itv.it_value.tv_sec = uap->sec;
 	itv.it_value.tv_usec = 0;
 	error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
 	if (error)
 		return (error);
 	if (oitv.it_value.tv_usec != 0)
 		oitv.it_value.tv_sec++;
 	td->td_retval[0] = oitv.it_value.tv_sec;
 	return (0);
 }
 
 int
 svr4_sys_gettimeofday(td, uap)
 	struct thread *td;
 	struct svr4_sys_gettimeofday_args *uap;
 {
 	if (uap->tp) {
 		struct timeval atv;
 
 		microtime(&atv);
 		return copyout(&atv, uap->tp, sizeof (atv));
 	}
 
 	return 0;
 }
 
 int
 svr4_sys_facl(td, uap)
 	struct thread *td;
 	struct svr4_sys_facl_args *uap;
 {
 	int *retval;
 
 	retval = td->td_retval;
 	*retval = 0;
 
 	switch (uap->cmd) {
 	case SVR4_SYS_SETACL:
 		/* We don't support acls on any filesystem */
 		return ENOSYS;
 
 	case SVR4_SYS_GETACL:
 		return copyout(retval, &uap->num,
 		    sizeof(uap->num));
 
 	case SVR4_SYS_GETACLCNT:
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 
 int
 svr4_sys_acl(td, uap)
 	struct thread *td;
 	struct svr4_sys_acl_args *uap;
 {
 	/* XXX: for now the same */
 	return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap);
 }
 
 int
 svr4_sys_auditsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_auditsys_args *uap;
 {
 	/*
 	 * XXX: Big brother is *not* watching.
 	 */
 	return 0;
 }
 
 int
 svr4_sys_memcntl(td, uap)
 	struct thread *td;
 	struct svr4_sys_memcntl_args *uap;
 {
 	switch (uap->cmd) {
 	case SVR4_MC_SYNC:
 		{
 			struct msync_args msa;
 
 			msa.addr = uap->addr;
 			msa.len = uap->len;
 			msa.flags = (int)uap->arg;
 
 			return msync(td, &msa);
 		}
 	case SVR4_MC_ADVISE:
 		{
 			struct madvise_args maa;
 
 			maa.addr = uap->addr;
 			maa.len = uap->len;
 			maa.behav = (int)uap->arg;
 
 			return madvise(td, &maa);
 		}
 	case SVR4_MC_LOCK:
 	case SVR4_MC_UNLOCK:
 	case SVR4_MC_LOCKAS:
 	case SVR4_MC_UNLOCKAS:
 		return EOPNOTSUPP;
 	default:
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_nice(td, uap)
 	struct thread *td;
 	struct svr4_sys_nice_args *uap;
 {
 	struct setpriority_args ap;
 	int error;
 
 	ap.which = PRIO_PROCESS;
 	ap.who = 0;
 	ap.prio = uap->prio;
 
 	if ((error = setpriority(td, &ap)) != 0)
 		return error;
 
 	/* the cast is stupid, but the structures are the same */
 	if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0)
 		return error;
 
 	return 0;
 }
 
 int
 svr4_sys_resolvepath(td, uap)
 	struct thread *td;
 	struct svr4_sys_resolvepath_args *uap;
 {
 	struct nameidata nd;
 	int error, *retval = td->td_retval;
 	unsigned int ncopy;
 	int vfslocked;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME | MPSAFE, UIO_USERSPACE,
 	    uap->path, td);
 
 	if ((error = namei(&nd)) != 0)
 		return error;
 	vfslocked = NDHASGIANT(&nd);
 
 	ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1);
 	if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0)
 		goto bad;
 
 	*retval = ncopy;
 bad:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return error;
 }
Index: head/sys/fs/smbfs/smbfs_io.c
===================================================================
--- head/sys/fs/smbfs/smbfs_io.c	(revision 169666)
+++ head/sys/fs/smbfs/smbfs_io.c	(revision 169667)
@@ -1,711 +1,711 @@
 /*-
  * Copyright (c) 2000-2001, Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>	/* defines plimit structure in proc struct */
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 /*
 #include <sys/ioccom.h>
 */
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 
 #include <fs/smbfs/smbfs.h>
 #include <fs/smbfs/smbfs_node.h>
 #include <fs/smbfs/smbfs_subr.h>
 
 /*#define SMBFS_RWGENERIC*/
 
 extern int smbfs_pbuf_freecnt;
 
 static int smbfs_fastlookup = 1;
 
 SYSCTL_DECL(_vfs_smbfs);
 SYSCTL_INT(_vfs_smbfs, OID_AUTO, fastlookup, CTLFLAG_RW, &smbfs_fastlookup, 0, "");
 
 
 #define DE_SIZE	(sizeof(struct dirent))
 
 static int
 smbfs_readvdir(struct vnode *vp, struct uio *uio, struct ucred *cred)
 {
 	struct dirent de;
 	struct componentname cn;
 	struct smb_cred scred;
 	struct smbfs_fctx *ctx;
 	struct vnode *newvp;
 	struct smbnode *np = VTOSMB(vp);
 	int error/*, *eofflag = ap->a_eofflag*/;
 	long offset, limit;
 
 	np = VTOSMB(vp);
 	SMBVDEBUG("dirname='%s'\n", np->n_name);
 	smb_makescred(&scred, uio->uio_td, cred);
 	offset = uio->uio_offset / DE_SIZE;	/* offset in the directory */
 	limit = uio->uio_resid / DE_SIZE;
 	if (uio->uio_resid < DE_SIZE || uio->uio_offset < 0)
 		return EINVAL;
 	while (limit && offset < 2) {
 		limit--;
 		bzero((caddr_t)&de, DE_SIZE);
 		de.d_reclen = DE_SIZE;
 		de.d_fileno = (offset == 0) ? np->n_ino :
 		    (np->n_parent ? VTOSMB(np->n_parent)->n_ino : 2);
 		if (de.d_fileno == 0)
 			de.d_fileno = 0x7ffffffd + offset;
 		de.d_namlen = offset + 1;
 		de.d_name[0] = '.';
 		de.d_name[1] = '.';
 		de.d_name[offset + 1] = '\0';
 		de.d_type = DT_DIR;
 		error = uiomove(&de, DE_SIZE, uio);
 		if (error)
 			return error;
 		offset++;
 		uio->uio_offset += DE_SIZE;
 	}
 	if (limit == 0)
 		return 0;
 	if (offset != np->n_dirofs || np->n_dirseq == NULL) {
 		SMBVDEBUG("Reopening search %ld:%ld\n", offset, np->n_dirofs);
 		if (np->n_dirseq) {
 			smbfs_findclose(np->n_dirseq, &scred);
 			np->n_dirseq = NULL;
 		}
 		np->n_dirofs = 2;
 		error = smbfs_findopen(np, "*", 1,
 		    SMB_FA_SYSTEM | SMB_FA_HIDDEN | SMB_FA_DIR,
 		    &scred, &ctx);
 		if (error) {
 			SMBVDEBUG("can not open search, error = %d", error);
 			return error;
 		}
 		np->n_dirseq = ctx;
 	} else
 		ctx = np->n_dirseq;
 	while (np->n_dirofs < offset) {
 		error = smbfs_findnext(ctx, offset - np->n_dirofs++, &scred);
 		if (error) {
 			smbfs_findclose(np->n_dirseq, &scred);
 			np->n_dirseq = NULL;
 			return error == ENOENT ? 0 : error;
 		}
 	}
 	error = 0;
 	for (; limit; limit--, offset++) {
 		error = smbfs_findnext(ctx, limit, &scred);
 		if (error)
 			break;
 		np->n_dirofs++;
 		bzero((caddr_t)&de, DE_SIZE);
 		de.d_reclen = DE_SIZE;
 		de.d_fileno = ctx->f_attr.fa_ino;
 		de.d_type = (ctx->f_attr.fa_attr & SMB_FA_DIR) ? DT_DIR : DT_REG;
 		de.d_namlen = ctx->f_nmlen;
 		bcopy(ctx->f_name, de.d_name, de.d_namlen);
 		de.d_name[de.d_namlen] = '\0';
 		if (smbfs_fastlookup) {
 			error = smbfs_nget(vp->v_mount, vp, ctx->f_name,
 			    ctx->f_nmlen, &ctx->f_attr, &newvp);
 			if (!error) {
 				cn.cn_nameptr = de.d_name;
 				cn.cn_namelen = de.d_namlen;
 				cache_enter(vp, newvp, &cn);
 				vput(newvp);
 			}
 		}
 		error = uiomove(&de, DE_SIZE, uio);
 		if (error)
 			break;
 	}
 	if (error == ENOENT)
 		error = 0;
 	uio->uio_offset = offset * DE_SIZE;
 	return error;
 }
 
 int
 smbfs_readvnode(struct vnode *vp, struct uio *uiop, struct ucred *cred)
 {
 	struct smbmount *smp = VFSTOSMBFS(vp->v_mount);
 	struct smbnode *np = VTOSMB(vp);
 	struct thread *td;
 	struct vattr vattr;
 	struct smb_cred scred;
 	int error, lks;
 
 	/*
 	 * Protect against method which is not supported for now
 	 */
 	if (uiop->uio_segflg == UIO_NOCOPY)
 		return EOPNOTSUPP;
 
 	if (vp->v_type != VREG && vp->v_type != VDIR) {
 		SMBFSERR("vn types other than VREG or VDIR are unsupported !\n");
 		return EIO;
 	}
 	if (uiop->uio_resid == 0)
 		return 0;
 	if (uiop->uio_offset < 0)
 		return EINVAL;
 /*	if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize)
 		return EFBIG;*/
 	td = uiop->uio_td;
 	if (vp->v_type == VDIR) {
 		lks = LK_EXCLUSIVE;/*lockstatus(vp->v_vnlock, td);*/
 		if (lks == LK_SHARED)
 			vn_lock(vp, LK_UPGRADE | LK_RETRY, td);
 		error = smbfs_readvdir(vp, uiop, cred);
 		if (lks == LK_SHARED)
 			vn_lock(vp, LK_DOWNGRADE | LK_RETRY, td);
 		return error;
 	}
 
 /*	biosize = SSTOCN(smp->sm_share)->sc_txmax;*/
 	if (np->n_flag & NMODIFIED) {
 		smbfs_attr_cacheremove(vp);
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return error;
 		np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 	} else {
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return error;
 		if (np->n_mtime.tv_sec != vattr.va_mtime.tv_sec) {
 			error = smbfs_vinvalbuf(vp, td);
 			if (error)
 				return error;
 			np->n_mtime.tv_sec = vattr.va_mtime.tv_sec;
 		}
 	}
 	smb_makescred(&scred, td, cred);
 	return smb_read(smp->sm_share, np->n_fid, uiop, &scred);
 }
 
 int
 smbfs_writevnode(struct vnode *vp, struct uio *uiop,
 	struct ucred *cred, int ioflag)
 {
 	struct smbmount *smp = VTOSMBFS(vp);
 	struct smbnode *np = VTOSMB(vp);
 	struct smb_cred scred;
 	struct proc *p;
 	struct thread *td;
 	int error = 0;
 
 	if (vp->v_type != VREG) {
 		SMBERROR("vn types other than VREG unsupported !\n");
 		return EIO;
 	}
 	SMBVDEBUG("ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid);
 	if (uiop->uio_offset < 0)
 		return EINVAL;
 /*	if (uiop->uio_offset + uiop->uio_resid > smp->nm_maxfilesize)
 		return (EFBIG);*/
 	td = uiop->uio_td;
 	p = td->td_proc;
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		if (np->n_flag & NMODIFIED) {
 			smbfs_attr_cacheremove(vp);
 			error = smbfs_vinvalbuf(vp, td);
 			if (error)
 				return error;
 		}
 		if (ioflag & IO_APPEND) {
 #ifdef notyet
 			/*
 			 * File size can be changed by another client
 			 */
 			smbfs_attr_cacheremove(vp);
 			error = VOP_GETATTR(vp, &vattr, cred, td);
 			if (error) return (error);
 #endif
 			uiop->uio_offset = np->n_size;
 		}
 	}
 	if (uiop->uio_resid == 0)
 		return 0;
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (uiop->uio_offset + uiop->uio_resid >
 		    lim_cur(p, RLIMIT_FSIZE)) {
 			psignal(p, SIGXFSZ);
 			PROC_UNLOCK(p);
 			return EFBIG;
 		}
 		PROC_UNLOCK(p);
 	}
 	smb_makescred(&scred, td, cred);
 	error = smb_write(smp->sm_share, np->n_fid, uiop, &scred);
 	SMBVDEBUG("after: ofs=%d,resid=%d\n",(int)uiop->uio_offset, uiop->uio_resid);
 	if (!error) {
 		if (uiop->uio_offset > np->n_size) {
 			np->n_size = uiop->uio_offset;
 			vnode_pager_setsize(vp, np->n_size);
 		}
 	}
 	return error;
 }
 
 /*
  * Do an I/O operation to/from a cache block.
  */
 int
 smbfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
 {
 	struct smbmount *smp = VFSTOSMBFS(vp->v_mount);
 	struct smbnode *np = VTOSMB(vp);
 	struct uio uio, *uiop = &uio;
 	struct iovec io;
 	struct smb_cred scred;
 	int error = 0;
 
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	smb_makescred(&scred, td, cr);
 
 	if (bp->b_iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 	    switch (vp->v_type) {
 	      case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		error = smb_read(smp->sm_share, np->n_fid, uiop, &scred);
 		if (error)
 			break;
 		if (uiop->uio_resid) {
 			int left = uiop->uio_resid;
 			int nread = bp->b_bcount - left;
 			if (left > 0)
 			    bzero((char *)bp->b_data + nread, left);
 		}
 		break;
 	    default:
 		printf("smbfs_doio:  type %x unexpected\n",vp->v_type);
 		break;
 	    };
 	    if (error) {
 		bp->b_error = error;
 		bp->b_ioflags |= BIO_ERROR;
 	    }
 	} else { /* write */
 	    if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
 		bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff;
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		error = smb_write(smp->sm_share, np->n_fid, uiop, &scred);
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 */
 		if (error == EINTR
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			s = splbio();
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if ((bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			splx(s);
 		} else {
 			if (error) {
 				bp->b_ioflags |= BIO_ERROR;
 				bp->b_error = error;
 			}
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return 0;
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	bufdone(bp);
 	return error;
 }
 
 /*
  * Vnode op for VM getpages.
  * Wish wish .... get rid from multiple IO routines
  */
 int
 smbfs_getpages(ap)
 	struct vop_getpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_reqpage;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 #ifdef SMBFS_RWGENERIC
 	return vop_stdgetpages(ap);
 #else
 	int i, error, nextoff, size, toff, npages, count, reqpage;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct smbmount *smp;
 	struct smbnode *np;
 	struct smb_cred scred;
 	vm_object_t object;
 	vm_page_t *pages, m;
 
 	vp = ap->a_vp;
 	if ((object = vp->v_object) == NULL) {
 		printf("smbfs_getpages: called with non-merged cache vnode??\n");
 		return VM_PAGER_ERROR;
 	}
 
 	td = curthread;				/* XXX */
 	cred = td->td_ucred;		/* XXX */
 	np = VTOSMB(vp);
 	smp = VFSTOSMBFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	npages = btoc(count);
 	reqpage = ap->a_reqpage;
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 */
 	m = pages[reqpage];
 
 	VM_OBJECT_LOCK(object);
 	if (m->valid != 0) {
 		/* handled by vm_fault now	  */
 		/* vm_page_zero_invalid(m, TRUE); */
 		vm_page_lock_queues();
 		for (i = 0; i < npages; ++i) {
 			if (i != reqpage)
 				vm_page_free(pages[i]);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return 0;
 	}
 	VM_OBJECT_UNLOCK(object);
 
 	smb_makescred(&scred, td, cred);
 
 	bp = getpbuf(&smbfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodein++;
-	cnt.v_vnodepgsin += npages;
+	VMCNT_ADD(vnodein, 1);
+	VMCNT_ADD(vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = smb_read(smp->sm_share, np->n_fid, &uio, &scred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &smbfs_pbuf_freecnt);
 
 	VM_OBJECT_LOCK(object);
 	if (error && (uio.uio_resid == count)) {
 		printf("smbfs_getpages: error %d\n",error);
 		vm_page_lock_queues();
 		for (i = 0; i < npages; i++) {
 			if (reqpage != i)
 				vm_page_free(pages[i]);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_ERROR;
 	}
 
 	size = count - uio.uio_resid;
 
 	vm_page_lock_queues();
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
 		} else {
 			/*
 			 * Read operation was short.  If no error occured
 			 * we may have hit a zero-fill section.   We simply
 			 * leave valid set to 0.
 			 */
 			;
 		}
 
 		if (i != reqpage) {
 			/*
 			 * Whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere (it already is in the object).  Result:
 			 * It appears that emperical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * Just in case someone was asking for this page we
 			 * now tell them that it is ok to use.
 			 */
 			if (!error) {
 				if (m->oflags & VPO_WANTED)
 					vm_page_activate(m);
 				else
 					vm_page_deactivate(m);
 				vm_page_wakeup(m);
 			} else {
 				vm_page_free(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 	return 0;
 #endif /* SMBFS_RWGENERIC */
 }
 
 /*
  * Vnode op for VM putpages.
  * possible bug: all IO done in sync mode
  * Note that vop_close always invalidate pages before close, so it's
  * not necessary to open vnode.
  */
 int
 smbfs_putpages(ap)
 	struct vop_putpages_args /* {
 		struct vnode *a_vp;
 		vm_page_t *a_m;
 		int a_count;
 		int a_sync;
 		int *a_rtvals;
 		vm_ooffset_t a_offset;
 	} */ *ap;
 {
 	int error;
 	struct vnode *vp = ap->a_vp;
 	struct thread *td;
 	struct ucred *cred;
 
 #ifdef SMBFS_RWGENERIC
 	td = curthread;			/* XXX */
 	cred = td->td_ucred;		/* XXX */
 	VOP_OPEN(vp, FWRITE, cred, td, -1);
 	error = vop_stdputpages(ap);
 	VOP_CLOSE(vp, FWRITE, cred, td);
 	return error;
 #else
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int i, npages, count;
 	int *rtvals;
 	struct smbmount *smp;
 	struct smbnode *np;
 	struct smb_cred scred;
 	vm_page_t *pages;
 
 	td = curthread;			/* XXX */
 	cred = td->td_ucred;		/* XXX */
 /*	VOP_OPEN(vp, FWRITE, cred, td, -1);*/
 	np = VTOSMB(vp);
 	smp = VFSTOSMBFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 
 	for (i = 0; i < npages; i++) {
 		rtvals[i] = VM_PAGER_AGAIN;
 	}
 
 	bp = getpbuf(&smbfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodeout++;
-	cnt.v_vnodepgsout += count;
+	VMCNT_ADD(vnodeout, 1);
+	VMCNT_ADD(vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 	SMBVDEBUG("ofs=%d,resid=%d\n",(int)uio.uio_offset, uio.uio_resid);
 
 	smb_makescred(&scred, td, cred);
 	error = smb_write(smp->sm_share, np->n_fid, &uio, &scred);
 /*	VOP_CLOSE(vp, FWRITE, cred, td);*/
 	SMBVDEBUG("paged write done: %d\n", error);
 
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &smbfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 		vm_page_lock_queues();
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(pages[i]);
 		}
 		vm_page_unlock_queues();
 	}
 	return rtvals[0];
 #endif /* SMBFS_RWGENERIC */
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 smbfs_vinvalbuf(struct vnode *vp, struct thread *td)
 {
 	struct smbnode *np = VTOSMB(vp);
 	int error = 0;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return 0;
 
 	while (np->n_flag & NFLUSHINPROG) {
 		np->n_flag |= NFLUSHWANT;
 		error = tsleep(&np->n_flag, PRIBIO + 2, "smfsvinv", 2 * hz);
 		error = smb_td_intr(td);
 		if (error == EINTR)
 			return EINTR;
 	}
 	np->n_flag |= NFLUSHINPROG;
 
 	if (vp->v_bufobj.bo_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
 	}
 
 	error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0);
 	while (error) {
 		if (error == ERESTART || error == EINTR) {
 			np->n_flag &= ~NFLUSHINPROG;
 			if (np->n_flag & NFLUSHWANT) {
 				np->n_flag &= ~NFLUSHWANT;
 				wakeup(&np->n_flag);
 			}
 			return EINTR;
 		}
 		error = vinvalbuf(vp, V_SAVE, td, PCATCH, 0);
 	}
 	np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
 	if (np->n_flag & NFLUSHWANT) {
 		np->n_flag &= ~NFLUSHWANT;
 		wakeup(&np->n_flag);
 	}
 	return (error);
 }
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 169666)
+++ head/sys/i386/i386/machdep.c	(revision 169667)
@@ -1,3065 +1,3065 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_apic.h"
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <isa/rtc.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/privatespace.h>
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ISA
 #include <i386/isa/icu.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 
 int arch_i386_is_xbox = 0;
 uint32_t arch_i386_xbox_memsize = 0;
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #define PHYSMAP_SIZE	(2 * 16)
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct pcpu __pcpu;
 #endif
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
 	    ptoa((uintmax_t)Maxmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
-	    ptoa((uintmax_t)cnt.v_free_count),
-	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
+	    ptoa((uintmax_t)VMCNT_GET(free_count)),
+	    ptoa((uintmax_t)VMCNT_GET(free_count)) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	struct proc *p = td->td_proc;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	SIGSETOLD(td->td_sigmask, scp->sc_mask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if (!tsc_present)
 		return (EOPNOTSUPP);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	mtx_lock_spin(&sched_lock);
 	sched_bind(curthread, cpu_id);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	sched_unbind(curthread);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length  */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	10 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	int hasbrokenint12, has_smap;
 	u_long physmem_tunable;
 	u_int extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	struct bios_smap *smap;
 	quad_t dcons_addr, dcons_size;
 
 	has_smap = 0;
 #ifdef XBOX
 	if (arch_i386_is_xbox) {
 		/*
 		 * We queried the memory size before, so chop off 4MB for
 		 * the framebuffer and inform the OS of this.
 		 */
 		physmap[0] = 0;
 		physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
 		physmap_idx = 0;
 		goto physmap_done;
 	}
 #endif
 
 	hasbrokenint12 = 0;
 	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
 	bzero(&vmf, sizeof(vmf));
 	bzero(physmap, sizeof(physmap));
 	basemem = 0;
 
 	/*
 	 * Some newer BIOSes has broken INT 12H implementation which cause
 	 * kernel panic immediately. In this case, we need to scan SMAP
 	 * with INT 15:E820 first, then determine base memory size.
 	 */
 	if (hasbrokenint12) {
 		goto int15e820;
 	}
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 int15e820:
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%016llx len=%016llx\n",
 			    smap->type, smap->base, smap->length);
 		has_smap = 1;
 
 		if (smap->type != 0x01)
 			continue;
 
 		if (smap->length == 0)
 			continue;
 
 #ifndef PAE
 		if (smap->base >= 0xffffffff) {
 			printf("%uK of memory above 4GB ignored\n",
 			    (u_int)(smap->length / 1024));
 			continue;
 		}
 #endif
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-monotonic memory region, ignoring second region\n");
 				continue;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			continue;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 	} while (vmf.vmf_ebx != 0);
 
 	/*
 	 * Perform "base memory" related probes & setup based on SMAP
 	 */
 	if (basemem == 0) {
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (physmap[i] == 0x00000000) {
 				basemem = physmap[i + 1] / 1024;
 				break;
 			}
 		}
 
 		/*
 		 * XXX this function is horribly organized and has to the same
 		 * things that it does above here.
 		 */
 		if (basemem == 0)
 			basemem = 640;
 		if (basemem > 640) {
 			printf(
 		    "Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			    basemem);
 			basemem = 640;
 		}
 
 		/*
 		 * Let vm86 scribble on pages between basemem and
 		 * ISA_HOLE_START, as above.
 		 */
 		for (pa = trunc_page(basemem * 1024);
 		     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 			pmap_kenter(KERNBASE + pa, pa);
 		pte = (pt_entry_t *)vm86paddr;
 		for (i = basemem / 4; i < 160; i++)
 			pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 	}
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		/*
 		 * Prefer the RTC value for extended memory.
 		 */
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	/*
 	 * If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
 	 * the amount of memory in the system.
 	 */
 	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
 		Maxmem = atop(physmap[physmap_idx + 1]);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup(&proc0, &thread0);
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 #ifdef SMP
 	pc = &SMP_prvspace[0].pcpu;
 #else
 	pc = &__pcpu;
 #endif
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 	PCPU_SET(curtid, thread0.td_tid);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 #ifdef XBOX
 	/*
 	 * The following code queries the PCI ID of 0:0:0. For the XBOX,
 	 * This should be 0x10de / 0x02a5.
 	 *
 	 * This is exactly what Linux does.
 	 */
 	outl(0xcf8, 0x80000000);
 	if (inl(0xcfc) == 0x02a510de) {
 		arch_i386_is_xbox = 1;
 		pic16l_setled(XBOX_LED_GREEN);
 
 		/*
 		 * We are an XBOX, but we may have either 64MB or 128MB of
 		 * memory. The PCI host bridge should be programmed for this,
 		 * so we just query it. 
 		 */
 		outl(0xcf8, 0x80000084);
 		arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
 	}
 #endif /* XBOX */
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 	elcr_probe();
 	atpic_startup();
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 #ifdef PAE
 	dblfault_tss.tss_cr3 = (int)IdlePDPT;
 #else
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 #endif
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0;
 #ifdef PAE
 	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
 #else
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 #endif
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = 0xffffffff;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL)
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
 						(struct save87 *)fpregs);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		set_fpregs_xmm((struct save87 *)fpregs,
 					   &td->td_pcb->pcb_save.sv_xmm);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 #else
 	union savefpu *addr;
 
 	/*
 	 * XXX mc_fpstate might be misaligned, since its declaration is not
 	 * unportabilized using __attribute__((aligned(16))) like the
 	 * declaration of struct savemm, and anyway, alignment doesn't work
 	 * for auto variables since we don't use gcc's pessimal stack
 	 * alignment.  Work around this by abusing the spare fields after
 	 * mcp->mc_fpstate.
 	 *
 	 * XXX unpessimize most cases by only aligning when fxsave might be
 	 * called, although this requires knowing too much about
 	 * npxgetregs()'s internals.
 	 */
 	addr = (union savefpu *)&mcp->mc_fpstate;
 	if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 	    cpu_fxsr &&
 #endif
 	    ((uintptr_t)(void *)addr & 0xF)) {
 		do
 			addr = (void *)((char *)addr + 4);
 		while ((uintptr_t)(void *)addr & 0xF);
 	}
 	mcp->mc_ownedfp = npxgetregs(td, addr);
 	if (addr != (union savefpu *)&mcp->mc_fpstate) {
 		bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 		bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	}
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	union savefpu *addr;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/* XXX align as above. */
 		addr = (union savefpu *)&mcp->mc_fpstate;
 		if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 		    cpu_fxsr &&
 #endif
 		    ((uintptr_t)(void *)addr & 0xF)) {
 			do
 				addr = (void *)((char *)addr + 4);
 			while ((uintptr_t)(void *)addr & 0xF);
 			bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
 		}
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		/*
 		 * XXX we violate the dubious requirement that npxsetregs()
 		 * be called with interrupts disabled.
 		 */
 		npxsetregs(td, addr);
 #endif
 		/*
 		 * Don't bother putting things back where they were in the
 		 * misaligned case, since we know that the caller won't use
 		 * them again.
 		 */
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifndef DEV_APIC
 #include <machine/apicvar.h>
 
 /*
  * Provide stub functions so that the MADT APIC enumerator in the acpi
  * kernel module will link against a kernel without 'device apic'.
  *
  * XXX - This is a gross hack.
  */
 void
 apic_register_enumerator(struct apic_enumerator *enumerator)
 {
 }
 
 void *
 ioapic_create(vm_paddr_t addr, int32_t apic_id, int intbase)
 {
 	return (NULL);
 }
 
 int
 ioapic_disable_pin(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_get_vector(void *cookie, u_int pin)
 {
 	return (-1);
 }
 
 void
 ioapic_register(void *cookie)
 {
 }
 
 int
 ioapic_remap_vector(void *cookie, u_int pin, int vector)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_extint(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_nmi(void *cookie, u_int pin)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_polarity(void *cookie, u_int pin, enum intr_polarity pol)
 {
 	return (ENXIO);
 }
 
 int
 ioapic_set_triggermode(void *cookie, u_int pin, enum intr_trigger trigger)
 {
 	return (ENXIO);
 }
 
 void
 lapic_create(u_int apic_id, int boot_cpu)
 {
 }
 
 void
 lapic_init(vm_paddr_t addr)
 {
 }
 
 int
 lapic_set_lvt_mode(u_int apic_id, u_int lvt, u_int32_t mode)
 {
 	return (ENXIO);
 }
 
 int
 lapic_set_lvt_polarity(u_int apic_id, u_int lvt, enum intr_polarity pol)
 {
 	return (ENXIO);
 }
 
 int
 lapic_set_lvt_triggermode(u_int apic_id, u_int lvt, enum intr_trigger trigger)
 {
 	return (ENXIO);
 }
 #endif
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 169666)
+++ head/sys/i386/i386/pmap.c	(revision 169667)
@@ -1,3675 +1,3675 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_msgbuf.h"
 #include "opt_smp.h"
 #include "opt_xbox.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #ifdef SMP
 #include <sys/smp.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 
 #ifdef XBOX
 #include <machine/xbox.h>
 #endif
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if defined(DIAGNOSTIC)
 #define PMAP_DIAGNOSTIC
 #endif
 
 #if !defined(PMAP_DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define PV_STATS
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 struct pmap kernel_pmap_store;
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps;
 static struct mtx allpmaps_lock;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 int pgeflag = 0;		/* PG_G or-in */
 int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt;
 vm_offset_t kernel_vm_end;
 extern u_int32_t KERNend;
 
 #ifdef PAE
 pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 static int shpgperproc = PMAP_SHPGPERPROC;
 
 struct pv_chunk *pv_chunkbase;		/* KVA block for pv_chunks */
 int pv_maxchunks;			/* How many chunks we have KVA for */
 vm_offset_t pv_vafree;			/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 struct sysmaps {
 	struct	mtx lock;
 	pt_entry_t *CMAP1;
 	pt_entry_t *CMAP2;
 	caddr_t	CADDR1;
 	caddr_t	CADDR2;
 };
 static struct sysmaps sysmaps_pcpu[MAXCPU];
 pt_entry_t *CMAP1 = 0;
 static pt_entry_t *CMAP3;
 caddr_t CADDR1 = 0, ptvmmap = 0;
 static caddr_t CADDR3;
 struct msgbuf *msgbufp = 0;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 #ifdef SMP
 extern pt_entry_t *SMPpt;
 #endif
 static pt_entry_t *PMAP1 = 0, *PMAP2;
 static pt_entry_t *PADDR1 = 0, *PADDR2;
 #ifdef SMP
 static int PMAP1cpu;
 static int PMAP1changedcpu;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changedcpu, CTLFLAG_RD, 
 	   &PMAP1changedcpu, 0,
 	   "Number of times pmap_pte_quick changed CPU with same PMAP1");
 #endif
 static int PMAP1changed;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1changed, CTLFLAG_RD, 
 	   &PMAP1changed, 0,
 	   "Number of times pmap_pte_quick changed PMAP1");
 static int PMAP1unchanged;
 SYSCTL_INT(_debug, OID_AUTO, PMAP1unchanged, CTLFLAG_RD, 
 	   &PMAP1unchanged, 0,
 	   "Number of times pmap_pte_quick didn't change PMAP1");
 static struct mtx PMAP2mutex;
 
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap, int try);
 
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     vm_page_t *free);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags);
 static int _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, vm_page_t *);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 #ifdef PAE
 static void *pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait);
 #endif
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 /*
  * Move the kernel virtual free pointer to the next
  * 4MB.  This is used to help improve performance
  * by using a large (4MB) page for much of the kernel
  * (.text, .data, .bss)
  */
 static vm_offset_t
 pmap_kmem_choose(vm_offset_t addr)
 {
 	vm_offset_t newaddr = addr;
 
 #ifndef DISABLE_PSE
 	if (cpu_feature & CPUID_PSE)
 		newaddr = (addr + PDRMASK) & ~PDRMASK;
 #endif
 	return newaddr;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct sysmaps *sysmaps;
 	int i;
 
 	/*
 	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_avail = pmap_kmem_choose(virtual_avail);
 
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + (u_int)IdlePTD);
 #ifdef PAE
 	kernel_pmap->pm_pdpt = (pdpt_entry_t *) (KERNBASE + (u_int)IdlePDPT);
 #endif
 	kernel_pmap->pm_active = -1;	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	LIST_INIT(&allpmaps);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_SPIN);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, kernel_pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the idle process page zeroing.
 	 */
 	for (i = 0; i < MAXCPU; i++) {
 		sysmaps = &sysmaps_pcpu[i];
 		mtx_init(&sysmaps->lock, "SYSMAPS", NULL, MTX_DEF);
 		SYSMAP(caddr_t, sysmaps->CMAP1, sysmaps->CADDR1, 1)
 		SYSMAP(caddr_t, sysmaps->CMAP2, sysmaps->CADDR2, 1)
 	}
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1)
 	*CMAP3 = 0;
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(MSGBUF_SIZE)))
 
 	/*
 	 * ptemap is used for pmap_pte_quick
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1);
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1);
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	*CMAP1 = 0;
 
 	/*
 	 * Leave in place an identity mapping (virt == phys) for the low 1 MB
 	 * physical memory region that is used by the ACPI wakeup code.  This
 	 * mapping must not have PG_G set. 
 	 */
 #ifdef XBOX
 	/* FIXME: This is gross, but needed for the XBOX. Since we are in such
 	 * an early stadium, we cannot yet neatly map video memory ... :-(
 	 * Better fixes are very welcome! */
 	if (!arch_i386_is_xbox)
 #endif
 	for (i = 1; i < NKPT; i++)
 		PTD[i] = 0;
 
 	/* Initialize the PAT MSR if present. */
 	pmap_init_pat();
 
 	/* Turn on PG_G on kernel page(s) */
 	pmap_set_pg();
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if (!(cpu_feature & CPUID_PAT))
 		return;
 
 #ifdef PAT_WORKS
 	/*
 	 * Leave the indices 0-3 at the default of WB, WT, UC, and UC-.
 	 * Program 4 and 5 as WP and WC.
 	 * Leave 6 and 7 as UC and UC-.
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~(PAT_MASK(4) | PAT_MASK(5));
 	pat_msr |= PAT_VALUE(4, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(5, PAT_WRITE_COMBINING);
 #else
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.  Thus, just replace PAT Index 2 with WC instead
 	 * of UC-.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	pat_msr = rdmsr(MSR_PAT);
 	pat_msr &= ~PAT_MASK(2);
 	pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 #endif
 	wrmsr(MSR_PAT, pat_msr);
 }
 
 /*
  * Set PG_G on kernel pages.  Only the BSP calls this when SMP is turned on.
  */
 void
 pmap_set_pg(void)
 {
 	pd_entry_t pdir;
 	pt_entry_t *pte;
 	vm_offset_t va, endva;
 	int i; 
 
 	if (pgeflag == 0)
 		return;
 
 	i = KERNLOAD/NBPDR;
 	endva = KERNBASE + KERNend;
 
 	if (pseflag) {
 		va = KERNBASE + KERNLOAD;
 		while (va  < endva) {
 			pdir = kernel_pmap->pm_pdir[KPTDI+i];
 			pdir |= pgeflag;
 			kernel_pmap->pm_pdir[KPTDI+i] = PTD[KPTDI+i] = pdir;
 			invltlb();	/* Play it safe, invltlb() every time */
 			i++;
 			va += NBPDR;
 		}
 	} else {
 		va = (vm_offset_t)btext;
 		while (va < endva) {
 			pte = vtopte(va);
 			if (*pte)
 				*pte |= pgeflag;
 			invltlb();	/* Play it safe, invltlb() every time */
 			va += PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 #ifdef PAE
 
 static MALLOC_DEFINE(M_PMAPPDPT, "pmap", "pmap pdpt");
 
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	*flags = UMA_SLAB_PRIV;
 	return (contigmalloc(PAGE_SIZE, M_PMAPPDPT, 0, 0x0ULL, 0xffffffffULL,
 	    1, 0));
 }
 #endif
 
 /*
  * ABuse the pte nodes for unmapped kva to thread a kva freelist through.
  * Requirements:
  *  - Must deal with pages in order to ensure that none of the PG_* bits
  *    are ever set, PG_V in particular.
  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  *    on PAE systems.  This should be ok.
  *  - Assumes nothing will ever test these addresses for 0 to indicate
  *    no mapping instead of correctly checking PG_V.
  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  * Because PG_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_ptelist_alloc(vm_offset_t *head)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		return (va);	/* Out of memory */
 	pte = vtopte(va);
 	*head = *pte;
 	if (*head & PG_V)
 		panic("pmap_ptelist_alloc: va with PG_V set!");
 	*pte = 0;
 	return (va);
 }
 
 static void
 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if (va & PG_V)
 		panic("pmap_ptelist_free: freeing va with PG_V set!");
 	pte = vtopte(va);
 	*pte = *head;		/* virtual! PG_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_ptelist_free(head, va);
 	}
 }
 
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kmem_alloc_nofault(kernel_map,
 	    PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("pmap_init: not enough kvm for pv chunks");
 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #ifdef PAE
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 }
 
 
 SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_max, CTLFLAG_RD, &pv_entry_max, 0,
 	"Max number of PV entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, shpgperproc, CTLFLAG_RD, &shpgperproc, 0,
 	"Page share factor per proc");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 pmap_cache_bits(int mode, boolean_t is_pde)
 {
 	int pat_flag, pat_index, cache_bits;
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* If we don't support PAT, map extended modes to older ones. */
 	if (!(cpu_feature & CPUID_PAT)) {
 		switch (mode) {
 		case PAT_UNCACHEABLE:
 		case PAT_WRITE_THROUGH:
 		case PAT_WRITE_BACK:
 			break;
 		case PAT_UNCACHED:
 		case PAT_WRITE_COMBINING:
 		case PAT_WRITE_PROTECTED:
 			mode = PAT_UNCACHEABLE;
 			break;
 		}
 	}
 	
 	/* Map the caching mode to a PAT index. */
 	switch (mode) {
 #ifdef PAT_WORKS
 	case PAT_UNCACHEABLE:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_UNCACHED:
 		pat_index = 2;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 5;
 		break;
 	case PAT_WRITE_PROTECTED:
 		pat_index = 4;
 		break;
 #else
 	case PAT_UNCACHED:
 	case PAT_UNCACHEABLE:
 	case PAT_WRITE_PROTECTED:
 		pat_index = 3;
 		break;
 	case PAT_WRITE_THROUGH:
 		pat_index = 1;
 		break;
 	case PAT_WRITE_BACK:
 		pat_index = 0;
 		break;
 	case PAT_WRITE_COMBINING:
 		pat_index = 2;
 		break;
 #endif
 	default:
 		panic("Unknown caching mode %d\n", mode);
 	}	
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_index & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_index & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_index & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invlpg(va);
 		smp_invlpg(va);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invlpg(va);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg(pmap->pm_active & other_cpus, va);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	u_int cpumask;
 	u_int other_cpus;
 	vm_offset_t addr;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		smp_invlpg_range(sva, eva);
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invlpg_range(pmap->pm_active & other_cpus,
 			    sva, eva);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	u_int cpumask;
 	u_int other_cpus;
 
 	sched_pin();
 	if (pmap == kernel_pmap || pmap->pm_active == all_cpus) {
 		invltlb();
 		smp_invltlb();
 	} else {
 		cpumask = PCPU_GET(cpumask);
 		other_cpus = PCPU_GET(other_cpus);
 		if (pmap->pm_active & cpumask)
 			invltlb();
 		if (pmap->pm_active & other_cpus)
 			smp_masked_invltlb(pmap->pm_active & other_cpus);
 	}
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 PMAP_INLINE void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invlpg(va);
 }
 
 PMAP_INLINE void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 PMAP_INLINE void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap || pmap->pm_active)
 		invltlb();
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 #endif /* !SMP */
 
 /*
  * Are we current address space or kernel?  N.B. We return FALSE when
  * a pmap's page table is in use because a kernel thread is borrowing
  * it.  The borrowed page table can change spontaneously, making any
  * dependence on its continued use subject to a race condition.
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ||
 		(pmap == vmspace_pmap(curthread->td_proc->p_vmspace) &&
 	    (pmap->pm_pdir[PTDPTDI] & PG_FRAME) == (PTDpde[0] & PG_FRAME)));
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page(kernel_pmap, (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, vm_page_queue_mtx
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t *pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 			PMAP_UNLOCK(pmap);
 			return rtval;
 		}
 		pte = pmap_pte(pmap, va);
 		rtval = (*pte & PG_FRAME) | (va & PAGE_MASK);
 		pmap_pte_release(pte);
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				m = PHYS_TO_VM_PAGE((pde & PG_PS_FRAME) |
 				    (va & PDRMASK));
 				vm_page_hold(m);
 			}
 		} else {
 			sched_pin();
 			pte = *pmap_pte_quick(pmap, va);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				m = PHYS_TO_VM_PAGE(pte & PG_FRAME);
 				vm_page_hold(m);
 			}
 			sched_unpin();
 		}
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag);
 }
 
 PMAP_INLINE void 
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pgeflag | pmap_cache_bits(mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	vm_offset_t va, sva;
 
 	va = sva = *virt;
 	while (start < end) {
 		pmap_kenter(va, start);
 		va += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, *pte;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		oldpte |= *pte;
 		pte_store(pte, VM_PAGE_TO_PHYS(*ma) | pgeflag | PG_RW | PG_V);
 		pte++;
 		ma++;
 	}
 	if ((oldpte & PG_V) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 static PMAP_INLINE void
 pmap_free_zero_pages(vm_page_t free)
 {
 	vm_page_t m;
 
 	while (free != NULL) {
 		m = free;
 		free = m->right;
 		vm_page_free_zero(m);
 	}
 }
 
 /*
  * This routine unholds page table pages, and if the hold count
  * drops to zero, then it decrements the wire count.
  */
 static PMAP_INLINE int
 pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0)
 		return _pmap_unwire_pte_hold(pmap, m, free);
 	else
 		return 0;
 }
 
 static int 
 _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, vm_page_t *free)
 {
 	vm_offset_t pteva;
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
-	atomic_subtract_int(&cnt.v_wire_count, 1);
+	VMCNT_DEC(wire_count, 1);
 
 	/*
 	 * Do an invltlb to make the invalidated mapping
 	 * take effect immediately.
 	 */
 	pteva = VM_MAXUSER_ADDRESS + i386_ptob(m->pindex);
 	pmap_invalidate_page(pmap, pteva);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	m->right = *free;
 	*free = m;
 
 	return 1;
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return 0;
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return pmap_unwire_pte_hold(pmap, mpte, free);
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pdir = (pd_entry_t *)(KERNBASE + (vm_offset_t)IdlePTD);
 #ifdef PAE
 	pmap->pm_pdpt = (pdpt_entry_t *)(KERNBASE + (vm_offset_t)IdlePDPT);
 #endif
 	pmap->pm_active = 0;
 	PCPU_SET(curpmap, pmap);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	vm_paddr_t pa;
 	static int color;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kmem_alloc_nofault(kernel_map,
 		    NBPTD);
 #ifdef PAE
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 	}
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD;) {
 		m = vm_page_alloc(NULL, color++,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (m == NULL)
 			VM_WAIT;
 		else {
 			ptdpg[i++] = m;
 		}
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, ptdpg, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		if ((ptdpg[i]->flags & PG_ZERO) == 0)
 			bzero(pmap->pm_pdir + (i * NPDEPG), PAGE_SIZE);
 	}
 
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 	/* Wire in kernel global address entries. */
 	/* XXX copies current process, does not fill in MPPTDI */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * sizeof(pd_entry_t));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
 #endif
 
 	/* install self-referential address mapping entry(s) */
 	for (i = 0; i < NPGPTD; i++) {
 		pa = VM_PAGE_TO_PHYS(ptdpg[i]);
 		pmap->pm_pdir[PTDPTDI + i] = pa | PG_V | PG_RW | PG_A | PG_M;
 #ifdef PAE
 		pmap->pm_pdpt[i] = pa | PG_V;
 #endif
 	}
 
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, unsigned ptepindex, int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("_pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (flags & M_WAITOK) {
 			PMAP_UNLOCK(pmap);
 			vm_page_unlock_queues();
 			VM_WAIT;
 			vm_page_lock_queues();
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return m;
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, int flags)
 {
 	unsigned ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	KASSERT((flags & (M_NOWAIT | M_WAITOK)) == M_NOWAIT ||
 	    (flags & (M_NOWAIT | M_WAITOK)) == M_WAITOK,
 	    ("pmap_allocpte: flags is neither M_NOWAIT nor M_WAITOK"));
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		pmap->pm_pdir[ptepindex] = 0;
 		ptepa = 0;
 		pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 		pmap_invalidate_all(kernel_pmap);
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & M_WAITOK))
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 #ifdef SMP
 /*
  * Deal with a SMP shootdown of other users of the pmap that we are
  * trying to dispose of.  This can be a bit hairy.
  */
 static u_int *lazymask;
 static u_int lazyptd;
 static volatile u_int lazywait;
 
 void pmap_lazyfix_action(void);
 
 void
 pmap_lazyfix_action(void)
 {
 	u_int mymask = PCPU_GET(cpumask);
 
 #ifdef COUNT_IPIS
 	*ipi_lazypmap_counts[PCPU_GET(cpuid)]++;
 #endif
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 	atomic_store_rel_int(&lazywait, 1);
 }
 
 static void
 pmap_lazyfix_self(u_int mymask)
 {
 
 	if (rcr3() == lazyptd)
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 	atomic_clear_int(lazymask, mymask);
 }
 
 
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int mymask;
 	u_int mask;
 	u_int spins;
 
 	while ((mask = pmap->pm_active) != 0) {
 		spins = 50000000;
 		mask = mask & -mask;	/* Find least significant set bit */
 		mtx_lock_spin(&smp_ipi_mtx);
 #ifdef PAE
 		lazyptd = vtophys(pmap->pm_pdpt);
 #else
 		lazyptd = vtophys(pmap->pm_pdir);
 #endif
 		mymask = PCPU_GET(cpumask);
 		if (mask == mymask) {
 			lazymask = &pmap->pm_active;
 			pmap_lazyfix_self(mymask);
 		} else {
 			atomic_store_rel_int((u_int *)&lazymask,
 			    (u_int)&pmap->pm_active);
 			atomic_store_rel_int(&lazywait, 0);
 			ipi_selected(mask, IPI_LAZYPMAP);
 			while (lazywait == 0) {
 				ia32_pause();
 				if (--spins == 0)
 					break;
 			}
 		}
 		mtx_unlock_spin(&smp_ipi_mtx);
 		if (spins == 0)
 			printf("pmap_lazyfix: spun for 50000000\n");
 	}
 }
 
 #else	/* SMP */
 
 /*
  * Cleaning up on uniprocessor is easy.  For various reasons, we're
  * unlikely to have to even execute this code, including the fact
  * that the cleanup is deferred until the parent does a wait(2), which
  * means that another userland process has run.
  */
 static void
 pmap_lazyfix(pmap_t pmap)
 {
 	u_int cr3;
 
 	cr3 = vtophys(pmap->pm_pdir);
 	if (cr3 == rcr3()) {
 		load_cr3(PCPU_GET(curpcb)->pcb_cr3);
 		pmap->pm_active &= ~(PCPU_GET(cpumask));
 	}
 }
 #endif	/* SMP */
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m, ptdpg[NPGPTD];
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	pmap_lazyfix(pmap);
 	mtx_lock_spin(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock_spin(&allpmaps_lock);
 
 	for (i = 0; i < NPGPTD; i++)
 		ptdpg[i] = PHYS_TO_VM_PAGE(pmap->pm_pdir[PTDPTDI + i] &
 		    PG_FRAME);
 
 	bzero(pmap->pm_pdir + PTDPTDI, (nkpt + NPGPTD) *
 	    sizeof(*pmap->pm_pdir));
 #ifdef SMP
 	pmap->pm_pdir[MPPTDI] = 0;
 #endif
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		m = ptdpg[i];
 #ifdef PAE
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		m->wire_count--;
-		atomic_subtract_int(&cnt.v_wire_count, 1);
+		VMCNT_DEC(wire_count, 1);
 		vm_page_free_zero(m);
 	}
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - KERNBASE;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "IU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "IU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct pmap *pmap;
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 	pt_entry_t *pde;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			nkpt++;
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 		}
 	}
 	addr = roundup2(addr, PAGE_SIZE * NPTEPG);
 	if (addr - 1 >= kernel_map->max_offset)
 		addr = kernel_map->max_offset;
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 				kernel_vm_end = kernel_map->max_offset;
 				break;
 			}
 			continue;
 		}
 
 		/*
 		 * This index is bogus, but out of the way
 		 */
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(PTD, kernel_vm_end) = newpdir;
 
 		mtx_lock_spin(&allpmaps_lock);
 		LIST_FOREACH(pmap, &allpmaps, pm_list) {
 			pde = pmap_pde(pmap, kernel_vm_end);
 			pde_store(pde, newpdir);
 		}
 		mtx_unlock_spin(&allpmaps_lock);
 		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 		if (kernel_vm_end - 1 >= kernel_map->max_offset) {
 			kernel_vm_end = kernel_map->max_offset;
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return (struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK);
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static uint32_t pc_freemask[11] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 
 static int pmap_collect_inactive, pmap_collect_active;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_inactive, CTLFLAG_RD, &pmap_collect_inactive, 0,
 	"Current number times pmap_collect called on inactive queue");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pmap_collect_active, CTLFLAG_RD, &pmap_collect_active, 0,
 	"Current number times pmap_collect called on active queue");
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.  This is normally called to
  * unmap inactive pages, and if necessary, active pages.
  */
 static void
 pmap_collect(pmap_t locked_pmap, struct vpgqueues *vpq)
 {
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m, free;
 
 	sched_pin();
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = PV_PMAP(pv);
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 			pte = pmap_pte_quick(pmap, va);
 			tpte = pte_load_clear(pte);
 			KASSERT((tpte & PG_W) == 0,
 			    ("pmap_collect: wired pte %#jx", (uintmax_t)tpte));
 			if (tpte & PG_A)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tpte & PG_M) {
 				KASSERT((tpte & PG_RW),
 	("pmap_collect: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)tpte));
 				vm_page_dirty(m);
 			}
 			free = NULL;
 			pmap_unuse_pt(pmap, va, &free);
 			pmap_invalidate_page(pmap, va);
 			pmap_free_zero_pages(free);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 			free_pv_entry(pmap, pv);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 		}
 	}
 	sched_unpin();
 }
 
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	vm_page_t m;
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	/* move to head of list */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx])
 			return;
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire(m, 0);
 	vm_page_free(m);
 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, int try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	static vm_pindex_t colour;
 	int bit, field, page_req;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		pagedaemon_wakeup();
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfl(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 	page_req = try ? VM_ALLOC_NORMAL : VM_ALLOC_SYSTEM; 
 	m = vm_page_alloc(NULL, colour, page_req |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 	if (m == NULL || pc == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			if (m) {
 				vm_page_lock_queues();
 				vm_page_unwire(m, 0);
 				vm_page_free(m);
 				vm_page_unlock_queues();
 			}
 			if (pc)
 				pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 			return (NULL);
 		}
 		/*
 		 * Reclaim pv entries: At first, destroy mappings to
 		 * inactive pages.  After that, if a pv chunk entry
 		 * is still needed, destroy mappings to active pages.
 		 */
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, "
 			    "consider increasing tunables "
 			    "vm.pmap.shpgperproc or "
 			    "vm.pmap.pv_entry_max\n");
 		PV_STAT(pmap_collect_inactive++);
 		pmap_collect(pmap, &vm_page_queues[PQ_INACTIVE]);
 		if (m == NULL)
 			m = vm_page_alloc(NULL, colour, VM_ALLOC_SYSTEM |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 		if (pc == NULL)
 			pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 		if (m == NULL || pc == NULL) {
 			PV_STAT(pmap_collect_active++);
 			pmap_collect(pmap, &vm_page_queues[PQ_ACTIVE]);
 			if (m == NULL)
 				m = vm_page_alloc(NULL, colour,
 				    VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 				    VM_ALLOC_WIRED);
 			if (pc == NULL)
 				pc = (struct pv_chunk *)
 				    pmap_ptelist_alloc(&pv_vafree);
 			if (m == NULL || pc == NULL)
 				panic("get_pv_entry: increase vm.pmap.shpgperproc");
 		}
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	colour++;
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va)
 			break;
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found"));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, vm_page_t *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if (oldpte & PG_M) {
 			KASSERT((oldpte & PG_RW),
 	("pmap_remove_pte: modified page not writable: va: %#x, pte: %#jx",
 			    va, (uintmax_t)oldpte));
 			vm_page_dirty(m);
 		}
 		if (oldpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, &free);
 	pmap_invalidate_page(pmap, va);
 	pmap_free_zero_pages(free);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	vm_page_t free = NULL;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		unsigned pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			pmap->pm_pdir[pdirindex] = 0;
 			pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 			anyvalid = 1;
 			continue;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if (*pte == 0)
 				continue;
 
 			/*
 			 * The TLB entry for a PG_G mapping is invalidated
 			 * by pmap_remove_pte().
 			 */
 			if ((*pte & PG_G) == 0)
 				anyvalid = 1;
 			if (pmap_remove_pte(pmap, pte, sva, &free))
 				break;
 		}
 	}
 out:
 	sched_unpin();
 	if (anyvalid) {
 		pmap_invalidate_all(pmap);
 		pmap_free_zero_pages(free);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	vm_page_t free;
 
 #if defined(PMAP_DIAGNOSTIC)
 	/*
 	 * XXX This makes pmap_remove_all() illegal for non-managed pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_remove_all: illegal for unmanaged page, va: 0x%x",
 		    VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	sched_pin();
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tpte & PG_M) {
 			KASSERT((tpte & PG_RW),
 	("pmap_remove_all: modified page not writable: va: %#x, pte: %#jx",
 			    pv->pv_va, (uintmax_t)tpte));
 			vm_page_dirty(m);
 		}
 		free = NULL;
 		pmap_unuse_pt(pmap, pv->pv_va, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		pmap_free_zero_pages(free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	int anychanged;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 #ifdef PAE
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 #else
 	if (prot & VM_PROT_WRITE)
 		return;
 #endif
 
 	anychanged = 0;
 
 	vm_page_lock_queues();
 	sched_pin();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
 		unsigned pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			if ((prot & VM_PROT_WRITE) == 0)
 				pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
 #ifdef PAE
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pmap->pm_pdir[pdirindex] |= pg_nx;
 #endif
 			anychanged = 1;
 			continue;
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 			if (pbits & PG_MANAGED) {
 				m = NULL;
 				if (pbits & PG_A) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_flag_set(m, PG_REFERENCED);
 					pbits &= ~PG_A;
 				}
 				if ((pbits & PG_M) != 0) {
 					if (m == NULL)
 						m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 			}
 
 			if ((prot & VM_PROT_WRITE) == 0)
 				pbits &= ~(PG_RW | PG_M);
 #ifdef PAE
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 #endif
 
 			if (pbits != obits) {
 #ifdef PAE
 				if (!atomic_cmpset_64(pte, obits, pbits))
 					goto retry;
 #else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 #endif
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = 1;
 			}
 		}
 	}
 	sched_unpin();
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t opa;
 	pt_entry_t origpte, newpte;
 	vm_page_t mpte, om;
 	boolean_t invlva;
 
 	va = trunc_page(va);
 #ifdef PMAP_DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
 		panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
 #endif
 
 	mpte = NULL;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		mpte = pmap_allocpte(pmap, va, M_WAITOK);
 	}
 #if 0 && defined(PMAP_DIAGNOSTIC)
 	else {
 		pd_entry_t *pdeaddr = pmap_pde(pmap, va);
 		origpte = *pdeaddr;
 		if ((origpte & PG_V) == 0) { 
 			panic("pmap_enter: invalid kernel page table page, pdir=%p, pde=%p, va=%p\n",
 				pmap->pm_pdir[PTDPTDI], origpte, va);
 		}
 	}
 #endif
 
 	pde = pmap_pde(pmap, va);
 	if ((*pde & PG_PS) != 0)
 		panic("pmap_enter: attempted pmap_enter on 4MB page");
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x\n",
 			(uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	pa = VM_PAGE_TO_PHYS(m);
 	om = NULL;
 	origpte = *pte;
 	opa = origpte & PG_FRAME;
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (origpte && (opa == pa)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((origpte & PG_W) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (origpte & PG_W))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove extra pte reference
 		 */
 		if (mpte)
 			mpte->wire_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (origpte & PG_MANAGED) {
 			om = m;
 			pa |= PG_MANAGED;
 		}
 		goto validate;
 	} 
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		if (origpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (origpte & PG_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 	} else
 		pmap->pm_stats.resident_count++;
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		pa |= PG_MANAGED;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	newpte = (pt_entry_t)(pa | PG_V);
 	if ((prot & VM_PROT_WRITE) != 0) {
 		newpte |= PG_RW;
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 #ifdef PAE
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 #endif
 	if (wired)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= pgeflag;
 
 	/*
 	 * if the mapping or permission bits are different, we need
 	 * to update the pte.
 	 */
 	if ((origpte & ~(PG_M|PG_A)) != newpte) {
 		if (origpte & PG_V) {
 			invlva = FALSE;
 			origpte = pte_load_store(pte, newpte | PG_A);
 			if (origpte & PG_A) {
 				if (origpte & PG_MANAGED)
 					vm_page_flag_set(om, PG_REFERENCED);
 				if (opa != VM_PAGE_TO_PHYS(m))
 					invlva = TRUE;
 #ifdef PAE
 				if ((origpte & PG_NX) == 0 &&
 				    (newpte & PG_NX) != 0)
 					invlva = TRUE;
 #endif
 			}
 			if (origpte & PG_M) {
 				KASSERT((origpte & PG_RW),
 	("pmap_enter: modified page not writable: va: %#x, pte: %#jx",
 				    va, (uintmax_t)origpte));
 				if ((origpte & PG_MANAGED) != 0)
 					vm_page_dirty(om);
 				if ((prot & VM_PROT_WRITE) == 0)
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va);
 		} else
 			pte_store(pte, newpte | PG_A);
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		mpte = pmap_enter_quick_locked(pmap, start + ptoa(diff), m,
 		    prot, mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pmap);
 	(void) pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	vm_page_t free;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		unsigned ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					panic("pmap_enter_quick: unexpected mapping into 4MB page");
 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    M_NOWAIT);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	/*
 	 * This call to vtopte makes the assumption that we are
 	 * entering the page into the current pmap.  In order to support
 	 * quick entry into any pmap, one would likely use pmap_pte_quick.
 	 * But that isn't as quick as vtopte.
 	 */
 	pte = vtopte(va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			free = NULL;
 			if (pmap_unwire_pte_hold(pmap, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				pmap_free_zero_pages(free);
 			}
 			
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	pa = VM_PAGE_TO_PHYS(m);
 #ifdef PAE
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		pa |= pg_nx;
 #endif
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if (m->flags & (PG_FICTITIOUS|PG_UNMANAGED))
 		pte_store(pte, pa | PG_V | PG_U);
 	else
 		pte_store(pte, pa | PG_V | PG_U | PG_MANAGED);
 	return mpte;
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pseflag && 
 	    ((addr & (NBPDR - 1)) == 0) && ((size & (NBPDR - 1)) == 0)) {
 		int i;
 		vm_page_t m[1];
 		unsigned int ptepindex;
 		int npdes;
 		pd_entry_t ptepa;
 
 		PMAP_LOCK(pmap);
 		if (pmap->pm_pdir[ptepindex = (addr >> PDRSHIFT)])
 			goto out;
 		PMAP_UNLOCK(pmap);
 retry:
 		p = vm_page_lookup(object, pindex);
 		if (p != NULL) {
 			if (vm_page_sleep_if_busy(p, FALSE, "init4p"))
 				goto retry;
 		} else {
 			p = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				return;
 			m[0] = p;
 
 			if (vm_pager_get_pages(object, m, 1, 0) != VM_PAGER_OK) {
 				vm_page_lock_queues();
 				vm_page_free(p);
 				vm_page_unlock_queues();
 				return;
 			}
 
 			p = vm_page_lookup(object, pindex);
 			vm_page_lock_queues();
 			vm_page_wakeup(p);
 			vm_page_unlock_queues();
 		}
 
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		p->valid = VM_PAGE_BITS_ALL;
 
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count += size >> PAGE_SHIFT;
 		npdes = size >> PDRSHIFT;
 		for(i = 0; i < npdes; i++) {
 			pde_store(&pmap->pm_pdir[ptepindex],
 			    ptepa | PG_U | PG_RW | PG_V | PG_PS);
 			ptepa += NBPDR;
 			ptepindex += 1;
 		}
 		pmap_invalidate_all(pmap);
 out:
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	pt_entry_t *pte;
 
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
 	pmap_pte_release(pte);
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 	vm_page_t   free;
 	vm_offset_t addr;
 	vm_offset_t end_addr = src_addr + len;
 	vm_offset_t pdnxt;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (!pmap_is_current(src_pmap))
 		return;
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		pt_entry_t *src_pte, *dst_pte;
 		vm_page_t dstmpte, srcmpte;
 		pd_entry_t srcptepaddr;
 		unsigned ptepindex;
 
 		if (addr >= UPT_MIN_ADDRESS)
 			panic("pmap_copy: invalid to pmap_copy page tables");
 
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if (dst_pmap->pm_pdir[ptepindex] == 0) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
 				    ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 			}
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		if (srcmpte->wire_count == 0)
 			panic("pmap_copy: source page table page is unused");
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = vtopte(addr);
 		while (addr < pdnxt) {
 			pt_entry_t ptetemp;
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    M_NOWAIT);
 				if (dstmpte == NULL)
 					break;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 	 			} else {
 					free = NULL;
 					if (pmap_unwire_pte_hold( dst_pmap,
 					    dstmpte, &free)) {
 						pmap_invalidate_page(dst_pmap,
 						    addr);
 						pmap_free_zero_pages(free);
 					}
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 	sched_unpin();
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }	
 
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 #if defined(CPU_ENABLE_SSE)
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 #endif
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	pagezero(sysmaps->CADDR2);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP2)
 		panic("pmap_zero_page: CMAP2 busy");
 	sched_pin();
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(sysmaps->CADDR2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(sysmaps->CADDR2);
 	else
 		bzero((char *)sysmaps->CADDR2 + off, size);
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.  This
  *	is intended to be called from the vm_pagezero process only and
  *	outside of Giant.
  */
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 
 	if (*CMAP3)
 		panic("pmap_zero_page: CMAP3 busy");
 	sched_pin();
 	*CMAP3 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M;
 	invlcaddr(CADDR3);
 	pagezero(CADDR3);
 	*CMAP3 = 0;
 	sched_unpin();
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	struct sysmaps *sysmaps;
 
 	sysmaps = &sysmaps_pcpu[PCPU_GET(cpuid)];
 	mtx_lock(&sysmaps->lock);
 	if (*sysmaps->CMAP1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*sysmaps->CMAP2)
 		panic("pmap_copy_page: CMAP2 busy");
 	sched_pin();
 	invlpg((u_int)sysmaps->CADDR1);
 	invlpg((u_int)sysmaps->CADDR2);
 	*sysmaps->CMAP1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A;
 	*sysmaps->CMAP2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M;
 	bcopy(sysmaps->CADDR1, sysmaps->CADDR2, PAGE_SIZE);
 	*sysmaps->CMAP1 = 0;
 	*sysmaps->CMAP2 = 0;
 	sched_unpin();
 	mtx_unlock(&sysmaps->lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (PV_PMAP(pv) == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, free = NULL;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = (~(pc->pc_map[field])) & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = vtopte(pv->pv_va);
 				tpte = *pte;
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08x\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT(m < &vm_page_array[vm_page_array_size],
 					("pmap_remove_pages: bad tpte %#jx",
 					(uintmax_t)tpte));
 
 				pmap->pm_stats.resident_count--;
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (tpte & PG_M)
 					vm_page_dirty(m);
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				m->md.pv_list_count--;
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 				if (TAILQ_EMPTY(&m->md.pv_list))
 					vm_page_flag_clear(m, PG_WRITEABLE);
 
 				pmap_unuse_pt(pmap, pv->pv_va, &free);
 			}
 		}
 		if (allfree) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			vm_page_unwire(m, 0);
 			vm_page_free(m);
 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all(pmap);
 	pmap_free_zero_pages(free);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & PG_M) != 0;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	if (*pmap_pde(pmap, addr)) {
 		pte = vtopte(addr);
 		rv = *pte == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t oldpte, *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if ((oldpte & PG_RW) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_RW and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 	sched_unpin();
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	int rtval = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return (rtval);
 	sched_pin();
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pvf = pv;
 		do {
 			pvn = TAILQ_NEXT(pv, pv_list);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			pmap = PV_PMAP(pv);
 			PMAP_LOCK(pmap);
 			pte = pmap_pte_quick(pmap, pv->pv_va);
 			if ((*pte & PG_A) != 0) {
 				atomic_clear_int((u_int *)pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				rtval++;
 				if (rtval > 4)
 					pvn = NULL;
 			}
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	sched_unpin();
 	return (rtval);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_M) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_M is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 	sched_pin();
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_A) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_A is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_A);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	vm_offset_t va, tmpva, offset;
 
 	offset = pa & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	pa = pa & PG_FRAME;
 
 	if (pa < KERNLOAD && pa + size <= KERNLOAD)
 		va = KERNBASE + pa;
 	else
 		va = kmem_alloc_nofault(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0; ) {
 		pmap_kenter_attr(tmpva, pa, mode);
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	pmap_invalidate_cache();
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	vm_offset_t base, offset, tmpva;
 
 	if (va >= KERNBASE && va + size <= KERNBASE + KERNLOAD)
 		return;
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE)
 		pmap_kremove(tmpva);
 	pmap_invalidate_range(kernel_pmap, va, tmpva);
 	kmem_free(kernel_map, base, size);
 }
 
 int
 pmap_change_attr(va, size, mode)
 	vm_offset_t va;
 	vm_size_t size;
 	int mode;
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t *pte;
 	u_int opte, npte;
 	pd_entry_t *pde;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = roundup(offset + size, PAGE_SIZE);
 
 	/* Only supported on kernel virtual addresses. */
 	if (base <= VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 
 	/* 4MB pages and pages that aren't mapped aren't supported. */
 	for (tmpva = base; tmpva < (base + size); tmpva += PAGE_SIZE) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS)
 			return (EINVAL);
 		if (*pde == 0)
 			return (EINVAL);
 		pte = vtopte(va);
 		if (*pte == 0)
 			return (EINVAL);
 	}
 
 	/*
 	 * Ok, all the pages exist and are 4k, so run through them updating
 	 * their cache mode.
 	 */
 	for (tmpva = base; size > 0; ) {
 		pte = vtopte(tmpva);
 
 		/*
 		 * The cache mode bits are all in the low 32-bits of the
 		 * PTE, so we can just spin on updating the low 32-bits.
 		 */
 		do {
 			opte = *(u_int *)pte;
 			npte = opte & ~(PG_PTE_PAT | PG_NC_PCD | PG_NC_PWT);
 			npte |= pmap_cache_bits(mode, 0);
 		} while (npte != opte &&
 		    !atomic_cmpset_int((u_int *)pte, opte, npte));
 		tmpva += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that shouldn't
 	 * be, etc.
 	 */    
 	pmap_invalidate_range(kernel_pmap, base, tmpva);
 	pmap_invalidate_cache();
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *ptep, pte;
 	vm_page_t m;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	ptep = pmap_pte(pmap, addr);
 	pte = (ptep != NULL) ? *ptep : 0;
 	pmap_pte_release(ptep);
 	PMAP_UNLOCK(pmap);
 
 	if (pte != 0) {
 		vm_paddr_t pa;
 
 		val = MINCORE_INCORE;
 		if ((pte & PG_MANAGED) == 0)
 			return val;
 
 		pa = pte & PG_FRAME;
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pte & PG_M)
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone else
 			 */
 			vm_page_lock_queues();
 			if (m->dirty || pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pte & PG_A)
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone else
 			 */
 			vm_page_lock_queues();
 			if ((m->flags & PG_REFERENCED) ||
 			    pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #if defined(SMP)
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	oldpmap->pm_active &= ~1;
 	pmap->pm_active |= 1;
 #endif
 #ifdef PAE
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_cr3 = cr3;
 	load_cr3(cr3);
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	if ((obj == NULL) || (size < NBPDR) || (obj->type != OBJT_DEVICE)) {
 		return addr;
 	}
 
 	addr = (addr + PDRMASK) & ~PDRMASK;
 	return addr;
 }
 
 
 #if defined(PMAP_DEBUG)
 pmap_pid_dump(int pid)
 {
 	pmap_t pmap;
 	struct proc *p;
 	int npte = 0;
 	int index;
 
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_pid != pid)
 			continue;
 
 		if (p->p_vmspace) {
 			int i,j;
 			index = 0;
 			pmap = vmspace_pmap(p->p_vmspace);
 			for (i = 0; i < NPDEPTD; i++) {
 				pd_entry_t *pde;
 				pt_entry_t *pte;
 				vm_offset_t base = i << PDRSHIFT;
 				
 				pde = &pmap->pm_pdir[i];
 				if (pde && pmap_pde_v(pde)) {
 					for (j = 0; j < NPTEPG; j++) {
 						vm_offset_t va = base + (j << PAGE_SHIFT);
 						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
 							if (index) {
 								index = 0;
 								printf("\n");
 							}
 							sx_sunlock(&allproc_lock);
 							return npte;
 						}
 						pte = pmap_pte(pmap, va);
 						if (pte && pmap_pte_v(pte)) {
 							pt_entry_t pa;
 							vm_page_t m;
 							pa = *pte;
 							m = PHYS_TO_VM_PAGE(pa & PG_FRAME);
 							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
 								va, pa, m->hold_count, m->wire_count, m->flags);
 							npte++;
 							index++;
 							if (index >= 2) {
 								index = 0;
 								printf("\n");
 							} else {
 								printf(" ");
 							}
 						}
 					}
 				}
 			}
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	return npte;
 }
 #endif
 
 #if defined(DEBUG)
 
 static void	pads(pmap_t pm);
 void		pmap_pvdump(vm_offset_t pa);
 
 /* print address space of pmap*/
 static void
 pads(pmap_t pm)
 {
 	int i, j;
 	vm_paddr_t va;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < NPDEPTD; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < NPTEPG; j++) {
 				va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *ptep);
 			};
 
 }
 
 void
 pmap_pvdump(vm_paddr_t pa)
 {
 	pv_entry_t pv;
 	pmap_t pmap;
 	vm_page_t m;
 
 	printf("pa %x", pa);
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = PV_PMAP(pv);
 		printf(" -> pmap %p, va %x", (void *)pmap, pv->pv_va);
 		pads(pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/ia64/ia64/machdep.c
===================================================================
--- head/sys/ia64/ia64/machdep.c	(revision 169666)
+++ head/sys/ia64/ia64/machdep.c	(revision 169667)
@@ -1,1496 +1,1496 @@
 /*-
  * Copyright (c) 2003,2004 Marcel Moolenaar
  * Copyright (c) 2000,2001 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/random.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/uuid.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/efi.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/mca.h>
 #include <machine/md_var.h>
 #include <machine/mutex.h>
 #include <machine/pal.h>
 #include <machine/pcb.h>
 #include <machine/reg.h>
 #include <machine/sal.h>
 #include <machine/sigframe.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/unwind.h>
 #include <machine/vmparam.h>
 
 #include <i386/include/specialreg.h>
 
 u_int64_t processor_frequency;
 u_int64_t bus_frequency;
 u_int64_t itc_frequency;
 int cold = 1;
 
 u_int64_t pa_bootinfo;
 struct bootinfo bootinfo;
 
 struct pcpu pcpu0;
 extern char kstack[]; 
 vm_offset_t proc0kstack;
 
 extern u_int64_t kernel_text[], _end[];
 
 extern u_int64_t ia64_gateway_page[];
 extern u_int64_t break_sigtramp[];
 extern u_int64_t epc_sigtramp[];
 
 struct fpswa_iface *fpswa_iface;
 
 u_int64_t ia64_pal_base;
 u_int64_t ia64_port_base;
 
 char machine[] = MACHINE;
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static char cpu_model[64];
 SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
     "The CPU model name");
 
 static char cpu_family[64];
 SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
     "The CPU family name");
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 struct msgbuf *msgbufp=0;
 
 long Maxmem = 0;
 long realmem = 0;
 
 vm_offset_t phys_avail[100];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 void mi_startup(void);		/* XXX should be in a MI header */
 
 struct kva_md_info kmi;
 
 #define	Mhz	1000000L
 #define	Ghz	(1000L*Mhz)
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 identifycpu(void)
 {
 	char vendor[17];
 	char *family_name, *model_name;
 	u_int64_t features, tmp;
 	int number, revision, model, family, archrev;
 
 	/*
 	 * Assumes little-endian.
 	 */
 	*(u_int64_t *) &vendor[0] = ia64_get_cpuid(0);
 	*(u_int64_t *) &vendor[8] = ia64_get_cpuid(1);
 	vendor[16] = '\0';
 
 	tmp = ia64_get_cpuid(3);
 	number = (tmp >> 0) & 0xff;
 	revision = (tmp >> 8) & 0xff;
 	model = (tmp >> 16) & 0xff;
 	family = (tmp >> 24) & 0xff;
 	archrev = (tmp >> 32) & 0xff;
 
 	family_name = model_name = "unknown";
 	switch (family) {
 	case 0x07:
 		family_name = "Itanium";
 		model_name = "Merced";
 		break;
 	case 0x1f:
 		family_name = "Itanium 2";
 		switch (model) {
 		case 0x00:
 			model_name = "McKinley";
 			break;
 		case 0x01:
 			/*
 			 * Deerfield is a low-voltage variant based on the
 			 * Madison core. We need circumstantial evidence
 			 * (i.e. the clock frequency) to identify those.
 			 * Allow for roughly 1% error margin.
 			 */
 			tmp = processor_frequency >> 7;
 			if ((processor_frequency - tmp) < 1*Ghz &&
 			    (processor_frequency + tmp) >= 1*Ghz)
 				model_name = "Deerfield";
 			else
 				model_name = "Madison";
 			break;
 		case 0x02:
 			model_name = "Madison II";
 			break;
 		}
 		break;
 	case 0x20:
 		family_name = "Itanium 2";
 		switch (model) {
 		case 0x00:
 			model_name = "Montecito";
 			break;
 		}
 		break;
 	}
 	snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
 	snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);
 
 	features = ia64_get_cpuid(4);
 
 	printf("CPU: %s (", model_name);
 	if (processor_frequency) {
 		printf("%ld.%02ld-Mhz ",
 		    (processor_frequency + 4999) / Mhz,
 		    ((processor_frequency + 4999) / (Mhz/100)) % 100);
 	}
 	printf("%s)\n", family_name);
 	printf("  Origin = \"%s\"  Revision = %d\n", vendor, revision);
 	printf("  Features = 0x%b\n", (u_int32_t) features,
 	    "\020"
 	    "\001LB"	/* long branch (brl) instruction. */
 	    "\002SD"	/* Spontaneous deferral. */
 	    "\003AO"	/* 16-byte atomic operations (ld, st, cmpxchg). */ );
 }
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	identifycpu();
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ia64_ptob(Maxmem),
 	    ia64_ptob(Maxmem) / 1048576);
 	realmem = Maxmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx],
 			    phys_avail[indx + 1] - 1, size1, size1 / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
-	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
-	    ptoa(cnt.v_free_count) / 1048576);
+	printf("avail memory = %ld (%ld MB)\n", ptoa(VMCNT_GET(free_count)),
+	    ptoa(VMCNT_GET(free_count)) / 1048576);
  
 	if (fpswa_iface == NULL)
 		printf("Warning: no FPSWA package supplied\n");
 	else
 		printf("FPSWA Revision = 0x%lx, Entry = %p\n",
 		    (long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	/*
 	 * Traverse the MADT to discover IOSAPIC and Local SAPIC
 	 * information.
 	 */
 	ia64_probe_sapics();
 	ia64_mca_init();
 }
 
 void
 cpu_boot(int howto)
 {
 
 	efi_reset_system();
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	*rate = processor_frequency;
 	return (0);
 }
 
 void
 cpu_halt()
 {
 
 	efi_reset_system();
 }
 
 static void
 cpu_idle_default(void)
 {
 	struct ia64_pal_result res;
 
 	res = ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
 }
 
 void
 cpu_idle()
 {
 	(*cpu_idle_hook)();
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 void
 cpu_reset()
 {
 
 	cpu_boot(0);
 }
 
 void
 cpu_switch(struct thread *old, struct thread *new)
 {
 	struct pcb *oldpcb, *newpcb;
 
 	oldpcb = old->td_pcb;
 #ifdef COMPAT_IA32
 	ia32_savectx(oldpcb);
 #endif
 	if (PCPU_GET(fpcurthread) == old)
 		old->td_frame->tf_special.psr |= IA64_PSR_DFH;
 	if (!savectx(oldpcb)) {
 		newpcb = new->td_pcb;
 		oldpcb->pcb_current_pmap =
 		    pmap_switch(newpcb->pcb_current_pmap);
 		PCPU_SET(curthread, new);
 		PCPU_SET(curtid, new->td_tid);
 #ifdef COMPAT_IA32
 		ia32_restorectx(newpcb);
 #endif
 		if (PCPU_GET(fpcurthread) == new)
 			new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
 		restorectx(newpcb);
 		/* We should not get here. */
 		panic("cpu_switch: restorectx() returned");
 		/* NOTREACHED */
 	}
 }
 
 void
 cpu_throw(struct thread *old __unused, struct thread *new)
 {
 	struct pcb *newpcb;
 
 	newpcb = new->td_pcb;
 	(void)pmap_switch(newpcb->pcb_current_pmap);
 	PCPU_SET(curthread, new);
 	PCPU_SET(curtid, new->td_tid);
 #ifdef COMPAT_IA32
 	ia32_restorectx(newpcb);
 #endif
 	restorectx(newpcb);
 	/* We should not get here. */
 	panic("cpu_throw: restorectx() returned");
 	/* NOTREACHED */
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 	pcpu->pc_acpi_id = cpuid;
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_intr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_intr);
 }
 
 void
 map_pal_code(void)
 {
 	pt_entry_t pte;
 	uint64_t psr;
 
 	if (ia64_pal_base == 0)
 		return;
 
 	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
 	    PTE_PL_KERN | PTE_AR_RWX;
 	pte |= ia64_pal_base & PTE_PPN_MASK;
 
 	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
 	    "r"(IA64_PHYS_TO_RR7(ia64_pal_base)), "r"(IA64_ID_PAGE_SHIFT<<2));
 
 	__asm __volatile("mov	%0=psr" : "=r"(psr));
 	__asm __volatile("rsm	psr.ic|psr.i");
 	__asm __volatile("srlz.i");
 	__asm __volatile("mov	cr.ifa=%0" ::
 	    "r"(IA64_PHYS_TO_RR7(ia64_pal_base)));
 	__asm __volatile("mov	cr.itir=%0" :: "r"(IA64_ID_PAGE_SHIFT << 2));
 	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(1), "r"(pte));
 	__asm __volatile("srlz.d");		/* XXX not needed. */
 	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(1), "r"(pte));
 	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
 	__asm __volatile("srlz.i");
 }
 
 void
 map_gateway_page(void)
 {
 	pt_entry_t pte;
 	uint64_t psr;
 
 	pte = PTE_PRESENT | PTE_MA_WB | PTE_ACCESSED | PTE_DIRTY |
 	    PTE_PL_KERN | PTE_AR_X_RX;
 	pte |= (uint64_t)ia64_gateway_page & PTE_PPN_MASK;
 
 	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
 	    "r"(VM_MAX_ADDRESS), "r"(PAGE_SHIFT << 2));
 
 	__asm __volatile("mov	%0=psr" : "=r"(psr));
 	__asm __volatile("rsm	psr.ic|psr.i");
 	__asm __volatile("srlz.i");
 	__asm __volatile("mov	cr.ifa=%0" :: "r"(VM_MAX_ADDRESS));
 	__asm __volatile("mov	cr.itir=%0" :: "r"(PAGE_SHIFT << 2));
 	__asm __volatile("itr.d	dtr[%0]=%1" :: "r"(3), "r"(pte));
 	__asm __volatile("srlz.d");		/* XXX not needed. */
 	__asm __volatile("itr.i	itr[%0]=%1" :: "r"(3), "r"(pte));
 	__asm __volatile("mov	psr.l=%0" :: "r" (psr));
 	__asm __volatile("srlz.i");
 
 	/* Expose the mapping to userland in ar.k5 */
 	ia64_set_k5(VM_MAX_ADDRESS);
 }
 
 static void
 calculate_frequencies(void)
 {
 	struct ia64_sal_result sal;
 	struct ia64_pal_result pal;
 
 	sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
 	pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
 
 	if (sal.sal_status == 0 && pal.pal_status == 0) {
 		if (bootverbose) {
 			printf("Platform clock frequency %ld Hz\n",
 			       sal.sal_result[0]);
 			printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
 			       "ITC ratio %ld/%ld\n",
 			       pal.pal_result[0] >> 32,
 			       pal.pal_result[0] & ((1L << 32) - 1),
 			       pal.pal_result[1] >> 32,
 			       pal.pal_result[1] & ((1L << 32) - 1),
 			       pal.pal_result[2] >> 32,
 			       pal.pal_result[2] & ((1L << 32) - 1));
 		}
 		processor_frequency =
 			sal.sal_result[0] * (pal.pal_result[0] >> 32)
 			/ (pal.pal_result[0] & ((1L << 32) - 1));
 		bus_frequency =
 			sal.sal_result[0] * (pal.pal_result[1] >> 32)
 			/ (pal.pal_result[1] & ((1L << 32) - 1));
 		itc_frequency =
 			sal.sal_result[0] * (pal.pal_result[2] >> 32)
 			/ (pal.pal_result[2] & ((1L << 32) - 1));
 	}
 }
 
 void
 ia64_init(void)
 {
 	int phys_avail_cnt;
 	vm_offset_t kernstart, kernend;
 	vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
 	char *p;
 	struct efi_md *md;
 	int metadata_missing;
 
 	/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */
 
 	/*
 	 * TODO: Disable interrupts, floating point etc.
 	 * Maybe flush cache and tlb
 	 */
 	ia64_set_fpsr(IA64_FPSR_DEFAULT);
 
 	/*
 	 * TODO: Get critical system information (if possible, from the
 	 * information provided by the boot program).
 	 */
 
 	/*
 	 * pa_bootinfo is the physical address of the bootinfo block as
 	 * passed to us by the loader and set in locore.s.
 	 */
 	bootinfo = *(struct bootinfo *)(IA64_PHYS_TO_RR7(pa_bootinfo));
 
 	if (bootinfo.bi_magic != BOOTINFO_MAGIC || bootinfo.bi_version != 1) {
 		bzero(&bootinfo, sizeof(bootinfo));
 		bootinfo.bi_kernend = (vm_offset_t) round_page(_end);
 	}
 
 	/*
 	 * Look for the I/O ports first - we need them for console
 	 * probing.
 	 */
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
 		switch (md->md_type) {
 		case EFI_MD_TYPE_IOPORT:
 			ia64_port_base = IA64_PHYS_TO_RR6(md->md_phys);
 			break;
 		case EFI_MD_TYPE_PALCODE:
 			ia64_pal_base = md->md_phys;
 			break;
 		}
 	}
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep)
 		preload_metadata = (caddr_t)bootinfo.bi_modulep;
 	else
 		metadata_missing = 1;
 
 	if (envmode == 0 && bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp;
 	else
 		kern_envp = static_env;
 
 	/*
 	 * Look at arguments passed to us and compute boothowto.
 	 */
 	boothowto = bootinfo.bi_boothowto;
 
 	/*
 	 * Catch case of boot_verbose set in environment.
 	 */
 	if ((p = getenv("boot_verbose")) != NULL) {
 		if (strcmp(p, "yes") == 0 || strcmp(p, "YES") == 0) {
 			boothowto |= RB_VERBOSE;
 		}
 		freeenv(p);
 	}
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose = 1;
 
 	/*
 	 * Setup the PCPU data for the bootstrap processor. It is needed
 	 * by printf(). Also, since printf() has critical sections, we
 	 * need to initialize at least pc_curthread.
 	 */
 	pcpup = &pcpu0;
 	ia64_set_k4((u_int64_t)pcpup);
 	pcpu_init(pcpup, 0, sizeof(pcpu0));
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curtid, thread0.td_tid);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/* OUTPUT NOW ALLOWED */
 
 	if (ia64_pal_base != 0) {
 		ia64_pal_base &= ~IA64_ID_PAGE_MASK;
 		/*
 		 * We use a TR to map the first 256M of memory - this might
 		 * cover the palcode too.
 		 */
 		if (ia64_pal_base == 0)
 			printf("PAL code mapped by the kernel's TR\n");
 	} else
 		printf("PAL code not found\n");
 
 	/*
 	 * Wire things up so we can call the firmware.
 	 */
 	map_pal_code();
 	efi_boot_minimal(bootinfo.bi_systab);
 	ia64_sal_init();
 	calculate_frequencies();
 
 	/*
 	 * Find the beginning and end of the kernel.
 	 */
 	kernstart = trunc_page(kernel_text);
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 	kernend = (vm_offset_t)round_page(ksym_end);
 #else
 	kernend = (vm_offset_t)round_page(_end);
 #endif
 
 	/* But if the bootstrap tells us otherwise, believe it! */
 	if (bootinfo.bi_kernend)
 		kernend = round_page(bootinfo.bi_kernend);
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 	/* Get FPSWA interface */
 	fpswa_iface = (bootinfo.bi_fpswa == 0) ? NULL :
 	    (struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo.bi_fpswa);
 
 	/* Init basic tunables, including hz */
 	init_param1();
 
 	p = getenv("kernelname");
 	if (p) {
 		strncpy(kernelname, p, sizeof(kernelname) - 1);
 		freeenv(p);
 	}
 
 	kernstartpfn = atop(IA64_RR_MASK(kernstart));
 	kernendpfn = atop(IA64_RR_MASK(kernend));
 
 	/*
 	 * Size the memory regions and load phys_avail[] with the results.
 	 */
 
 	/*
 	 * Find out how much memory is available, by looking at
 	 * the memory descriptors.
 	 */
 
 #ifdef DEBUG_MD
 	printf("Memory descriptor count: %d\n", mdcount);
 #endif
 
 	phys_avail_cnt = 0;
 	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
 #ifdef DEBUG_MD
 		printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
 		    md->md_type, md->md_phys, md->md_pages);
 #endif
 
 		pfn0 = ia64_btop(round_page(md->md_phys));
 		pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
 		if (pfn1 <= pfn0)
 			continue;
 
 		if (md->md_type != EFI_MD_TYPE_FREE)
 			continue;
 
 		/*
 		 * We have a memory descriptor that describes conventional
 		 * memory that is for general use. We must determine if the
 		 * loader has put the kernel in this region.
 		 */
 		physmem += (pfn1 - pfn0);
 		if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
 			/*
 			 * Must compute the location of the kernel
 			 * within the segment.
 			 */
 #ifdef DEBUG_MD
 			printf("Descriptor %p contains kernel\n", mp);
 #endif
 			if (pfn0 < kernstartpfn) {
 				/*
 				 * There is a chunk before the kernel.
 				 */
 #ifdef DEBUG_MD
 				printf("Loading chunk before kernel: "
 				       "0x%lx / 0x%lx\n", pfn0, kernstartpfn);
 #endif
 				phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
 				phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
 				phys_avail_cnt += 2;
 			}
 			if (kernendpfn < pfn1) {
 				/*
 				 * There is a chunk after the kernel.
 				 */
 #ifdef DEBUG_MD
 				printf("Loading chunk after kernel: "
 				       "0x%lx / 0x%lx\n", kernendpfn, pfn1);
 #endif
 				phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
 				phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
 				phys_avail_cnt += 2;
 			}
 		} else {
 			/*
 			 * Just load this cluster as one chunk.
 			 */
 #ifdef DEBUG_MD
 			printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
 			       pfn0, pfn1);
 #endif
 			phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
 			phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
 			phys_avail_cnt += 2;
 			
 		}
 	}
 	phys_avail[phys_avail_cnt] = 0;
 
 	Maxmem = physmem;
 	init_param2(physmem);
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
 	msgbufp = (struct msgbuf *)pmap_steal_memory(MSGBUF_SIZE);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	proc_linkup(&proc0, &thread0);
 	/*
 	 * Init mapping for kernel stack for proc 0
 	 */
 	proc0kstack = (vm_offset_t)kstack;
 	thread0.td_kstack = proc0kstack;
 	thread0.td_kstack_pages = KSTACK_PAGES;
 
 	mutex_init();
 
 	/*
 	 * Initialize the rest of proc 0's PCB.
 	 *
 	 * Set the kernel sp, reserving space for an (empty) trapframe,
 	 * and make proc0's trapframe pointer point to it for sanity.
 	 * Initialise proc0's backing store to start after u area.
 	 */
 	cpu_thread_setup(&thread0);
 	thread0.td_frame->tf_flags = FRAME_SYSCALL;
 	thread0.td_pcb->pcb_special.sp =
 	    (u_int64_t)thread0.td_frame - 16;
 	thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;
 
 	/*
 	 * Initialize the virtual memory system.
 	 */
 	pmap_bootstrap();
 
 	/*
 	 * Initialize debuggers, and break into them if appropriate.
 	 */
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger\n");
 #endif
 
 	ia64_set_tpr(0);
 
 	/*
 	 * Save our current context so that we have a known (maybe even
 	 * sane) context as the initial context for new threads that are
 	 * forked from us. If any of those threads (including thread0)
 	 * does something wrong, we may be lucky and return here where
 	 * we're ready for them with a nice panic.
 	 */
 	if (!savectx(thread0.td_pcb))
 		mi_startup();
 
 	/* We should not get here. */
 	panic("ia64_init: Whooaa there!");
 	/* NOTREACHED */
 }
 
 uint64_t
 ia64_get_hcdp(void)
 {
 
 	return (bootinfo.bi_hcdp);
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 DELAY(int n)
 {
 	u_int64_t start, end, now;
 
 	start = ia64_get_itc();
 	end = start + (itc_frequency * n) / 1000000;
 	/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
 	do {
 		now = ia64_get_itc();
 	} while (now < end || (now > start && end < start));
 }
 
 /*
  * Send an interrupt (signal) to a process.
  */
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct proc *p;
 	struct thread *td;
 	struct trapframe *tf;
 	struct sigacts *psp;
 	struct sigframe sf, *sfp;
 	u_int64_t sbs, sp;
 	int oonstack;
 	int sig;
 	u_long code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_special.sp;
 	oonstack = sigonstack(sp);
 	sbs = 0;
 
 	/* save user context */
 	bzero(&sf, sizeof(struct sigframe));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/*
 	 * Allocate and validate space for the signal handler
 	 * context. Note that if the stack is in P0 space, the
 	 * call to grow() is a nop, and the useracc() check
 	 * will fail if the process has not already allocated
 	 * the space with a `brk'.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sbs = (u_int64_t)td->td_sigstk.ss_sp;
 		sbs = (sbs + 15) & ~15;
 		sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe *)sp;
 	sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);
 
 	/* Fill in the siginfo structure for POSIX handlers. */
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		/*
 		 * XXX this shouldn't be here after code in trap.c
 		 * is fixed
 		 */
 		sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
 		code = (u_int64_t)&sfp->sf_si;
 	}
 
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 
 	/* Copy the frame out to userland. */
 	if (copyout(&sf, sfp, sizeof(sf)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		return;
 	}
 
 	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
 		tf->tf_special.psr &= ~IA64_PSR_RI;
 		tf->tf_special.iip = ia64_get_k5() +
 		    ((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
 	} else
 		tf->tf_special.iip = ia64_get_k5() +
 		    ((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);
 
 	/*
 	 * Setup the trapframe to return to the signal trampoline. We pass
 	 * information to the trampoline in the following registers:
 	 *
 	 *	gp	new backing store or NULL
 	 *	r8	signal number
 	 *	r9	signal code or siginfo pointer
 	 *	r10	signal handler (function descriptor)
 	 */
 	tf->tf_special.sp = (u_int64_t)sfp - 16;
 	tf->tf_special.gp = sbs;
 	tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
 	tf->tf_special.ndirty = 0;
 	tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
 	tf->tf_scratch.gr8 = sig;
 	tf->tf_scratch.gr9 = code;
 	tf->tf_scratch.gr10 = (u_int64_t)catcher;
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 int
 sigreturn(struct thread *td,
 	struct sigreturn_args /* {
 		ucontext_t *sigcntxp;
 	} */ *uap)
 {
 	ucontext_t uc;
 	struct trapframe *tf;
 	struct proc *p;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	p = td->td_proc;
 	pcb = td->td_pcb;
 
 	/*
 	 * Fetch the entire context structure at once for speed.
 	 * We don't use a normal argument to simplify RSE handling.
 	 */
 	if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
 		return (EFAULT);
 
 	set_mcontext(td, &uc.uc_mcontext);
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (sigonstack(tf->tf_special.sp))
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_special = tf->tf_special;
 	pcb->pcb_special.__spare = ~0UL;	/* XXX see unwind.c */
 	save_callee_saved(&pcb->pcb_preserved);
 	save_callee_saved_fp(&pcb->pcb_preserved_fp);
 }
 
 int
 ia64_flush_dirty(struct thread *td, struct _special *r)
 {
 	struct iovec iov;
 	struct uio uio;
 	uint64_t bspst, kstk, rnat;
 	int error;
 
 	if (r->ndirty == 0)
 		return (0);
 
 	kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
 	if (td == curthread) {
 		__asm __volatile("mov	ar.rsc=0;;");
 		__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
 		/* Make sure we have all the user registers written out. */
 		if (bspst - kstk < r->ndirty) {
 			__asm __volatile("flushrs;;");
 			__asm __volatile("mov	%0=ar.bspstore" : "=r"(bspst));
 		}
 		__asm __volatile("mov	%0=ar.rnat;;" : "=r"(rnat));
 		__asm __volatile("mov	ar.rsc=3");
 		error = copyout((void*)kstk, (void*)r->bspstore, r->ndirty);
 		kstk += r->ndirty;
 		r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
 		    ? *(uint64_t*)(kstk | 0x1f8L) : rnat;
 	} else {
 		PHOLD(td->td_proc);
 		iov.iov_base = (void*)(uintptr_t)kstk;
 		iov.iov_len = r->ndirty;
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_offset = r->bspstore;
 		uio.uio_resid = r->ndirty;
 		uio.uio_segflg = UIO_SYSSPACE;
 		uio.uio_rw = UIO_WRITE;
 		uio.uio_td = td;
 		error = proc_rwmem(td->td_proc, &uio);
 		/*
 		 * XXX proc_rwmem() doesn't currently return ENOSPC,
 		 * so I think it can bogusly return 0. Neither do
 		 * we allow short writes.
 		 */
 		if (uio.uio_resid != 0 && error == 0)
 			error = ENOSPC;
 		PRELE(td->td_proc);
 	}
 
 	r->bspstore += r->ndirty;
 	r->ndirty = 0;
 	return (error);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	int error;
 
 	tf = td->td_frame;
 	bzero(mc, sizeof(*mc));
 	mc->mc_special = tf->tf_special;
 	error = ia64_flush_dirty(td, &mc->mc_special);
 	if (tf->tf_flags & FRAME_SYSCALL) {
 		mc->mc_flags |= _MC_FLAGS_SYSCALL_CONTEXT;
 		mc->mc_scratch = tf->tf_scratch;
 		if (flags & GET_MC_CLEAR_RET) {
 			mc->mc_scratch.gr8 = 0;
 			mc->mc_scratch.gr9 = 0;
 			mc->mc_scratch.gr10 = 0;
 			mc->mc_scratch.gr11 = 0;
 		}
 	} else {
 		mc->mc_flags |= _MC_FLAGS_ASYNC_CONTEXT;
 		mc->mc_scratch = tf->tf_scratch;
 		mc->mc_scratch_fp = tf->tf_scratch_fp;
 		/*
 		 * XXX If the thread never used the high FP registers, we
 		 * probably shouldn't waste time saving them.
 		 */
 		ia64_highfp_save(td);
 		mc->mc_flags |= _MC_FLAGS_HIGHFP_VALID;
 		mc->mc_high_fp = td->td_pcb->pcb_high_fp;
 	}
 	save_callee_saved(&mc->mc_preserved);
 	save_callee_saved_fp(&mc->mc_preserved_fp);
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct _special s;
 	struct trapframe *tf;
 	uint64_t psrmask;
 
 	tf = td->td_frame;
 
 	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
 	    ("Whoa there! We have more than 8KB of dirty registers!"));
 
 	s = mc->mc_special;
 	/*
 	 * Only copy the user mask and the restart instruction bit from
 	 * the new context.
 	 */
 	psrmask = IA64_PSR_BE | IA64_PSR_UP | IA64_PSR_AC | IA64_PSR_MFL |
 	    IA64_PSR_MFH | IA64_PSR_RI;
 	s.psr = (tf->tf_special.psr & ~psrmask) | (s.psr & psrmask);
 	/* We don't have any dirty registers of the new context. */
 	s.ndirty = 0;
 	if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
 		/*
 		 * We can get an async context passed to us while we
 		 * entered the kernel through a syscall: sigreturn(2)
 		 * and kse_switchin(2) both take contexts that could
 		 * previously be the result of a trap or interrupt.
 		 * Hence, we cannot assert that the trapframe is not
 		 * a syscall frame, but we can assert that it's at
 		 * least an expected syscall.
 		 */
 		if (tf->tf_flags & FRAME_SYSCALL) {
 			KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn ||
 			    tf->tf_scratch.gr15 == SYS_kse_switchin, ("foo"));
 			tf->tf_flags &= ~FRAME_SYSCALL;
 		}
 		tf->tf_scratch = mc->mc_scratch;
 		tf->tf_scratch_fp = mc->mc_scratch_fp;
 		if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
 			td->td_pcb->pcb_high_fp = mc->mc_high_fp;
 	} else {
 		KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
 		if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
 			s.cfm = tf->tf_special.cfm;
 			s.iip = tf->tf_special.iip;
 			tf->tf_scratch.gr15 = 0;	/* Clear syscall nr. */
 		} else
 			tf->tf_scratch = mc->mc_scratch;
 	}
 	tf->tf_special = s;
 	restore_callee_saved(&mc->mc_preserved);
 	restore_callee_saved_fp(&mc->mc_preserved_fp);
 
 	if (mc->mc_flags & _MC_FLAGS_KSE_SET_MBOX)
 		suword((caddr_t)mc->mc_special.ifa, mc->mc_special.isr);
 
 	return (0);
 }
 
 /*
  * Clear registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	uint64_t *ksttop, *kst;
 
 	tf = td->td_frame;
 	ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
 	    (tf->tf_special.bspstore & 0x1ffUL));
 
 	/*
 	 * We can ignore up to 8KB of dirty registers by masking off the
 	 * lower 13 bits in exception_restore() or epc_syscall(). This
 	 * should be enough for a couple of years, but if there are more
 	 * than 8KB of dirty registers, we lose track of the bottom of
 	 * the kernel stack. The solution is to copy the active part of
 	 * the kernel stack down 1 page (or 2, but not more than that)
 	 * so that we always have less than 8KB of dirty registers.
 	 */
 	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
 	    ("Whoa there! We have more than 8KB of dirty registers!"));
 
 	bzero(&tf->tf_special, sizeof(tf->tf_special));
 	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {	/* break syscalls. */
 		bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
 		bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
 		tf->tf_special.cfm = (1UL<<63) | (3UL<<7) | 3UL;
 		tf->tf_special.bspstore = IA64_BACKINGSTORE;
 		/*
 		 * Copy the arguments onto the kernel register stack so that
 		 * they get loaded by the loadrs instruction. Skip over the
 		 * NaT collection points.
 		 */
 		kst = ksttop - 1;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst-- = 0;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst-- = ps_strings;
 		if (((uintptr_t)kst & 0x1ff) == 0x1f8)
 			*kst-- = 0;
 		*kst = stack;
 		tf->tf_special.ndirty = (ksttop - kst) << 3;
 	} else {				/* epc syscalls (default). */
 		tf->tf_special.cfm = (3UL<<62) | (3UL<<7) | 3UL;
 		tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
 		/*
 		 * Write values for out0, out1 and out2 to the user's backing
 		 * store and arrange for them to be restored into the user's
 		 * initial register frame.
 		 * Assumes that (bspstore & 0x1f8) < 0x1e0.
 		 */
 		suword((caddr_t)tf->tf_special.bspstore - 24, stack);
 		suword((caddr_t)tf->tf_special.bspstore - 16, ps_strings);
 		suword((caddr_t)tf->tf_special.bspstore -  8, 0);
 	}
 
 	tf->tf_special.iip = entry;
 	tf->tf_special.sp = (stack & ~15) - 16;
 	tf->tf_special.rsc = 0xf;
 	tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
 	tf->tf_special.psr = IA64_PSR_IC | IA64_PSR_I | IA64_PSR_IT |
 	    IA64_PSR_DT | IA64_PSR_RT | IA64_PSR_DFH | IA64_PSR_BN |
 	    IA64_PSR_CPL_USER;
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	uint64_t slot;
 
 	switch (addr & 0xFUL) {
 	case 0:
 		slot = IA64_PSR_RI_0;
 		break;
 	case 1:
 		/* XXX we need to deal with MLX bundles here */
 		slot = IA64_PSR_RI_1;
 		break;
 	case 2:
 		slot = IA64_PSR_RI_2;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	td->td_frame->tf_special.iip = addr & ~0x0FULL;
 	td->td_frame->tf_special.psr =
 	    (td->td_frame->tf_special.psr & ~IA64_PSR_RI) | slot;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	/*
 	 * There's no way to set single stepping when we're leaving the
 	 * kernel through the EPC syscall path. The way we solve this is
 	 * by enabling the lower-privilege trap so that we re-enter the
 	 * kernel as soon as the privilege level changes. See trap.c for
 	 * how we proceed from there.
 	 */
 	tf = td->td_frame;
 	if (tf->tf_flags & FRAME_SYSCALL)
 		tf->tf_special.psr |= IA64_PSR_LP;
 	else
 		tf->tf_special.psr |= IA64_PSR_SS;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	/*
 	 * Clear any and all status bits we may use to implement single
 	 * stepping.
 	 */
 	tf = td->td_frame;
 	tf->tf_special.psr &= ~IA64_PSR_SS;
 	tf->tf_special.psr &= ~IA64_PSR_LP;
 	tf->tf_special.psr &= ~IA64_PSR_TB;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	regs->r_special = tf->tf_special;
 	regs->r_scratch = tf->tf_scratch;
 	save_callee_saved(&regs->r_preserved);
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 	int error;
 
 	tf = td->td_frame;
 	error = ia64_flush_dirty(td, &tf->tf_special);
 	if (!error) {
 		tf->tf_special = regs->r_special;
 		tf->tf_special.bspstore += tf->tf_special.ndirty;
 		tf->tf_special.ndirty = 0;
 		tf->tf_scratch = regs->r_scratch;
 		restore_callee_saved(&regs->r_preserved);
 	}
 	return (error);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *frame = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Save the high FP registers. */
 	ia64_highfp_save(td);
 
 	fpregs->fpr_scratch = frame->tf_scratch_fp;
 	save_callee_saved_fp(&fpregs->fpr_preserved);
 	fpregs->fpr_high = pcb->pcb_high_fp;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *frame = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Throw away the high FP registers (should be redundant). */
 	ia64_highfp_drop(td);
 
 	frame->tf_scratch_fp = fpregs->fpr_scratch;
 	restore_callee_saved_fp(&fpregs->fpr_preserved);
 	pcb->pcb_high_fp = fpregs->fpr_high;
 	return (0);
 }
 
 /*
  * High FP register functions.
  */
 
 int
 ia64_highfp_drop(struct thread *td)
 {
 	struct pcb *pcb;
 	struct pcpu *cpu;
 	struct thread *thr;
 
 	mtx_lock_spin(&td->td_md.md_highfp_mtx);
 	pcb = td->td_pcb;
 	cpu = pcb->pcb_fpcpu;
 	if (cpu == NULL) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		return (0);
 	}
 	pcb->pcb_fpcpu = NULL;
 	thr = cpu->pc_fpcurthread;
 	cpu->pc_fpcurthread = NULL;
 	mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 
 	/* Post-mortem sanity checking. */
 	KASSERT(thr == td, ("Inconsistent high FP state"));
 	return (1);
 }
 
 int
 ia64_highfp_save(struct thread *td)
 {
 	struct pcb *pcb;
 	struct pcpu *cpu;
 	struct thread *thr;
 
 	/* Don't save if the high FP registers weren't modified. */
 	if ((td->td_frame->tf_special.psr & IA64_PSR_MFH) == 0)
 		return (ia64_highfp_drop(td));
 
 	mtx_lock_spin(&td->td_md.md_highfp_mtx);
 	pcb = td->td_pcb;
 	cpu = pcb->pcb_fpcpu;
 	if (cpu == NULL) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		return (0);
 	}
 #ifdef SMP
 	if (td == curthread)
 		sched_pin();
 	if (cpu != pcpup) {
 		mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 		ipi_send(cpu, IPI_HIGH_FP);
 		if (td == curthread)
 			sched_unpin();
 		while (pcb->pcb_fpcpu == cpu)
 			DELAY(100);
 		return (1);
 	} else {
 		save_high_fp(&pcb->pcb_high_fp);
 		if (td == curthread)
 			sched_unpin();
 	}
 #else
 	save_high_fp(&pcb->pcb_high_fp);
 #endif
 	pcb->pcb_fpcpu = NULL;
 	thr = cpu->pc_fpcurthread;
 	cpu->pc_fpcurthread = NULL;
 	mtx_unlock_spin(&td->td_md.md_highfp_mtx);
 
 	/* Post-mortem sanity cxhecking. */
 	KASSERT(thr == td, ("Inconsistent high FP state"));
 	return (1);
 }
 
 int
 sysbeep(int pitch, int period)
 {
 	return (ENODEV);
 }
Index: head/sys/ia64/ia64/pmap.c
===================================================================
--- head/sys/ia64/ia64/pmap.c	(revision 169666)
+++ head/sys/ia64/ia64/pmap.c	(revision 169667)
@@ -1,2399 +1,2399 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 1998,2000 Doug Rabson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  *	from:	i386 Id: pmap.c,v 1.193 1998/04/19 15:22:48 bde Exp
  *		with some ideas from NetBSD's alpha pmap
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pageout.h>
 #include <vm/uma.h>
 
 #include <machine/md_var.h>
 #include <machine/pal.h>
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 /*
  * Following the Linux model, region IDs are allocated in groups of
  * eight so that a single region ID can be used for as many RRs as we
  * want by encoding the RR number into the low bits of the ID.
  *
  * We reserve region ID 0 for the kernel and allocate the remaining
  * IDs for user pmaps.
  *
  * Region 0..4
  *	User virtually mapped
  *
  * Region 5
  *	Kernel virtually mapped
  *
  * Region 6
  *	Kernel physically mapped uncacheable
  *
  * Region 7
  *	Kernel physically mapped cacheable
  */
 
 /* XXX move to a header. */
 extern uint64_t ia64_gateway_page[];
 
 MALLOC_DEFINE(M_PMAP, "PMAP", "PMAP Structures");
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 
 #if !defined(DIAGNOSTIC)
 #define PMAP_INLINE __inline
 #else
 #define PMAP_INLINE
 #endif
 
 #define	pmap_accessed(lpte)		((lpte)->pte & PTE_ACCESSED)
 #define	pmap_dirty(lpte)		((lpte)->pte & PTE_DIRTY)
 #define	pmap_managed(lpte)		((lpte)->pte & PTE_MANAGED)
 #define	pmap_ppn(lpte)			((lpte)->pte & PTE_PPN_MASK)
 #define	pmap_present(lpte)		((lpte)->pte & PTE_PRESENT)
 #define	pmap_prot(lpte)			(((lpte)->pte & PTE_PROT_MASK) >> 56)
 #define	pmap_wired(lpte)		((lpte)->pte & PTE_WIRED)
 
 #define	pmap_clear_accessed(lpte)	(lpte)->pte &= ~PTE_ACCESSED
 #define	pmap_clear_dirty(lpte)		(lpte)->pte &= ~PTE_DIRTY
 #define	pmap_clear_present(lpte)	(lpte)->pte &= ~PTE_PRESENT
 #define	pmap_clear_wired(lpte)		(lpte)->pte &= ~PTE_WIRED
 
 #define	pmap_set_wired(lpte)		(lpte)->pte |= PTE_WIRED
 
 /*
  * The VHPT bucket head structure.
  */
 struct ia64_bucket {
 	uint64_t	chain;
 	struct mtx	mutex;
 	u_int		length;
 };
 
 /*
  * Statically allocated kernel pmap
  */
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 /*
  * Kernel virtual memory management.
  */
 static int nkpt;
 struct ia64_lpte **ia64_kptdir;
 #define KPTE_DIR_INDEX(va) \
 	((va >> (2*PAGE_SHIFT-5)) & ((1<<(PAGE_SHIFT-3))-1))
 #define KPTE_PTE_INDEX(va) \
 	((va >> PAGE_SHIFT) & ((1<<(PAGE_SHIFT-5))-1))
 #define NKPTEPG		(PAGE_SIZE / sizeof(struct ia64_lpte))
 
 vm_offset_t kernel_vm_end;
 
 /* Values for ptc.e. XXX values for SKI. */
 static uint64_t pmap_ptc_e_base = 0x100000000;
 static uint64_t pmap_ptc_e_count1 = 3;
 static uint64_t pmap_ptc_e_count2 = 2;
 static uint64_t pmap_ptc_e_stride1 = 0x2000;
 static uint64_t pmap_ptc_e_stride2 = 0x100000000;
 struct mtx pmap_ptcmutex;
 
 /*
  * Data for the RID allocator
  */
 static int pmap_ridcount;
 static int pmap_rididx;
 static int pmap_ridmapsz;
 static int pmap_ridmax;
 static uint64_t *pmap_ridmap;
 struct mtx pmap_ridmutex;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 
 /*
  * Data for allocating PTEs for user processes.
  */
 static uma_zone_t ptezone;
 
 /*
  * Virtual Hash Page Table (VHPT) data.
  */
 /* SYSCTL_DECL(_machdep); */
 SYSCTL_NODE(_machdep, OID_AUTO, vhpt, CTLFLAG_RD, 0, "");
 
 struct ia64_bucket *pmap_vhpt_bucket;
 
 int pmap_vhpt_nbuckets;
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, nbuckets, CTLFLAG_RD,
     &pmap_vhpt_nbuckets, 0, "");
 
 uint64_t pmap_vhpt_base[MAXCPU];
 
 int pmap_vhpt_log2size = 0;
 TUNABLE_INT("machdep.vhpt.log2size", &pmap_vhpt_log2size);
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, log2size, CTLFLAG_RD,
     &pmap_vhpt_log2size, 0, "");
 
 static int pmap_vhpt_inserts;
 SYSCTL_INT(_machdep_vhpt, OID_AUTO, inserts, CTLFLAG_RD,
     &pmap_vhpt_inserts, 0, "");
 
 static int pmap_vhpt_population(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_machdep_vhpt, OID_AUTO, population, CTLTYPE_INT | CTLFLAG_RD,
     NULL, 0, pmap_vhpt_population, "I", "");
 
 static struct ia64_lpte *pmap_find_vhpt(vm_offset_t va);
 
 static PMAP_INLINE void	free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 
 static void	pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m, vm_prot_t prot);
 static pmap_t	pmap_install(pmap_t);
 static void	pmap_invalidate_all(pmap_t pmap);
 static int	pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte,
 		    vm_offset_t va, pv_entry_t pv, int freepte);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
 		    vm_page_t m);
 
 vm_offset_t
 pmap_steal_memory(vm_size_t size)
 {
 	vm_size_t bank_size;
 	vm_offset_t pa, va;
 
 	size = round_page(size);
 
 	bank_size = phys_avail[1] - phys_avail[0];
 	while (size > bank_size) {
 		int i;
 		for (i = 0; phys_avail[i+2]; i+= 2) {
 			phys_avail[i] = phys_avail[i+2];
 			phys_avail[i+1] = phys_avail[i+3];
 		}
 		phys_avail[i] = 0;
 		phys_avail[i+1] = 0;
 		if (!phys_avail[0])
 			panic("pmap_steal_memory: out of memory");
 		bank_size = phys_avail[1] - phys_avail[0];
 	}
 
 	pa = phys_avail[0];
 	phys_avail[0] += size;
 
 	va = IA64_PHYS_TO_RR7(pa);
 	bzero((caddr_t) va, size);
 	return va;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap()
 {
 	struct ia64_pal_result res;
 	struct ia64_lpte *pte;
 	vm_offset_t base, limit;
 	size_t size;
 	int i, j, count, ridbits;
 
 	/*
 	 * Query the PAL Code to find the loop parameters for the
 	 * ptc.e instruction.
 	 */
 	res = ia64_call_pal_static(PAL_PTCE_INFO, 0, 0, 0);
 	if (res.pal_status != 0)
 		panic("Can't configure ptc.e parameters");
 	pmap_ptc_e_base = res.pal_result[0];
 	pmap_ptc_e_count1 = res.pal_result[1] >> 32;
 	pmap_ptc_e_count2 = res.pal_result[1] & ((1L<<32) - 1);
 	pmap_ptc_e_stride1 = res.pal_result[2] >> 32;
 	pmap_ptc_e_stride2 = res.pal_result[2] & ((1L<<32) - 1);
 	if (bootverbose)
 		printf("ptc.e base=0x%lx, count1=%ld, count2=%ld, "
 		       "stride1=0x%lx, stride2=0x%lx\n",
 		       pmap_ptc_e_base,
 		       pmap_ptc_e_count1,
 		       pmap_ptc_e_count2,
 		       pmap_ptc_e_stride1,
 		       pmap_ptc_e_stride2);
 	mtx_init(&pmap_ptcmutex, "Global PTC lock", NULL, MTX_SPIN);
 
 	/*
 	 * Setup RIDs. RIDs 0..7 are reserved for the kernel.
 	 *
 	 * We currently need at least 19 bits in the RID because PID_MAX
 	 * can only be encoded in 17 bits and we need RIDs for 5 regions
 	 * per process. With PID_MAX equalling 99999 this means that we
 	 * need to be able to encode 499995 (=5*PID_MAX).
 	 * The Itanium processor only has 18 bits and the architected
 	 * minimum is exactly that. So, we cannot use a PID based scheme
 	 * in those cases. Enter pmap_ridmap...
 	 * We should avoid the map when running on a processor that has
 	 * implemented enough bits. This means that we should pass the
 	 * process/thread ID to pmap. This we currently don't do, so we
 	 * use the map anyway. However, we don't want to allocate a map
 	 * that is large enough to cover the range dictated by the number
 	 * of bits in the RID, because that may result in a RID map of
 	 * 2MB in size for a 24-bit RID. A 64KB map is enough.
 	 * The bottomline: we create a 32KB map when the processor only
 	 * implements 18 bits (or when we can't figure it out). Otherwise
 	 * we create a 64KB map.
 	 */
 	res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0);
 	if (res.pal_status != 0) {
 		if (bootverbose)
 			printf("Can't read VM Summary - assuming 18 Region ID bits\n");
 		ridbits = 18; /* guaranteed minimum */
 	} else {
 		ridbits = (res.pal_result[1] >> 8) & 0xff;
 		if (bootverbose)
 			printf("Processor supports %d Region ID bits\n",
 			    ridbits);
 	}
 	if (ridbits > 19)
 		ridbits = 19;
 
 	pmap_ridmax = (1 << ridbits);
 	pmap_ridmapsz = pmap_ridmax / 64;
 	pmap_ridmap = (uint64_t *)pmap_steal_memory(pmap_ridmax / 8);
 	pmap_ridmap[0] |= 0xff;
 	pmap_rididx = 0;
 	pmap_ridcount = 8;
 	mtx_init(&pmap_ridmutex, "RID allocator lock", NULL, MTX_DEF);
 
 	/*
 	 * Allocate some memory for initial kernel 'page tables'.
 	 */
 	ia64_kptdir = (void *)pmap_steal_memory(PAGE_SIZE);
 	for (i = 0; i < NKPT; i++) {
 		ia64_kptdir[i] = (void*)pmap_steal_memory(PAGE_SIZE);
 	}
 	nkpt = NKPT;
 	kernel_vm_end = NKPT * PAGE_SIZE * NKPTEPG + VM_MIN_KERNEL_ADDRESS -
 	    VM_GATEWAY_SIZE;
 
 	for (i = 0; phys_avail[i+2]; i+= 2)
 		;
 	count = i+2;
 
 	/*
 	 * Figure out a useful size for the VHPT, based on the size of
 	 * physical memory and try to locate a region which is large
 	 * enough to contain the VHPT (which must be a power of two in
 	 * size and aligned to a natural boundary).
 	 * We silently bump up the VHPT size to the minimum size if the
 	 * user has set the tunable too small. Likewise, the VHPT size
 	 * is silently capped to the maximum allowed.
 	 */
 	TUNABLE_INT_FETCH("machdep.vhpt.log2size", &pmap_vhpt_log2size);
 	if (pmap_vhpt_log2size == 0) {
 		pmap_vhpt_log2size = 15;
 		size = 1UL << pmap_vhpt_log2size;
 		while (size < Maxmem * 32) {
 			pmap_vhpt_log2size++;
 			size <<= 1;
 		}
 	} else if (pmap_vhpt_log2size < 15)
 		pmap_vhpt_log2size = 15;
 	if (pmap_vhpt_log2size > 61)
 		pmap_vhpt_log2size = 61;
 
 	pmap_vhpt_base[0] = 0;
 	base = limit = 0;
 	size = 1UL << pmap_vhpt_log2size;
 	while (pmap_vhpt_base[0] == 0) {
 		if (bootverbose)
 			printf("Trying VHPT size 0x%lx\n", size);
 		for (i = 0; i < count; i += 2) {
 			base = (phys_avail[i] + size - 1) & ~(size - 1);
 			limit = base + MAXCPU * size;
 			if (limit <= phys_avail[i+1])
 				/*
 				 * VHPT can fit in this region
 				 */
 				break;
 		}
 		if (!phys_avail[i]) {
 			/* Can't fit, try next smaller size. */
 			pmap_vhpt_log2size--;
 			size >>= 1;
 		} else
 			pmap_vhpt_base[0] = IA64_PHYS_TO_RR7(base);
 	}
 	if (pmap_vhpt_log2size < 15)
 		panic("Can't find space for VHPT");
 
 	if (bootverbose)
 		printf("Putting VHPT at 0x%lx\n", base);
 
 	if (base != phys_avail[i]) {
 		/* Split this region. */
 		if (bootverbose)
 			printf("Splitting [%p-%p]\n", (void *)phys_avail[i],
 			    (void *)phys_avail[i+1]);
 		for (j = count; j > i; j -= 2) {
 			phys_avail[j] = phys_avail[j-2];
 			phys_avail[j+1] = phys_avail[j-2+1];
 		}
 		phys_avail[i+1] = base;
 		phys_avail[i+2] = limit;
 	} else
 		phys_avail[i] = limit;
 
 	pmap_vhpt_nbuckets = size / sizeof(struct ia64_lpte);
 
 	pmap_vhpt_bucket = (void *)pmap_steal_memory(pmap_vhpt_nbuckets *
 	    sizeof(struct ia64_bucket));
 	pte = (struct ia64_lpte *)pmap_vhpt_base[0];
 	for (i = 0; i < pmap_vhpt_nbuckets; i++) {
 		pte[i].pte = 0;
 		pte[i].itir = 0;
 		pte[i].tag = 1UL << 63;	/* Invalid tag */
 		pte[i].chain = (uintptr_t)(pmap_vhpt_bucket + i);
 		/* Stolen memory is zeroed! */
 		mtx_init(&pmap_vhpt_bucket[i].mutex, "VHPT bucket lock", NULL,
 		    MTX_SPIN);
 	}
 
 	for (i = 1; i < MAXCPU; i++) {
 		pmap_vhpt_base[i] = pmap_vhpt_base[i - 1] + size;
 		bcopy((void *)pmap_vhpt_base[i - 1], (void *)pmap_vhpt_base[i],
 		    size);
 	}
 
 	__asm __volatile("mov cr.pta=%0;; srlz.i;;" ::
 	    "r" (pmap_vhpt_base[0] + (1<<8) + (pmap_vhpt_log2size<<2) + 1));
 
 	virtual_avail = VM_MIN_KERNEL_ADDRESS;
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	for (i = 0; i < 5; i++)
 		kernel_pmap->pm_rid[i] = 0;
 	kernel_pmap->pm_active = 1;
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 	PCPU_SET(current_pmap, kernel_pmap);
 
 	/*
 	 * Region 5 is mapped via the vhpt.
 	 */
 	ia64_set_rr(IA64_RR_BASE(5),
 		    (5 << 8) | (PAGE_SHIFT << 2) | 1);
 
 	/*
 	 * Region 6 is direct mapped UC and region 7 is direct mapped
 	 * WC. The details of this is controlled by the Alt {I,D}TLB
 	 * handlers. Here we just make sure that they have the largest 
 	 * possible page size to minimise TLB usage.
 	 */
 	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) | (IA64_ID_PAGE_SHIFT << 2));
 	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) | (IA64_ID_PAGE_SHIFT << 2));
 
 	/*
 	 * Clear out any random TLB entries left over from booting.
 	 */
 	pmap_invalidate_all(kernel_pmap);
 
 	map_gateway_page();
 }
 
 static int
 pmap_vhpt_population(SYSCTL_HANDLER_ARGS)
 {
 	int count, error, i;
 
 	count = 0;
 	for (i = 0; i < pmap_vhpt_nbuckets; i++)
 		count += pmap_vhpt_bucket[i].length;
 
 	error = SYSCTL_OUT(req, &count, sizeof(count));
 	return (error);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	ptezone = uma_zcreate("PT ENTRY", sizeof (struct ia64_lpte), 
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 
 /***************************************************
  * Manipulate TLBs for a pmap
  ***************************************************/
 
 #if 0
 static __inline void
 pmap_invalidate_page_locally(void *arg)
 {
 	vm_offset_t va = (uintptr_t)arg;
 	struct ia64_lpte *pte;
 
 	pte = (struct ia64_lpte *)ia64_thash(va);
 	if (pte->tag == ia64_ttag(va))
 		pte->tag = 1UL << 63;
 	ia64_ptc_l(va, PAGE_SHIFT << 2);
 }
 
 #ifdef SMP
 static void
 pmap_invalidate_page_1(void *arg)
 {
 	void **args = arg;
 	pmap_t oldpmap;
 
 	critical_enter();
 	oldpmap = pmap_install(args[0]);
 	pmap_invalidate_page_locally(args[1]);
 	pmap_install(oldpmap);
 	critical_exit();
 }
 #endif
 
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 #ifdef SMP
 	if (mp_ncpus > 1) {
 		void *args[2];
 		args[0] = pmap;
 		args[1] = (void *)va;
 		smp_rendezvous(NULL, pmap_invalidate_page_1, NULL, args);
 	} else
 #endif
 	pmap_invalidate_page_locally((void *)va);
 }
 #endif /* 0 */
 
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	int i, vhpt_ofs;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 	vhpt_ofs = ia64_thash(va) - pmap_vhpt_base[PCPU_GET(cpuid)];
 	critical_enter();
 	for (i = 0; i < MAXCPU; i++) {
 		pte = (struct ia64_lpte *)(pmap_vhpt_base[i] + vhpt_ofs);
 		if (pte->tag == ia64_ttag(va))
 			pte->tag = 1UL << 63;
 	}
 	critical_exit();
 	mtx_lock_spin(&pmap_ptcmutex);
 	ia64_ptc_ga(va, PAGE_SHIFT << 2);
 	mtx_unlock_spin(&pmap_ptcmutex);
 }
 
 static void
 pmap_invalidate_all_1(void *arg)
 {
 	uint64_t addr;
 	int i, j;
 
 	critical_enter();
 	addr = pmap_ptc_e_base;
 	for (i = 0; i < pmap_ptc_e_count1; i++) {
 		for (j = 0; j < pmap_ptc_e_count2; j++) {
 			ia64_ptc_e(addr);
 			addr += pmap_ptc_e_stride2;
 		}
 		addr += pmap_ptc_e_stride1;
 	}
 	critical_exit();
 }
 
 static void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("invalidating TLB for non-current pmap"));
 
 #ifdef SMP
 	if (mp_ncpus > 1)
 		smp_rendezvous(NULL, pmap_invalidate_all_1, NULL, NULL);
 	else
 #endif
 	pmap_invalidate_all_1(NULL);
 }
 
 static uint32_t
 pmap_allocate_rid(void)
 {
 	uint64_t bit, bits;
 	int rid;
 
 	mtx_lock(&pmap_ridmutex);
 	if (pmap_ridcount == pmap_ridmax)
 		panic("pmap_allocate_rid: All Region IDs used");
 
 	/* Find an index with a free bit. */
 	while ((bits = pmap_ridmap[pmap_rididx]) == ~0UL) {
 		pmap_rididx++;
 		if (pmap_rididx == pmap_ridmapsz)
 			pmap_rididx = 0;
 	}
 	rid = pmap_rididx * 64;
 
 	/* Find a free bit. */
 	bit = 1UL;
 	while (bits & bit) {
 		rid++;
 		bit <<= 1;
 	}
 
 	pmap_ridmap[pmap_rididx] |= bit;
 	pmap_ridcount++;
 	mtx_unlock(&pmap_ridmutex);
 
 	return rid;
 }
 
 static void
 pmap_free_rid(uint32_t rid)
 {
 	uint64_t bit;
 	int idx;
 
 	idx = rid / 64;
 	bit = ~(1UL << (rid & 63));
 
 	mtx_lock(&pmap_ridmutex);
 	pmap_ridmap[idx] &= bit;
 	pmap_ridcount--;
 	mtx_unlock(&pmap_ridmutex);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 
 void
 pmap_pinit0(struct pmap *pmap)
 {
 	/* kernel_pmap is the same as any other pmap. */
 	pmap_pinit(pmap);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(struct pmap *pmap)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	for (i = 0; i < 5; i++)
 		pmap->pm_rid[i] = pmap_allocate_rid();
 	pmap->pm_active = 0;
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (pmap->pm_rid[i])
 			pmap_free_rid(pmap->pm_rid[i]);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct ia64_lpte *ptepage;
 	vm_page_t nkpg;
 
 	while (kernel_vm_end < addr) {
 		/* We could handle more by increasing the size of kptdir. */
 		if (nkpt == MAXKPT)
 			panic("pmap_growkernel: out of kernel address space");
 
 		nkpg = vm_page_alloc(NULL, nkpt,
 		    VM_ALLOC_NOOBJ | VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
 		if (!nkpg)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		ptepage = (struct ia64_lpte *)
 		    IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(nkpg));
 		bzero(ptepage, PAGE_SIZE);
 		ia64_kptdir[KPTE_DIR_INDEX(kernel_vm_end)] = ptepage;
 
 		nkpt++;
 		kernel_vm_end += PAGE_SIZE * NKPTEPG;
 	}
 }
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 /*
  * free the pv_entry back to the free list
  */
 static PMAP_INLINE void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	struct vpgqueues *vpq;
 	struct ia64_lpte *pte;
 	pmap_t oldpmap, pmap;
 	pv_entry_t allocated_pv, next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 	if (allocated_pv != NULL) {
 		pv_entry_count++;
 		if (pv_entry_count > pv_entry_high_water)
 			pagedaemon_wakeup();
 		else
 			return (allocated_pv);
 	}
 
 	/*
 	 * Reclaim pv entries: At first, destroy mappings to inactive
 	 * pages.  After that, if a pv entry is still needed, destroy
 	 * mappings to active pages.
 	 */
 	if (ratecheck(&lastprint, &printinterval))
 		printf("Approaching the limit on PV entries, "
 		    "increase the vm.pmap.shpgperproc tunable.\n");
 	vpq = &vm_page_queues[PQ_INACTIVE];
 retry:
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = pv->pv_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			oldpmap = pmap_install(pmap);
 			pte = pmap_find_vhpt(va);
 			KASSERT(pte != NULL, ("pte"));
 			pmap_remove_pte(pmap, pte, va, pv, 1);
 			pmap_install(oldpmap);
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			if (allocated_pv == NULL)
 				allocated_pv = pv;
 			else
 				free_pv_entry(pv);
 		}
 	}
 	if (allocated_pv == NULL) {
 		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 			vpq = &vm_page_queues[PQ_ACTIVE];
 			goto retry;
 		}
 		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 	}
 	return (allocated_pv);
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = uma_zalloc(pvzone, M_NOWAIT)) != NULL) {
 		pv_entry_count++;
 		pv->pv_va = va;
 		pv->pv_pmap = pmap;
 		TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Add an ia64_lpte to the VHPT.
  */
 static void
 pmap_enter_vhpt(struct ia64_lpte *pte, vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *vhpte;
 	uint64_t pte_pa;
 
 	/* Can fault, so get it out of the way. */
 	pte_pa = ia64_tpa((vm_offset_t)pte);
 
 	vhpte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)vhpte->chain;
 
 	mtx_lock_spin(&bckt->mutex);
 	pte->chain = bckt->chain;
 	ia64_mf();
 	bckt->chain = pte_pa;
 
 	pmap_vhpt_inserts++;
 	bckt->length++;
 	mtx_unlock_spin(&bckt->mutex);
 }
 
 /*
  * Remove the ia64_lpte matching va from the VHPT. Return zero if it
  * worked or an appropriate error code otherwise.
  */
 static int
 pmap_remove_vhpt(vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *pte;
 	struct ia64_lpte *lpte;
 	struct ia64_lpte *vhpte;
 	uint64_t chain, tag;
 
 	tag = ia64_ttag(va);
 	vhpte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)vhpte->chain;
 
 	lpte = NULL;
 	mtx_lock_spin(&bckt->mutex);
 	chain = bckt->chain;
 	pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	while (chain != 0 && pte->tag != tag) {
 		lpte = pte;
 		chain = pte->chain;
 		pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	}
 	if (chain == 0) {
 		mtx_unlock_spin(&bckt->mutex);
 		return (ENOENT);
 	}
 
 	/* Snip this pv_entry out of the collision chain. */
 	if (lpte == NULL)
 		bckt->chain = pte->chain;
 	else
 		lpte->chain = pte->chain;
 	ia64_mf();
 
 	bckt->length--;
 	mtx_unlock_spin(&bckt->mutex);
 	return (0);
 }
 
 /*
  * Find the ia64_lpte for the given va, if any.
  */
 static struct ia64_lpte *
 pmap_find_vhpt(vm_offset_t va)
 {
 	struct ia64_bucket *bckt;
 	struct ia64_lpte *pte;
 	uint64_t chain, tag;
 
 	tag = ia64_ttag(va);
 	pte = (struct ia64_lpte *)ia64_thash(va);
 	bckt = (struct ia64_bucket *)pte->chain;
 
 	mtx_lock_spin(&bckt->mutex);
 	chain = bckt->chain;
 	pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	while (chain != 0 && pte->tag != tag) {
 		chain = pte->chain;
 		pte = (struct ia64_lpte *)IA64_PHYS_TO_RR7(chain);
 	}
 	mtx_unlock_spin(&bckt->mutex);
 	return ((chain != 0) ? pte : NULL);
 }
 
 /*
  * Remove an entry from the list of managed mappings.
  */
 static int
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va, pv_entry_t pv)
 {
 	if (!pv) {
 		if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 			TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 				if (pmap == pv->pv_pmap && va == pv->pv_va) 
 					break;
 			}
 		} else {
 			TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 				if (va == pv->pv_va) 
 					break;
 			}
 		}
 	}
 
 	if (pv) {
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		if (TAILQ_FIRST(&m->md.pv_list) == NULL)
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		free_pv_entry(pv);
 		return 0;
 	} else {
 		return ENOENT;
 	}
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	pv = get_pv_entry(pmap);
 	pv->pv_pmap = pmap;
 	pv->pv_va = va;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL && pmap_present(pte))
 		pa = pmap_ppn(pte);
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL && pmap_present(pte) &&
 	    (pmap_prot(pte) & prot) == prot) {
 		m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Find the kernel lpte for mapping the given virtual address, which
  * must be in the part of region 5 which we can cover with our kernel
  * 'page tables'.
  */
 static struct ia64_lpte *
 pmap_find_kpte(vm_offset_t va)
 {
 	KASSERT((va >> 61) == 5,
 		("kernel mapping 0x%lx not in region 5", va));
 	KASSERT(IA64_RR_MASK(va) < (nkpt * PAGE_SIZE * NKPTEPG),
 		("kernel mapping 0x%lx out of range", va));
 	return (&ia64_kptdir[KPTE_DIR_INDEX(va)][KPTE_PTE_INDEX(va)]);
 }
 
 /*
  * Find a pte suitable for mapping a user-space address. If one exists 
  * in the VHPT, that one will be returned, otherwise a new pte is
  * allocated.
  */
 static struct ia64_lpte *
 pmap_find_pte(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return pmap_find_kpte(va);
 
 	pte = pmap_find_vhpt(va);
 	if (pte == NULL) {
 		pte = uma_zalloc(ptezone, M_NOWAIT | M_ZERO);
 		pte->tag = 1UL << 63;
 	}
 	return (pte);
 }
 
 /*
  * Free a pte which is now unused. This simply returns it to the zone
  * allocator if it is a user mapping. For kernel mappings, clear the
  * valid bit to make it clear that the mapping is not currently used.
  */
 static void
 pmap_free_pte(struct ia64_lpte *pte, vm_offset_t va)
 {
 	if (va < VM_MAXUSER_ADDRESS)
 		uma_zfree(ptezone, pte);
 	else
 		pmap_clear_present(pte);
 }
 
 static PMAP_INLINE void
 pmap_pte_prot(pmap_t pm, struct ia64_lpte *pte, vm_prot_t prot)
 {
 	static int prot2ar[4] = {
 		PTE_AR_R,	/* VM_PROT_NONE */
 		PTE_AR_RW,	/* VM_PROT_WRITE */
 		PTE_AR_RX,	/* VM_PROT_EXECUTE */
 		PTE_AR_RWX	/* VM_PROT_WRITE|VM_PROT_EXECUTE */
 	};
 
 	pte->pte &= ~(PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK);
 	pte->pte |= (uint64_t)(prot & VM_PROT_ALL) << 56;
 	pte->pte |= (prot == VM_PROT_NONE || pm == kernel_pmap)
 	    ? PTE_PL_KERN : PTE_PL_USER;
 	pte->pte |= prot2ar[(prot & VM_PROT_ALL) >> 1];
 }
 
 /*
  * Set a pte to contain a valid mapping and enter it in the VHPT. If
  * the pte was orginally valid, then its assumed to already be in the
  * VHPT.
  * This functions does not set the protection bits.  It's expected
  * that those have been set correctly prior to calling this function.
  */
 static void
 pmap_set_pte(struct ia64_lpte *pte, vm_offset_t va, vm_offset_t pa,
     boolean_t wired, boolean_t managed)
 {
 
 	pte->pte &= PTE_PROT_MASK | PTE_PL_MASK | PTE_AR_MASK;
 	pte->pte |= PTE_PRESENT | PTE_MA_WB;
 	pte->pte |= (managed) ? PTE_MANAGED : (PTE_DIRTY | PTE_ACCESSED);
 	pte->pte |= (wired) ? PTE_WIRED : 0;
 	pte->pte |= pa & PTE_PPN_MASK;
 
 	pte->itir = PAGE_SHIFT << 2;
 
 	pte->tag = ia64_ttag(va);
 }
 
 /*
  * Remove the (possibly managed) mapping represented by pte from the
  * given pmap.
  */
 static int
 pmap_remove_pte(pmap_t pmap, struct ia64_lpte *pte, vm_offset_t va,
 		pv_entry_t pv, int freepte)
 {
 	int error;
 	vm_page_t m;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("removing pte for non-current pmap"));
 
 	/*
 	 * First remove from the VHPT.
 	 */
 	error = pmap_remove_vhpt(va);
 	if (error)
 		return (error);
 
 	pmap_invalidate_page(pmap, va);
 
 	if (pmap_wired(pte))
 		pmap->pm_stats.wired_count -= 1;
 
 	pmap->pm_stats.resident_count -= 1;
 	if (pmap_managed(pte)) {
 		m = PHYS_TO_VM_PAGE(pmap_ppn(pte));
 		if (pmap_dirty(pte))
 			vm_page_dirty(m);
 		if (pmap_accessed(pte))
 			vm_page_flag_set(m, PG_REFERENCED);
 
 		error = pmap_remove_entry(pmap, m, va, pv);
 	}
 	if (freepte)
 		pmap_free_pte(pte, va);
 
 	return (error);
 }
 
 /*
  * Extract the physical page address associated with a kernel
  * virtual address.
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 	vm_offset_t gwpage;
 
 	KASSERT(va >= IA64_RR_BASE(5), ("Must be kernel VA"));
 
 	/* Regions 6 and 7 are direct mapped. */
 	if (va >= IA64_RR_BASE(6))
 		return (IA64_RR_MASK(va));
 
 	/* EPC gateway page? */
 	gwpage = (vm_offset_t)ia64_get_k5();
 	if (va >= gwpage && va < gwpage + VM_GATEWAY_SIZE)
 		return (IA64_RR_MASK((vm_offset_t)ia64_gateway_page));
 
 	/* Bail out if the virtual address is beyond our limits. */
 	if (IA64_RR_MASK(va) >= nkpt * PAGE_SIZE * NKPTEPG)
 		return (0);
 
 	pte = pmap_find_kpte(va);
 	if (!pmap_present(pte))
 		return (0);
 	return (pmap_ppn(pte) | (va & PAGE_MASK));
 }
 
 /*
  * Add a list of wired pages to the kva this routine is only used for
  * temporary kernel mappings that do not need to have page modification
  * or references recorded.  Note that old mappings are simply written
  * over.  The page is effectively wired, but it's customary to not have
  * the PTE reflect that, nor update statistics.
  */
 void
 pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
 {
 	struct ia64_lpte *pte;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pte = pmap_find_kpte(va);
 		if (pmap_present(pte))
 			pmap_invalidate_page(kernel_pmap, va);
 		else
 			pmap_enter_vhpt(pte, va);
 		pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL);
 		pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m[i]), FALSE, FALSE);
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t va, int count)
 {
 	struct ia64_lpte *pte;
 	int i;
 
 	for (i = 0; i < count; i++) {
 		pte = pmap_find_kpte(va);
 		if (pmap_present(pte)) {
 			pmap_remove_vhpt(va);
 			pmap_invalidate_page(kernel_pmap, va);
 			pmap_clear_present(pte);
 		}
 		va += PAGE_SIZE;
 	}
 }
 
 /*
  * Add a wired page to the kva.  As for pmap_qenter(), it's customary
  * to not have the PTE reflect that, nor update statistics.
  */
 void 
 pmap_kenter(vm_offset_t va, vm_offset_t pa)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_kpte(va);
 	if (pmap_present(pte))
 		pmap_invalidate_page(kernel_pmap, va);
 	else
 		pmap_enter_vhpt(pte, va);
 	pmap_pte_prot(kernel_pmap, pte, VM_PROT_ALL);
 	pmap_set_pte(pte, va, pa, FALSE, FALSE);
 }
 
 /*
  * Remove a page from the kva
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_kpte(va);
 	if (pmap_present(pte)) {
 		pmap_remove_vhpt(va);
 		pmap_invalidate_page(kernel_pmap, va);
 		pmap_clear_present(pte);
 	}
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_offset_t start, vm_offset_t end, int prot)
 {
 	return IA64_PHYS_TO_RR7(start);
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va)
 {
 	struct ia64_lpte *pte;
 
 	KASSERT((pmap == kernel_pmap || pmap == PCPU_GET(current_pmap)),
 		("removing page for non-current pmap"));
 
 	pte = pmap_find_vhpt(va);
 	if (pte != NULL)
 		pmap_remove_pte(pmap, pte, va, 0, 1);
 	return;
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	pmap_t oldpmap;
 	vm_offset_t va;
 	pv_entry_t npv, pv;
 	struct ia64_lpte *pte;
 
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pmap_remove_page(pmap, sva);
 		goto out;
 	}
 
 	if (pmap->pm_stats.resident_count < ((eva - sva) >> PAGE_SHIFT)) {
 		TAILQ_FOREACH_SAFE(pv, &pmap->pm_pvlist, pv_plist, npv) {
 			va = pv->pv_va;
 			if (va >= sva && va < eva) {
 				pte = pmap_find_vhpt(va);
 				KASSERT(pte != NULL, ("pte"));
 				pmap_remove_pte(pmap, pte, va, pv, 1);
 			}
 		}
 		
 	} else {
 		for (va = sva; va < eva; va = va += PAGE_SIZE) {
 			pte = pmap_find_vhpt(va);
 			if (pte != NULL)
 				pmap_remove_pte(pmap, pte, va, 0, 1);
 		}
 	}
 out:
 	vm_page_unlock_queues();
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 #if defined(DIAGNOSTIC)
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (m->flags & PG_FICTITIOUS) {
 		panic("pmap_page_protect: illegal for unmanaged page, va: 0x%lx", VM_PAGE_TO_PHYS(m));
 	}
 #endif
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		struct ia64_lpte *pte;
 		pmap_t pmap = pv->pv_pmap;
 		vm_offset_t va = pv->pv_va;
 
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_install(pmap);
 		pte = pmap_find_vhpt(va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_ppn(pte) != VM_PAGE_TO_PHYS(m))
 			panic("pmap_remove_all: pv_table for %lx is inconsistent", VM_PAGE_TO_PHYS(m));
 		pmap_remove_pte(pmap, pte, va, pv, 1);
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	if ((sva & PAGE_MASK) || (eva & PAGE_MASK))
 		panic("pmap_protect: unaligned addresses");
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	while (sva < eva) {
 		/* 
 		 * If page is invalid, skip this page
 		 */
 		pte = pmap_find_vhpt(sva);
 		if (pte == NULL) {
 			sva += PAGE_SIZE;
 			continue;
 		}
 
 		if (pmap_prot(pte) != prot) {
 			if (pmap_managed(pte)) {
 				vm_offset_t pa = pmap_ppn(pte);
 				vm_page_t m = PHYS_TO_VM_PAGE(pa);
 				if (pmap_dirty(pte)) {
 					vm_page_dirty(m);
 					pmap_clear_dirty(pte);
 				}
 				if (pmap_accessed(pte)) {
 					vm_page_flag_set(m, PG_REFERENCED);
 					pmap_clear_accessed(pte);
 				}
 			}
 			pmap_pte_prot(pmap, pte, prot);
 			pmap_invalidate_page(pmap, sva);
 		}
 
 		sva += PAGE_SIZE;
 	}
 	vm_page_unlock_queues();
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 	pmap_t oldpmap;
 	vm_offset_t pa;
 	vm_offset_t opa;
 	struct ia64_lpte origpte;
 	struct ia64_lpte *pte;
 	boolean_t managed;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 
 	va &= ~PAGE_MASK;
 #ifdef DIAGNOSTIC
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 #endif
 
 	/*
 	 * Find (or create) a pte for the given mapping.
 	 */
 	while ((pte = pmap_find_pte(va)) == NULL) {
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pmap);
 		vm_page_unlock_queues();
 		VM_WAIT;
 		vm_page_lock_queues();
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_install(pmap);
 	}
 	origpte = *pte;
 	if (!pmap_present(pte)) {
 		opa = ~0UL;
 		pmap_enter_vhpt(pte, va);
 	} else
 		opa = pmap_ppn(pte);
 	managed = FALSE;
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (opa == pa) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && !pmap_wired(&origpte))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && pmap_wired(&origpte))
 			pmap->pm_stats.wired_count--;
 
 		managed = (pmap_managed(&origpte)) ? TRUE : FALSE;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (managed && pmap_dirty(&origpte))
 			vm_page_dirty(m);
 
 		pmap_invalidate_page(pmap, va);
 		goto validate;
 	}
 
 	/*
 	 * Mapping has changed, invalidate old range and fall
 	 * through to handle validating new mapping.
 	 */
 	if (opa != ~0UL) {
 		pmap_remove_pte(pmap, pte, va, 0, 0);
 		pmap_enter_vhpt(pte, va);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva,
 		    ("pmap_enter: managed mapping within the clean submap"));
 		pmap_insert_entry(pmap, va, m);
 		managed = TRUE;
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 
 	/*
 	 * Now validate mapping with desired protection/wiring. This
 	 * adds the pte to the VHPT if necessary.
 	 */
 	pmap_pte_prot(pmap, pte, prot);
 	pmap_set_pte(pte, va, pa, wired, managed);
 
 	if ((prot & VM_PROT_WRITE) != 0)
 		vm_page_flag_set(m, PG_WRITEABLE);
 	vm_page_unlock_queues();
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
 		m = TAILQ_NEXT(m, listq);
 	}
 	pmap_install(oldpmap);
  	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pmap_t oldpmap;
 
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	pmap_enter_quick_locked(pmap, va, m, prot);
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot)
 {
 	struct ia64_lpte *pte;
 	boolean_t managed;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pte = pmap_find_pte(va)) == NULL)
 		return;
 
 	if (!pmap_present(pte)) {
 		/* Enter on the PV list if the page is managed. */
 		if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 			if (!pmap_try_insert_pv_entry(pmap, va, m)) {
 				pmap_free_pte(pte, va);
 				return;
 			}
 			managed = TRUE;
 		} else
 			managed = FALSE;
 
 		/* Increment counters. */
 		pmap->pm_stats.resident_count++;
 
 		/* Initialise with R/O protection and enter into VHPT. */
 		pmap_enter_vhpt(pte, va);
 		pmap_pte_prot(pmap, pte,
 		    prot & (VM_PROT_READ | VM_PROT_EXECUTE));
 		pmap_set_pte(pte, va, VM_PAGE_TO_PHYS(m), FALSE, managed);
 	}
 }
 
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr,
 		    vm_object_t object, vm_pindex_t pindex,
 		    vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte;
 
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 
 	pte = pmap_find_vhpt(va);
 	KASSERT(pte != NULL, ("pte"));
 	if (wired && !pmap_wired(pte)) {
 		pmap->pm_stats.wired_count++;
 		pmap_set_wired(pte);
 	} else if (!wired && pmap_wired(pte)) {
 		pmap->pm_stats.wired_count--;
 		pmap_clear_wired(pte);
 	}
 
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
 	  vm_offset_t src_addr)
 {
 }	
 
 
 /*
  *	pmap_zero_page zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.
  */
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t) va, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.
  *
  *	off and size must reside within a single page.
  */
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((char *)(caddr_t)va + off, size);
 }
 
 
 /*
  *	pmap_zero_page_idle zeros the specified hardware page by
  *	mapping it into virtual memory and using bzero to clear
  *	its contents.  This is for the vm_idlezero process.
  */
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	vm_offset_t va = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(m));
 	bzero((caddr_t) va, PAGE_SIZE);
 }
 
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = IA64_PHYS_TO_RR7(VM_PAGE_TO_PHYS(mdst));
 	bcopy((caddr_t) src, (caddr_t) dst, PAGE_SIZE);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pmap_t oldpmap;
 	pv_entry_t pv, npv;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		struct ia64_lpte *pte;
 
 		npv = TAILQ_NEXT(pv, pv_plist);
 
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (!pmap_wired(pte))
 			pmap_remove_pte(pmap, pte, pv->pv_va, pv, 1);
 	}
 
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_unlock_queues();
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  * 
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 	int count = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return 0;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_install(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_accessed(pte)) {
 			count++;
 			pmap_clear_accessed(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 
 	return count;
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 	boolean_t rv;
 
 	rv = FALSE;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_install(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		pmap_install(oldpmap);
 		KASSERT(pte != NULL, ("pte"));
 		rv = pmap_dirty(pte) ? TRUE : FALSE;
 		PMAP_UNLOCK(pv->pv_pmap);
 		if (rv)
 			break;
 	}
 
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	struct ia64_lpte *pte;
 
 	pte = pmap_find_vhpt(addr);
 	if (pte != NULL && pmap_present(pte))
 		return (FALSE);
 	return (TRUE);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 	if (m->flags & PG_FICTITIOUS)
 		return;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_install(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_dirty(pte)) {
 			pmap_clear_dirty(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap;
 	pv_entry_t pv;
 
 	if (m->flags & PG_FICTITIOUS)
 		return;
 
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		PMAP_LOCK(pv->pv_pmap);
 		oldpmap = pmap_install(pv->pv_pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		if (pmap_accessed(pte)) {
 			pmap_clear_accessed(pte);
 			pmap_invalidate_page(pv->pv_pmap, pv->pv_va);
 		}
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct ia64_lpte *pte;
 	pmap_t oldpmap, pmap;
 	pv_entry_t pv;
 	vm_prot_t prot;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		pmap = pv->pv_pmap;
 		PMAP_LOCK(pmap);
 		oldpmap = pmap_install(pmap);
 		pte = pmap_find_vhpt(pv->pv_va);
 		KASSERT(pte != NULL, ("pte"));
 		prot = pmap_prot(pte);
 		if ((prot & VM_PROT_WRITE) != 0) {
 			if (pmap_dirty(pte)) {
 				vm_page_dirty(m);
 				pmap_clear_dirty(pte);
 			}
 			prot &= ~VM_PROT_WRITE;
 			pmap_pte_prot(pmap, pte, prot);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		pmap_install(oldpmap);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 void *
 pmap_mapdev(vm_offset_t pa, vm_size_t size)
 {
 	return (void*) IA64_PHYS_TO_RR6(pa);
 }
 
 /*
  * 'Unmap' a range mapped by pmap_mapdev().
  */
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	return;
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	pmap_t oldpmap;
 	struct ia64_lpte *pte, tpte;
 	int val = 0;
 	
 	PMAP_LOCK(pmap);
 	oldpmap = pmap_install(pmap);
 	pte = pmap_find_vhpt(addr);
 	if (pte != NULL) {
 		tpte = *pte;
 		pte = &tpte;
 	}
 	pmap_install(oldpmap);
 	PMAP_UNLOCK(pmap);
 
 	if (pte == NULL)
 		return 0;
 
 	if (pmap_present(pte)) {
 		vm_page_t m;
 		vm_offset_t pa;
 
 		val = MINCORE_INCORE;
 		if (!pmap_managed(pte))
 			return val;
 
 		pa = pmap_ppn(pte);
 
 		m = PHYS_TO_VM_PAGE(pa);
 
 		/*
 		 * Modified by us
 		 */
 		if (pmap_dirty(pte))
 			val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
 		else {
 			/*
 			 * Modified by someone
 			 */
 			vm_page_lock_queues();
 			if (pmap_is_modified(m))
 				val |= MINCORE_MODIFIED_OTHER;
 			vm_page_unlock_queues();
 		}
 		/*
 		 * Referenced by us
 		 */
 		if (pmap_accessed(pte))
 			val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 		else {
 			/*
 			 * Referenced by someone
 			 */
 			vm_page_lock_queues();
 			if (pmap_ts_referenced(m)) {
 				val |= MINCORE_REFERENCED_OTHER;
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			vm_page_unlock_queues();
 		}
 	} 
 	return val;
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_install(vmspace_pmap(td->td_proc->p_vmspace));
 }
 
 pmap_t
 pmap_switch(pmap_t pm)
 {
 	pmap_t prevpm;
 	int i;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
 	prevpm = PCPU_GET(current_pmap);
 	if (prevpm == pm)
 		return (prevpm);
 	if (prevpm != NULL)
 		atomic_clear_32(&prevpm->pm_active, PCPU_GET(cpumask));
 	if (pm == NULL) {
 		for (i = 0; i < 5; i++) {
 			ia64_set_rr(IA64_RR_BASE(i),
 			    (i << 8)|(PAGE_SHIFT << 2)|1);
 		}
 	} else {
 		for (i = 0; i < 5; i++) {
 			ia64_set_rr(IA64_RR_BASE(i),
 			    (pm->pm_rid[i] << 8)|(PAGE_SHIFT << 2)|1);
 		}
 		atomic_set_32(&pm->pm_active, PCPU_GET(cpumask));
 	}
 	PCPU_SET(current_pmap, pm);
 	__asm __volatile("srlz.d");
 	return (prevpm);
 }
 
 static pmap_t
 pmap_install(pmap_t pm)
 {
 	pmap_t prevpm;
 
 	mtx_lock_spin(&sched_lock);
 	prevpm = pmap_switch(pm);
 	mtx_unlock_spin(&sched_lock);
 	return (prevpm);
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
 
 	return addr;
 }
 
 #include "opt_ddb.h"
 
 #ifdef DDB
 
 #include <ddb/ddb.h>
 
 static const char*	psnames[] = {
 	"1B",	"2B",	"4B",	"8B",
 	"16B",	"32B",	"64B",	"128B",
 	"256B",	"512B",	"1K",	"2K",
 	"4K",	"8K",	"16K",	"32K",
 	"64K",	"128K",	"256K",	"512K",
 	"1M",	"2M",	"4M",	"8M",
 	"16M",	"32M",	"64M",	"128M",
 	"256M",	"512M",	"1G",	"2G"
 };
 
 static void
 print_trs(int type)
 {
 	struct ia64_pal_result res;
 	int i, maxtr;
 	struct {
 		pt_entry_t	pte;
 		uint64_t	itir;
 		uint64_t	ifa;
 		struct ia64_rr	rr;
 	} buf;
 	static const char *manames[] = {
 		"WB",	"bad",	"bad",	"bad",
 		"UC",	"UCE",	"WC",	"NaT",
 	};
 
 	res = ia64_call_pal_static(PAL_VM_SUMMARY, 0, 0, 0);
 	if (res.pal_status != 0) {
 		db_printf("Can't get VM summary\n");
 		return;
 	}
 
 	if (type == 0)
 		maxtr = (res.pal_result[0] >> 40) & 0xff;
 	else
 		maxtr = (res.pal_result[0] >> 32) & 0xff;
 
 	db_printf("V RID    Virtual Page  Physical Page PgSz ED AR PL D A MA  P KEY\n");
 	for (i = 0; i <= maxtr; i++) {
 		bzero(&buf, sizeof(buf));
 		res = ia64_call_pal_stacked_physical
 			(PAL_VM_TR_READ, i, type, ia64_tpa((uint64_t) &buf));
 		if (!(res.pal_result[0] & 1))
 			buf.pte &= ~PTE_AR_MASK;
 		if (!(res.pal_result[0] & 2))
 			buf.pte &= ~PTE_PL_MASK;
 		if (!(res.pal_result[0] & 4))
 			pmap_clear_dirty(&buf);
 		if (!(res.pal_result[0] & 8))
 			buf.pte &= ~PTE_MA_MASK;
 		db_printf("%d %06x %013lx %013lx %4s %d  %d  %d  %d %d %-3s "
 		    "%d %06x\n", (int)buf.ifa & 1, buf.rr.rr_rid,
 		    buf.ifa >> 12, (buf.pte & PTE_PPN_MASK) >> 12,
 		    psnames[(buf.itir & ITIR_PS_MASK) >> 2],
 		    (buf.pte & PTE_ED) ? 1 : 0,
 		    (int)(buf.pte & PTE_AR_MASK) >> 9,
 		    (int)(buf.pte & PTE_PL_MASK) >> 7,
 		    (pmap_dirty(&buf)) ? 1 : 0,
 		    (pmap_accessed(&buf)) ? 1 : 0,
 		    manames[(buf.pte & PTE_MA_MASK) >> 2],
 		    (pmap_present(&buf)) ? 1 : 0,
 		    (int)((buf.itir & ITIR_KEY_MASK) >> 8));
 	}
 }
 
 DB_COMMAND(itr, db_itr)
 {
 	print_trs(0);
 }
 
 DB_COMMAND(dtr, db_dtr)
 {
 	print_trs(1);
 }
 
 DB_COMMAND(rr, db_rr)
 {
 	int i;
 	uint64_t t;
 	struct ia64_rr rr;
 
 	printf("RR RID    PgSz VE\n");
 	for (i = 0; i < 8; i++) {
 		__asm __volatile ("mov %0=rr[%1]"
 				  : "=r"(t)
 				  : "r"(IA64_RR_BASE(i)));
 		*(uint64_t *) &rr = t;
 		printf("%d  %06x %4s %d\n",
 		       i, rr.rr_rid, psnames[rr.rr_ps], rr.rr_ve);
 	}
 }
 
 DB_COMMAND(thash, db_thash)
 {
 	if (!have_addr)
 		return;
 
 	db_printf("%p\n", (void *) ia64_thash(addr));
 }
 
 DB_COMMAND(ttag, db_ttag)
 {
 	if (!have_addr)
 		return;
 
 	db_printf("0x%lx\n", ia64_ttag(addr));
 }
 
 DB_COMMAND(kpte, db_kpte)
 {
 	struct ia64_lpte *pte;
 
 	if (!have_addr) {
 		db_printf("usage: kpte <kva>\n");
 		return;
 	}
 	if (addr < VM_MIN_KERNEL_ADDRESS) {
 		db_printf("kpte: error: invalid <kva>\n");
 		return;
 	}
 	pte = &ia64_kptdir[KPTE_DIR_INDEX(addr)][KPTE_PTE_INDEX(addr)];
 	db_printf("kpte at %p:\n", pte);
 	db_printf("  pte  =%016lx\n", pte->pte);
 	db_printf("  itir =%016lx\n", pte->itir);
 	db_printf("  tag  =%016lx\n", pte->tag);
 	db_printf("  chain=%016lx\n", pte->chain);
 }
 
 #endif
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 169666)
+++ head/sys/kern/init_main.c	(revision 169667)
@@ -1,736 +1,736 @@
 /*-
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_init_path.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/exec.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/copyright.h>
 
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct	thread thread0 __aligned(16);
 struct	vmspace vmspace0;
 struct	proc *initproc;
 
 int	boothowto = 0;		/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
 int	bootverbose;
 SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 SET_DECLARE(sysinit_set, struct sysinit);
 struct sysinit **sysinit, **sysinit_end;
 struct sysinit **newsysinit, **newsysinit_end;
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set, struct sysinit **set_end)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count;
 
 	count = set_end - set;
 	if (newsysinit)
 		count += newsysinit_end - newsysinit;
 	else
 		count += sysinit_end - sysinit;
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; sipp < sysinit_end; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; sipp < set_end; sipp++)
 		*xipp++ = *sipp;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 	newsysinit_end = newset + count;
 }
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
 #if defined(VERBOSE_SYSINIT)
 	int last;
 	int verbose;
 #endif
 
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
 	}
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 #if defined(VERBOSE_SYSINIT)
 	last = SI_SUB_COPYRIGHT;
 	verbose = 0;
 #if !defined(DDB)
 	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
 #endif
 #endif
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 #if defined(VERBOSE_SYSINIT)
 		if ((*sipp)->subsystem > last) {
 			verbose = 1;
 			last = (*sipp)->subsystem;
 			printf("subsystem %x\n", last);
 		}
 		if (verbose) {
 #if defined(DDB)
 			const char *name;
 			c_db_sym_t sym;
 			db_expr_t  offset;
 
 			sym = db_search_symbol((vm_offset_t)(*sipp)->func,
 			    DB_STGY_PROC, &offset);
 			db_symbol_values(sym, &name, NULL);
 			if (name != NULL)
 				printf("   %s(%p)... ", name, (*sipp)->udata);
 			else
 #endif
 				printf("   %p(%p)... ", (*sipp)->func,
 				    (*sipp)->udata);
 		}
 #endif
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 #if defined(VERBOSE_SYSINIT)
 		if (verbose)
 			printf("done.\n");
 #endif
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != SET_BEGIN(sysinit_set))
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			sysinit_end = newsysinit_end;
 			newsysinit = NULL;
 			newsysinit_end = NULL;
 			goto restart;
 		}
 	}
 
 	panic("Shouldn't get here!");
 	/* NOTREACHED*/
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 static void
 print_caddr_t(void *data __unused)
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark)
 SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_caddr_t, version)
 
 #ifdef WITNESS
 static char wit_warn[] =
      "WARNING: WITNESS option enabled, expect reduced performance.\n";
 SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
    print_caddr_t, wit_warn)
 #endif
 
 #ifdef DIAGNOSTIC
 static char diag_warn[] =
      "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
 SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
     print_caddr_t, diag_warn)
 #endif
 
 static void
 set_boot_verbose(void *data __unused)
 {
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 }
 SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
 
 struct sysentvec null_sysvec = {
 	0,
 	NULL,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	"null",
 	NULL,
 	NULL,
 	0,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	VM_MAXUSER_ADDRESS,
 	USRSTACK,
 	PS_STRINGS,
 	VM_PROT_ALL,
 	NULL,
 	NULL,
 	NULL
 };
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINIT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	struct proc *p;
 	unsigned i;
 	struct thread *td;
 
 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
 
 	/*
 	 * Initialize magic number.
 	 */
 	p->p_magic = P_MAGIC;
 
 	/*
 	 * Initialize thread and process structures.
 	 */
 	procinit();	/* set up proc zone */
 	threadinit();	/* set up UMA zones */
 
 	/*
 	 * Initialise scheduler resources.
 	 * Add scheduler specific parts to proc, thread as needed.
 	 */
 	schedinit();	/* scheduler gets its house in order */
 	/*
 	 * Initialize sleep queue hash table
 	 */
 	sleepinit();
 
 	/*
 	 * additional VM structures
 	 */
 	vm_init2();
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF | MTX_DUPOK);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_state = PRS_NORMAL;
 	knlist_init(&p->p_klist, &p->p_mtx, NULL, NULL, NULL);
 	STAILQ_INIT(&p->p_ktr);
 	p->p_nice = NZERO;
 	td->td_state = TDS_RUNNING;
 	td->td_pri_class = PRI_TIMESHARE;
 	td->td_user_pri = PUSER;
 	td->td_base_user_pri = PUSER;
 	td->td_priority = PVM;
 	td->td_base_pri = PUSER;
 	td->td_oncpu = 0;
 	p->p_peers = 0;
 	p->p_leader = p;
 
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 
 	/* Create credentials. */
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 	p->p_ucred->cr_uidinfo = uifind(0);
 	p->p_ucred->cr_ruidinfo = uifind(0);
 	p->p_ucred->cr_prison = NULL;	/* Don't jail it. */
 #ifdef AUDIT
 	audit_proc_alloc(p);
 	audit_proc_kproc0(p);
 #endif
 #ifdef MAC
 	mac_create_proc0(p->p_ucred);
 #endif
 	td->td_ucred = crhold(p->p_ucred);
 
 	/* Create sigacts. */
 	p->p_sigacts = sigacts_alloc();
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	p->p_fd = fdinit(NULL);
 	p->p_fdtol = NULL;
 
 	/* Create the limits structures. */
 	p->p_limit = lim_alloc();
 	for (i = 0; i < RLIM_NLIMITS; i++)
 		p->p_limit->pl_rlimit[i].rlim_cur =
 		    p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
-	i = ptoa(cnt.v_free_count);
+	i = ptoa(VMCNT_GET(free_count));
 	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	p->p_cpulimit = RLIM_INFINITY;
 
 	p->p_stats = pstats_alloc();
 
 	/* Allocate a prototype map so we have something to fork. */
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
 	    p->p_sysent->sv_maxuser);
 	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct timespec ts;
 	struct proc *p;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the filesystem.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		microuptime(&p->p_stats->p_start);
 		p->p_rux.rux_runtime = 0;
 	}
 	sx_sunlock(&allproc_lock);
 	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	nanotime(&ts);
 	srandom(ts.tv_sec ^ ts.tv_nsec);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init:/stand/sysinstall";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
 	"Path used to search the init process");
 
 /*
  * Shutdown timeout of init(8).
  * Unused within kernel, but used to control init(8), hence do not remove.
  */
 #ifndef INIT_SHUTDOWN_TIMEOUT
 #define INIT_SHUTDOWN_TIMEOUT 120
 #endif
 static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
 SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
 	CTLFLAG_RW, &init_shutdown_timeout, 0, "");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, error;
 	char *var, *path, *next, *s;
 	char *ucp, **uap, *arg0, *arg1;
 	struct thread *td;
 	struct proc *p;
 
 	mtx_lock(&Giant);
 
 	GIANT_REQUIRED;
 
 	td = curthread;
 	p = td->td_proc;
 
 	vfs_mountroot();
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
 			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	if ((var = getenv("init_path")) != NULL) {
 		strlcpy(init_path, var, sizeof(init_path));
 		freeenv(var);
 	}
 	
 	for (path = init_path; *path != '\0'; path = next) {
 		while (*path == ':')
 			path++;
 		if (*path == '\0')
 			break;
 		for (next = path; *next != '\0' && *next != ':'; next++)
 			/* nothing */ ;
 		if (bootverbose)
 			printf("start_init: trying %.*s\n", (int)(next - path),
 			    path);
 			
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)p->p_sysent->sv_usrstack;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		(void)subyte(--ucp, 0);
 		for (s = next - 1; s >= path; s--)
 			(void)subyte(--ucp, *s);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
 		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
 		if ((error = execve(td, &args)) == 0) {
 			mtx_unlock(&Giant);
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %.*s: error %d\n", (int)(next - path), 
 			    path, error);
 	}
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kthread_create(), but runs in it's own address space.
  * We do this early to reserve pid 1.
  *
  * Note special case - do not make it runnable yet.  Other work
  * in progress will change this more.
  */
 static void
 create_init(const void *udata __unused)
 {
 	struct ucred *newcred, *oldcred;
 	int error;
 
 	error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
 	/* divorce init's credentials from the kernel's */
 	newcred = crget();
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM;
 	oldcred = initproc->p_ucred;
 	crcopy(newcred, oldcred);
 #ifdef MAC
 	mac_create_proc1(newcred);
 #endif
 #ifdef AUDIT
 	audit_proc_init(initproc);
 #endif
 	initproc->p_ucred = newcred;
 	PROC_UNLOCK(initproc);
 	crfree(oldcred);
 	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
 	mtx_lock_spin(&sched_lock);
 	initproc->p_sflag |= PS_INMEM;
 	mtx_unlock_spin(&sched_lock);
 	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
 	mtx_lock_spin(&sched_lock);
 	TD_SET_CAN_RUN(td);
 	sched_add(td, SRQ_BORING);
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 169666)
+++ head/sys/kern/kern_fork.c	(revision 169667)
@@ -1,848 +1,848 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sx.h>
 #include <sys/signalvar.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	struct proc *p2;
 	int error;
 
 	/* Don't allow kernel-only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
 	AUDIT_ARG(fflags, uap->flags);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error != 0)
 		return(error);
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;
 	int flags;
 	int pages;
 	struct proc **procp;
 {
 	struct proc *p1, *p2, *pptr;
 	struct proc *newproc;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
 	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	p1 = td->td_proc;
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			if (thread_single(SINGLE_BOUNDARY)) {
 				PROC_UNLOCK(p1);
 				return (ERESTART);
 			}
 			PROC_UNLOCK(p1);
 		}
 
 		vm_forkproc(td, NULL, NULL, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent).
 		 */
 		if (flags & RFFDG) 
 			fdunshare(p1, td);
 
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			thread_single_end();
 			PROC_UNLOCK(p1);
 		}
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Note 1:1 allows for forking with one thread coming out on the
 	 * other side with the expectation that the process is about to
 	 * exec.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		/*
 		 * Idle the other threads for a second.
 		 * Since the user space is copied, it must remain stable.
 		 * In addition, all threads (from the user perspective)
 		 * need to either be suspended or in the kernel,
 		 * where they will try restart in the parent and will
 		 * be aborted in the child.
 		 */
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_NO_EXIT)) {
 			/* Abort. Someone else is single threading before us. */
 			PROC_UNLOCK(p1);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 		/*
 		 * All other activity in this process
 		 * is now suspended at the user boundary,
 		 * (or other safe places if we think of any).
 		 */
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 #ifdef AUDIT
 	audit_proc_alloc(newproc);
 #endif
 	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
 	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	if ((nprocs >= maxproc - 10 &&
 	    priv_check_cred(td->td_ucred, PRIV_MAXPROC, SUSER_RUID) != 0) ||
 	    nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 *
 	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, SUSER_RUID |
 	    SUSER_ALLOWJAIL);
 	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			while (p2->p_pid == trypid ||
 			    (p2->p_pgrp != NULL &&
 			    (p2->p_pgrp->pg_id == trypid ||
 			    (p2->p_session != NULL &&
 			    p2->p_session->s_sid == trypid)))) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp != NULL) {
 				if (p2->p_pgrp->pg_id > trypid &&
 				    pidchecked > p2->p_pgrp->pg_id)
 					pidchecked = p2->p_pgrp->pg_id;
 				if (p2->p_session != NULL &&
 				    p2->p_session->s_sid > trypid &&
 				    pidchecked > p2->p_session->s_sid)
 					pidchecked = p2->p_session->s_sid;
 			}
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 	sx_sunlock(&proctree_lock);
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	sx_xunlock(&allproc_lock);
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
 
 	p2->p_ucred = crhold(td->td_ucred);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(p1->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		fd = fdcopy(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_XLOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_XUNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	td2 = FIRST_THREAD_IN_PROC(p2);
 
 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_sigmask = td->td_sigmask;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag = PS_INMEM;
 	/*
 	 * Allow the scheduler to adjust the priority of the child and
 	 * parent while we hold the sched_lock.
 	 */
 	sched_fork(td, td2);
 
 	mtx_unlock_spin(&sched_lock);
 	td2->td_ucred = crhold(p2->p_ucred);
 #ifdef AUDIT
 	audit_proc_fork(p1, p2);
 #endif
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	p2->p_textvp = p1->p_textvp;
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
 	p2->p_limit = lim_hold(p1->p_limit);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/* Bump references to the text vnode (for procfs) */
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
 	/*
 	 * Set up linkage for kernel based threading.
 	 */
 	if ((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	p2->p_pgrp = p1->p_pgrp;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
-		atomic_add_int(&cnt.v_forks, 1);
-		atomic_add_int(&cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+		VMCNT_ADD(forks, 1);
+		VMCNT_ADD(forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
-		atomic_add_int(&cnt.v_vforks, 1);
-		atomic_add_int(&cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+		VMCNT_ADD(forks, 1);
+		VMCNT_ADD(forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
-		atomic_add_int(&cnt.v_kthreads, 1);
-		atomic_add_int(&cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+		VMCNT_ADD(kthreads, 1);
+		VMCNT_ADD(kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
-		atomic_add_int(&cnt.v_rforks, 1);
-		atomic_add_int(&cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+		VMCNT_ADD(rforks, 1);
+		VMCNT_ADD(rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	mtx_lock_spin(&sched_lock);
 	p2->p_state = PRS_NORMAL;
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * If other threads are waiting, let them continue now.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
 	return (0);
 fail:
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_destroy_proc(newproc);
 #endif
 #ifdef AUDIT
 	audit_proc_free(newproc);
 #endif
 	uma_zfree(proc_zone, newproc);
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	pause("fork", hz / 2);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct proc *p;
 	struct thread *td;
 
 	/*
 	 * Finish setting up thread glue so that it begins execution in a
 	 * non-nested critical section with sched_lock held but not recursed.
 	 */
 	td = curthread;
 	p = td->td_proc;
 	td->td_oncpu = PCPU_GET(cpuid);
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
 	sched_lock.mtx_lock = (uintptr_t)td;
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
 	/*
 	 * Processes normally resume in mi_switch() after being
 	 * cpu_switch()'ed to, but when children start up they arrive here
 	 * instead, so we must do much the same things as mi_switch() would.
 	 */
 
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 	td = curthread;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	EVENTHANDLER_INVOKE(schedtail, p);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/kern_malloc.c
===================================================================
--- head/sys/kern/kern_malloc.c	(revision 169666)
+++ head/sys/kern/kern_malloc.c	(revision 169667)
@@ -1,894 +1,895 @@
 /*-
  * Copyright (c) 1987, 1991, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2005-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * Kernel malloc(9) implementation -- general purpose kernel memory allocator
  * based on memory types.  Back end is implemented using the UMA(9) zone
  * allocator.  A set of fixed-size buckets are used for smaller allocations,
  * and a special UMA allocation interface is used for larger allocations.
  * Callers declare memory types, and statistics are maintained independently
  * for each memory type.  Statistics are maintained per-CPU for performance
  * reasons.  See malloc(9) and comments in malloc.h for a detailed
  * description.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/vmmeter.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
 #ifdef DEBUG_REDZONE
 #include <vm/redzone.h>
 #endif
 
 #if defined(INVARIANTS) && defined(__i386__)
 #include <machine/cpu.h>
 #endif
 
 #include <ddb/ddb.h>
 
 /*
  * When realloc() is called, if the new size is sufficiently smaller than
  * the old size, realloc() will allocate a new, smaller block to avoid
  * wasting memory. 'Sufficiently smaller' is defined as: newsize <=
  * oldsize / 2^n, where REALLOC_FRACTION defines the value of 'n'.
  */
 #ifndef REALLOC_FRACTION
 #define	REALLOC_FRACTION	1	/* new block if <= half the size */
 #endif
 
 /*
  * Centrally define some common malloc types.
  */
 MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
 MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
 MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
 
 MALLOC_DEFINE(M_IP6OPT, "ip6opt", "IPv6 options");
 MALLOC_DEFINE(M_IP6NDP, "ip6ndp", "IPv6 Neighbor Discovery");
 
 static void kmeminit(void *);
 SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
 
 static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
 
 static struct malloc_type *kmemstatistics;
 static char *kmembase;
 static char *kmemlimit;
 static int kmemcount;
 
 #define KMEM_ZSHIFT	4
 #define KMEM_ZBASE	16
 #define KMEM_ZMASK	(KMEM_ZBASE - 1)
 
 #define KMEM_ZMAX	PAGE_SIZE
 #define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
 static u_int8_t kmemsize[KMEM_ZSIZE + 1];
 
 /*
  * Small malloc(9) memory allocations are allocated from a set of UMA buckets
  * of various sizes.
  *
  * XXX: The comment here used to read "These won't be powers of two for
  * long."  It's possible that a significant amount of wasted memory could be
  * recovered by tuning the sizes of these buckets.
  */
 struct {
 	int kz_size;
 	char *kz_name;
 	uma_zone_t kz_zone;
 } kmemzones[] = {
 	{16, "16", NULL},
 	{32, "32", NULL},
 	{64, "64", NULL},
 	{128, "128", NULL},
 	{256, "256", NULL},
 	{512, "512", NULL},
 	{1024, "1024", NULL},
 	{2048, "2048", NULL},
 	{4096, "4096", NULL},
 #if PAGE_SIZE > 4096
 	{8192, "8192", NULL},
 #if PAGE_SIZE > 8192
 	{16384, "16384", NULL},
 #if PAGE_SIZE > 16384
 	{32768, "32768", NULL},
 #if PAGE_SIZE > 32768
 	{65536, "65536", NULL},
 #if PAGE_SIZE > 65536
 #error	"Unsupported PAGE_SIZE"
 #endif	/* 65536 */
 #endif	/* 32768 */
 #endif	/* 16384 */
 #endif	/* 8192 */
 #endif	/* 4096 */
 	{0, NULL},
 };
 
 /*
  * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
  * types are described by a data structure passed by the declaring code, but
  * the malloc(9) implementation has its own data structure describing the
  * type and statistics.  This permits the malloc(9)-internal data structures
  * to be modified without breaking binary-compiled kernel modules that
  * declare malloc types.
  */
 static uma_zone_t mt_zone;
 
 u_int vm_kmem_size;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size, CTLFLAG_RD, &vm_kmem_size, 0,
     "Size of kernel memory");
 
 u_int vm_kmem_size_min;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RD, &vm_kmem_size_min, 0,
     "Minimum size of kernel memory");
 
 u_int vm_kmem_size_max;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RD, &vm_kmem_size_max, 0,
     "Maximum size of kernel memory");
 
 u_int vm_kmem_size_scale;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_scale, CTLFLAG_RD, &vm_kmem_size_scale, 0,
     "Scale factor for kernel memory size");
 
 /*
  * The malloc_mtx protects the kmemstatistics linked list.
  */
 struct mtx malloc_mtx;
 
 #ifdef MALLOC_PROFILE
 uint64_t krequests[KMEM_ZSIZE + 1];
 
 static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
 #endif
 
 static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
 
 /*
  * time_uptime of the last malloc(9) failure (induced or real).
  */
 static time_t t_malloc_fail;
 
 /*
  * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
  * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
  */
 #ifdef MALLOC_MAKE_FAILURES
 SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
     "Kernel malloc debugging options");
 
 static int malloc_failure_rate;
 static int malloc_nowait_count;
 static int malloc_failure_count;
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_rate, CTLFLAG_RW,
     &malloc_failure_rate, 0, "Every (n) mallocs with M_NOWAIT will fail");
 TUNABLE_INT("debug.malloc.failure_rate", &malloc_failure_rate);
 SYSCTL_INT(_debug_malloc, OID_AUTO, failure_count, CTLFLAG_RD,
     &malloc_failure_count, 0, "Number of imposed M_NOWAIT malloc failures");
 #endif
 
 int
 malloc_last_fail(void)
 {
 
 	return (time_uptime - t_malloc_fail);
 }
 
 /*
  * An allocation has succeeded -- update malloc type statistics for the
  * amount of bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-PCU
  * statistics.
  */
 static void
 malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
     int zindx)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	if (size > 0) {
 		mtsp->mts_memalloced += size;
 		mtsp->mts_numallocs++;
 	}
 	if (zindx != -1)
 		mtsp->mts_size |= 1 << zindx;
 	critical_exit();
 }
 
 void
 malloc_type_allocated(struct malloc_type *mtp, unsigned long size)
 {
 
 	if (size > 0)
 		malloc_type_zone_allocated(mtp, size, -1);
 }
 
 /*
  * A free operation has occurred -- update malloc type statistis for the
  * amount of the bucket size.  Occurs within a critical section so that the
  * thread isn't preempted and doesn't migrate while updating per-CPU
  * statistics.
  */
 void
 malloc_type_freed(struct malloc_type *mtp, unsigned long size)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 
 	critical_enter();
 	mtip = mtp->ks_handle;
 	mtsp = &mtip->mti_stats[curcpu];
 	mtsp->mts_memfreed += size;
 	mtsp->mts_numfrees++;
 	critical_exit();
 }
 
 /*
  *	malloc:
  *
  *	Allocate a block of memory.
  *
  *	If M_NOWAIT is set, this routine will not block and return NULL if
  *	the allocation fails.
  */
 void *
 malloc(unsigned long size, struct malloc_type *mtp, int flags)
 {
 	int indx;
 	caddr_t va;
 	uma_zone_t zone;
 	uma_keg_t keg;
 #if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
 #ifdef INVARIANTS
 	/*
 	 * Check that exactly one of M_WAITOK or M_NOWAIT is specified.
 	 */
 	indx = flags & (M_WAITOK | M_NOWAIT);
 	if (indx != M_NOWAIT && indx != M_WAITOK) {
 		static	struct timeval lasterr;
 		static	int curerr, once;
 		if (once == 0 && ppsratecheck(&lasterr, &curerr, 1)) {
 			printf("Bad malloc flags: %x\n", indx);
 			kdb_backtrace();
 			flags |= M_WAITOK;
 			once++;
 		}
 	}
 #endif
 #if 0
 	if (size == 0)
 		kdb_enter("zero size malloc");
 #endif
 #ifdef MALLOC_MAKE_FAILURES
 	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
 		atomic_add_int(&malloc_nowait_count, 1);
 		if ((malloc_nowait_count % malloc_failure_rate) == 0) {
 			atomic_add_int(&malloc_failure_count, 1);
 			t_malloc_fail = time_uptime;
 			return (NULL);
 		}
 	}
 #endif
 	if (flags & M_WAITOK)
 		KASSERT(curthread->td_intr_nesting_level == 0,
 		   ("malloc(M_WAITOK) in interrupt context"));
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp(mtp))
 		return memguard_alloc(size, flags);
 #endif
 
 #ifdef DEBUG_REDZONE
 	size = redzone_size_ntor(size);
 #endif
 
 	if (size <= KMEM_ZMAX) {
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
 		indx = kmemsize[size >> KMEM_ZSHIFT];
 		zone = kmemzones[indx].kz_zone;
 		keg = zone->uz_keg;
 #ifdef MALLOC_PROFILE
 		krequests[size >> KMEM_ZSHIFT]++;
 #endif
 		va = uma_zalloc(zone, flags);
 		if (va != NULL)
 			size = keg->uk_size;
 		malloc_type_zone_allocated(mtp, va == NULL ? 0 : size, indx);
 	} else {
 		size = roundup(size, PAGE_SIZE);
 		zone = NULL;
 		keg = NULL;
 		va = uma_large_malloc(size, flags);
 		malloc_type_allocated(mtp, va == NULL ? 0 : size);
 	}
 	if (flags & M_WAITOK)
 		KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL"));
 	else if (va == NULL)
 		t_malloc_fail = time_uptime;
 #ifdef DIAGNOSTIC
 	if (va != NULL && !(flags & M_ZERO)) {
 		memset(va, 0x70, osize);
 	}
 #endif
 #ifdef DEBUG_REDZONE
 	if (va != NULL)
 		va = redzone_setup(va, osize);
 #endif
 	return ((void *) va);
 }
 
 /*
  *	free:
  *
  *	Free a block of memory allocated by malloc.
  *
  *	This routine may not block.
  */
 void
 free(void *addr, struct malloc_type *mtp)
 {
 	uma_slab_t slab;
 	u_long size;
 
 	/* free(NULL, ...) does nothing */
 	if (addr == NULL)
 		return;
 
 #ifdef DEBUG_MEMGUARD
 	if (memguard_cmp(mtp)) {
 		memguard_free(addr);
 		return;
 	}
 #endif
 
 #ifdef DEBUG_REDZONE
 	redzone_check(addr);
 	addr = redzone_addr_ntor(addr);
 #endif
 
 	size = 0;
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
 
 	if (slab == NULL)
 		panic("free: address %p(%p) has not been allocated.\n",
 		    addr, (void *)((u_long)addr & (~UMA_SLAB_MASK)));
 
 
 	if (!(slab->us_flags & UMA_SLAB_MALLOC)) {
 #ifdef INVARIANTS
 		struct malloc_type **mtpp = addr;
 #endif
 		size = slab->us_keg->uk_size;
 #ifdef INVARIANTS
 		/*
 		 * Cache a pointer to the malloc_type that most recently freed
 		 * this memory here.  This way we know who is most likely to
 		 * have stepped on it later.
 		 *
 		 * This code assumes that size is a multiple of 8 bytes for
 		 * 64 bit machines
 		 */
 		mtpp = (struct malloc_type **)
 		    ((unsigned long)mtpp & ~UMA_ALIGN_PTR);
 		mtpp += (size - sizeof(struct malloc_type *)) /
 		    sizeof(struct malloc_type *);
 		*mtpp = mtp;
 #endif
 		uma_zfree_arg(LIST_FIRST(&slab->us_keg->uk_zones), addr, slab);
 	} else {
 		size = slab->us_size;
 		uma_large_free(slab);
 	}
 	malloc_type_freed(mtp, size);
 }
 
 /*
  *	realloc: change the size of a memory block
  */
 void *
 realloc(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	uma_slab_t slab;
 	unsigned long alloc;
 	void *newaddr;
 
 	/* realloc(NULL, ...) is equivalent to malloc(...) */
 	if (addr == NULL)
 		return (malloc(size, mtp, flags));
 
 	/*
 	 * XXX: Should report free of old memory and alloc of new memory to
 	 * per-CPU stats.
 	 */
 
 #ifdef DEBUG_MEMGUARD
 if (memguard_cmp(mtp)) {
 	slab = NULL;
 	alloc = size;
 } else {
 #endif
 
 #ifdef DEBUG_REDZONE
 	slab = NULL;
 	alloc = redzone_get_size(addr);
 #else
 	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
 
 	/* Sanity check */
 	KASSERT(slab != NULL,
 	    ("realloc: address %p out of range", (void *)addr));
 
 	/* Get the size of the original block */
 	if (!(slab->us_flags & UMA_SLAB_MALLOC))
 		alloc = slab->us_keg->uk_size;
 	else
 		alloc = slab->us_size;
 
 	/* Reuse the original block if appropriate */
 	if (size <= alloc
 	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
 		return (addr);
 #endif /* !DEBUG_REDZONE */
 
 #ifdef DEBUG_MEMGUARD
 }
 #endif
 
 	/* Allocate a new, bigger (or smaller) block */
 	if ((newaddr = malloc(size, mtp, flags)) == NULL)
 		return (NULL);
 
 	/* Copy over original contents */
 	bcopy(addr, newaddr, min(size, alloc));
 	free(addr, mtp);
 	return (newaddr);
 }
 
 /*
  *	reallocf: same as realloc() but free memory on failure.
  */
 void *
 reallocf(void *addr, unsigned long size, struct malloc_type *mtp, int flags)
 {
 	void *mem;
 
 	if ((mem = realloc(addr, size, mtp, flags)) == NULL)
 		free(addr, mtp);
 	return (mem);
 }
 
 /*
  * Initialize the kernel memory allocator
  */
 /* ARGSUSED*/
 static void
 kmeminit(void *dummy)
 {
 	u_int8_t indx;
 	u_long mem_size;
 	int i;
  
 	mtx_init(&malloc_mtx, "malloc", NULL, MTX_DEF);
 
 	/*
 	 * Try to auto-tune the kernel memory size, so that it is
 	 * more applicable for a wider range of machine sizes.
 	 * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while
 	 * a VM_KMEM_SIZE of 12MB is a fair compromise.  The
 	 * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space
 	 * available, and on an X86 with a total KVA space of 256MB,
 	 * try to keep VM_KMEM_SIZE_MAX at 80MB or below.
 	 *
 	 * Note that the kmem_map is also used by the zone allocator,
 	 * so make sure that there is enough space.
 	 */
 	vm_kmem_size = VM_KMEM_SIZE + nmbclusters * PAGE_SIZE;
-	mem_size = cnt.v_page_count;
+	mem_size = VMCNT_GET(page_count);
 
 #if defined(VM_KMEM_SIZE_SCALE)
 	vm_kmem_size_scale = VM_KMEM_SIZE_SCALE;
 #endif
 	TUNABLE_INT_FETCH("vm.kmem_size_scale", &vm_kmem_size_scale);
 	if (vm_kmem_size_scale > 0 &&
 	    (mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
 		vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
 
 #if defined(VM_KMEM_SIZE_MIN)
 	vm_kmem_size_min = VM_KMEM_SIZE_MIN;
 #endif
 	TUNABLE_INT_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
 	if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
 		vm_kmem_size = vm_kmem_size_min;
 	}
 
 #if defined(VM_KMEM_SIZE_MAX)
 	vm_kmem_size_max = VM_KMEM_SIZE_MAX;
 #endif
 	TUNABLE_INT_FETCH("vm.kmem_size_max", &vm_kmem_size_max);
 	if (vm_kmem_size_max > 0 && vm_kmem_size >= vm_kmem_size_max)
 		vm_kmem_size = vm_kmem_size_max;
 
 	/* Allow final override from the kernel environment */
 #ifndef BURN_BRIDGES
 	if (TUNABLE_INT_FETCH("kern.vm.kmem.size", &vm_kmem_size) != 0)
 		printf("kern.vm.kmem.size is now called vm.kmem_size!\n");
 #endif
 	TUNABLE_INT_FETCH("vm.kmem_size", &vm_kmem_size);
 
 	/*
 	 * Limit kmem virtual size to twice the physical memory.
 	 * This allows for kmem map sparseness, but limits the size
 	 * to something sane. Be careful to not overflow the 32bit
 	 * ints while doing the check.
 	 */
-	if (((vm_kmem_size / 2) / PAGE_SIZE) > cnt.v_page_count)
-		vm_kmem_size = 2 * cnt.v_page_count * PAGE_SIZE;
+	if (((vm_kmem_size / 2) / PAGE_SIZE) > VMCNT_GET(page_count))
+		vm_kmem_size = 2 * VMCNT_GET(page_count) * PAGE_SIZE;
 
 	/*
 	 * Tune settings based on the kernel map's size at this time.
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);
 
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
 		(vm_offset_t *)&kmemlimit, vm_kmem_size);
 	kmem_map->system_map = 1;
 
 #ifdef DEBUG_MEMGUARD
 	/*
 	 * Initialize MemGuard if support compiled in.  MemGuard is a
 	 * replacement allocator used for detecting tamper-after-free
 	 * scenarios as they occur.  It is only used for debugging.
 	 */
 	vm_memguard_divisor = 10;
 	TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
 
 	/* Pick a conservative value if provided value sucks. */
 	if ((vm_memguard_divisor <= 0) ||
 	    ((vm_kmem_size / vm_memguard_divisor) == 0))
 		vm_memguard_divisor = 10;
 	memguard_init(kmem_map, vm_kmem_size / vm_memguard_divisor);
 #endif
 
 	uma_startup2();
 
 	mt_zone = uma_zcreate("mt_zone", sizeof(struct malloc_type_internal),
 #ifdef INVARIANTS
 	    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 	    NULL, NULL, NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 	for (i = 0, indx = 0; kmemzones[indx].kz_size != 0; indx++) {
 		int size = kmemzones[indx].kz_size;
 		char *name = kmemzones[indx].kz_name;
 
 		kmemzones[indx].kz_zone = uma_zcreate(name, size,
 #ifdef INVARIANTS
 		    mtrash_ctor, mtrash_dtor, mtrash_init, mtrash_fini,
 #else
 		    NULL, NULL, NULL, NULL,
 #endif
 		    UMA_ALIGN_PTR, UMA_ZONE_MALLOC);
 		    
 		for (;i <= size; i+= KMEM_ZBASE)
 			kmemsize[i >> KMEM_ZSHIFT] = indx;
 		
 	}
 }
 
 void
 malloc_init(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 
-	KASSERT(cnt.v_page_count != 0, ("malloc_register before vm_init"));
+	KASSERT(VMCNT_GET(page_count) != 0,
+	    ("malloc_register before vm_init"));
 
 	mtp = data;
 	mtip = uma_zalloc(mt_zone, M_WAITOK | M_ZERO);
 	mtp->ks_handle = mtip;
 
 	mtx_lock(&malloc_mtx);
 	mtp->ks_next = kmemstatistics;
 	kmemstatistics = mtp;
 	kmemcount++;
 	mtx_unlock(&malloc_mtx);
 }
 
 void
 malloc_uninit(void *data)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type_stats *mtsp;
 	struct malloc_type *mtp, *temp;
 	uma_slab_t slab;
 	long temp_allocs, temp_bytes;
 	int i;
 
 	mtp = data;
 	KASSERT(mtp->ks_handle != NULL, ("malloc_deregister: cookie NULL"));
 	mtx_lock(&malloc_mtx);
 	mtip = mtp->ks_handle;
 	mtp->ks_handle = NULL;
 	if (mtp != kmemstatistics) {
 		for (temp = kmemstatistics; temp != NULL;
 		    temp = temp->ks_next) {
 			if (temp->ks_next == mtp)
 				temp->ks_next = mtp->ks_next;
 		}
 	} else
 		kmemstatistics = mtp->ks_next;
 	kmemcount--;
 	mtx_unlock(&malloc_mtx);
 
 	/*
 	 * Look for memory leaks.
 	 */
 	temp_allocs = temp_bytes = 0;
 	for (i = 0; i < MAXCPU; i++) {
 		mtsp = &mtip->mti_stats[i];
 		temp_allocs += mtsp->mts_numallocs;
 		temp_allocs -= mtsp->mts_numfrees;
 		temp_bytes += mtsp->mts_memalloced;
 		temp_bytes -= mtsp->mts_memfreed;
 	}
 	if (temp_allocs > 0 || temp_bytes > 0) {
 		printf("Warning: memory type %s leaked memory on destroy "
 		    "(%ld allocations, %ld bytes leaked).\n", mtp->ks_shortdesc,
 		    temp_allocs, temp_bytes);
 	}
 
 	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
 	uma_zfree_arg(mt_zone, mtip, slab);
 }
 
 struct malloc_type *
 malloc_desc2type(const char *desc)
 {
 	struct malloc_type *mtp;
 
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		if (strcmp(mtp->ks_shortdesc, desc) == 0)
 			return (mtp);
 	}
 	return (NULL);
 }
 
 static int
 sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct malloc_type_stream_header mtsh;
 	struct malloc_type_internal *mtip;
 	struct malloc_type_header mth;
 	struct malloc_type *mtp;
 	int buflen, count, error, i;
 	struct sbuf sbuf;
 	char *buffer;
 
 	mtx_lock(&malloc_mtx);
 restart:
 	mtx_assert(&malloc_mtx, MA_OWNED);
 	count = kmemcount;
 	mtx_unlock(&malloc_mtx);
 	buflen = sizeof(mtsh) + count * (sizeof(mth) +
 	    sizeof(struct malloc_type_stats) * MAXCPU) + 1;
 	buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 	mtx_lock(&malloc_mtx);
 	if (count < kmemcount) {
 		free(buffer, M_TEMP);
 		goto restart;
 	}
 
 	sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN);
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&mtsh, sizeof(mtsh));
 	mtsh.mtsh_version = MALLOC_TYPE_STREAM_VERSION;
 	mtsh.mtsh_maxcpus = MAXCPU;
 	mtsh.mtsh_count = kmemcount;
 	if (sbuf_bcat(&sbuf, &mtsh, sizeof(mtsh)) < 0) {
 		mtx_unlock(&malloc_mtx);
 		error = ENOMEM;
 		goto out;
 	}
 
 	/*
 	 * Insert alternating sequence of type headers and type statistics.
 	 */
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 
 		/*
 		 * Insert type header.
 		 */
 		bzero(&mth, sizeof(mth));
 		strlcpy(mth.mth_name, mtp->ks_shortdesc, MALLOC_MAX_NAME);
 		if (sbuf_bcat(&sbuf, &mth, sizeof(mth)) < 0) {
 			mtx_unlock(&malloc_mtx);
 			error = ENOMEM;
 			goto out;
 		}
 
 		/*
 		 * Insert type statistics for each CPU.
 		 */
 		for (i = 0; i < MAXCPU; i++) {
 			if (sbuf_bcat(&sbuf, &mtip->mti_stats[i],
 			    sizeof(mtip->mti_stats[i])) < 0) {
 				mtx_unlock(&malloc_mtx);
 				error = ENOMEM;
 				goto out;
 			}
 		}
 	}
 	mtx_unlock(&malloc_mtx);
 	sbuf_finish(&sbuf);
 	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
 out:
 	sbuf_delete(&sbuf);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, malloc_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_kern_malloc_stats, "s,malloc_type_ustats",
     "Return malloc types");
 
 SYSCTL_INT(_kern, OID_AUTO, malloc_count, CTLFLAG_RD, &kmemcount, 0,
     "Count of kernel malloc types");
 
 #ifdef DDB
 DB_SHOW_COMMAND(malloc, db_show_malloc)
 {
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	u_int64_t allocs, frees;
 	u_int64_t alloced, freed;
 	int i;
 
 	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
 	    "Requests");
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 		allocs = 0;
 		frees = 0;
 		alloced = 0;
 		freed = 0;
 		for (i = 0; i < MAXCPU; i++) {
 			allocs += mtip->mti_stats[i].mts_numallocs;
 			frees += mtip->mti_stats[i].mts_numfrees;
 			alloced += mtip->mti_stats[i].mts_memalloced;
 			freed += mtip->mti_stats[i].mts_memfreed;
 		}
 		db_printf("%18s %12ju %12juK %12ju\n",
 		    mtp->ks_shortdesc, allocs - frees,
 		    (alloced - freed + 1023) / 1024, allocs);
 	}
 }
 #endif
 
 #ifdef MALLOC_PROFILE
 
 static int
 sysctl_kern_mprof(SYSCTL_HANDLER_ARGS)
 {
 	int linesize = 64;
 	struct sbuf sbuf;
 	uint64_t count;
 	uint64_t waste;
 	uint64_t mem;
 	int bufsize;
 	int error;
 	char *buf;
 	int rsize;
 	int size;
 	int i;
 
 	bufsize = linesize * (KMEM_ZSIZE + 1);
 	bufsize += 128; 	/* For the stats line */
 	bufsize += 128; 	/* For the banner line */
 	waste = 0;
 	mem = 0;
 
 	buf = malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
 	sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN);
 	sbuf_printf(&sbuf, 
 	    "\n  Size                    Requests  Real Size\n");
 	for (i = 0; i < KMEM_ZSIZE; i++) {
 		size = i << KMEM_ZSHIFT;
 		rsize = kmemzones[kmemsize[i]].kz_size;
 		count = (long long unsigned)krequests[i];
 
 		sbuf_printf(&sbuf, "%6d%28llu%11d\n", size,
 		    (unsigned long long)count, rsize);
 
 		if ((rsize * count) > (size * count))
 			waste += (rsize * count) - (size * count);
 		mem += (rsize * count);
 	}
 	sbuf_printf(&sbuf,
 	    "\nTotal memory used:\t%30llu\nTotal Memory wasted:\t%30llu\n",
 	    (unsigned long long)mem, (unsigned long long)waste);
 	sbuf_finish(&sbuf);
 
 	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
 
 	sbuf_delete(&sbuf);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 SYSCTL_OID(_kern, OID_AUTO, mprof, CTLTYPE_STRING|CTLFLAG_RD,
     NULL, 0, sysctl_kern_mprof, "A", "Malloc Profiling");
 #endif /* MALLOC_PROFILE */
Index: head/sys/kern/kern_mib.c
===================================================================
--- head/sys/kern/kern_mib.c	(revision 169666)
+++ head/sys/kern/kern_mib.c	(revision 169667)
@@ -1,413 +1,413 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
  * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
  * project, to make these variables more userfriendly.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_posix.h"
 #include "opt_config.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/jail.h>
 #include <sys/smp.h>
 #include <sys/unistd.h>
 
 SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
 	"Sysctl internal magic");
 SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW, 0,
 	"High kernel, proc, limits &c");
 SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
 	"Virtual memory");
 SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
 	"File system");
 SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
 	"Network, (see socket.h)");
 SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
 	"Debugging");
 SYSCTL_NODE(_debug, OID_AUTO,  sizeof,  CTLFLAG_RW, 0,
 	"Sizeof various things");
 SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
 	"hardware");
 SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
 	"machine dependent");
 SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
 	"user-level");
 SYSCTL_NODE(, CTL_P1003_1B,  p1003_1b,   CTLFLAG_RW, 0,
 	"p1003_1b, (see p1003_1b.h)");
 
 SYSCTL_NODE(, OID_AUTO,  compat, CTLFLAG_RW, 0,
 	"Compatibility code");
 SYSCTL_NODE(, OID_AUTO, security, CTLFLAG_RW, 0, 
      	"Security");
 #ifdef REGRESSION
 SYSCTL_NODE(, OID_AUTO, regression, CTLFLAG_RW, 0,
      "Regression test MIB");
 #endif
 
 SYSCTL_STRING(_kern, OID_AUTO, ident, CTLFLAG_RD,
     kern_ident, 0, "Kernel identifier");
 
 SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD,
     osrelease, 0, "Operating system release");
 
 SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD,
     0, BSD, "Operating system revision");
 
 SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD,
     version, 0, "Kernel version");
 
 SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD,
     ostype, 0, "Operating system type");
 
 /*
  * NOTICE: The *userland* release date is available in
  * /usr/include/osreldate.h
  */
 extern int osreldate;
 SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
     &osreldate, 0, "Kernel release date");
 
 SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RDTUN,
     &maxproc, 0, "Maximum number of processes");
 
 SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, CTLFLAG_RW,
     &maxprocperuid, 0, "Maximum processes allowed per userid");
 
 SYSCTL_INT(_kern, OID_AUTO, maxusers, CTLFLAG_RDTUN,
     &maxusers, 0, "Hint for kernel tuning");
 
 SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD,
     0, ARG_MAX, "Maximum bytes of argument to execve(2)");
 
 SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD,
     0, _POSIX_VERSION, "Version of POSIX attempting to comply to");
 
 SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD,
     0, NGROUPS_MAX, "Maximum number of groups a user can belong to");
 
 SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD,
     0, 1, "Whether job control is available");
 
 #ifdef _POSIX_SAVED_IDS
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
     0, 1, "Whether saved set-group/user ID is available");
 #else
 SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD,
     0, 0, "Whether saved set-group/user ID is available");
 #endif
 
 char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
 
 SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, CTLFLAG_RW,
     kernelname, sizeof kernelname, "Name of kernel file booted");
 
 SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD,
     &mp_ncpus, 0, "Number of active CPUs");
 
 SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD,
     0, BYTE_ORDER, "System byte order");
 
 SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD,
     0, PAGE_SIZE, "System memory page size");
 
 static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 
 	val = ctob(physmem);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "LU", "");
 
 static int
 sysctl_hw_realmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 	val = ctob(realmem);
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 SYSCTL_PROC(_hw, HW_REALMEM, realmem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_realmem, "LU", "");
 static int
 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
 
-	val = ctob(physmem - cnt.v_wire_count);
+	val = ctob(physmem - VMCNT_GET(wire_count));
 	return (sysctl_handle_long(oidp, &val, 0, req));
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG | CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "LU", "");
 
 SYSCTL_ULONG(_hw, OID_AUTO, availpages, CTLFLAG_RD, &physmem, 0, "");
 
 static char	machine_arch[] = MACHINE_ARCH;
 SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD,
     machine_arch, 0, "System architecture");
 
 char hostname[MAXHOSTNAMELEN];
 
 static int
 sysctl_hostname(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	char tmphostname[MAXHOSTNAMELEN];
 	int error;
 
 	pr = req->td->td_ucred->cr_prison;
 	if (pr != NULL) {
 		if (!jail_set_hostname_allowed && req->newptr)
 			return (EPERM);
 		/*
 		 * Process is in jail, so make a local copy of jail
 		 * hostname to get/set so we don't have to hold the jail
 		 * mutex during the sysctl copyin/copyout activities.
 		 */
 		mtx_lock(&pr->pr_mtx);
 		bcopy(pr->pr_host, tmphostname, MAXHOSTNAMELEN);
 		mtx_unlock(&pr->pr_mtx);
 
 		error = sysctl_handle_string(oidp, tmphostname,
 		    sizeof pr->pr_host, req);
 
 		if (req->newptr != NULL && error == 0) {
 			/*
 			 * Copy the locally set hostname to the jail, if
 			 * appropriate.
 			 */
 			mtx_lock(&pr->pr_mtx);
 			bcopy(tmphostname, pr->pr_host, MAXHOSTNAMELEN);
 			mtx_unlock(&pr->pr_mtx);
 		}
 	} else
 		error = sysctl_handle_string(oidp,
 		    hostname, sizeof hostname, req);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_HOSTNAME, hostname,
        CTLTYPE_STRING|CTLFLAG_RW|CTLFLAG_PRISON,
        0, 0, sysctl_hostname, "A", "Hostname");
 
 static int	regression_securelevel_nonmonotonic = 0;
 
 #ifdef REGRESSION
 SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW,
     &regression_securelevel_nonmonotonic, 0, "securelevel may be lowered");
 #endif
 
 int securelevel = -1;
 static struct mtx securelevel_mtx;
 
 MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock",
     MTX_DEF);
 
 static int
 sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS)
 {
 	struct prison *pr;
 	int error, level;
 
 	pr = req->td->td_ucred->cr_prison;
 
 	/*
 	 * If the process is in jail, return the maximum of the global and
 	 * local levels; otherwise, return the global level.  Perform a
 	 * lockless read since the securelevel is an integer.
 	 */
 	if (pr != NULL)
 		level = imax(securelevel, pr->pr_securelevel);
 	else
 		level = securelevel;
 	error = sysctl_handle_int(oidp, &level, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	/*
 	 * Permit update only if the new securelevel exceeds the
 	 * global level, and local level if any.
 	 */
 	if (pr != NULL) {
 		mtx_lock(&pr->pr_mtx);
 		if (!regression_securelevel_nonmonotonic &&
 		    (level < imax(securelevel, pr->pr_securelevel))) {
 			mtx_unlock(&pr->pr_mtx);
 			return (EPERM);
 		}
 		pr->pr_securelevel = level;
 		mtx_unlock(&pr->pr_mtx);
 	} else {
 		mtx_lock(&securelevel_mtx);
 		if (!regression_securelevel_nonmonotonic &&
 		    (level < securelevel)) {
 			mtx_unlock(&securelevel_mtx);
 			return (EPERM);
 		}
 		securelevel = level;
 		mtx_unlock(&securelevel_mtx);
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel,
     CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
     "I", "Current secure level");
 
 #ifdef INCLUDE_CONFIG_FILE
 /* Actual kernel configuration options. */
 extern char kernconfstring[];
 
 static int
 sysctl_kern_config(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	int error;
 	char *p;
 
 	sb = sbuf_new(NULL, NULL, 2048, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_clear(sb);
 	p = kernconfstring;
 	if (p == NULL || *p == '\0') {
 		sbuf_printf(sb, "No kernel configuration\n");
 	} else {
 		sbuf_printf(sb, "%s", p);
 	}
 	sbuf_trim(sb);
 	sbuf_putc(sb, '\n');
 	sbuf_finish(sb);
 	error = sysctl_handle_string(oidp, sbuf_data(sb), sbuf_len(sb), req);
 	if (error)
 		return (error);
 	sbuf_delete(sb);
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RW, 
     0, 0, sysctl_kern_config, "", "Kernel configuration file");
 #endif
 
 char domainname[MAXHOSTNAMELEN];
 SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
     &domainname, sizeof(domainname), "Name of the current YP/NIS domain");
 
 u_long hostid;
 SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
 char hostuuid[64] = "00000000-0000-0000-0000-000000000000";
 SYSCTL_STRING(_kern, KERN_HOSTUUID, hostuuid, CTLFLAG_RW, hostuuid,
     sizeof(hostuuid), "Host UUID");
 
 /*
  * This is really cheating.  These actually live in the libc, something
  * which I'm not quite sure is a good idea anyway, but in order for
  * getnext and friends to actually work, we define dummies here.
  */
 SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD,
     "", 0, "PATH that finds all the standard utilities");
 SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD,
     0, 0, "Max ibase/obase values in bc(1)");
 SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD,
     0, 0, "Max array size in bc(1)");
 SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD,
     0, 0, "Max scale value in bc(1)");
 SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD,
     0, 0, "Max string length in bc(1)");
 SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD,
     0, 0, "Maximum number of weights assigned to an LC_COLLATE locale entry");
 SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
 SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD,
     0, 0, "Max length (bytes) of a text-processing utility's input line");
 SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD,
     0, 0, "Maximum number of repeats of a regexp permitted");
 SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD,
     0, 0,
     "The version of POSIX 1003.2 with which the system attempts to comply");
 SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD,
     0, 0, "Whether C development supports the C bindings option");
 SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD,
     0, 0, "Whether system supports the C development utilities option");
 SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD,
     0, 0, "");
 SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD,
     0, 0, "Whether system supports FORTRAN development utilities");
 SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD,
     0, 0, "Whether system supports FORTRAN runtime utilities");
 SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD,
     0, 0, "Whether system supports creation of locales");
 SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD,
     0, 0, "Whether system supports software development utilities");
 SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD,
     0, 0, "Whether system supports the user portability utilities");
 SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD,
     0, 0, "Min Maximum number of streams a process may have open at one time");
 SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD,
     0, 0, "Min Maximum number of types supported for timezone names");
 
 #include <sys/vnode.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, vnode, CTLFLAG_RD,
     0, sizeof(struct vnode), "sizeof(struct vnode)");
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, proc, CTLFLAG_RD,
     0, sizeof(struct proc), "sizeof(struct proc)");
 
 #include <sys/bio.h>
 #include <sys/buf.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, bio, CTLFLAG_RD,
     0, sizeof(struct bio), "sizeof(struct bio)");
 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD,
     0, sizeof(struct buf), "sizeof(struct buf)");
 
 #include <sys/user.h>
 SYSCTL_INT(_debug_sizeof, OID_AUTO, kinfo_proc, CTLFLAG_RD,
     0, sizeof(struct kinfo_proc), "sizeof(struct kinfo_proc)");
 
 /* XXX compatibility, remove for 6.0 */
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 SYSCTL_INT(_kern, OID_AUTO, fallback_elf_brand, CTLFLAG_RW,
     &__elfN(fallback_brand), sizeof(__elfN(fallback_brand)),
     "compatibility for kern.fallback_elf_brand");
Index: head/sys/kern/kern_synch.c
===================================================================
--- head/sys/kern/kern_synch.c	(revision 169666)
+++ head/sys/kern/kern_synch.c	(revision 169667)
@@ -1,588 +1,588 @@
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
 static void synch_setup(void *dummy);
 SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup, NULL)
 
 int	hogticks;
 int	lbolt;
 static int pause_wchan;
 
 static struct callout loadav_callout;
 static struct callout lbolt_callout;
 
 struct loadavg averunnable =
 	{ {0, 0, 0}, FSCALE };	/* load average, of runnable procs */
 /*
  * Constants for averages over 1, 5, and 15 minutes
  * when sampling at 5 second intervals.
  */
 static fixpt_t cexp[3] = {
 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
 };
 
 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
 static int      fscale __unused = FSCALE;
 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");
 
 static void	loadav(void *arg);
 static void	lboltcb(void *arg);
 
 void
 sleepinit(void)
 {
 
 	hogticks = (hz / 10) * 2;	/* Default only. */
 	init_sleepqueues();
 }
 
 /*
  * General sleep call.  Suspends the current thread until a wakeup is
  * performed on the specified identifier.  The thread will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
  * signal needs to be delivered, ERESTART is returned if the current system
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
  * The lock argument is unlocked before the caller is suspended, and
  * re-locked before _sleep() returns.  If priority includes the PDROP
  * flag the lock is not re-locked before returning.
  */
 int
 _sleep(ident, lock, priority, wmesg, timo)
 	void *ident;
 	struct lock_object *lock;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td;
 	struct proc *p;
 	struct lock_class *class;
 	int catch, flags, lock_state, pri, rval;
 	WITNESS_SAVE_DECL(lock_witness);
 
 	td = curthread;
 	p = td->td_proc;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Sleeping on \"%s\"", wmesg);
 	KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL ||
 	    ident == &lbolt, ("sleeping without a lock"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 	if (lock != NULL)
 		class = LOCK_CLASS(lock);
 	else
 		class = NULL;
 
 	if (cold) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		if (lock != NULL && priority & PDROP)
 			class->lc_unlock(lock);
 		return (0);
 	}
 	catch = priority & PCATCH;
 	rval = 0;
 
 	/*
 	 * If we are already on a sleep queue, then remove us from that
 	 * sleep queue first.  We have to do this to handle recursive
 	 * sleeps.
 	 */
 	if (TD_ON_SLEEPQ(td))
 		sleepq_remove(td, td->td_wchan);
 
 	if (ident == &pause_wchan)
 		flags = SLEEPQ_PAUSE;
 	else
 		flags = SLEEPQ_SLEEP;
 	if (catch)
 		flags |= SLEEPQ_INTERRUPTIBLE;
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
 	if (lock != NULL && !(class->lc_flags & LC_SLEEPABLE)) {
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 	} else
 		/* GCC needs to follow the Yellow Brick Road */
 		lock_state = -1;
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
 	 * before calling thread_suspend_check, as we could stop there,
 	 * and a wakeup or a SIGCONT (or both) could occur while we were
 	 * stopped without resuming us.  Thus, we must be ready for sleep
 	 * when cursig() is called.  If the wakeup happens while we're
 	 * stopped, then td will no longer be on a sleep queue upon
 	 * return from cursig().
 	 */
 	sleepq_add(ident, ident == &lbolt ? NULL : lock, wmesg, flags, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
 	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
 		sleepq_release(ident);
 		WITNESS_SAVE(lock, lock_witness);
 		lock_state = class->lc_unlock(lock);
 		sleepq_lock(ident);
 	}
 
 	/*
 	 * Adjust this thread's priority, if necessary.
 	 */
 	pri = priority & PRIMASK;
 	if (pri != 0 && pri != td->td_priority) {
 		mtx_lock_spin(&sched_lock);
 		sched_prio(td, pri);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	if (timo && catch)
 		rval = sleepq_timedwait_sig(ident);
 	else if (timo)
 		rval = sleepq_timedwait(ident);
 	else if (catch)
 		rval = sleepq_wait_sig(ident);
 	else {
 		sleepq_wait(ident);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	if (lock != NULL && !(priority & PDROP)) {
 		class->lc_lock(lock, lock_state);
 		WITNESS_RESTORE(lock, lock_witness);
 	}
 	return (rval);
 }
 
 int
 msleep_spin(ident, mtx, wmesg, timo)
 	void *ident;
 	struct mtx *mtx;
 	const char *wmesg;
 	int timo;
 {
 	struct thread *td;
 	struct proc *p;
 	int rval;
 	WITNESS_SAVE_DECL(mtx);
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(mtx != NULL, ("sleeping without a mutex"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
 
 	if (cold) {
 		/*
 		 * During autoconfiguration, just return;
 		 * don't run any other threads or panic below,
 		 * in case this is the idle thread and already asleep.
 		 * XXX: this used to do "s = splhigh(); splx(safepri);
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
 		return (0);
 	}
 
 	sleepq_lock(ident);
 	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
 	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
 	mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
 	WITNESS_SAVE(&mtx->lock_object, mtx);
 	mtx_unlock_spin(mtx);
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
 	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
 
 	/*
 	 * Can't call ktrace with any spin locks held so it can lock the
 	 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
 	 * any spin lock.  Thus, we have to drop the sleepq spin lock while
 	 * we handle those requests.  This is safe since we have placed our
 	 * thread on the sleep queue already.
 	 */
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW)) {
 		sleepq_release(ident);
 		ktrcsw(1, 0);
 		sleepq_lock(ident);
 	}
 #endif
 #ifdef WITNESS
 	sleepq_release(ident);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
 	    wmesg);
 	sleepq_lock(ident);
 #endif
 	if (timo)
 		rval = sleepq_timedwait(ident);
 	else {
 		sleepq_wait(ident);
 		rval = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
 	mtx_lock_spin(mtx);
 	WITNESS_RESTORE(&mtx->lock_object, mtx);
 	return (rval);
 }
 
 /*
  * pause() is like tsleep() except that the intention is to not be
  * explicitly woken up by another thread.  Instead, the current thread
  * simply wishes to sleep until the timeout expires.  It is
  * implemented using a dummy wait channel.
  */
 int
 pause(wmesg, timo)
 	const char *wmesg;
 	int timo;
 {
 
 	KASSERT(timo != 0, ("pause: timeout required"));
 	return (tsleep(&pause_wchan, 0, wmesg, timo));
 }
 
 /*
  * Make all threads sleeping on the specified identifier runnable.
  */
 void
 wakeup(ident)
 	register void *ident;
 {
 
 	sleepq_lock(ident);
 	sleepq_broadcast(ident, SLEEPQ_SLEEP, -1, 0);
 }
 
 /*
  * Make a thread sleeping on the specified identifier runnable.
  * May wake more than one thread if a target thread is currently
  * swapped out.
  */
 void
 wakeup_one(ident)
 	register void *ident;
 {
 
 	sleepq_lock(ident);
 	sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0);
 }
 
 /*
  * The machine independent parts of context switching.
  */
 void
 mi_switch(int flags, struct thread *newtd)
 {
 	uint64_t new_switchtime;
 	struct thread *td;
 	struct proc *p;
 
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
 	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
 		mtx_assert(&Giant, MA_NOTOWNED);
 #endif
 	KASSERT(td->td_critnest == 1 || (td->td_critnest == 2 &&
 	    (td->td_owepreempt) && (flags & SW_INVOL) != 0 &&
 	    newtd == NULL) || panicstr,
 	    ("mi_switch: switch in a critical section"));
 	KASSERT((flags & (SW_INVOL | SW_VOL)) != 0,
 	    ("mi_switch: switch must be voluntary or involuntary"));
 	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
 
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (kdb_active) {
 		mtx_unlock_spin(&sched_lock);
 		kdb_backtrace();
 		kdb_reenter();
 		panic("%s: did not reenter debugger", __func__);
 	}
 
 	if (flags & SW_VOL)
 		p->p_stats->p_ru.ru_nvcsw++;
 	else
 		p->p_stats->p_ru.ru_nivcsw++;
 
 	/*
 	 * Compute the amount of time during which the current
 	 * process was running, and add that to its total so far.
 	 */
 	new_switchtime = cpu_ticks();
 	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	p->p_rux.rux_uticks += td->td_uticks;
 	td->td_uticks = 0;
 	p->p_rux.rux_iticks += td->td_iticks;
 	td->td_iticks = 0;
 	p->p_rux.rux_sticks += td->td_sticks;
 	td->td_sticks = 0;
 
 	td->td_generation++;	/* bump preempt-detect counter */
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.  If
 	 * it reaches the max, arrange to kill the process in ast().
 	 */
 	if (p->p_cpulimit != RLIM_INFINITY &&
 	    p->p_rux.rux_runtime >= p->p_cpulimit * cpu_tickrate()) {
 		p->p_sflag |= PS_XCPU;
 		td->td_flags |= TDF_ASTPENDING;
 	}
 
 	/*
 	 * Finish up stats for outgoing thread.
 	 */
-	cnt.v_swtch++;
+	VMCNT_ADD(swtch, 1);
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
 	CTR4(KTR_PROC, "mi_switch: old thread %ld (kse %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 #if (KTR_COMPILE & KTR_SCHED) != 0
 	if (TD_IS_IDLETHREAD(td))
 		CTR3(KTR_SCHED, "mi_switch: %p(%s) prio %d idle",
 		    td, td->td_proc->p_comm, td->td_priority);
 	else if (newtd != NULL)
 		CTR5(KTR_SCHED,
 		    "mi_switch: %p(%s) prio %d preempted by %p(%s)",
 		    td, td->td_proc->p_comm, td->td_priority, newtd,
 		    newtd->td_proc->p_comm);
 	else
 		CTR6(KTR_SCHED,
 		    "mi_switch: %p(%s) prio %d inhibit %d wmesg %s lock %s",
 		    td, td->td_proc->p_comm, td->td_priority,
 		    td->td_inhibitors, td->td_wmesg, td->td_lockname);
 #endif
 	/*
 	 * We call thread_switchout after the KTR_SCHED prints above so kse
 	 * selecting a new thread to run does not show up as a preemption.
 	 */
 #ifdef KSE
 	if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
 		newtd = thread_switchout(td, flags, newtd);
 #endif
 	sched_switch(td, newtd, flags);
 	CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d",
 	    td, td->td_proc->p_comm, td->td_priority);
 
 	CTR4(KTR_PROC, "mi_switch: new thread %ld (kse %p, pid %ld, %s)",
 	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
 	 */
 	if ((td = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
 		thread_stash(td);
 	}
 }
 
 /*
  * Change process state to be runnable,
  * placing it on the run queue if it is in memory,
  * and awakening the swapper if it isn't in memory.
  */
 void
 setrunnable(struct thread *td)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (p->p_state) {
 	case PRS_ZOMBIE:
 		panic("setrunnable(1)");
 	default:
 		break;
 	}
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
 		return;
 	case TDS_INHIBITED:
 		/*
 		 * If we are only inhibited because we are swapped out
 		 * then arange to swap in this process. Otherwise just return.
 		 */
 		if (td->td_inhibitors != TDI_SWAPPED)
 			return;
 		/* XXX: intentional fall-through ? */
 	case TDS_CAN_RUN:
 		break;
 	default:
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
 	if ((p->p_sflag & PS_INMEM) == 0) {
 		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
 			p->p_sflag |= PS_SWAPINREQ;
 			/*
 			 * due to a LOR between sched_lock and
 			 * the sleepqueue chain locks, use
 			 * lower level scheduling functions.
 			 */
 			kick_proc0();
 		}
 	} else
 		sched_wakeup(td);
 }
 
 /*
  * Compute a tenex style load average of a quantity on
  * 1, 5 and 15 minute intervals.
  * XXXKSE   Needs complete rewrite when correct info is available.
  * Completely Bogus.. only works with 1:1 (but compiles ok now :-)
  */
 static void
 loadav(void *arg)
 {
 	int i, nrun;
 	struct loadavg *avg;
 
 	nrun = sched_load();
 	avg = &averunnable;
 
 	for (i = 0; i < 3; i++)
 		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
 
 	/*
 	 * Schedule the next update to occur after 5 seconds, but add a
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
 	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
 	    loadav, NULL);
 }
 
 static void
 lboltcb(void *arg)
 {
 	wakeup(&lbolt);
 	callout_reset(&lbolt_callout, hz, lboltcb, NULL);
 }
 
 /* ARGSUSED */
 static void
 synch_setup(dummy)
 	void *dummy;
 {
 	callout_init(&loadav_callout, CALLOUT_MPSAFE);
 	callout_init(&lbolt_callout, CALLOUT_MPSAFE);
 
 	/* Kick off timeout driven events by calling first time. */
 	loadav(NULL);
 	lboltcb(NULL);
 }
 
 /*
  * General purpose yield system call.
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
 	mtx_assert(&Giant, MA_NOTOWNED);
 	(void)uap;
 	sched_relinquish(td);
 	return (0);
 }
Index: head/sys/kern/kern_thread.c
===================================================================
--- head/sys/kern/kern_thread.c	(revision 169666)
+++ head/sys/kern/kern_thread.c	(revision 169667)
@@ -1,931 +1,931 @@
 /*-
  * Copyright (C) 2001 Julian Elischer <julian@freebsd.org>.
  *  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
  *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  * DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/turnstile.h>
 #include <sys/ktr.h>
 #include <sys/umtx.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * thread related storage.
  */
 static uma_zone_t thread_zone;
 
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
 
 int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
 int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
 	&max_threads_hits, 0, "");
 
 #ifdef KSE
 int virtual_cpu;
 
 #endif
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
 struct mtx kse_zombie_lock;
 MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
 
 #ifdef KSE
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
 	int error, new_val;
 	int def_val;
 
 	def_val = mp_ncpus;
 	if (virtual_cpu == 0)
 		new_val = def_val;
 	else
 		new_val = virtual_cpu;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val < 0)
 		return (EINVAL);
 	virtual_cpu = new_val;
 	return (0);
 }
 
 /* DEBUG ONLY */
 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
 	"debug virtual cpus");
 #endif
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
 
 /*
  * Prepare a thread for use.
  */
 static int
 thread_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct thread	*td;
 
 	td = (struct thread *)mem;
 	td->td_state = TDS_INACTIVE;
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = alloc_unr(tid_unrhdr);
 	td->td_syscalls = 0;
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
 	 * end of a context switch.  A context switch must occur inside a
 	 * critical section, and in fact, includes hand-off of the sched_lock.
 	 * After a context switch to a newly created thread, it will release
 	 * sched_lock for the first time, and its td_critnest will hit 0 for
 	 * the first time.  This happens on the far end of a context switch,
 	 * and when it context switches away from itself, it will in fact go
 	 * back into a critical section, and hand off the sched lock to the
 	 * next thread.
 	 */
 	td->td_critnest = 1;
 
 #ifdef AUDIT
 	audit_thread_alloc(td);
 #endif
 	umtx_thread_alloc(td);
 	return (0);
 }
 
 /*
  * Reclaim a thread after use.
  */
 static void
 thread_dtor(void *mem, int size, void *arg)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 #ifdef INVARIANTS
 	/* Verify that this thread is in a safe state to free. */
 	switch (td->td_state) {
 	case TDS_INHIBITED:
 	case TDS_RUNNING:
 	case TDS_CAN_RUN:
 	case TDS_RUNQ:
 		/*
 		 * We must never unlink a thread that is in one of
 		 * these states, because it is currently active.
 		 */
 		panic("bad state for thread unlinking");
 		/* NOTREACHED */
 	case TDS_INACTIVE:
 		break;
 	default:
 		panic("bad thread state");
 		/* NOTREACHED */
 	}
 #endif
 #ifdef AUDIT
 	audit_thread_free(td);
 #endif
 	free_unr(tid_unrhdr, td->td_tid);
 	sched_newthread(td);
 }
 
 /*
  * Initialize type-stable parts of a thread (when newly created).
  */
 static int
 thread_init(void *mem, int size, int flags)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 
 	vm_thread_new(td, 0);
 	cpu_thread_setup(td);
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
 	td->td_sched = (struct td_sched *)&td[1];
 	sched_newthread(td);
 	umtx_thread_init(td);
 	return (0);
 }
 
 /*
  * Tear down type-stable parts of a thread (just before being discarded).
  */
 static void
 thread_fini(void *mem, int size)
 {
 	struct thread *td;
 
 	td = (struct thread *)mem;
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
 	umtx_thread_fini(td);
 	vm_thread_dispose(td);
 }
 
 /*
  * For a newly created process,
  * link up all the structures and its initial threads etc.
  * called from:
  * {arch}/{arch}/machdep.c   ia64_init(), init386() etc.
  * proc_dtor() (should go away)
  * proc_init()
  */
 void
 proc_linkup(struct proc *p, struct thread *td)
 {
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
 	TAILQ_INIT(&p->p_upcalls);	     /* upcall list */
 	sigqueue_init(&p->p_sigqueue, p);
 	p->p_ksi = ksiginfo_alloc(1);
 	if (p->p_ksi != NULL) {
 		/* XXX p_ksi may be null if ksiginfo zone is not ready */
 		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
 	}
 	LIST_INIT(&p->p_mqnotifier);
 	p->p_numthreads = 0;
 	thread_link(td, p);
 }
 
 /*
  * Initialize global thread allocation resources.
  */
 void
 threadinit(void)
 {
 
 	mtx_init(&tid_lock, "TID lock", NULL, MTX_DEF);
 	tid_unrhdr = new_unrhdr(PID_MAX + 1, INT_MAX, &tid_lock);
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
 	    16 - 1, 0);
 #ifdef KSE
 	kseinit();	/* set up kse specific stuff  e.g. upcall zone*/
 #endif
 }
 
 /*
  * Stash an embarasingly extra thread into the zombie thread queue.
  * Use the slpq as that must be unused by now.
  */
 void
 thread_stash(struct thread *td)
 {
 	mtx_lock_spin(&kse_zombie_lock);
 	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
 	mtx_unlock_spin(&kse_zombie_lock);
 }
 
 /*
  * Reap zombie kse resource.
  */
 void
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
 	if (!TAILQ_EMPTY(&zombie_threads)) {
 		mtx_lock_spin(&kse_zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
 		mtx_unlock_spin(&kse_zombie_lock);
 		while (td_first) {
 			td_next = TAILQ_NEXT(td_first, td_slpq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
 	}
 }
 
 /*
  * Allocate a thread.
  */
 struct thread *
 thread_alloc(void)
 {
 
 	thread_reap(); /* check if any zombies to get */
 	return (uma_zalloc(thread_zone, M_WAITOK));
 }
 
 
 /*
  * Deallocate a thread.
  */
 void
 thread_free(struct thread *td)
 {
 
 	cpu_thread_clean(td);
 	uma_zfree(thread_zone, td);
 }
 
 /*
  * Discard the current thread and exit from its context.
  * Always called with scheduler locked.
  *
  * Because we can't free a thread while we're operating under its context,
  * push the current thread into our CPU's deadthread holder. This means
  * we needn't worry about someone else grabbing our context before we
  * do a cpu_throw().  This may not be needed now as we are under schedlock.
  * Maybe we can just do a thread_stash() as thr_exit1 does.
  */
 /*  XXX
  * libthr expects its thread exit to return for the last
  * thread, meaning that the program is back to non-threaded
  * mode I guess. Because we do this (cpu_throw) unconditionally
  * here, they have their own version of it. (thr_exit1()) 
  * that doesn't do it all if this was the last thread.
  * It is also called from thread_suspend_check().
  * Of course in the end, they end up coming here through exit1
  * anyhow..  After fixing 'thr' to play by the rules we should be able 
  * to merge these two functions together.
  *
  * called from:
  * exit1()
  * kse_exit()
  * thr_exit()
  * ifdef KSE
  * thread_user_enter()
  * thread_userret()
  * endif
  * thread_suspend_check()
  */
 void
 thread_exit(void)
 {
 	uint64_t new_switchtime;
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, p->p_comm);
 	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
 #ifdef AUDIT
 	AUDIT_SYSCALL_EXIT(0, td);
 #endif
 
 #ifdef KSE
 	if (td->td_standin != NULL) {
 		/*
 		 * Note that we don't need to free the cred here as it
 		 * is done in thread_reap().
 		 */
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 #endif
 
 	umtx_thread_exit(td);
 
 	/*
 	 * drop FPU & debug register state storage, or any other
 	 * architecture specific resources that
 	 * would not be on a new untouched process.
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
 #ifdef KSE
 	/*
 	 * The thread is exiting. scheduler can release its stuff
 	 * and collect stats etc.
 	 * XXX this is not very right, since PROC_UNLOCK may still
 	 * need scheduler stuff.
 	 */
 	sched_thread_exit(td);
 #endif
 
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
 	new_switchtime = cpu_ticks();
 	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	p->p_rux.rux_uticks += td->td_uticks;
 	p->p_rux.rux_sticks += td->td_sticks;
 	p->p_rux.rux_iticks += td->td_iticks;
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
-	cnt.v_swtch++;
+	VMCNT_ADD(swtch, 1);
 
 	/* Add our usage into the usage of all our children. */
 	if (p->p_numthreads == 1)
 		ruadd(p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
 
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
 	 * all this stuff if we never had threads.
 	 * EXIT clears all sign of other threads when
 	 * it goes to single threading, so the last thread always
 	 * takes the short path.
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
 			thread_unlink(td);
 
 			sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 
 			/*
 			 * The test below is NOT true if we are the
 			 * sole exiting thread. P_STOPPED_SNGL is unset
 			 * in exit1() after it is the only survivor.
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
 					thread_unsuspend_one(p->p_singlethread);
 				}
 			}
 
 #ifdef KSE
 			/*
 			 * Because each upcall structure has an owner thread,
 			 * owner thread exits only when process is in exiting
 			 * state, so upcall to userland is no longer needed,
 			 * deleting upcall structure is safe here.
 			 * So when all threads in a group is exited, all upcalls
 			 * in the group should be automatically freed.
 			 *  XXXKSE This is a KSE thing and should be exported
 			 * there somehow.
 			 */
 			upcall_remove(td);
 #endif
 
 			PROC_UNLOCK(p);
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
 			 * The last thread is exiting.. but not through exit()
 			 * what should we do?
 			 * Theoretically this can't happen
  			 * exit1() - clears threading flags before coming here
  			 * kse_exit() - treats last thread specially
  			 * thr_exit() - treats last thread specially
 			 * ifdef KSE
  			 * thread_user_enter() - only if more exist
  			 * thread_userret() - only if more exist
 			 * endif
  			 * thread_suspend_check() - only if more exist
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
 	} else {
 		/*
 		 * non threaded process comes here.
 		 * This includes an EX threaded process that is coming
 		 * here via exit1(). (exit1 dethreads the proc first).
 		 */
 		PROC_UNLOCK(p);
 	}
 	td->td_state = TDS_INACTIVE;
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
 	cpu_throw(td, choosethread());
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
 
 /*
  * Do any thread specific cleanups that may be needed in wait()
  * called with Giant, proc and schedlock not held.
  */
 void
 thread_wait(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
 	FOREACH_THREAD_IN_PROC(p, td) {
 #ifdef KSE
 		if (td->td_standin != NULL) {
 			if (td->td_standin->td_ucred != NULL) {
 				crfree(td->td_standin->td_ucred);
 				td->td_standin->td_ucred = NULL;
 			}
 			thread_free(td->td_standin);
 			td->td_standin = NULL;
 		}
 #endif
 		cpu_thread_clean(td);
 		crfree(td->td_ucred);
 	}
 	thread_reap();	/* check for zombie threads etc. */
 }
 
 /*
  * Link a thread to a process.
  * set up anything that needs to be initialized for it to
  * be used by the process.
  *
  * Note that we do not link to the proc's ucred here.
  * The thread is linked as if running but no KSE assigned.
  * Called from:
  *  proc_linkup()
  *  thread_schedule_upcall()
  *  thr_create()
  */
 void
 thread_link(struct thread *td, struct proc *p)
 {
 
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
 	td->td_flags    = 0;
 
 	LIST_INIT(&td->td_contested);
 	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
 	p->p_numthreads++;
 }
 
 /*
  * Convert a process with one thread to an unthreaded process.
  * Called from:
  *  thread_single(exit)  (called from execve and exit)
  *  kse_exit()		XXX may need cleaning up wrt KSE stuff
  */
 void
 thread_unthread(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
 #ifdef KSE
 	upcall_remove(td);
 	p->p_flag &= ~(P_SA|P_HADTHREADS);
 	td->td_mailbox = NULL;
 	td->td_pflags &= ~(TDP_SA | TDP_CAN_UNBIND);
 	if (td->td_standin != NULL) {
 		thread_stash(td->td_standin);
 		td->td_standin = NULL;
 	}
 	sched_set_concurrency(p, 1);
 #else
 	p->p_flag &= ~P_HADTHREADS;
 #endif
 }
 
 /*
  * Called from:
  *  thread_exit()
  */
 void
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
 	/* could clear a few other things here */
 	/* Must  NOT clear links to proc! */
 }
 
 /*
  * Enforce single-threading.
  *
  * Returns 1 if the caller must abort (another thread is waiting to
  * exit the process or similar). Process is locked!
  * Returns 0 when you are successfully the only thread running.
  * A process has successfully single threaded in the suspend mode when
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
  * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
 thread_single(int mode)
 {
 	struct thread *td;
 	struct thread *td2;
 	struct proc *p;
 	int remaining;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT((td != NULL), ("curthread is NULL"));
 
 	if ((p->p_flag & P_HADTHREADS) == 0)
 		return (0);
 
 	/* Is someone already single threading? */
 	if (p->p_singlethread != NULL && p->p_singlethread != td)
 		return (1);
 
 	if (mode == SINGLE_EXIT) {
 		p->p_flag |= P_SINGLE_EXIT;
 		p->p_flag &= ~P_SINGLE_BOUNDARY;
 	} else {
 		p->p_flag &= ~P_SINGLE_EXIT;
 		if (mode == SINGLE_BOUNDARY)
 			p->p_flag |= P_SINGLE_BOUNDARY;
 		else
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	p->p_flag |= P_STOPPED_SINGLE;
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = td;
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
 	else if (mode == SINGLE_BOUNDARY)
 		remaining = p->p_numthreads - p->p_boundary_count;
 	else
 		remaining = p->p_numthreads - p->p_suspcount;
 	while (remaining != 1) {
 		if (P_SHOULDSTOP(p) != P_STOPPED_SINGLE)
 			goto stopme;
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				switch (mode) {
 				case SINGLE_EXIT:
 					if (td->td_flags & TDF_DBSUSPEND)
 						td->td_flags &= ~TDF_DBSUSPEND;
 					if (TD_IS_SUSPENDED(td2))
 						thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						sleepq_abort(td2, EINTR);
 					break;
 				case SINGLE_BOUNDARY:
 					if (TD_IS_SUSPENDED(td2) &&
 					    !(td2->td_flags & TDF_BOUNDARY))
 						thread_unsuspend_one(td2);
 					if (TD_ON_SLEEPQ(td2) &&
 					    (td2->td_flags & TDF_SINTR))
 						sleepq_abort(td2, ERESTART);
 					break;
 				default:	
 					if (TD_IS_SUSPENDED(td2))
 						continue;
 					/*
 					 * maybe other inhibited states too?
 					 */
 					if ((td2->td_flags & TDF_SINTR) &&
 					    (td2->td_inhibitors &
 					    (TDI_SLEEPING | TDI_SWAPPED)))
 						thread_suspend_one(td2);
 					break;
 				}
 			}
 #ifdef SMP
 			else if (TD_IS_RUNNING(td2) && td != td2) {
 				forward_signal(td2);
 			}
 #endif
 		}
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
 			remaining = p->p_numthreads - p->p_boundary_count;
 		else
 			remaining = p->p_numthreads - p->p_suspcount;
 
 		/*
 		 * Maybe we suspended some threads.. was it enough?
 		 */
 		if (remaining == 1)
 			break;
 
 stopme:
 		/*
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
 		thread_stopped(p);
 		thread_suspend_one(td);
 		PROC_UNLOCK(p);
 		mi_switch(SW_VOL, NULL);
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
 			remaining = p->p_numthreads - p->p_boundary_count;
 		else
 			remaining = p->p_numthreads - p->p_suspcount;
 	}
 	if (mode == SINGLE_EXIT) {
 		/*
 		 * We have gotten rid of all the other threads and we
 		 * are about to either exit or exec. In either case,
 		 * we try our utmost  to revert to being a non-threaded
 		 * process.
 		 */
 		p->p_singlethread = NULL;
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
 		thread_unthread(td);
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Called in from locations that can safely check to see
  * whether we have to suspend or at least throttle for a
  * single-thread event (e.g. fork).
  *
  * Such locations include userret().
  * If the "return_instead" argument is non zero, the thread must be able to
  * accept 0 (caller may continue), or 1 (caller must abort) as a result.
  *
  * The 'return_instead' argument tells the function if it may do a
  * thread_exit() or suspend, or whether the caller must abort and back
  * out instead.
  *
  * If the thread that set the single_threading request has set the
  * P_SINGLE_EXIT bit in the process flags then this call will never return
  * if 'return_instead' is false, but will exit.
  *
  * P_SINGLE_EXIT | return_instead == 0| return_instead != 0
  *---------------+--------------------+---------------------
  *       0       | returns 0          |   returns 0 or 1
  *               | when ST ends       |   immediatly
  *---------------+--------------------+---------------------
  *       1       | thread exits       |   returns 1
  *               |                    |  immediatly
  * 0 = thread_exit() or suspension ok,
  * other = return error instead of stopping the thread.
  *
  * While a full suspension is under effect, even a single threading
  * thread would be suspended if it made this call (but it shouldn't).
  * This call should only be made from places where
  * thread_exit() would be safe as that may be the outcome unless
  * return_instead is set.
  */
 int
 thread_suspend_check(int return_instead)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	mtx_assert(&Giant, MA_NOTOWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	while (P_SHOULDSTOP(p) ||
 	      ((p->p_flag & P_TRACED) && (td->td_flags & TDF_DBSUSPEND))) {
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			KASSERT(p->p_singlethread != NULL,
 			    ("singlethread not set"));
 			/*
 			 * The only suspension in action is a
 			 * single-threading. Single threader need not stop.
 			 * XXX Should be safe to access unlocked
 			 * as it can only be set to be true by us.
 			 */
 			if (p->p_singlethread == td)
 				return (0);	/* Exempt from stopping. */
 		}
 		if ((p->p_flag & P_SINGLE_EXIT) && return_instead)
 			return (EINTR);
 
 		/* Should we goto user boundary if we didn't come from there? */
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE &&
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
 		/* If thread will exit, flush its pending signals */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			sigqueue_flush(&td->td_sigqueue);
 
 		mtx_lock_spin(&sched_lock);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
 		 * this thread should just suicide.
 		 * Assumes that P_SINGLE_EXIT implies P_STOPPED_SINGLE.
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			thread_exit();
 
 		/*
 		 * When a thread suspends, it just
 		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
 		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 			if (p->p_numthreads == p->p_suspcount) 
 				thread_unsuspend_one(p->p_singlethread);
 		}
 		PROC_UNLOCK(p);
 		mi_switch(SW_INVOL, NULL);
 		if (return_instead == 0) {
 			p->p_boundary_count--;
 			td->td_flags &= ~TDF_BOUNDARY;
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
 	TD_SET_SUSPENDED(td);
 }
 
 void
 thread_unsuspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
 }
 
 /*
  * Allow all threads blocked by single threading to continue running.
  */
 void
 thread_unsuspend(struct proc *p)
 {
 	struct thread *td;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
 		/*
 		 * Stopping everything also did the job for the single
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
 		thread_unsuspend_one(p->p_singlethread);
 	}
 }
 
 /*
  * End the single threading mode..
  */
 void
 thread_single_end(void)
 {
 	struct thread *td;
 	struct proc *p;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
 	mtx_lock_spin(&sched_lock);
 	p->p_singlethread = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
 	 * on the process. The single threader must be allowed
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
                 FOREACH_THREAD_IN_PROC(p, td) {
 			if (TD_IS_SUSPENDED(td)) {
 				thread_unsuspend_one(td);
 			}
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 }
 
 struct thread *
 thread_find(struct proc *p, lwpid_t tid)
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_lock_spin(&sched_lock);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (td->td_tid == tid)
 			break;
 	}
 	mtx_unlock_spin(&sched_lock);
 	return (td);
 }
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 169666)
+++ head/sys/kern/subr_trap.c	(revision 169667)
@@ -1,278 +1,278 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 #ifdef __i386__
 #include "opt_npx.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Define the code needed before returning to user mode, for trap and
  * syscall.
  */
 void
 userret(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p = td->td_proc;
 
 	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(p);
 #endif
 
 #ifdef KTRACE
 	KTRUSERRET(td);
 #endif
 
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
 	 */
 	if (td->td_pflags & TDP_GEOM)
 		g_waitidle();
 
 	/*
 	 * We need to check to see if we have to exit or wait due to a
 	 * single threading requirement or some other STOP condition.
 	 * Don't bother doing all the work if the stop bits are not set
 	 * at this time.. If we miss it, we miss it.. no big deal.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		PROC_LOCK(p);
 		thread_suspend_check(0);	/* Can suspend or kill */
 		PROC_UNLOCK(p);
 	}
 
 #ifdef KSE
 	/*
 	 * Do special thread processing, e.g. upcall tweaking and such.
 	 */
 	if (p->p_flag & P_SA)
 		thread_userret(td, frame);
 #endif
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 
 		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 	}
 
 	/*
 	 * Let the scheduler adjust our priority etc.
 	 */
 	sched_userret(td);
 	KASSERT(td->td_locks == 0,
 	    ("userret: Returning with %d locks held.", td->td_locks));
 }
 
 /*
  * Process an asynchronous software trap.
  * This is relatively easy.
  * This function will return with preemption disabled.
  */
 void
 ast(struct trapframe *framep)
 {
 	struct thread *td;
 	struct proc *p;
 	struct rlimit rlim;
 	int sflag;
 	int flags;
 	int sig;
 #if defined(DEV_NPX) && !defined(SMP)
 	int ucode;
 	ksiginfo_t ksi;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	td->td_frame = framep;
 	td->td_pticks = 0;
 
 #ifdef KSE
 	if ((p->p_flag & P_SA) && (td->td_mailbox == NULL))
 		thread_user_enter(td);
 #endif
 
 	/*
 	 * This updates the p_sflag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
 	 * AST's saved in sflag, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
 	mtx_lock_spin(&sched_lock);
 	flags = td->td_flags;
 	sflag = p->p_sflag;
 	if (p->p_sflag & (PS_ALRMPEND | PS_PROFPEND | PS_XCPU))
 		p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU);
 #ifdef MAC
 	if (p->p_sflag & PS_MACPEND)
 		p->p_sflag &= ~PS_MACPEND;
 #endif
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
 	    TDF_NEEDRESCHED | TDF_INTERRUPT);
-	cnt.v_trap++;
 	mtx_unlock_spin(&sched_lock);
+	VMCNT_ADD(trap, 1);
 
 	/*
 	 * XXXKSE While the fact that we owe a user profiling
 	 * tick is stored per thread in this code, the statistics
 	 * themselves are still stored per process.
 	 * This should probably change, by which I mean that
 	 * possibly the location of both might change.
 	 */
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
 		addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
 		td->td_profil_ticks = 0;
 		td->td_pflags &= ~TDP_OWEUPC;
 	}
 	if (sflag & PS_ALRMPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
 	}
 #if defined(DEV_NPX) && !defined(SMP)
 	if (PCPU_GET(curpcb)->pcb_flags & PCB_NPXTRAP) {
 		atomic_clear_int(&PCPU_GET(curpcb)->pcb_flags,
 		    PCB_NPXTRAP);
 		ucode = npxtrap();
 		if (ucode != -1) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGFPE;
 			ksi.ksi_code = ucode;
 			trapsignal(td, &ksi);
 		}
 	}
 #endif
 	if (sflag & PS_PROFPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
 	if (sflag & PS_XCPU) {
 		PROC_LOCK(p);
 		lim_rlimit(p, RLIMIT_CPU, &rlim);
 		mtx_lock_spin(&sched_lock);
 		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
 			mtx_unlock_spin(&sched_lock);
 			killproc(p, "exceeded maximum CPU limit");
 		} else {
 			if (p->p_cpulimit < rlim.rlim_max)
 				p->p_cpulimit += 5;
 			mtx_unlock_spin(&sched_lock);
 			psignal(p, SIGXCPU);
 		}
 		PROC_UNLOCK(p);
 	}
 #ifdef MAC
 	if (sflag & PS_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1);
 #endif
 		mtx_lock_spin(&sched_lock);
 		sched_prio(td, td->td_user_pri);
 		mi_switch(SW_INVOL, NULL);
 		mtx_unlock_spin(&sched_lock);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1);
 #endif
 	}
 	if (flags & TDF_NEEDSIGCHK) {
 		PROC_LOCK(p);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 		PROC_UNLOCK(p);
 	}
 
 	userret(td, framep);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 169666)
+++ head/sys/kern/vfs_bio.c	(revision 169667)
@@ -1,3954 +1,3956 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include "opt_directio.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 /*
  * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  */
 struct buf *buf;		/* buffer header pool */
 
 static struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf *bp);
 static void vfs_setdirty(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int flushbufqueues(int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 int runningbufspace;
 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static int bufspace;
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "KVA memory used for bufs");
 static int maxbufspace;
 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including buf_daemon)");
 static int bufmallocspace;
 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static int maxbufmallocspace;
 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
     "Maximum amount of malloced memory for buffers");
 static int lobufspace;
 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 int hibufspace;
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding buf_daemon)");
 static int bufreusecnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
     "Number of times we have reused a buffer");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static int lorunningspace;
 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0,
     "Minimum preferred space used for in-progress I/O");
 static int hirunningspace;
 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "XXX Unused");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "XXX Complicatedly unused");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer aquisition");
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx bdlock;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx rbreqlock;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static int needsbuffer;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct mtx nblock;
 
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
 
 static struct mtx bdonelock;
 
 /*
  * Lock that protects against bwait()/bdone()/B_DONE races.
  */
 static struct mtx bpinlock;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
 #define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY	5	/* empty buffer headers */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 
 /* Lock for the bufqueues */
 static struct mtx bqlock;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
 #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
 #ifdef DIRECTIO
 extern void ffs_rawread_setup(void);
 #endif /* DIRECTIO */
 /*
  *	numdirtywakeup:
  *
  *	If someone is blocked due to there being too many dirty buffers,
  *	and numdirtybuffers is now reasonable, wake them up.
  */
 
 static __inline void
 numdirtywakeup(int level)
 {
 
 	if (numdirtybuffers <= level) {
 		mtx_lock(&nblock);
 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
 			wakeup(&needsbuffer);
 		}
 		mtx_unlock(&nblock);
 	}
 }
 
 /*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 
 static __inline void
 bufspacewakeup(void)
 {
 
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	mtx_lock(&nblock);
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  * runningbufwakeup() - in-progress I/O accounting.
  *
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 
 	if (bp->b_runningbufspace) {
 		atomic_subtract_int(&runningbufspace, bp->b_runningbufspace);
 		bp->b_runningbufspace = 0;
 		mtx_lock(&rbreqlock);
 		if (runningbufreq && runningbufspace <= lorunningspace) {
 			runningbufreq = 0;
 			wakeup(&runningbufreq);
 		}
 		mtx_unlock(&rbreqlock);
 	}
 }
 
 /*
  *	bufcountwakeup:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
 
 static __inline void
 bufcountwakeup(void) 
 {
 
 	atomic_add_int(&numfreebuffers, 1);
 	mtx_lock(&nblock);
 	if (needsbuffer) {
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
 		if (numfreebuffers >= hifreebuffers)
 			needsbuffer &= ~VFS_BIO_NEED_FREE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	Reads will adjust runningbufspace, but will not block based on it.
  *	The read load has a side effect of reducing the allowed write load.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		++runningbufreq;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline
 void
 vfs_buf_test_cache(struct buf *bp,
 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 		  vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline
 void
 bd_wakeup(int dirtybuflevel)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 
 static __inline
 void
 bd_speedup(void)
 {
 
 	bd_wakeup(1);
 }
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/20 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += (physmem_est - 65536) * 2 / (factor * 5);
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 	}
 
 #if 0
 	/*
 	 * Do not allow the buffer_map to be more then 1/2 the size of the
 	 * kernel_map.
 	 */
 	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 
 	    (BKVASIZE * 2)) {
 		nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 
 		    (BKVASIZE * 2);
 		printf("Warning: nbufs capped at %d\n", nbuf);
 	}
 #endif
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = max(min(nbuf/4, 256), 16);
 #ifdef NSWBUF_MIN
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 #endif
 #ifdef DIRECTIO
 	ffs_rawread_setup();
 #endif
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
 	mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vflags = 0;
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
 	 * used by most other processes.  The differential is required to 
 	 * ensure that buf_daemon is able to run when other processes might 
 	 * be blocked waiting for buffer space.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system.
 	 */
 	maxbufspace = nbuf * BKVASIZE;
 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 	lobufspace = hibufspace - MAXBSIZE;
 
 	lorunningspace = 512 * 1024;
 	hirunningspace = 1024 * 1024;
 
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
  * allocation, we don't want the malloced region to grow uncontrolled.
  * The malloc scheme improves memory utilization significantly on average
  * (small) directories.
  */
 	maxbufmallocspace = hibufspace / 20;
 
 /*
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 /*
  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  * eat up all available buffer space.  This occurs when our minimum cannot
  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  * BKVASIZE'd (8K) buffers.
  */
 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 /*
  * Try to keep the number of free buffers in the specified range,
  * and give special processes (e.g. like buf_daemon) access to an 
  * emergency reserve.
  */
 	lofreebuffers = nbuf / 18 + 5;
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
 /*
  * Maximum number of async ops initiated per buf_daemon loop.  This is
  * somewhat of a hack at the moment, we really need to limit ourselves
  * based on the number of bytes of I/O in-transit that were initiated
  * from buf_daemon.
  */
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 }
 
 /*
  * bfreekva() - free the kva allocation for a buffer.
  *
  *	Since this call frees up buffer space, we call bufspacewakeup().
  */
 static void
 bfreekva(struct buf *bp)
 {
 
 	if (bp->b_kvasize) {
 		atomic_add_int(&buffreekvacnt, 1);
 		atomic_subtract_int(&bufspace, bp->b_kvasize);
 		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list in brelse.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp), ("bremfree: buf must be locked."));
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 
 	bp->b_flags |= B_REMFREE;
 	/* Fixup numfreebuffers count.  */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)
 		atomic_subtract_int(&numfreebuffers, 1);
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	mtx_lock(&bqlock);
 	bremfreel(bp);
 	mtx_unlock(&bqlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	bqlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp), ("bremfreel: buffer %p not locked.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	mtx_assert(&bqlock, MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 	bp->b_qindex = QUEUE_NONE;
 	/*
 	 * If this was a delayed bremfree() we only need to remove the buffer
 	 * from the queue and return the stats are already done.
 	 */
 	if (bp->b_flags & B_REMFREE) {
 		bp->b_flags &= ~B_REMFREE;
 		return;
 	}
 	/*
 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
 	 * delayed-write, the buffer was free and we must decrement
 	 * numfreebuffers.
 	 */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)
 		atomic_subtract_int(&numfreebuffers, 1);
 }
 
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything ( see
  * getblk() ).  This is really just a special case of breadn().
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf **bpp)
 {
 
 	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread))
 				curthread->td_proc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread))
 			curthread->td_proc->p_stats->p_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
 
 	if (bp->b_pin_count > 0)
 		bunpin_wait(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/* Mark the buffer clean */
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	bufobj_wref(bp->b_bufobj);
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread))
 		curthread->td_proc->p_stats->p_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(BUF_REFCNT(bp) != 0, ("bdwrite: buffer is not busy"));
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
 	bqrelse(bp);
 
 	/*
 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
 	 * buffers (midpoint between our recovery point and our stall
 	 * point).
 	 */
 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(BUF_REFCNT(bp) == 1, ("bdirty: bp %p not locked",bp));
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		atomic_add_int(&numdirtybuffers, 1);
 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	KASSERT(BUF_REFCNT(bp) == 1, ("bundirty: bp %p not locked",bp));
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		atomic_subtract_int(&numdirtybuffers, 1);
 		numdirtywakeup(lodirtybuffers);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&nblock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bd_wakeup(1);
 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
 			msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4), "flswai", 0);
 		}
 		mtx_unlock(&nblock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if (bp->b_iocmd == BIO_WRITE &&
 	    (bp->b_ioflags & BIO_ERROR) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
 		 * pages from being scrapped.  If B_INVAL is set then
 		 * this case is not run and the next case is run to 
 		 * destroy the buffer.  B_INVAL can occur if the buffer
 		 * is outside the range supported by the underlying device.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI) {
 			atomic_subtract_int(&numdirtybuffers, 1);
 			numdirtywakeup(lodirtybuffers);
 		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
 	 *
 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 	 * on pages to return pages to the VM page queues.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 	else if (vm_page_count_severe()) {
 		/*
 		 * XXX This lock may not be necessary since BKGRDINPROG
 		 * cannot be set while we hold the buf lock, it can only be
 		 * cleared if it is already pending.
 		 */
 		if (bp->b_vp) {
 			BO_LOCK(bp->b_bufobj);
 			if (!(bp->b_vflags & BV_BKGRDINPROG))
 				bp->b_flags |= B_RELBUF;
 			BO_UNLOCK(bp->b_bufobj);
 		} else
 			bp->b_flags |= B_RELBUF;
 	}
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_mount != NULL &&
 		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
 		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
 		vm_page_t m;
 		off_t foff;
 		vm_pindex_t poff;
 		vm_object_t obj;
 
 		obj = bp->b_bufobj->bo_object;
 
 		/*
 		 * Get the base offset and length of the buffer.  Note that 
 		 * in the VMIO case if the buffer block size is not
 		 * page-aligned then b_data pointer may not be page-aligned.
 		 * But our b_pages[] array *IS* page aligned.
 		 *
 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 		 * supported due to the page granularity bits (m->valid,
 		 * m->dirty, etc...). 
 		 *
 		 * See man buf(9) for more information
 		 */
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 		VM_OBJECT_LOCK(obj);
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
 			m = bp->b_pages[i];
 
 			/*
 			 * If we hit a bogus page, fixup *all* the bogus pages
 			 * now.
 			 */
 			if (m == bogus_page) {
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
 					if (mtmp == bogus_page) {
 						mtmp = vm_page_lookup(obj, poff + j);
 						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
 						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(
 					    trunc_page((vm_offset_t)bp->b_data),
 					    bp->b_pages, bp->b_npages);
 				}
 				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) ||
 			    (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
 				int presid = resid > (PAGE_SIZE - poffset) ?
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_lock_queues();
 				vm_page_set_invalid(m, poffset, presid);
 				vm_page_unlock_queues();
 				if (had_bogus)
 					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		VM_OBJECT_UNLOCK(obj);
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	} else if (bp->b_flags & B_VMIO) {
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
 			vfs_vmio_release(bp);
 		}
 
 	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
 		if (bp->b_bufsize != 0)
 			allocbuf(bp, 0);
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	/* enqueue */
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
 		bremfreel(bp);
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 1");
 		if (bp->b_kvasize) {
 			bp->b_qindex = QUEUE_EMPTYKVA;
 		} else {
 			bp->b_qindex = QUEUE_EMPTY;
 		}
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	/* remaining buffers */
 	} else {
 		if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
 		    (B_DELWRI|B_NEEDSGIANT))
 			bp->b_qindex = QUEUE_DIRTY_GIANT;
 		if (bp->b_flags & B_DELWRI)
 			bp->b_qindex = QUEUE_DIRTY;
 		else
 			bp->b_qindex = QUEUE_CLEAN;
 		if (bp->b_flags & B_AGE)
 			TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 		else
 			TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 	}
 	mtx_unlock(&bqlock);
 
 	/*
 	 * If B_INVAL and B_DELWRI is set, clear B_DELWRI.  We have already
 	 * placed the buffer on the correct queue.  We must also disassociate
 	 * the device and vnode for a B_INVAL buffer so gbincore() doesn't
 	 * find it.
 	 */
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	/*
 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 	 * if B_INVAL is set ).
 	 */
 
 	if (!(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
 	 * Something we can maybe free or reuse
 	 */
 	if (bp->b_bufsize || bp->b_kvasize)
 		bufspacewakeup();
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE) {
 			mtx_lock(&bqlock);
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 		}
 		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
 		bremfreel(bp);
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bqrelse: free buffer onto another queue???");
 	/* buffers with stale but valid contents */
 	if (bp->b_flags & B_DELWRI) {
 		if (bp->b_flags & B_NEEDSGIANT)
 			bp->b_qindex = QUEUE_DIRTY_GIANT;
 		else
 			bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 	} else {
 		/*
 		 * XXX This lock may not be necessary since BKGRDINPROG
 		 * cannot be set while we hold the buf lock, it can only be
 		 * cleared if it is already pending.
 		 */
 		BO_LOCK(bp->b_bufobj);
 		if (!vm_page_count_severe() || bp->b_vflags & BV_BKGRDINPROG) {
 			BO_UNLOCK(bp->b_bufobj);
 			bp->b_qindex = QUEUE_CLEAN;
 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
 			    b_freelist);
 		} else {
 			/*
 			 * We are too low on memory, we have to try to free
 			 * the buffer (most importantly: the wired pages
 			 * making up its backing store) *now*.
 			 */
 			BO_UNLOCK(bp->b_bufobj);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			return;
 		}
 	}
 	mtx_unlock(&bqlock);
 
 	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
 	 * Something we can maybe free or reuse.
 	 */
 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("bqrelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /* Give pages used by the bp back to the VM system (where possible) */
 static void
 vfs_vmio_release(struct buf *bp)
 {
 	int i;
 	vm_page_t m;
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
 		/*
 		 * In order to keep page LRU ordering consistent, put
 		 * everything on the inactive queue.
 		 */
 		vm_page_unwire(m, 0);
 		/*
 		 * We don't mess with busy pages, it is
 		 * the responsibility of the process that
 		 * busied the pages to deal with them.
 		 */
 		if ((m->oflags & VPO_BUSY) || (m->busy != 0))
 			continue;
 			
 		if (m->wire_count == 0) {
 			/*
 			 * Might as well free the page if we can and it has
 			 * no valid data.  We also free the page if the
 			 * buffer was used for direct I/O
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
 			    m->hold_count == 0) {
 				vm_page_free(m);
 			} else if (bp->b_flags & B_DIRECT) {
 				vm_page_try_to_free(m);
 			} else if (vm_page_count_severe()) {
 				vm_page_try_to_cache(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
 	}
 	bp->b_npages = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		VI_LOCK(vp);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		VI_UNLOCK(vp);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			return nwritten;
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return nwritten;
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
  *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_map is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
  *	Instead we ask the buf daemon to do it for us.  We attempt to
  *	avoid piecemeal wakeups of the pageout daemon.
  */
 
 static struct buf *
 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
 	static int flushingbufs;
 
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
 
 	atomic_add_int(&getnewbufcalls, 1);
 	atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
 	atomic_add_int(&getnewbufrestarts, 1);
 
 	/*
 	 * Setup for scan.  If we do not have enough free buffers,
 	 * we setup a degenerate case that immediately fails.  Note
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
 	mtx_lock(&bqlock);
 	nqindex = QUEUE_EMPTYKVA;
 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
 	if (nbp == NULL) {
 		/*
 		 * If no EMPTYKVA buffers and we are either
 		 * defragging or reusing, locate a CLEAN buffer
 		 * to free or reuse.  If bufspace useage is low
 		 * skip this step so we can allocate a new buffer.
 		 */
 		if (defrag || bufspace >= lobufspace) {
 			nqindex = QUEUE_CLEAN;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 		}
 
 		/*
 		 * If we could not find or were not allowed to reuse a
 		 * CLEAN buffer, check to see if it is ok to use an EMPTY
 		 * buffer.  We can only use an EMPTY buffer if allocating
 		 * its KVA would not otherwise run us out of buffer space.
 		 */
 		if (nbp == NULL && defrag == 0 &&
 		    bufspace + maxsize < hibufspace) {
 			nqindex = QUEUE_EMPTY;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
 
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 			switch(qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_CLEAN:
 				/*
 				 * nbp is NULL. 
 				 */
 				break;
 			}
 		}
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * b_kvasize != 0.  XXX this situation should no longer
 		 * occur, if defrag is non-zero the buffer's b_kvasize
 		 * should also be non-zero at this point.  XXX
 		 */
 		if (defrag && bp->b_kvasize == 0) {
 			printf("Warning: defrag empty buffer %p\n", bp);
 			continue;
 		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		if (bp->b_vp) {
 			BO_LOCK(bp->b_bufobj);
 			if (bp->b_vflags & BV_BKGRDINPROG) {
 				BO_UNLOCK(bp->b_bufobj);
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			BO_UNLOCK(bp->b_bufobj);
 		}
 		CTR6(KTR_BUF,
 		    "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
 		    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
 		    bp->b_kvasize, bp->b_bufsize, qindex);
 
 		/*
 		 * Sanity Checks
 		 */
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
 		 * Note: we no longer distinguish between VMIO and non-VMIO
 		 * buffers.
 		 */
 
 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 
 		bremfreel(bp);
 		mtx_unlock(&bqlock);
 
 		if (qindex == QUEUE_CLEAN) {
 			if (bp->b_flags & B_VMIO) {
 				bp->b_flags &= ~B_ASYNC;
 				vfs_vmio_release(bp);
 			}
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 *
 		 * Get the rest of the buffer freed up.  b_kva* is still
 		 * valid after this operation.
 		 */
 
 		if (bp->b_rcred != NOCRED) {
 			crfree(bp->b_rcred);
 			bp->b_rcred = NOCRED;
 		}
 		if (bp->b_wcred != NOCRED) {
 			crfree(bp->b_wcred);
 			bp->b_wcred = NOCRED;
 		}
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 3");
 		KASSERT(bp->b_vp == NULL,
 		    ("bp: %p still has vnode %p.  qindex: %d",
 		    bp, bp->b_vp, qindex));
 		KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 		   ("bp: %p still on a buffer list. xflags %X",
 		    bp, bp->b_xflags));
 
 		if (bp->b_bufsize)
 			allocbuf(bp, 0);
 
 		bp->b_flags = 0;
 		bp->b_ioflags = 0;
 		bp->b_xflags = 0;
 		bp->b_vflags = 0;
 		bp->b_vp = NULL;
 		bp->b_blkno = bp->b_lblkno = 0;
 		bp->b_offset = NOOFFSET;
 		bp->b_iodone = 0;
 		bp->b_error = 0;
 		bp->b_resid = 0;
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 		bp->b_bufobj = NULL;
 		bp->b_pin_count = 0;
 		bp->b_fsprivate1 = NULL;
 		bp->b_fsprivate2 = NULL;
 		bp->b_fsprivate3 = NULL;
 
 		LIST_INIT(&bp->b_dep);
 
 		/*
 		 * If we are defragging then free the buffer.
 		 */
 		if (defrag) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			defrag = 0;
 			goto restart;
 		}
 
 		/*
 		 * Notify any waiters for the buffer lock about
 		 * identity change by freeing the buffer.
 		 */
 		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 
 		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
 		 * processes are blocked in getnewbuf() or allocbuf().
 		 */
 		if (bufspace >= hibufspace)
 			flushingbufs = 1;
 		if (flushingbufs && bp->b_kvasize != 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 		if (bufspace < lobufspace)
 			flushingbufs = 0;
 		break;
 	}
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
 	 * wakeup various daemons and write out some dirty buffers.
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 
 	if (bp == NULL) {
 		int flags;
 		char *waitmsg;
 
 		if (defrag) {
 			flags = VFS_BIO_NEED_BUFSPACE;
 			waitmsg = "nbufkv";
 		} else if (bufspace >= hibufspace) {
 			waitmsg = "nbufbs";
 			flags = VFS_BIO_NEED_BUFSPACE;
 		} else {
 			waitmsg = "newbuf";
 			flags = VFS_BIO_NEED_ANY;
 		}
 		mtx_lock(&nblock);
 		needsbuffer |= flags;
 		mtx_unlock(&nblock);
 		mtx_unlock(&bqlock);
 
 		bd_speedup();	/* heeeelp */
 
 		mtx_lock(&nblock);
 		while (needsbuffer & flags) {
 			if (msleep(&needsbuffer, &nblock,
 			    (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
 				mtx_unlock(&nblock);
 				return (NULL);
 			}
 		}
 		mtx_unlock(&nblock);
 	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
 		 * to keep fragmentation sane we only allocate kva in
 		 * BKVASIZE chunks.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize) {
 			vm_offset_t addr = 0;
 
 			bfreekva(bp);
 
 			vm_map_lock(buffer_map);
 			if (vm_map_findspace(buffer_map,
 				vm_map_min(buffer_map), maxsize, &addr)) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
 				 */
 				atomic_add_int(&bufdefragcnt, 1);
 				vm_map_unlock(buffer_map);
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
 			if (addr) {
 				vm_map_insert(buffer_map, NULL, 0,
 					addr, addr + maxsize,
 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 
 				bp->b_kvabase = (caddr_t) addr;
 				bp->b_kvasize = maxsize;
 				atomic_add_int(&bufspace, bp->b_kvasize);
 				atomic_add_int(&bufreusecnt, 1);
 			}
 			vm_map_unlock(buffer_map);
 		}
 		bp->b_saveaddr = bp->b_kvabase;
 		bp->b_data = bp->b_saveaddr;
 	}
 	return(bp);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 
 static void
 buf_daemon()
 {
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kthread_suspend_check(bufdaemonproc);
 
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.  Wakeup any waiting processes before we
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
 			int flushed;
 
 			flushed = flushbufqueues(QUEUE_DIRTY, 0);
 			/* The list empty check here is slightly racy */
 			if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
 				mtx_lock(&Giant);
 				flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
 				mtx_unlock(&Giant);
 			}
 			if (flushed == 0) {
 				/*
 				 * Could not find any buffers without rollback
 				 * dependencies, so just write the first one
 				 * in the hopes of eventually making progress.
 				 */
 				flushbufqueues(QUEUE_DIRTY, 1);
 				if (!TAILQ_EMPTY(
 				    &bufqueues[QUEUE_DIRTY_GIANT])) {
 					mtx_lock(&Giant);
 					flushbufqueues(QUEUE_DIRTY_GIANT, 1);
 					mtx_unlock(&Giant);
 				}
 				break;
 			}
 			uio_yield();
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep half a second.
 		 * Otherwise we loop immediately.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(int queue, int flushdeps)
 {
 	struct thread *td = curthread;
 	struct buf sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int target;
 
 	target = numdirtybuffers - lodirtybuffers;
 	if (flushdeps && target > 2)
 		target /= 2;
 	flushed = 0;
 	bp = NULL;
 	mtx_lock(&bqlock);
 	TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
 	while (flushed != target) {
 		bp = TAILQ_FIRST(&bufqueues[queue]);
 		if (bp == &sentinel)
 			break;
 		TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
 		TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		if (bp->b_pin_count > 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_LOCK(bp->b_bufobj);
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BO_UNLOCK(bp->b_bufobj);
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		BO_UNLOCK(bp->b_bufobj);
 		if (bp->b_flags & B_INVAL) {
 			bremfreel(bp);
 			mtx_unlock(&bqlock);
 			brelse(bp);
 			flushed++;
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT, td) == 0) {
 			mtx_unlock(&bqlock);
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			vfs_bio_awrite(bp);
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0, td);
 			flushwithdeps += hasdeps;
 			flushed++;
 			waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 			mtx_lock(&bqlock);
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
 	mtx_unlock(&bqlock);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_LOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_UNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_LOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_UNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_UNLOCK(obj);
 	return (0);
 }
 
 /*
  *	vfs_setdirty:
  *
  *	Sets the dirty range for a buffer based on the status of the dirty
  *	bits in the pages comprising the buffer.
  *
  *	The range is limited to the size of the buffer.
  *
  *	This routine is primarily used by NFS, but is generalized for the
  *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 
 	/*
 	 * Degenerate case - empty buffer
 	 */
 
 	if (bp->b_bufsize == 0)
 		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vfs_setdirty_locked_object(bp);
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		vm_page_lock_queues();
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		vm_page_unlock_queues();
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successfull read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int error;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 
 	bo = &vp->v_bufobj;
 loop:
 	/*
 	 * Block if we are low on buffers.   Certain processes are allowed
 	 * to completely exhaust the buffer cache.
          *
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
 	 *
 	 * XXX remove if 0 sections (clean this up after its proven)
          */
 	if (numfreebuffers == 0) {
 		if (TD_IS_IDLETHREAD(curthread))
 			return NULL;
 		mtx_lock(&nblock);
 		needsbuffer |= VFS_BIO_NEED_ANY;
 		mtx_unlock(&nblock);
 	}
 
 	VI_LOCK(vp);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy, it must
 		 * be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    VI_MTX(vp), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					/*
 					 * If buffer is pinned and caller does
 					 * not want sleep  waiting for it to be
 					 * unpinned, bail out
 					 * */
 					if (bp->b_pin_count > 0) {
 						if (flags & GB_LOCK_NOWAIT) {
 							bqrelse(bp);
 							return (NULL);
 						} else {
 							bunpin_wait(bp);
 						}
 					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		int bsize, maxsize, vmio;
 		off_t offset;
 
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		VI_UNLOCK(vp);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		bsize = bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(slpflag, slptimeo, size, maxsize);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vn_canvmio(vp) != TRUE)
 				printf("getblk: VMIO on vnode type %d\n",
 					vp->v_type);
 #endif
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 		}
 
 		allocbuf(bp, size);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	KASSERT(BUF_REFCNT(bp) == 1, ("getblk: bp %p not locked",bp));
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0)
 		continue;
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	KASSERT(BUF_REFCNT(bp) == 1, ("geteblk: bp %p not locked",bp));
 	return (bp);
 }
 
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize, mbsize;
 	int i;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("allocbuf: buffer not busy");
 
 	if (bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		caddr_t origbuf;
 		int origbufsize;
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		if (bp->b_flags & B_MALLOC)
 			newbsize = mbsize;
 		else
 			newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * malloced buffers are not shrunk
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				if (newbsize) {
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
 					if (bp->b_bufsize) {
 						atomic_subtract_int(
 						    &bufmallocspace,
 						    bp->b_bufsize);
 						bufspacewakeup();
 						bp->b_bufsize = 0;
 					}
 					bp->b_saveaddr = bp->b_kvabase;
 					bp->b_data = bp->b_saveaddr;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
 				return 1;
 			}		
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 			/*
 			 * We only use malloced memory on the first allocation.
 			 * and revert to page-allocated memory when the buffer
 			 * grows.
 			 */
 			/*
 			 * There is a potential smp race here that could lead
 			 * to bufmallocspace slightly passing the max.  It
 			 * is probably extremely rare and not worth worrying
 			 * over.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
 				(mbsize <= PAGE_SIZE/2)) {
 
 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				atomic_add_int(&bufmallocspace, mbsize);
 				return 1;
 			}
 			origbuf = NULL;
 			origbufsize = 0;
 			/*
 			 * If the buffer is growing on its other-than-first allocation,
 			 * then we revert to the page-allocation scheme.
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				if (bp->b_bufsize) {
 					atomic_subtract_int(&bufmallocspace,
 					    bp->b_bufsize);
 					bufspacewakeup();
 					bp->b_bufsize = 0;
 				}
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 			if (origbuf) {
 				bcopy(origbuf, bp->b_data, origbufsize);
 				free(origbuf, M_BIOBUF);
 			}
 		}
 	} else {
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (size == 0) ? 0 :
 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * DEV_BSIZE aligned new buffer size is less then the
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
 				VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 				vm_page_lock_queues();
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
 					 * is the responsibility of 
 					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
 					while (vm_page_sleep_if_busy(m, TRUE, "biodep"))
 						vm_page_lock_queues();
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
 				}
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
 			 * byte-granular fashion.
 			 */
 			struct vnode *vp;
 			vm_object_t obj;
 			vm_offset_t toff;
 			vm_offset_t tinc;
 
 			/*
 			 * Step 1, bring in the VM pages from the object, 
 			 * allocating them if necessary.  We must clear
 			 * B_CACHE if these pages are not valid for the 
 			 * range covered by the buffer.
 			 */
 
 			vp = bp->b_vp;
 			obj = bp->b_bufobj->bo_object;
 
 			VM_OBJECT_LOCK(obj);
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 					/*
 					 * note: must allocate system pages
 					 * since blocking here could intefere
 					 * with paging I/O, no matter which
 					 * process we are.
 					 */
 					m = vm_page_alloc(obj, pi,
 					    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
 					    VM_ALLOC_WIRED);
 					if (m == NULL) {
 						atomic_add_int(&vm_pageout_deficit,
 						    desiredpages - bp->b_npages);
 						VM_OBJECT_UNLOCK(obj);
 						VM_WAIT;
 						VM_OBJECT_LOCK(obj);
 					} else {
 						bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
 					continue;
 				}
 
 				/*
 				 * We found a page.  If we have to sleep on it,
 				 * retry because it might have gotten freed out
 				 * from under us.
 				 *
 				 * We can only test VPO_BUSY here.  Blocking on
 				 * m->busy might lead to a deadlock:
 				 *
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
 				vm_page_lock_queues();
 				if (vm_page_sleep_if_busy(m, FALSE, "pgtblk"))
 					continue;
 
 				/*
 				 * We have a good page.  Should we wakeup the
 				 * page daemon?
 				 */
 				if ((curproc != pageproc) &&
 				    (VM_PAGE_INQUEUE1(m, PQ_CACHE)) &&
-				    ((cnt.v_free_count + cnt.v_cache_count) <
-			 		(cnt.v_free_min + cnt.v_cache_min))) {
+				    ((VMCNT_GET(free_count) +
+				    VMCNT_GET(cache_count)) <
+				    (VMCNT_GET(free_min) +
+				    VMCNT_GET(cache_min)))) {
 					pagedaemon_wakeup();
 				}
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 				bp->b_pages[bp->b_npages] = m;
 				++bp->b_npages;
 			}
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
 			 * we have to figure out if we can still have B_CACHE
 			 * set.  Note that B_CACHE is set according to the
 			 * byte-granular range ( bcount and size ), new the
 			 * aligned range ( newbsize ).
 			 *
 			 * The VM test is against m->valid, which is DEV_BSIZE
 			 * aligned.  Needless to say, the validity of the data
 			 * needs to also be DEV_BSIZE aligned.  Note that this
 			 * fails with NFS if the server or some other client
 			 * extends the file's EOF.  If our buffer is resized, 
 			 * B_CACHE may remain set! XXX
 			 */
 
 			toff = bp->b_bcount;
 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 
 			while ((bp->b_flags & B_CACHE) && toff < size) {
 				vm_pindex_t pi;
 
 				if (tinc > (size - toff))
 					tinc = size - toff;
 
 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 				    PAGE_SHIFT;
 
 				vfs_buf_test_cache(
 				    bp, 
 				    bp->b_offset,
 				    toff, 
 				    tinc, 
 				    bp->b_pages[pi]
 				);
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
 			VM_OBJECT_UNLOCK(obj);
 
 			/*
 			 * Step 3, fixup the KVM pmap.  Remember that
 			 * bp->b_data is relative to bp->b_offset, but 
 			 * bp->b_offset may be offset into the first page.
 			 */
 
 			bp->b_data = (caddr_t)
 			    trunc_page((vm_offset_t)bp->b_data);
 			pmap_qenter(
 			    (vm_offset_t)bp->b_data,
 			    bp->b_pages, 
 			    bp->b_npages
 			);
 			
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 void
 biodone(struct bio *bp)
 {
 	void (*done)(struct bio *);
 
 	mtx_lock(&bdonelock);
 	bp->bio_flags |= BIO_DONE;
 	done = bp->bio_done;
 	if (done == NULL)
 		wakeup(bp);
 	mtx_unlock(&bdonelock);
 	if (done != NULL)
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  *
  * XXX: resort to a timeout for now.  The optimal locking (if any) for this
  * case is not yet clear.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 
 	mtx_lock(&bdonelock);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, &bdonelock, PRIBIO, wchan, hz / 10);
 	mtx_unlock(&bdonelock);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
  /*
   * Call back function from struct bio back up to struct buf.
   */
 static void
 bufdonebio(struct bio *bip)
 {
 	struct buf *bp;
 
 	bp = bip->bio_caller2;
 	bp->b_resid = bp->b_bcount - bip->bio_completed;
 	bp->b_resid = bip->bio_resid;	/* XXX: remove */
 	bp->b_ioflags = bip->bio_flags;
 	bp->b_error = bip->bio_error;
 	if (bp->b_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bufdone(bp);
 	g_destroy_bio(bip);
 }
 
 void
 dev_strategy(struct cdev *dev, struct buf *bp)
 {
 	struct cdevsw *csw;
 	struct bio *bip;
 
 	if ((!bp->b_iocmd) || (bp->b_iocmd & (bp->b_iocmd - 1)))
 		panic("b_iocmd botch");
 	for (;;) {
 		bip = g_new_bio();
 		if (bip != NULL)
 			break;
 		/* Try again later */
 		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
 	}
 	bip->bio_cmd = bp->b_iocmd;
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
 	bip->bio_data = bp->b_data;
 	bip->bio_done = bufdonebio;
 	bip->bio_caller2 = bp;
 	bip->bio_dev = dev;
 	KASSERT(dev->si_refcount > 0,
 	    ("dev_strategy on un-referenced struct cdev *(%s)",
 	    devtoname(dev)));
 	csw = dev_refthread(dev);
 	if (csw == NULL) {
 		g_destroy_bio(bip);
 		bp->b_error = ENXIO;
 		bp->b_ioflags = BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	(*csw->d_strategy)(bip);
 	dev_relthread(dev);
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occured, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existance
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
 	    BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
 	    BUF_REFCNT(bp)));
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 		boolean_t are_queues_locked;
 
 		obj = bp->b_bufobj->bo_object;
 
 #if defined(VFS_BIO_DEBUG)
 		mp_fixme("usecount and vflag accessed without locks.");
 		if (vp->v_usecount == 0) {
 			panic("biodone: zero vnode ref count");
 		}
 
 		KASSERT(vp->v_object != NULL,
 			("biodone: vnode %p has no vm_object", vp));
 #endif
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("biodone: no buffer offset"));
 
 		VM_OBJECT_LOCK(obj);
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occured.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount - bp->b_resid;
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR)) {
 			bp->b_flags |= B_CACHE;
 		}
 		if (bp->b_iocmd == BIO_READ) {
 			vm_page_lock_queues();
 			are_queues_locked = TRUE;
 		} else
 			are_queues_locked = FALSE;
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
 
 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 			if (resid > iosize)
 				resid = iosize;
 
 			/*
 			 * cleanup bogus pages, restoring the originals
 			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf(
 "biodone: foff(%jd)/m->pindex(%ju) mismatch\n",
 				    (intmax_t)foff, (uintmax_t)m->pindex);
 			}
 #endif
 
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, i, m);
 			}
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 				printf("biodone: page busy < 0, "
 				    "pindex: %d, foff: 0x(%x,%x), "
 				    "resid: %d, index: %d\n",
 				    (int) m->pindex, (int)(foff >> 32),
 						(int) foff & 0xffffffff, resid, i);
 				if (!vn_isdisk(vp, NULL))
 					printf(" iosize: %jd, lblkno: %jd, flags: 0x%x, npages: %d\n",
 					    (intmax_t)bp->b_vp->v_mount->mnt_stat.f_iosize,
 					    (intmax_t) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				else
 					printf(" VDEV, lblkno: %jd, flags: 0x%x, npages: %d\n",
 					    (intmax_t) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				printf(" valid: 0x%lx, dirty: 0x%lx, wired: %d\n",
 				    (u_long)m->valid, (u_long)m->dirty,
 				    m->wire_count);
 				panic("biodone: page busy < 0\n");
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (are_queues_locked)
 			vm_page_unlock_queues();
 		vm_object_pip_wakeupn(obj, 0);
 		VM_OBJECT_UNLOCK(obj);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_LOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 			    bp->b_pages, bp->b_npages);
 		}
 		vm_object_pip_subtract(obj, 1);
 		vm_page_io_finish(m);
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	VM_OBJECT_UNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being VPO_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	int i, bogus;
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_LOCK(obj);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 retry:
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if (vm_page_sleep_if_busy(m, FALSE, "vbpage"))
 			goto retry;
 	}
 	bogus = 0;
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_io_start(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		pmap_remove_all(m);
 		if (clear_modify)
 			vfs_page_set_valid(bp, foff, i, m);
 		else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus++;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(obj);
 	if (bogus)
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages(struct buf *bp)
 {
 	int i;
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages: no buffer offset"));
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		vfs_page_set_valid(bp, foff, i, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_set_validclean:
  *
  *	Set the range within the buffer to valid and clean.  The range is 
  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
  *	itself may be offset from the beginning of the first page.
  *
  */
 
 void   
 vfs_bio_set_validclean(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	vm_page_lock_queues();
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_validclean(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	clear a buffer.  This routine essentially fakes an I/O, so we need
  *	to clear BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask = 0;
 	caddr_t sa, ea;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 		    ((bp->b_pages[0]->valid & mask) == 0)) {
 			bzero(bp->b_data, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	ea = sa = bp->b_data;
 	for(i = 0; i < bp->b_npages; i++, sa = ea) {
 		ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 		ea = (caddr_t)(vm_offset_t)ulmin(
 		    (u_long)(vm_offset_t)ea,
 		    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0) {
 			if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
 				bzero(sa, ea - sa);
 		} else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 				    (bp->b_pages[i]->valid & (1 << j)) == 0)
 					bzero(sa, DEV_BSIZE);
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	VM_OBJECT_LOCK(kernel_object);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could intefere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM | VM_ALLOC_WIRED);
 		if (!p) {
 			atomic_add_int(&vm_pageout_deficit,
 			    (to - pg) >> PAGE_SHIFT);
 			VM_OBJECT_UNLOCK(kernel_object);
 			VM_WAIT;
 			VM_OBJECT_LOCK(kernel_object);
 			goto tryagain;
 		}
 		p->valid = VM_PAGE_BITS_ALL;
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	VM_OBJECT_UNLOCK(kernel_object);
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index, newnpages;
 
 	from = round_page(from);
 	to = round_page(to);
 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	VM_OBJECT_LOCK(kernel_object);
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
 		if (p && (index < bp->b_npages)) {
 			if (p->busy) {
 				printf(
 			    "vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 			}
 			bp->b_pages[index] = NULL;
 			pmap_qremove(pg, 1);
 			vm_page_lock_queues();
 			vm_page_unwire(p, 0);
 			vm_page_free(p);
 			vm_page_unlock_queues();
 		}
 	}
 	VM_OBJECT_UNLOCK(kernel_object);
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  */
 int
 vmapbuf(struct buf *bp)
 {
 	caddr_t addr, kva;
 	vm_prot_t prot;
 	int pidx, i;
 	struct vm_page *m;
 	struct pmap *pmap = &curproc->p_vmspace->vm_pmap;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	for (addr = (caddr_t)trunc_page((vm_offset_t)bp->b_data), pidx = 0;
 	     addr < bp->b_data + bp->b_bufsize;
 	     addr += PAGE_SIZE, pidx++) {
 		/*
 		 * Do the vm_fault if needed; do the copy-on-write thing
 		 * when reading stuff off device into memory.
 		 *
 		 * NOTE! Must use pmap_extract() because addr may be in
 		 * the userland address space, and kextract is only guarenteed
 		 * to work for the kernland address space (see: sparc64 port).
 		 */
 retry:
 		if (vm_fault_quick(addr >= bp->b_data ? addr : bp->b_data,
 		    prot) < 0) {
 			vm_page_lock_queues();
 			for (i = 0; i < pidx; ++i) {
 				vm_page_unhold(bp->b_pages[i]);
 				bp->b_pages[i] = NULL;
 			}
 			vm_page_unlock_queues();
 			return(-1);
 		}
 		m = pmap_extract_and_hold(pmap, (vm_offset_t)addr, prot);
 		if (m == NULL)
 			goto retry;
 		bp->b_pages[pidx] = m;
 	}
 	if (pidx > btoc(MAXPHYS))
 		panic("vmapbuf: mapped more than MAXPHYS");
 	pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 	
 	kva = bp->b_saveaddr;
 	bp->b_npages = pidx;
 	bp->b_saveaddr = bp->b_data;
 	bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int pidx;
 	int npages;
 
 	npages = bp->b_npages;
 	pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_lock_queues();
 	for (pidx = 0; pidx < npages; pidx++)
 		vm_page_unhold(bp->b_pages[pidx]);
 	vm_page_unlock_queues();
 
 	bp->b_data = bp->b_saveaddr;
 }
 
 void
 bdone(struct buf *bp)
 {
 
 	mtx_lock(&bdonelock);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(&bdonelock);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 
 	mtx_lock(&bdonelock);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, &bdonelock, pri, wchan, 0);
 	mtx_unlock(&bdonelock);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor, struct thread *td)
 {
 
 	return (VOP_FSYNC(bo->__bo_vnode, waitfor, td));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_LOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_LOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_MTX(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 void
 bpin(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	bp->b_pin_count++;
 	mtx_unlock(&bpinlock);
 }
 
 void
 bunpin(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	if (--bp->b_pin_count == 0)
 		wakeup(bp);
 	mtx_unlock(&bpinlock);
 }
 
 void
 bunpin_wait(struct buf *bp)
 {
 	mtx_lock(&bpinlock);
 	while (bp->b_pin_count > 0)
 		msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
 	mtx_unlock(&bpinlock);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	lockmgr_printinfo(&bp->b_lock);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (lockcount(&bp->b_lock)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 169666)
+++ head/sys/kern/vfs_subr.c	(revision 169667)
@@ -1,3966 +1,3967 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
  */
 
 /*
  * External virtual filesystem routines
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <machine/stdarg.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	v_upgrade_usecount(struct vnode *);
 static void	vfree(struct vnode *);
 static void	vnlru_free(int);
 static void	vdestroy(struct vnode *);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
 static void	vfs_knlunlock(void *arg);
 static int	vfs_knllocked(void *arg);
 
 
 /*
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
 int mpsafe_vfs = 1;
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
  * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
  * vnode.
  */
 static unsigned long	numvnodes;
 
 SYSCTL_LONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 /*
  * Conversion tables for conversion from vnode types to inode formats
  * and back.
  */
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[10] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
 };
 
 /*
  * List of vnodes that are ready for recycling.
  */
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;
 
 /*
  * Free vnode target.  Free vnodes may simply be files which have been stat'd
  * but not read.  This is somewhat common, and a small cache of such files
  * should be kept to avoid recreation costs.
  */
 static u_long wantfreevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 /* Number of vnodes in the free list. */
 static u_long freevnodes;
 SYSCTL_LONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
 /*
  * Various variables used for debugging the new implementation of
  * reassignbuf().
  * XXX these are probably of (very) limited utility now.
  */
 static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
 
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
  */
 int	nfs_mount_type = -1;
 
 /* To keep more than one thread at a time from running vfs_getnewfsid */
 static struct mtx mntid_mtx;
 
 /*
  * Lock for any access to the following:
  *	vnode_free_list
  *	numvnodes
  *	freevnodes
  */
 static struct mtx vnode_free_list_mtx;
 
 /* Publicly exported FS */
 struct nfs_public nfs_pub;
 
 /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
 static uma_zone_t vnode_zone;
 static uma_zone_t vnodepoll_zone;
 
 /* Set to 1 to print out reclaim of active vnodes */
 int	prtactive;
 
 /*
  * The workitem queue.
  *
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syncer process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 static int syncer_delayno;
 static long syncer_mask;
 LIST_HEAD(synclist, bufobj);
 static struct synclist *syncer_workitem_pending;
 /*
  * The sync_mtx protects:
  *	bo->bo_synclist
  *	sync_vnode_count
  *	syncer_delayno
  *	syncer_state
  *	syncer_workitem_pending
  *	syncer_worklist_len
  *	rushjob
  */
 static struct mtx sync_mtx;
 
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 static int syncdelay = 30;		/* max time to delay syncing data */
 static int filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 static int dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 static int metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;		/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 /*
  * When shutting down the syncer, run it at four times normal speed.
  */
 #define SYNCER_SHUTDOWN_SPEEDUP		4
 static int sync_vnode_count;
 static int syncer_worklist_len;
 static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
     syncer_state;
 
 /*
  * Number of vnodes we want to exist at any one time.  This is mostly used
  * to size hash tables in vnode-related code.  It is normally not used in
  * getnewvnode(), as wantfreevnodes is normally nonzero.)
  *
  * XXX desiredvnodes is historical cruft and should not exist.
  */
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
     &desiredvnodes, 0, "Maximum number of vnodes");
 SYSCTL_INT(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
     &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
 static int vnlru_nowhere;
 SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
     &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
 
 /*
  * Macros to control when a vnode is freed and recycled.  All require
  * the vnode interlock.
  */
 #define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
 #define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
 
 
 /*
  * Initialize the vnode management data structures.
  */
 #ifndef	MAXVNODES_MAX
 #define	MAXVNODES_MAX	100000
 #endif
 static void
 vntblinit(void *dummy __unused)
 {
 
 	/*
 	 * Desiredvnodes is a function of the physical memory size and
 	 * the kernel's heap size.  Specifically, desiredvnodes scales
 	 * in proportion to the physical memory size until two fifths
 	 * of the kernel's heap size is consumed by vnodes and vm
 	 * objects.
 	 */
-	desiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 * vm_kmem_size /
-	    (5 * (sizeof(struct vm_object) + sizeof(struct vnode))));
+	desiredvnodes = min(maxproc + VMCNT_GET(page_count) / 4, 2 *
+	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
+	    sizeof(struct vnode))));
 	if (desiredvnodes > MAXVNODES_MAX) {
 		if (bootverbose)
 			printf("Reducing kern.maxvnodes %d -> %d\n",
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
 	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
 
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
     struct thread *td)
 {
 	int lkflags;
 
 	MNT_ILOCK(mp);
 	MNT_REF(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
 			return (ENOENT);
 		}
 		if (interlkp)
 			mtx_unlock(interlkp);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		msleep(mp, MNT_MTX(mp), PVFS, "vfs_busy", 0);
 		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		if (interlkp)
 			mtx_lock(interlkp);
 		return (ENOENT);
 	}
 	if (interlkp)
 		mtx_unlock(interlkp);
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(struct mount *mp, struct thread *td)
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 	vfs_rel(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 	return ((struct mount *) 0);
 }
 
 /*
  * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
 	/*
 	 * If the thread is jailed, but this is not a jail-friendly file
 	 * system, deny immediately.
 	 */
 	if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
 		return (EPERM);
 
 	/*
 	 * If the file system was mounted outside a jail and a jailed thread
 	 * tries to access it, deny immediately.
 	 */
 	if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
 		return (EPERM);
 
 	/*
 	 * If the file system was mounted inside different jail that the jail of
 	 * the calling thread, deny immediately.
 	 */
 	if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
 	    mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
 		return (EPERM);
 	}
 
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
 		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Get a new unique fsid.  Try to make its val[0] unique, since this value
  * will be used to create fake device numbers for stat().  Also try (but
  * not so hard) make its val[0] unique mod 2^16, since some emulators only
  * support 16-bit device numbers.  We end up with unique val[0]'s for the
  * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
  *
  * Keep in mind that several mounts may be running in parallel.  Starting
  * the search one past where the previous search terminated is both a
  * micro-optimization and a defense against returning the same fsid to
  * different mounts.
  */
 void
 vfs_getnewfsid(struct mount *mp)
 {
 	static u_int16_t mntid_base;
 	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
 	mtx_lock(&mntid_mtx);
 	mtype = mp->mnt_vfc->vfc_typenum;
 	tfsid.val[1] = mtype;
 	mtype = (mtype & 0xFF) << 24;
 	for (;;) {
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
 		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
 		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
 	mtx_unlock(&mntid_mtx);
 }
 
 /*
  * Knob to control the precision of file timestamps:
  *
  *   0 = seconds only; nanoseconds zeroed.
  *   1 = seconds and nanoseconds, accurate within 1/HZ.
  *   2 = seconds and nanoseconds, truncated to microseconds.
  * >=3 = seconds and nanoseconds, maximum precision.
  */
 enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
 
 static int timestamp_precision = TSP_SEC;
 SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
     &timestamp_precision, 0, "");
 
 /*
  * Get a current timestamp.
  */
 void
 vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
 	switch (timestamp_precision) {
 	case TSP_SEC:
 		tsp->tv_sec = time_second;
 		tsp->tv_nsec = 0;
 		break;
 	case TSP_HZ:
 		getnanotime(tsp);
 		break;
 	case TSP_USEC:
 		microtime(&tv);
 		TIMEVAL_TO_TIMESPEC(&tv, tsp);
 		break;
 	case TSP_NSEC:
 	default:
 		nanotime(tsp);
 		break;
 	}
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_birthtime.tv_sec = VNOVAL;
 	vap->va_birthtime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * This routine is called when we have too many vnodes.  It attempts
  * to free <count> vnodes and will potentially free vnodes that still
  * have VM backing store (VM backing store is typically the cause
  * of a vnode blowout so we want to do this).  Therefore, this operation
  * is not considered cheap.
  *
  * A number of conditions may prevent a vnode from being reclaimed.
  * the buffer cache may have references on the vnode, a directory
  * vnode may still have references due to the namei cache representing
  * underlying files, or the vnode may be in active use.   It is not
  * desireable to reuse such vnodes.  These conditions may cause the
  * number of vnodes to reach some minimum value regardless of what
  * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
  */
 static int
 vlrureclaim(struct mount *mp)
 {
 	struct thread *td;
 	struct vnode *vp;
 	int done;
 	int trigger;
 	int usevnodes;
 	int count;
 
 	/*
 	 * Calculate the trigger point, don't allow user
 	 * screwups to blow us up.   This prevents us from
 	 * recycling vnodes with lots of resident pages.  We
 	 * aren't trying to free memory, we are trying to
 	 * free vnodes.
 	 */
 	usevnodes = desiredvnodes;
 	if (usevnodes <= 0)
 		usevnodes = 1;
-	trigger = cnt.v_page_count * 2 / usevnodes;
+	trigger = VMCNT_GET(page_count) * 2 / usevnodes;
 	done = 0;
 	td = curthread;
 	vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	count = mp->mnt_nvnodelistsize / 10 + 1;
 	while (count != 0) {
 		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
 		while (vp != NULL && vp->v_type == VMARKER)
 			vp = TAILQ_NEXT(vp, v_nmntvnodes);
 		if (vp == NULL)
 			break;
 		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 		--count;
 		if (!VI_TRYLOCK(vp))
 			goto next_iter;
 		/*
 		 * If it's been deconstructed already, it's still
 		 * referenced, or it exceeds the trigger, skip it.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VI_UNLOCK(vp);
 			goto next_iter;
 		}
 		MNT_IUNLOCK(mp);
 		vholdl(vp);
 		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT, td)) {
 			vdrop(vp);
 			goto next_iter_mntunlocked;
 		}
 		VI_LOCK(vp);
 		/*
 		 * v_usecount may have been bumped after VOP_LOCK() dropped
 		 * the vnode interlock and before it was locked again.
 		 *
 		 * It is not necessary to recheck VI_DOOMED because it can
 		 * only be set by another thread that holds both the vnode
 		 * lock and vnode interlock.  If another thread has the
 		 * vnode lock before we get to VOP_LOCK() and obtains the
 		 * vnode interlock after VOP_LOCK() drops the vnode
 		 * interlock, the other thread will be unable to drop the
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
 		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK, td);
 			goto next_iter_mntunlocked;
 		}
 		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
 		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
 		vgonel(vp);
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		done++;
 next_iter_mntunlocked:
 		if ((count % 256) != 0)
 			goto relock_mnt;
 		goto yield;
 next_iter:
 		if ((count % 256) != 0)
 			continue;
 		MNT_IUNLOCK(mp);
 yield:
 		uio_yield();
 relock_mnt:
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	return done;
 }
 
 /*
  * Attempt to keep the free list at wantfreevnodes length.
  */
 static void
 vnlru_free(int count)
 {
 	struct vnode *vp;
 	int vfslocked;
 
 	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
 	for (; count > 0; count--) {
 		vp = TAILQ_FIRST(&vnode_free_list);
 		/*
 		 * The list can be modified while the free_list_mtx
 		 * has been dropped and vp could be NULL here.
 		 */
 		if (!vp)
 			break;
 		VNASSERT(vp->v_op != NULL, vp,
 		    ("vnlru_free: vnode already reclaimed."));
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		/*
 		 * Don't recycle if we can't get the interlock.
 		 */
 		if (!VI_TRYLOCK(vp)) {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 			continue;
 		}
 		VNASSERT(VCANRECYCLE(vp), vp,
 		    ("vp inconsistent on freelist"));
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
 		vholdl(vp);
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vtryrecycle(vp);
 		VFS_UNLOCK_GIANT(vfslocked);
 		/*
 		 * If the recycled succeeded this vdrop will actually free
 		 * the vnode.  If not it will simply place it back on
 		 * the free list.
 		 */
 		vdrop(vp);
 		mtx_lock(&vnode_free_list_mtx);
 	}
 }
 /*
  * Attempt to recycle vnodes in a context that is always safe to block.
  * Calling vlrurecycle() from the bowels of filesystem code has some
  * interesting deadlock problems.
  */
 static struct proc *vnlruproc;
 static int vnlruproc_sig;
 
 static void
 vnlru_proc(void)
 {
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
 	struct thread *td = FIRST_THREAD_IN_PROC(p);
 
 	mtx_lock(&Giant);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
 		if (freevnodes > wantfreevnodes)
 			vnlru_free(freevnodes - wantfreevnodes);
 		if (numvnodes <= desiredvnodes * 9 / 10) {
 			vnlruproc_sig = 0;
 			wakeup(&vnlruproc_sig);
 			msleep(vnlruproc, &vnode_free_list_mtx,
 			    PVFS|PDROP, "vlruwt", hz);
 			continue;
 		}
 		mtx_unlock(&vnode_free_list_mtx);
 		done = 0;
 		mtx_lock(&mountlist_mtx);
 		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 			int vfsunlocked;
 			if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				continue;
 			}
 			if (!VFS_NEEDSGIANT(mp)) {
 				mtx_unlock(&Giant);
 				vfsunlocked = 1;
 			} else
 				vfsunlocked = 0;
 			done += vlrureclaim(mp);
 			if (vfsunlocked)
 				mtx_lock(&Giant);
 			mtx_lock(&mountlist_mtx);
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			vfs_unbusy(mp, td);
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
 			EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
 				printf("vnlru process getting nowhere..\n");
 			else if (vnlru_nowhere == 5)
 				printf("vnlru process messages stopped.\n");
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
 		} else
 			uio_yield();
 	}
 }
 
 static struct kproc_desc vnlru_kp = {
 	"vnlru",
 	vnlru_proc,
 	&vnlruproc
 };
 SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp)
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 
 static void
 vdestroy(struct vnode *vp)
 {
 	struct bufobj *bo;
 
 	CTR1(KTR_VFS, "vdestroy vp %p", vp);
 	mtx_lock(&vnode_free_list_mtx);
 	numvnodes--;
 	mtx_unlock(&vnode_free_list_mtx);
 	bo = &vp->v_bufobj;
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
 	    ("cleaned vnode still on the free list."));
 	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
 	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
 	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
 	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
 	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
 	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
 	VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
 	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
 	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_destroy_vnode(vp);
 #endif
 	if (vp->v_pollinfo != NULL) {
 		knlist_destroy(&vp->v_pollinfo->vpi_selinfo.si_note);
 		mtx_destroy(&vp->v_pollinfo->vpi_lock);
 		uma_zfree(vnodepoll_zone, vp->v_pollinfo);
 	}
 #ifdef INVARIANTS
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	uma_zfree(vnode_zone, vp);
 }
 
 /*
  * Try to recycle a freed vnode.  We abort if anyone picks up a reference
  * before we actually vgone().  This function must be called with the vnode
  * held to prevent the vnode from being returned to the free list midway
  * through vgone().
  */
 static int
 vtryrecycle(struct vnode *vp)
 {
 	struct thread *td = curthread;
 	struct mount *vnmp;
 
 	CTR1(KTR_VFS, "vtryrecycle: trying vp %p", vp);
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
 	/*
 	 * This vnode may found and locked via some other list, if so we
 	 * can't recycle it yet.
 	 */
 	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT, td) != 0)
 		return (EWOULDBLOCK);
 	/*
 	 * Don't recycle if its filesystem is being suspended.
 	 */
 	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0, td);
 		return (EBUSY);
 	}
 	/*
 	 * If we got this far, we need to acquire the interlock and see if
 	 * anyone picked up this vnode from another list.  If not, we will
 	 * mark it with DOOMED via vgonel() so that anyone who does find it
 	 * will skip over it.
 	 */
 	VI_LOCK(vp);
 	if (vp->v_usecount) {
 		VOP_UNLOCK(vp, LK_INTERLOCK, td);
 		vn_finished_write(vnmp);
 		return (EBUSY);
 	}
 	if ((vp->v_iflag & VI_DOOMED) == 0)
 		vgonel(vp);
 	VOP_UNLOCK(vp, LK_INTERLOCK, td);
 	vn_finished_write(vnmp);
 	CTR1(KTR_VFS, "vtryrecycle: recycled vp %p", vp);
 	return (0);
 }
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
     struct vnode **vpp)
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
 
 	mtx_lock(&vnode_free_list_mtx);
 	/*
 	 * Lend our context to reclaim vnodes if they've exceeded the max.
 	 */
 	if (freevnodes > wantfreevnodes)
 		vnlru_free(1);
 	/*
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
 		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
 			/*
 			 * File system is beeing suspended, we cannot risk a
 			 * deadlock here, so allocate new vnode anyway.
 			 */
 			if (freevnodes > wantfreevnodes)
 				vnlru_free(freevnodes - wantfreevnodes);
 			goto alloc;
 		}
 		if (vnlruproc_sig == 0) {
 			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
 		    "vlruwk", hz);
 #if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
 		if (numvnodes > desiredvnodes) {
 			mtx_unlock(&vnode_free_list_mtx);
 			return (ENFILE);
 		}
 #endif
 	}
 alloc:
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
 	/*
 	 * Setup locks.
 	 */
 	vp->v_vnlock = &vp->v_lock;
 	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
 	/*
 	 * By default, don't allow shared locks unless filesystems
 	 * opt-in.
 	 */
 	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
 	/*
 	 * Initialize bufobj.
 	 */
 	bo = &vp->v_bufobj;
 	bo->__bo_vnode = vp;
 	bo->bo_mtx = &vp->v_interlock;
 	bo->bo_ops = &buf_ops_bio;
 	bo->bo_private = vp;
 	TAILQ_INIT(&bo->bo_clean.bv_hd);
 	TAILQ_INIT(&bo->bo_dirty.bv_hd);
 	/*
 	 * Initialize namecache.
 	 */
 	LIST_INIT(&vp->v_cache_src);
 	TAILQ_INIT(&vp->v_cache_dst);
 	/*
 	 * Finalize various vnode identity bits.
 	 */
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	v_incr_usecount(vp);
 	vp->v_data = 0;
 #ifdef MAC
 	mac_init_vnode(vp);
 	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
 		mac_associate_vnode_singlelabel(mp, vp);
 	else if (mp == NULL)
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
 
 	CTR2(KTR_VFS, "getnewvnode: mp %p vp %p", mp, vp);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Delete from old mount point vnode list, if on one.
  */
 static void
 delmntque(struct vnode *vp)
 {
 	struct mount *mp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
 	MNT_ILOCK(mp);
 	vp->v_mount = NULL;
 	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
 		("bad mount point vnode list size"));
 	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	mp->mnt_nvnodelistsize--;
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
 
 static void
 insmntque_stddtr(struct vnode *vp, void *dtr_arg)
 {
 	struct thread *td;
 
 	td = curthread; /* XXX ? */
 	vp->v_data = NULL;
 	vp->v_op = &dead_vnodeops;
 	/* XXX non mp-safe fs may still call insmntque with vnode
 	   unlocked */
 	if (!VOP_ISLOCKED(vp, td))
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
 int
 insmntque1(struct vnode *vp, struct mount *mp,
 	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
 	KASSERT(vp->v_mount == NULL,
 		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
 	    mp->mnt_nvnodelistsize == 0) {
 		MNT_IUNLOCK(mp);
 		if (dtr != NULL)
 			dtr(vp, dtr_arg);
 		return (EBUSY);
 	}
 	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(mp);
 	return (0);
 }
 
 int
 insmntque(struct vnode *vp, struct mount *mp)
 {
 
 	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
  * Flush out and invalidate all buffers associated with a bufobj
  * Called with the underlying object locked.
  */
 int
 bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 	int error;
 
 	BO_LOCK(bo);
 	if (flags & V_SAVE) {
 		error = bufobj_wwait(bo, slpflag, slptimeo);
 		if (error) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
 			if ((error = BO_SYNC(bo, MNT_WAIT, td)) != 0)
 				return (error);
 			/*
 			 * XXX We could save a lock/unlock if this was only
 			 * enabled under INVARIANTS
 			 */
 			BO_LOCK(bo);
 			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
 				panic("vinvalbuf: dirty bufs");
 		}
 	}
 	/*
 	 * If you alter this loop please notice that interlock is dropped and
 	 * reacquired in flushbuflist.  Special care is needed to ensure that
 	 * no race conditions occur from this.
 	 */
 	do {
 		error = flushbuflist(&bo->bo_clean,
 		    flags, bo, slpflag, slptimeo);
 		if (error == 0)
 			error = flushbuflist(&bo->bo_dirty,
 			    flags, bo, slpflag, slptimeo);
 		if (error != 0 && error != EAGAIN) {
 			BO_UNLOCK(bo);
 			return (error);
 		}
 	} while (error != 0);
 
 	/*
 	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
 	 * have write I/O in-progress but if there is a VM object then the
 	 * VM object can also have read-I/O in-progress.
 	 */
 	do {
 		bufobj_wwait(bo, 0, 0);
 		BO_UNLOCK(bo);
 		if (bo->bo_object != NULL) {
 			VM_OBJECT_LOCK(bo->bo_object);
 			vm_object_pip_wait(bo->bo_object, "bovlbx");
 			VM_OBJECT_UNLOCK(bo->bo_object);
 		}
 		BO_LOCK(bo);
 	} while (bo->bo_numoutput > 0);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	if (bo->bo_object != NULL) {
 		VM_OBJECT_LOCK(bo->bo_object);
 		vm_object_page_remove(bo->bo_object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 		VM_OBJECT_UNLOCK(bo->bo_object);
 	}
 
 #ifdef INVARIANTS
 	BO_LOCK(bo);
 	if ((flags & (V_ALT | V_NORMAL)) == 0 &&
 	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
 		panic("vinvalbuf: flush failed");
 	BO_UNLOCK(bo);
 #endif
 	return (0);
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
     int slptimeo)
 {
 
 	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
 	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
 	return (bufobj_invalbuf(&vp->v_bufobj, flags, td, slpflag, slptimeo));
 }
 
 /*
  * Flush out buffers on the specified list.
  *
  */
 static int
 flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
     int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
 	daddr_t lblkno;
 	b_xflags_t xflags;
 
 	ASSERT_BO_LOCKED(bo);
 
 	retval = 0;
 	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
 		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
 		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
 			continue;
 		}
 		lblkno = 0;
 		xflags = 0;
 		if (nbp != NULL) {
 			lblkno = nbp->b_lblkno;
 			xflags = nbp->b_xflags &
 				(BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
 		}
 		retval = EAGAIN;
 		error = BUF_TIMELOCK(bp,
 		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
 		    "flushbuf", slpflag, slptimeo);
 		if (error) {
 			BO_LOCK(bo);
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
 		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);
 		}
 		/*
 		 * XXX Since there are no node locks for NFS, I
 		 * believe there is a slight chance that a delayed
 		 * write will occur while sleeping just above, so
 		 * check for it.
 		 */
 		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 		    (flags & V_SAVE)) {
 			bremfree(bp);
 			bp->b_flags |= B_ASYNC;
 			bwrite(bp);
 			BO_LOCK(bo);
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
 		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
 		    (nbp->b_bufobj != bo ||
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags &
 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
 			break;			/* nbp invalid */
 	}
 	return (retval);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
     off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
 	int trunclbn;
 	struct bufobj *bo;
 
 	CTR2(KTR_VFS, "vtruncbuf vp %p length %jd", vp, length);
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
 restart:
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI))) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno < trunclbn)
 				continue;
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK)
 				goto restart;
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 			anyfreed = 1;
 			if (nbp != NULL &&
 			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
 			    (nbp->b_vp != vp) ||
 			    (nbp->b_flags & B_DELWRI) == 0)) {
 				goto restart;
 			}
 			VI_LOCK(vp);
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
 			if (bp->b_lblkno > 0)
 				continue;
 			/*
 			 * Since we hold the vnode lock this should only
 			 * fail if we're racing with the buf daemon.
 			 */
 			if (BUF_LOCK(bp,
 			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
 			    VI_MTX(vp)) == ENOLCK) {
 				goto restart;
 			}
 			VNASSERT((bp->b_flags & B_DELWRI), vp,
 			    ("buf(%p) on dirty queue without DELWRI", bp));
 
 			bremfree(bp);
 			bawrite(bp);
 			VI_LOCK(vp);
 			goto restartsync;
 		}
 	}
 
 	bufobj_wwait(bo, 0, 0);
 	VI_UNLOCK(vp);
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * buf_splay() - splay tree core for the clean/dirty list of buffers in
  * 		 a vnode.
  *
  *	NOTE: We have to deal with the special case of a background bitmap
  *	buffer, a situation where two buffers will have the same logical
  *	block offset.  We want (1) only the foreground buffer to be accessed
  *	in a lookup and (2) must differentiate between the foreground and
  *	background buffer in the splay tree algorithm because the splay
  *	tree cannot normally handle multiple entities with the same 'index'.
  *	We accomplish this by adding differentiating flags to the splay tree's
  *	numerical domain.
  */
 static
 struct buf *
 buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
 {
 	struct buf dummy;
 	struct buf *lefttreemax, *righttreemin, *y;
 
 	if (root == NULL)
 		return (NULL);
 	lefttreemax = righttreemin = &dummy;
 	for (;;) {
 		if (lblkno < root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_left) == NULL)
 				break;
 			if (lblkno < y->b_lblkno) {
 				/* Rotate right. */
 				root->b_left = y->b_right;
 				y->b_right = root;
 				root = y;
 				if ((y = root->b_left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->b_left = root;
 			righttreemin = root;
 		} else if (lblkno > root->b_lblkno ||
 		    (lblkno == root->b_lblkno &&
 		    (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
 			if ((y = root->b_right) == NULL)
 				break;
 			if (lblkno > y->b_lblkno) {
 				/* Rotate left. */
 				root->b_right = y->b_left;
 				y->b_left = root;
 				root = y;
 				if ((y = root->b_right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->b_right = root;
 			lefttreemax = root;
 		} else {
 			break;
 		}
 		root = y;
 	}
 	/* Assemble the new root. */
 	lefttreemax->b_right = root->b_left;
 	righttreemin->b_left = root->b_right;
 	root->b_left = dummy.b_right;
 	root->b_right = dummy.b_left;
 	return (root);
 }
 
 static void
 buf_vlist_remove(struct buf *bp)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	ASSERT_BO_LOCKED(bp->b_bufobj);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
 	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
 	if (bp != bv->bv_root) {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 		KASSERT(root == bp, ("splay lookup failed in remove"));
 	}
 	if (bp->b_left == NULL) {
 		root = bp->b_right;
 	} else {
 		root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
 		root->b_right = bp->b_right;
 	}
 	bv->bv_root = root;
 	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
 	bv->bv_cnt--;
 	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
 }
 
 /*
  * Add the buffer to the sorted clean or dirty block list using a
  * splay tree algorithm.
  *
  * NOTE: xflags is passed as a constant, optimizing this inline function!
  */
 static void
 buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
 {
 	struct buf *root;
 	struct bufv *bv;
 
 	ASSERT_BO_LOCKED(bo);
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
 	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
 	bp->b_xflags |= xflags;
 	if (xflags & BX_VNDIRTY)
 		bv = &bo->bo_dirty;
 	else
 		bv = &bo->bo_clean;
 
 	root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
 	if (root == NULL) {
 		bp->b_left = NULL;
 		bp->b_right = NULL;
 		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
 	} else if (bp->b_lblkno < root->b_lblkno ||
 	    (bp->b_lblkno == root->b_lblkno &&
 	    (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
 		bp->b_left = root->b_left;
 		bp->b_right = root;
 		root->b_left = NULL;
 		TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
 	} else {
 		bp->b_right = root->b_right;
 		bp->b_left = root;
 		root->b_right = NULL;
 		TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
 	}
 	bv->bv_cnt++;
 	bv->bv_root = bp;
 }
 
 /*
  * Lookup a buffer using the splay tree.  Note that we specifically avoid
  * shadow buffers used in background bitmap writes.
  *
  * This code isn't quite efficient as it could be because we are maintaining
  * two sorted lists and do not know which list the block resides in.
  *
  * During a "make buildworld" the desired buffer is found at one of
  * the roots more than 60% of the time.  Thus, checking both roots
  * before performing either splay eliminates unnecessary splays on the
  * first tree splayed.
  */
 struct buf *
 gbincore(struct bufobj *bo, daddr_t lblkno)
 {
 	struct buf *bp;
 
 	ASSERT_BO_LOCKED(bo);
 	if ((bp = bo->bo_clean.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_dirty.bv_root) != NULL &&
 	    bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 		return (bp);
 	if ((bp = bo->bo_clean.bv_root) != NULL) {
 		bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	if ((bp = bo->bo_dirty.bv_root) != NULL) {
 		bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
 		if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
 			return (bp);
 	}
 	return (NULL);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(struct vnode *vp, struct buf *bp)
 {
 
 	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
 
 	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
 	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
 	    ("bgetvp: bp already attached! %p", bp));
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
 	if (VFS_NEEDSGIANT(vp->v_mount) ||
 	    vp->v_bufobj.bo_flag & BO_NEEDSGIANT)
 		bp->b_flags |= B_NEEDSGIANT;
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	buf_vlist_add(bp, &vp->v_bufobj, BX_VNCLEAN);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(struct buf *bp)
 {
 	struct bufobj *bo;
 	struct vnode *vp;
 
 	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;		/* XXX */
 	bo = bp->b_bufobj;
 	BO_LOCK(bo);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("brelvp: Buffer %p not on queue.", bp);
 	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
 	bp->b_flags &= ~B_NEEDSGIANT;
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	vdropl(vp);
 }
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
 {
 	int slot;
 
 	ASSERT_BO_LOCKED(bo);
 
 	mtx_lock(&sync_mtx);
 	if (bo->bo_flag & BO_ONWORKLST)
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
 		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
 	mtx_unlock(&sync_mtx);
 }
 
 static int
 sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
 {
 	int error, len;
 
 	mtx_lock(&sync_mtx);
 	len = syncer_worklist_len - sync_vnode_count;
 	mtx_unlock(&sync_mtx);
 	error = SYSCTL_OUT(req, &len, sizeof(len));
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
     sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
 
 static struct proc *updateproc;
 static void sched_sync(void);
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
 sync_vnode(struct bufobj *bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
 
 	vp = bo->__bo_vnode;	/* XXX */
 	if (VOP_ISLOCKED(vp, NULL) != 0)
 		return (1);
 	if (VI_TRYLOCK(vp) == 0)
 		return (1);
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
 	 * going away when we unlock the sync_mtx so that
 	 * we can acquire the vnode interlock.
 	 */
 	vholdl(vp);
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void) VOP_FSYNC(vp, MNT_LAZY, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
 	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
 		vn_syncer_add_to_worklist(bo, syncdelay);
 	}
 	vdropl(vp);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
 
 /*
  * System filesystem synchronizer daemon.
  */
 static void
 sched_sync(void)
 {
 	struct synclist *next;
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
 	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
 	int syncer_final_iter;
 	int first_printf;
 	int error;
 
 	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
 	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
 	mtx_lock(&sync_mtx);
 	for (;;) {
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
 			kthread_suspend_check(td->td_proc);
 			mtx_lock(&sync_mtx);
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
 		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
 		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 *
 		 * Skip over empty worklist slots when shutting down.
 		 */
 		do {
 			slp = &syncer_workitem_pending[syncer_delayno];
 			syncer_delayno += 1;
 			if (syncer_delayno == syncer_maxdelay)
 				syncer_delayno = 0;
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
 			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
 			if (syncer_state == SYNCER_SHUTTING_DOWN &&
 			    net_worklist_len == 0 &&
 			    last_work_seen == syncer_delayno) {
 				syncer_state = SYNCER_FINAL_DELAY;
 				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
 			}
 		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
 		    syncer_worklist_len > 0);
 
 		/*
 		 * Keep track of the last time there was anything
 		 * on the worklist other than syncer vnodes.
 		 * Return to the SHUTTING_DOWN state if any
 		 * new work appears.
 		 */
 		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
 		while ((bo = LIST_FIRST(slp)) != NULL) {
 			error = sync_vnode(bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
 				continue;
 			}
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
 			msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
 		else if (time_uptime == starttime)
 			msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer(void)
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
 	return (ret);
 }
 
 /*
  * Tell the syncer to speed up its work and run though its work
  * list several times, then tell it to shut down.
  */
 static void
 syncer_shutdown(void *arg, int howto)
 {
 	struct thread *td;
 
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
 	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
 	kproc_shutdown(arg, howto);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(struct buf *bp)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	int delay;
 #ifdef INVARIANTS
 	struct bufv *bv;
 #endif
 
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	++reassignbufcalls;
 
 	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	VI_LOCK(vp);
 	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
 		buf_vlist_remove(bp);
 	else
 		panic("reassignbuf: Buffer %p not on queue.", bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
 			switch (vp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VCHR:
 				delay = metadelay;
 				break;
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(bo, delay);
 		}
 		buf_vlist_add(bp, bo, BX_VNDIRTY);
 	} else {
 		buf_vlist_add(bp, bo, BX_VNCLEAN);
 
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
 			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
 	}
 #ifdef INVARIANTS
 	bv = &bo->bo_clean;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bv = &bo->bo_dirty;
 	bp = TAILQ_FIRST(&bv->bv_hd);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	bp = TAILQ_LAST(&bv->bv_hd, buflists);
 	KASSERT(bp == NULL || bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 #endif
 	VI_UNLOCK(vp);
 }
 
 /*
  * Increment the use and hold counts on the vnode, taking care to reference
  * the driver's usecount if this is a chardev.  The vholdl() will remove
  * the vnode from the free list if it is presently free.  Requires the
  * vnode interlock and returns with it held.
  */
 static void
 v_incr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_incr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 	vholdl(vp);
 }
 
 /*
  * Turn a holdcnt into a use+holdcnt such that only one call to
  * v_decr_usecount is needed.
  */
 static void
 v_upgrade_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_upgrade_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	vp->v_usecount++;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount++;
 		dev_unlock();
 	}
 }
 
 /*
  * Decrement the vnode use and hold count along with the driver's usecount
  * if this is a chardev.  The vdropl() below releases the vnode interlock
  * as it may free the vnode.
  */
 static void
 v_decr_usecount(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_usecount: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 	vdropl(vp);
 }
 
 /*
  * Decrement only the use count and driver use count.  This is intended to
  * be paired with a follow on vdropl() to release the remaining hold count.
  * In this way we may vgone() a vnode with a 0 usecount without risk of
  * having it end up on a free list because the hold count is kept above 0.
  */
 static void
 v_decr_useonly(struct vnode *vp)
 {
 
 	CTR3(KTR_VFS, "v_decr_useonly: vp %p holdcnt %d usecount %d\n",
 	    vp, vp->v_holdcnt, vp->v_usecount);
 	ASSERT_VI_LOCKED(vp, __FUNCTION__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	vp->v_usecount--;
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
 		dev_unlock();
 	}
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set if the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new filesystem type).
  */
 int
 vget(struct vnode *vp, int flags, struct thread *td)
 {
 	int oweinact;
 	int oldflags;
 	int error;
 
 	error = 0;
 	oldflags = flags;
 	oweinact = 0;
 	VFS_ASSERT_GIANT(vp->v_mount);
 	if ((flags & LK_INTERLOCK) == 0)
 		VI_LOCK(vp);
 	/*
 	 * If the inactive call was deferred because vput() was called
 	 * with a shared lock, we have to do it here before another thread
 	 * gets a reference to data that should be dead.
 	 */
 	if (vp->v_iflag & VI_OWEINACT) {
 		if (flags & LK_NOWAIT) {
 			VI_UNLOCK(vp);
 			return (EBUSY);
 		}
 		flags &= ~LK_TYPE_MASK;
 		flags |= LK_EXCLUSIVE;
 		oweinact = 1;
 	}
 	vholdl(vp);
 	if ((error = vn_lock(vp, flags | LK_INTERLOCK, td)) != 0) {
 		vdrop(vp);
 		return (error);
 	}
 	VI_LOCK(vp);
 	/* Upgrade our holdcnt to a usecount. */
 	v_upgrade_usecount(vp);
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
 	if (oweinact) {
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 		if ((oldflags & LK_TYPE_MASK) == 0)
 			VOP_UNLOCK(vp, 0, td);
 	} else
 		VI_UNLOCK(vp);
 	return (0);
 }
 
 /*
  * Increase the reference count of a vnode.
  */
 void
 vref(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	v_incr_usecount(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Return reference count of a vnode.
  *
  * The results of this call are only guaranteed when some mechanism other
  * than the VI lock is used to stop other processes from gaining references
  * to the vnode.  This may be the case if the caller holds the only reference.
  * This is also useful when stale data is acceptable as race conditions may
  * be accounted for by some other means.
  */
 int
 vrefcnt(struct vnode *vp)
 {
 	int usecnt;
 
 	VI_LOCK(vp);
 	usecnt = vp->v_usecount;
 	VI_UNLOCK(vp);
 
 	return (usecnt);
 }
 
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 	VFS_ASSERT_GIANT(vp->v_mount);
 
 	VI_LOCK(vp);
 
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vrele: missed vn_close"));
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		v_decr_usecount(vp);
 		return;
 	}
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		VI_UNLOCK(vp);
 		panic("vrele: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	/*
 	 * We must call VOP_INACTIVE with the node locked. Mark
 	 * as VI_DOINGINACT to avoid recursion.
 	 */
 	vp->v_iflag |= VI_OWEINACT;
 	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, td) == 0) {
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0)
 			vp->v_iflag &= ~VI_OWEINACT;
 		if (vp->v_iflag & VI_OWEINACT)
 			vinactive(vp, td);
 		VOP_UNLOCK(vp, 0, td);
 	} else {
 		VI_LOCK(vp);
 		if (vp->v_usecount > 0)
 			vp->v_iflag &= ~VI_OWEINACT;
 	}
 	vdropl(vp);
 }
 
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
  * re-aquiring the lock (as vrele() aquires the lock internally.)
  */
 void
 vput(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 	ASSERT_VOP_LOCKED(vp, "vput");
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VI_LOCK(vp);
 	/* Skip this v_writecount check if we're going to panic below. */
 	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
 	    ("vput: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
 	    vp->v_usecount == 1)) {
 		VOP_UNLOCK(vp, 0, td);
 		v_decr_usecount(vp);
 		return;
 	}
 
 	if (vp->v_usecount != 1) {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 	/*
 	 * We want to hold the vnode until the inactive finishes to
 	 * prevent vgone() races.  We drop the use count here and the
 	 * hold count below when we're done.
 	 */
 	v_decr_useonly(vp);
 	vp->v_iflag |= VI_OWEINACT;
 	if (VOP_ISLOCKED(vp, NULL) != LK_EXCLUSIVE) {
 		error = VOP_LOCK(vp, LK_EXCLUPGRADE|LK_INTERLOCK|LK_NOWAIT, td);
 		VI_LOCK(vp);
 		if (error) {
 			if (vp->v_usecount > 0)
 				vp->v_iflag &= ~VI_OWEINACT;
 			goto done;
 		}
 	}
 	if (vp->v_usecount > 0)
 		vp->v_iflag &= ~VI_OWEINACT;
 	if (vp->v_iflag & VI_OWEINACT)
 		vinactive(vp, td);
 	VOP_UNLOCK(vp, 0, td);
 done:
 	vdropl(vp);
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vholdl(vp);
 	VI_UNLOCK(vp);
 }
 
 void
 vholdl(struct vnode *vp)
 {
 
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 }
 
 /*
  * Note that there is one less who cares about this vnode.  vdrop() is the
  * opposite of vhold().
  */
 void
 vdrop(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	vdropl(vp);
 }
 
 /*
  * Drop the hold count of the vnode.  If this is the last reference to
  * the vnode we will free it if it has been vgone'd otherwise it is
  * placed on the free list.
  */
 void
 vdropl(struct vnode *vp)
 {
 
 	ASSERT_VI_LOCKED(vp, "vdropl");
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
 	if (vp->v_holdcnt == 0) {
 		if (vp->v_iflag & VI_DOOMED) {
 			vdestroy(vp);
 			return;
 		} else
 			vfree(vp);
 	}
 	VI_UNLOCK(vp);
 }
 
 /*
  * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
  * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
  * OWEINACT tracks whether a vnode missed a call to inactive due to a
  * failed lock upgrade.
  */
 static void
 vinactive(struct vnode *vp, struct thread *td)
 {
 
 	ASSERT_VOP_LOCKED(vp, "vinactive");
 	ASSERT_VI_LOCKED(vp, "vinactive");
 	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
 	    ("vinactive: recursed on VI_DOINGINACT"));
 	vp->v_iflag |= VI_DOINGINACT;
 	vp->v_iflag &= ~VI_OWEINACT;
 	VI_UNLOCK(vp);
 	VOP_INACTIVE(vp, td);
 	VI_LOCK(vp);
 	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
 	    ("vinactive: lost VI_DOINGINACT"));
 	vp->v_iflag &= ~VI_DOINGINACT;
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If FORCECLOSE is not specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If FORCECLOSE is specified, detach any active vnodes
  * that are found.
  *
  * If WRITECLOSE is set, only flush out regular file vnodes open for
  * writing.
  *
  * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
  *
  * `rootrefs' specifies the base reference count for the root vnode
  * of this filesystem. The root vnode is considered busy if its
  * v_usecount exceeds this value. On a successful return, vflush(, td)
  * will call vrele() on the root vnode exactly rootrefs times.
  * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
  * be zero.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
 	int busy = 0, error;
 
 	CTR1(KTR_VFS, "vflush: mp %p", mp);
 	if (rootrefs > 0) {
 		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
 		    ("vflush: bad args"));
 		/*
 		 * Get the filesystem root vnode. We can vput() it
 		 * immediately, since with rootrefs > 0, it won't go away.
 		 */
 		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp, td)) != 0)
 			return (error);
 		vput(rootvp);
 
 	}
 	MNT_ILOCK(mp);
 loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 
 		VI_LOCK(vp);
 		vholdl(vp);
 		MNT_IUNLOCK(mp);
 		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE, td);
 		if (error) {
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
 		/*
 		 * Skip over a vnodes marked VV_SYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
 			MNT_ILOCK(mp);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, flush out unlinked but still open
 		 * files (even if open only for reading) and regular file
 		 * vnodes open for writing.
 		 */
 		if (flags & WRITECLOSE) {
 			error = VOP_GETATTR(vp, &vattr, td->td_ucred, td);
 			VI_LOCK(vp);
 
 			if ((vp->v_type == VNON ||
 			    (error == 0 && vattr.va_nlink > 0)) &&
 			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 				VOP_UNLOCK(vp, 0, td);
 				vdropl(vp);
 				MNT_ILOCK(mp);
 				continue;
 			}
 		} else
 			VI_LOCK(vp);
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 *
 		 * If FORCECLOSE is set, forcibly close the vnode.
 		 */
 		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
 			VNASSERT(vp->v_usecount == 0 ||
 			    (vp->v_type != VCHR && vp->v_type != VBLK), vp,
 			    ("device VNODE %p is FORCECLOSED", vp));
 			vgonel(vp);
 		} else {
 			busy++;
 #ifdef DIAGNOSTIC
 			if (busyprt)
 				vprint("vflush: busy vnode", vp);
 #endif
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vdropl(vp);
 		MNT_ILOCK(mp);
 	}
 	MNT_IUNLOCK(mp);
 	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
 		/*
 		 * If just the root vnode is busy, and if its refcount
 		 * is equal to `rootrefs', then go ahead and kill it.
 		 */
 		VI_LOCK(rootvp);
 		KASSERT(busy > 0, ("vflush: not busy"));
 		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
 		    ("vflush: usecount %d < rootrefs %d",
 		     rootvp->v_usecount, rootrefs));
 		if (busy == 1 && rootvp->v_usecount == rootrefs) {
 			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK, td);
 			vgone(rootvp);
 			VOP_UNLOCK(rootvp, 0, td);
 			busy = 0;
 		} else
 			VI_UNLOCK(rootvp);
 	}
 	if (busy)
 		return (EBUSY);
 	for (; rootrefs > 0; rootrefs--)
 		vrele(rootvp);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  */
 int
 vrecycle(struct vnode *vp, struct thread *td)
 {
 	int recycled;
 
 	ASSERT_VOP_LOCKED(vp, "vrecycle");
 	recycled = 0;
 	VI_LOCK(vp);
 	if (vp->v_usecount == 0) {
 		recycled = 1;
 		vgonel(vp);
 	}
 	VI_UNLOCK(vp);
 	return (recycled);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(struct vnode *vp)
 {
 	VI_LOCK(vp);
 	vgonel(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 void
 vgonel(struct vnode *vp)
 {
 	struct thread *td;
 	int oweinact;
 	int active;
 	struct mount *mp;
 
 	CTR1(KTR_VFS, "vgonel: vp %p", vp);
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
 	td = curthread;
 
 	/*
 	 * Don't vgonel if we're already doomed.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return;
 	vp->v_iflag |= VI_DOOMED;
 	/*
 	 * Check to see if the vnode is in use.  If so, we have to call
 	 * VOP_CLOSE() and VOP_INACTIVE().
 	 */
 	active = vp->v_usecount;
 	oweinact = (vp->v_iflag & VI_OWEINACT);
 	VI_UNLOCK(vp);
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 * If the flush fails, just toss the buffers.
 	 */
 	mp = NULL;
 	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
 		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
 	if (vinvalbuf(vp, V_SAVE, td, 0, 0) != 0)
 		vinvalbuf(vp, 0, td, 0, 0);
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed.
 	 */
 	if (active)
 		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
 	if (oweinact || active) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_DOINGINACT) == 0)
 			vinactive(vp, td);
 		VI_UNLOCK(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, td))
 		panic("vgone: cannot reclaim");
 	if (mp != NULL)
 		vn_finished_secondary_write(mp);
 	VNASSERT(vp->v_object == NULL, vp,
 	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
 	/*
 	 * Delete from old mount point vnode list.
 	 */
 	delmntque(vp);
 	cache_purge(vp);
 	/*
 	 * Done with purge, reset to the standard lock and invalidate
 	 * the vnode.
 	 */
 	VI_LOCK(vp);
 	vp->v_vnlock = &vp->v_lock;
 	vp->v_op = &dead_vnodeops;
 	vp->v_tag = "none";
 	vp->v_type = VBAD;
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(struct vnode *vp)
 {
 	int count;
 
 	dev_lock();
 	count = vp->v_rdev->si_usecount;
 	dev_unlock();
 	return (count);
 }
 
 /*
  * Same as above, but using the struct cdev *as argument
  */
 int
 count_dev(struct cdev *dev)
 {
 	int count;
 
 	dev_lock();
 	count = dev->si_usecount;
 	dev_unlock();
 	return(count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
  "VMARKER"};
 
 void
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
 	char buf[96];
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("%p: ", (void *)vp);
 	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
 	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
 	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
 		strcat(buf, "|VV_ROOT");
 	if (vp->v_vflag & VV_TEXT)
 		strcat(buf, "|VV_TEXT");
 	if (vp->v_vflag & VV_SYSTEM)
 		strcat(buf, "|VV_SYSTEM");
 	if (vp->v_vflag & VV_DELETED)
 		strcat(buf, "|VV_DELETED");
 	if (vp->v_iflag & VI_DOOMED)
 		strcat(buf, "|VI_DOOMED");
 	if (vp->v_iflag & VI_FREE)
 		strcat(buf, "|VI_FREE");
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
 	if (vp->v_object != NULL)
 		printf("    v_object %p ref %d pages %d\n",
 		    vp->v_object, vp->v_object->ref_count,
 		    vp->v_object->resident_page_count);
 	printf("    ");
 	lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	if (vp->v_data != NULL)
 		VOP_PRINT(vp);
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
 {
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	/*
 	 * Note: because this is DDB, we can't obey the locking semantics
 	 * for these structures, which means we could catch an inconsistent
 	 * state and dereference a nasty pointer.  Not much to be done
 	 * about that.
 	 */
 	printf("Locked vnodes\n");
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp, NULL))
 				vprint("", vp);
 		}
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
 
 /*
  * Show details about the given vnode.
  */
 DB_SHOW_COMMAND(vnode, db_show_vnode)
 {
 	struct vnode *vp;
 
 	if (!have_addr)
 		return;
 	vp = (struct vnode *)addr;
 	vn_printf(vp, "vnode ");
 }
 #endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
  */
 static void
 vfsconf2x(struct vfsconf *vfsp, struct xvfsconf *xvfsp)
 {
 
 	strcpy(xvfsp->vfc_name, vfsp->vfc_name);
 	xvfsp->vfc_typenum = vfsp->vfc_typenum;
 	xvfsp->vfc_refcount = vfsp->vfc_refcount;
 	xvfsp->vfc_flags = vfsp->vfc_flags;
 	/*
 	 * These are unused in userland, we keep them
 	 * to not break binary compatibility.
 	 */
 	xvfsp->vfc_vfsops = NULL;
 	xvfsp->vfc_next = NULL;
 }
 
 /*
  * Top level filesystem related information gathering.
  */
 static int
 sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 	int error;
 
 	error = 0;
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		error = SYSCTL_OUT(req, &xvfsp, sizeof xvfsp);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLFLAG_RD, NULL, 0, sysctl_vfs_conflist,
     "S,xvfsconf", "List of all configured filesystems");
 
 #ifndef BURN_BRIDGES
 static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 	struct xvfsconf xvfsp;
 
 	printf("WARNING: userland calling deprecated sysctl, "
 	    "please rebuild world\n");
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		bzero(&xvfsp, sizeof(xvfsp));
 		vfsconf2x(vfsp, &xvfsp);
 		return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
 	}
 	return (EOPNOTSUPP);
 }
 
 static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
 	vfs_sysctl, "Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
 		bzero(&ovfs, sizeof(ovfs));
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 #endif /* !BURN_BRIDGES */
 
 #define KINFO_VNODESLOP		10
 #ifdef notyet
 /*
  * Dump vnode list (via sysctl).
  */
 /* ARGSUSED */
 static int
 sysctl_vnode(SYSCTL_HANDLER_ARGS)
 {
 	struct xvnode *xvn;
 	struct thread *td = req->td;
 	struct mount *mp;
 	struct vnode *vp;
 	int error, len, n;
 
 	/*
 	 * Stale numvnodes access is not fatal here.
 	 */
 	req->lock = 0;
 	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
 	if (!req->oldptr)
 		/* Make an estimate */
 		return (SYSCTL_OUT(req, 0, len));
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
 	n = 0;
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td))
 			continue;
 		MNT_ILOCK(mp);
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
 			if (n == len)
 				break;
 			vref(vp);
 			xvn[n].xv_size = sizeof *xvn;
 			xvn[n].xv_vnode = vp;
 			xvn[n].xv_id = 0;	/* XXX compat */
 #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
 			XV_COPY(usecount);
 			XV_COPY(writecount);
 			XV_COPY(holdcnt);
 			XV_COPY(mount);
 			XV_COPY(numoutput);
 			XV_COPY(type);
 #undef XV_COPY
 			xvn[n].xv_flag = vp->v_vflag;
 
 			switch (vp->v_type) {
 			case VREG:
 			case VDIR:
 			case VLNK:
 				break;
 			case VBLK:
 			case VCHR:
 				if (vp->v_rdev == NULL) {
 					vrele(vp);
 					continue;
 				}
 				xvn[n].xv_dev = dev2udev(vp->v_rdev);
 				break;
 			case VSOCK:
 				xvn[n].xv_socket = vp->v_socket;
 				break;
 			case VFIFO:
 				xvn[n].xv_fifo = vp->v_fifoinfo;
 				break;
 			case VNON:
 			case VBAD:
 			default:
 				/* shouldn't happen? */
 				vrele(vp);
 				continue;
 			}
 			vrele(vp);
 			++n;
 		}
 		MNT_IUNLOCK(mp);
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp, td);
 		if (n == len)
 			break;
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
 	free(xvn, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,xvnode", "");
 #endif
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall(void)
 {
 	struct mount *mp;
 	struct thread *td;
 	int error;
 
 	KASSERT(curthread != NULL, ("vfs_unmountall: NULL curthread"));
 	td = curthread;
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	while(!TAILQ_EMPTY(&mountlist)) {
 		mp = TAILQ_LAST(&mountlist, mntlist);
 		error = dounmount(mp, MNT_FORCE, td);
 		if (error) {
 			TAILQ_REMOVE(&mountlist, mp, mnt_list);
 			/*
 			 * XXX: Due to the way in which we mount the root
 			 * file system off of devfs, devfs will generate a
 			 * "busy" warning when we try to unmount it before
 			 * the root.  Don't print a warning as a result in
 			 * order to avoid false positive errors that may
 			 * cause needless upset.
 			 */
 			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
 				printf("unmount of %s failed (",
 				    mp->mnt_stat.f_mntonname);
 				if (error == EBUSY)
 					printf("BUSY)\n");
 				else
 					printf("%d)\n", error);
 			}
 		} else {
 			/* The unmount has removed mp from the mountlist */
 		}
 	}
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags)
 {
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
 		if ((vp->v_iflag & VI_OBJDIRTY) &&
 		    (flags == MNT_WAIT || VOP_ISLOCKED(vp, NULL) == 0)) {
 			MNT_IUNLOCK(mp);
 			if (!vget(vp,
 			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
 			    curthread)) {
 				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
 					vput(vp);
 					MNT_ILOCK(mp);
 					continue;
 				}
 
 				obj = vp->v_object;
 				if (obj != NULL) {
 					VM_OBJECT_LOCK(obj);
 					vm_object_page_clean(obj, 0, 0,
 					    flags == MNT_WAIT ?
 					    OBJPC_SYNC : OBJPC_NOSYNC);
 					VM_OBJECT_UNLOCK(obj);
 				}
 				vput(vp);
 			}
 			MNT_ILOCK(mp);
 		} else
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
 }
 
 /*
  * Mark a vnode as free, putting it up for recycling.
  */
 static void
 vfree(struct vnode *vp)
 {
 
 	CTR1(KTR_VFS, "vfree vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vfree");
 	mtx_lock(&vnode_free_list_mtx);
 	VNASSERT(vp->v_op != NULL, vp, ("vfree: vnode already reclaimed."));
 	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp, ("vnode already free"));
 	VNASSERT(VSHOULDFREE(vp), vp, ("vfree: freeing when we shouldn't"));
 	VNASSERT((vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("vfree: Freeing doomed vnode"));
 	if (vp->v_iflag & VI_AGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	vp->v_iflag &= ~VI_AGE;
 	vp->v_iflag |= VI_FREE;
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Opposite of vfree() - mark a vnode as in use.
  */
 static void
 vbusy(struct vnode *vp)
 {
 	CTR1(KTR_VFS, "vbusy vp %p", vp);
 	ASSERT_VI_LOCKED(vp, "vbusy");
 	VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
 	VNASSERT(vp->v_op != NULL, vp, ("vbusy: vnode already reclaimed."));
 
 	mtx_lock(&vnode_free_list_mtx);
 	TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 	freevnodes--;
 	vp->v_iflag &= ~(VI_FREE|VI_AGE);
 	mtx_unlock(&vnode_free_list_mtx);
 }
 
 /*
  * Initalize per-vnode helper structure to hold poll-related state.
  */
 void
 v_addpollinfo(struct vnode *vp)
 {
 	struct vpollinfo *vi;
 
 	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
 	if (vp->v_pollinfo != NULL) {
 		uma_zfree(vnodepoll_zone, vi);
 		return;
 	}
 	vp->v_pollinfo = vi;
 	mtx_init(&vp->v_pollinfo->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
 	knlist_init(&vp->v_pollinfo->vpi_selinfo.si_note, vp, vfs_knllock,
 	    vfs_knlunlock, vfs_knllocked);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	mtx_lock(&vp->v_pollinfo->vpi_lock);
 	if (vp->v_pollinfo->vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo->vpi_revents;
 		vp->v_pollinfo->vpi_revents &= ~events;
 
 		mtx_unlock(&vp->v_pollinfo->vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo->vpi_events |= events;
 	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
 	mtx_unlock(&vp->v_pollinfo->vpi_lock);
 	return 0;
 }
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*)(struct  vop_close_args *))nullop)
 static int	sync_fsync(struct  vop_fsync_args *);
 static int	sync_inactive(struct  vop_inactive_args *);
 static int	sync_reclaim(struct  vop_reclaim_args *);
 
 static struct vop_vector sync_vnodeops = {
 	.vop_bypass =	VOP_EOPNOTSUPP,
 	.vop_close =	sync_close,		/* close */
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
 	._vop_lock =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode("syncer", mp, &sync_vnodeops, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	error = insmntque(vp, mp);
 	if (error != 0)
 		panic("vfs_allocate_syncvnode: insmntque failed");
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	VI_LOCK(vp);
 	vn_syncer_add_to_worklist(&vp->v_bufobj,
 	    syncdelay > 0 ? next % syncdelay : 0);
 	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
 	mtx_lock(&sync_mtx);
 	sync_vnode_count++;
 	mtx_unlock(&sync_mtx);
 	VI_UNLOCK(vp);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
 	int error;
 	struct bufobj *bo;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	bo = &syncvp->v_bufobj;
 	BO_LOCK(bo);
 	vn_syncer_add_to_worklist(bo, syncdelay);
 	BO_UNLOCK(bo);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	mtx_lock(&mountlist_mtx);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, td) != 0) {
 		mtx_unlock(&mountlist_mtx);
 		return (0);
 	}
 	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
 		vfs_unbusy(mp, td);
 		return (0);
 	}
 	MNT_ILOCK(mp);
 	mp->mnt_noasync++;
 	mp->mnt_kern_flag &= ~MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
 	MNT_ILOCK(mp);
 	mp->mnt_noasync--;
 	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
 		mp->mnt_kern_flag |= MNTK_ASYNC;
 	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
 sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 
 	VI_LOCK(vp);
 	bo = &vp->v_bufobj;
 	vp->v_mount->mnt_syncer = NULL;
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
 		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
 	}
 	VI_UNLOCK(vp);
 
 	return (0);
 }
 
 /*
  * Check if vnode represents a disk device
  */
 int
 vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
 	error = 0;
 	dev_lock();
 	if (vp->v_type != VCHR)
 		error = ENOTBLK;
 	else if (vp->v_rdev == NULL)
 		error = ENXIO;
 	else if (vp->v_rdev->si_devsw == NULL)
 		error = ENXIO;
 	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
 		error = ENOTBLK;
 	dev_unlock();
 	if (errp != NULL)
 		*errp = error;
 	return (error == 0);
 }
 
 /*
  * Common filesystem object access control check routine.  Accepts a
  * vnode's type, "mode", uid and gid, requested access mode, credentials,
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
  *
  * The ifdef'd CAPABILITIES version is here for reference, but is not
  * actually used.
  */
 int
 vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
     mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	mode_t dac_granted;
 	mode_t priv_granted;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.
 	 */
 
 	if (privused != NULL)
 		*privused = 0;
 
 	dac_granted = 0;
 
 	/* Check the owner. */
 	if (cred->cr_uid == file_uid) {
 		dac_granted |= VADMIN;
 		if (file_mode & S_IXUSR)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRUSR)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWUSR)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check the groups (first match) */
 	if (groupmember(file_gid, cred)) {
 		if (file_mode & S_IXGRP)
 			dac_granted |= VEXEC;
 		if (file_mode & S_IRGRP)
 			dac_granted |= VREAD;
 		if (file_mode & S_IWGRP)
 			dac_granted |= (VWRITE | VAPPEND);
 
 		if ((acc_mode & dac_granted) == acc_mode)
 			return (0);
 
 		goto privcheck;
 	}
 
 	/* Otherwise, check everyone else. */
 	if (file_mode & S_IXOTH)
 		dac_granted |= VEXEC;
 	if (file_mode & S_IROTH)
 		dac_granted |= VREAD;
 	if (file_mode & S_IWOTH)
 		dac_granted |= (VWRITE | VAPPEND);
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 
 privcheck:
 	/*
 	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
 	 * from above.  For each privilege, if the privilege is required,
 	 * bitwise or the request type onto the priv_granted mask.
 	 */
 	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
 		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
 		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, SUSER_ALLOWJAIL))
 			priv_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
 		    !priv_check_cred(cred, PRIV_VFS_EXEC, SUSER_ALLOWJAIL))
 			priv_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_READ, SUSER_ALLOWJAIL))
 		priv_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_WRITE, SUSER_ALLOWJAIL))
 		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
 	    !priv_check_cred(cred, PRIV_VFS_ADMIN, SUSER_ALLOWJAIL))
 		priv_granted |= VADMIN;
 
 	if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * Credential check based on process requesting service, and per-attribute
  * permissions.
  */
 int
 extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
     struct thread *td, int access)
 {
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
 		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
 		return (EPERM);
 	}
 }
 
 #ifdef DEBUG_VFS_LOCKS
 /*
  * This only exists to supress warnings from unlocked specfs accesses.  It is
  * no longer ok to have an unlocked VFS.
  */
 #define	IGNORE_LOCK(vp) ((vp)->v_type == VCHR || (vp)->v_type == VBAD)
 
 int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "");
 
 int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "");
 
 int vfs_badlock_print = 1;	/* Print lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "");
 
 #ifdef KDB
 int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
 SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "");
 #endif
 
 static void
 vfs_badlock(const char *msg, const char *str, struct vnode *vp)
 {
 
 #ifdef KDB
 	if (vfs_badlock_backtrace)
 		kdb_backtrace();
 #endif
 	if (vfs_badlock_print)
 		printf("%s: %p %s\n", str, (void *)vp, msg);
 	if (vfs_badlock_ddb)
 		kdb_enter("lock violation");
 }
 
 void
 assert_vi_locked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is not locked but should be", str, vp);
 }
 
 void
 assert_vi_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
 		vfs_badlock("interlock is locked but should not be", str, vp);
 }
 
 void
 assert_vop_locked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) && VOP_ISLOCKED(vp, NULL) == 0)
 		vfs_badlock("is not locked but should be", str, vp);
 }
 
 void
 assert_vop_unlocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE)
 		vfs_badlock("is locked but should not be", str, vp);
 }
 
 void
 assert_vop_elocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLUSIVE)
 		vfs_badlock("is not exclusive locked but should be", str, vp);
 }
 
 #if 0
 void
 assert_vop_elocked_other(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_EXCLOTHER)
 		vfs_badlock("is not exclusive locked by another thread",
 		    str, vp);
 }
 
 void
 assert_vop_slocked(struct vnode *vp, const char *str)
 {
 
 	if (vp && !IGNORE_LOCK(vp) &&
 	    VOP_ISLOCKED(vp, curthread) != LK_SHARED)
 		vfs_badlock("is not locked shared but should be", str, vp);
 }
 #endif /* 0 */
 #endif /* DEBUG_VFS_LOCKS */
 
 void
 vop_rename_pre(void *ap)
 {
 	struct vop_rename_args *a = ap;
 
 #ifdef DEBUG_VFS_LOCKS
 	if (a->a_tvp)
 		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
 	if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
 		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
 		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
 	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
 #endif
 	if (a->a_tdvp != a->a_fdvp)
 		vhold(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vhold(a->a_fvp);
 	vhold(a->a_tdvp);
 	if (a->a_tvp)
 		vhold(a->a_tvp);
 }
 
 void
 vop_strategy_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_strategy_args *a;
 	struct buf *bp;
 
 	a = ap;
 	bp = a->a_bp;
 
 	/*
 	 * Cluster ops lock their component buffers but not the IO container.
 	 */
 	if ((bp->b_flags & B_CLUSTER) != 0)
 		return;
 
 	if (BUF_REFCNT(bp) < 1) {
 		if (vfs_badlock_print)
 			printf(
 			    "VOP_STRATEGY: bp is not locked but should be\n");
 		if (vfs_badlock_ddb)
 			kdb_enter("lock violation");
 	}
 #endif
 }
 
 void
 vop_lookup_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 #endif
 }
 
 void
 vop_lookup_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_lookup_args *a;
 	struct vnode *dvp;
 	struct vnode *vp;
 
 	a = ap;
 	dvp = a->a_dvp;
 	vp = *(a->a_vpp);
 
 	ASSERT_VI_UNLOCKED(dvp, "VOP_LOOKUP");
 	ASSERT_VOP_LOCKED(dvp, "VOP_LOOKUP");
 
 	if (!rc)
 		ASSERT_VOP_LOCKED(vp, "VOP_LOOKUP (child)");
 #endif
 }
 
 void
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct _vop_lock_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	else
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct _vop_lock_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
 		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
 #endif
 }
 
 void
 vop_unlock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
 	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_unlock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
 	struct vop_unlock_args *a = ap;
 
 	if (a->a_flags & LK_INTERLOCK)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
 #endif
 }
 
 void
 vop_create_post(void *ap, int rc)
 {
 	struct vop_create_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
 
 void
 vop_mkdir_post(void *ap, int rc)
 {
 	struct vop_mkdir_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 }
 
 void
 vop_mknod_post(void *ap, int rc)
 {
 	struct vop_mknod_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_remove_post(void *ap, int rc)
 {
 	struct vop_remove_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_rename_post(void *ap, int rc)
 {
 	struct vop_rename_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
 		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
 		if (a->a_tvp)
 			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
 	}
 	if (a->a_tdvp != a->a_fdvp)
 		vdrop(a->a_fdvp);
 	if (a->a_tvp != a->a_fvp)
 		vdrop(a->a_fvp);
 	vdrop(a->a_tdvp);
 	if (a->a_tvp)
 		vdrop(a->a_tvp);
 }
 
 void
 vop_rmdir_post(void *ap, int rc)
 {
 	struct vop_rmdir_args *a = ap;
 
 	if (!rc) {
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
 	}
 }
 
 void
 vop_setattr_post(void *ap, int rc)
 {
 	struct vop_setattr_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
 }
 
 void
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
 
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 static struct knlist fs_knlist;
 
 static void
 vfs_event_init(void *arg)
 {
 	knlist_init(&fs_knlist, NULL, NULL, NULL, NULL);
 }
 /* XXX - correct order? */
 SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
 
 void
 vfs_event_signal(fsid_t *fsid, u_int32_t event, intptr_t data __unused)
 {
 
 	KNOTE_UNLOCKED(&fs_knlist, event);
 }
 
 static int	filt_fsattach(struct knote *kn);
 static void	filt_fsdetach(struct knote *kn);
 static int	filt_fsevent(struct knote *kn, long hint);
 
 struct filterops fs_filtops =
 	{ 0, filt_fsattach, filt_fsdetach, filt_fsevent };
 
 static int
 filt_fsattach(struct knote *kn)
 {
 
 	kn->kn_flags |= EV_CLEAR;
 	knlist_add(&fs_knlist, kn, 0);
 	return (0);
 }
 
 static void
 filt_fsdetach(struct knote *kn)
 {
 
 	knlist_remove(&fs_knlist, kn, 0);
 }
 
 static int
 filt_fsevent(struct knote *kn, long hint)
 {
 
 	kn->kn_fflags |= hint;
 	return (kn->kn_fflags != 0);
 }
 
 static int
 sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
 {
 	struct vfsidctl vc;
 	int error;
 	struct mount *mp;
 
 	error = SYSCTL_IN(req, &vc, sizeof(vc));
 	if (error)
 		return (error);
 	if (vc.vc_vers != VFS_CTL_VERS1)
 		return (EINVAL);
 	mp = vfs_getvfs(&vc.vc_fsid);
 	if (mp == NULL)
 		return (ENOENT);
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
 		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
 	error = VFS_SYSCTL(mp, vc.vc_op, req);
 	vfs_rel(mp);
 	return (error);
 }
 
 SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
     "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
  * XXX: Wouldn't a random number make a lot more sense ??
  */
 u_quad_t
 init_va_filerev(void)
 {
 	struct bintime bt;
 
 	getbinuptime(&bt);
 	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
 }
 
 static int	filt_vfsread(struct knote *kn, long hint);
 static int	filt_vfswrite(struct knote *kn, long hint);
 static int	filt_vfsvnode(struct knote *kn, long hint);
 static void	filt_vfsdetach(struct knote *kn);
 static struct filterops vfsread_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsread };
 static struct filterops vfswrite_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfswrite };
 static struct filterops vfsvnode_filtops =
 	{ 1, NULL, filt_vfsdetach, filt_vfsvnode };
 
 static void
 vfs_knllock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 }
 
 static void
 vfs_knlunlock(void *arg)
 {
 	struct vnode *vp = arg;
 
 	VOP_UNLOCK(vp, 0, curthread);
 }
 
 static int
 vfs_knllocked(void *arg)
 {
 	struct vnode *vp = arg;
 
 	return (VOP_ISLOCKED(vp, curthread) == LK_EXCLUSIVE);
 }
 
 int
 vfs_kqfilter(struct vop_kqfilter_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &vfsread_filtops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &vfswrite_filtops;
 		break;
 	case EVFILT_VNODE:
 		kn->kn_fop = &vfsvnode_filtops;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	kn->kn_hook = (caddr_t)vp;
 
 	if (vp->v_pollinfo == NULL)
 		v_addpollinfo(vp);
 	if (vp->v_pollinfo == NULL)
 		return (ENOMEM);
 	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
 	knlist_add(knl, kn, 0);
 
 	return (0);
 }
 
 /*
  * Detach knote from vnode
  */
 static void
 filt_vfsdetach(struct knote *kn)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 
 	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
 	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfsread(struct knote *kn, long hint)
 {
 	struct vnode *vp = (struct vnode *)kn->kn_hook;
 	struct vattr va;
 
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 		return (1);
 	}
 
 	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
 		return (0);
 
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
 	return (kn->kn_data != 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_vfswrite(struct knote *kn, long hint)
 {
 	/*
 	 * filesystem is gone, so set the EOF flag and schedule
 	 * the knote for deletion.
 	 */
 	if (hint == NOTE_REVOKE)
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
 
 	kn->kn_data = 0;
 	return (1);
 }
 
 static int
 filt_vfsvnode(struct knote *kn, long hint)
 {
 	if (kn->kn_sfflags & hint)
 		kn->kn_fflags |= hint;
 	if (hint == NOTE_REVOKE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	return (kn->kn_fflags != 0);
 }
 
 int
 vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
 {
 	int error;
 
 	if (dp->d_reclen > ap->a_uio->uio_resid)
 		return (ENAMETOOLONG);
 	error = uiomove(dp, dp->d_reclen, ap->a_uio);
 	if (error) {
 		if (ap->a_ncookies != NULL) {
 			if (ap->a_cookies != NULL)
 				free(ap->a_cookies, M_TEMP);
 			ap->a_cookies = NULL;
 			*ap->a_ncookies = 0;
 		}
 		return (error);
 	}
 	if (ap->a_ncookies == NULL)
 		return (0);
 
 	KASSERT(ap->a_cookies,
 	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
 
 	*ap->a_cookies = realloc(*ap->a_cookies,
 	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
 	(*ap->a_cookies)[*ap->a_ncookies] = off;
 	return (0);
 }
 
 /*
  * Mark for update the access time of the file if the filesystem
  * supports VA_MARK_ATIME.  This functionality is used by execve
  * and mmap, so we want to avoid the synchronous I/O implied by
  * directly setting va_atime for the sake of efficiency.
  */
 void
 vfs_mark_atime(struct vnode *vp, struct thread *td)
 {
 	struct vattr atimeattr;
 
 	if ((vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) {
 		VATTR_NULL(&atimeattr);
 		atimeattr.va_vaflags |= VA_MARK_ATIME;
 		(void)VOP_SETATTR(vp, &atimeattr, td->td_ucred, td);
 	}
 }
Index: head/sys/nfsclient/nfs_bio.c
===================================================================
--- head/sys/nfsclient/nfs_bio.c	(revision 169666)
+++ head/sys/nfsclient/nfs_bio.c	(revision 169667)
@@ -1,1802 +1,1802 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_bio.c	8.9 (Berkeley) 3/30/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include <rpc/rpcclnt.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfsnode.h>
 
 #include <nfs4client/nfs4.h>
 
 static struct buf *nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size,
 		    struct thread *td);
 static int nfs_directio_write(struct vnode *vp, struct uio *uiop, 
 			      struct ucred *cred, int ioflag);
 
 extern int nfs_directio_enable;
 extern int nfs_directio_allow_mmap;
 
 /*
  * Vnode op for VM getpages.
  */
 int
 nfs_getpages(struct vop_getpages_args *ap)
 {
 	int i, error, nextoff, size, toff, count, npages;
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	vm_object_t object;
 	vm_page_t *pages;
 	struct nfsnode *np;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	cred = curthread->td_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 
 	if ((object = vp->v_object) == NULL) {
 		nfs_printf("nfs_getpages: called with non-merged cache vnode??\n");
 		return VM_PAGER_ERROR;
 	}
 
 	if (nfs_directio_enable && !nfs_directio_allow_mmap) {
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 			mtx_unlock(&np->n_mtx);
 			nfs_printf("nfs_getpages: called on non-cacheable vnode??\n");
 			return VM_PAGER_ERROR;
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {	
 		mtx_unlock(&nmp->nm_mtx);
 		/* We'll never get here for v4, because we always have fsinfo */
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	npages = btoc(count);
 
 	/*
 	 * If the requested page is partially valid, just return it and
 	 * allow the pager to zero-out the blanks.  Partially valid pages
 	 * can only occur at the file EOF.
 	 */
 
 	{
 		vm_page_t m = pages[ap->a_reqpage];
 
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		if (m->valid != 0) {
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
 			for (i = 0; i < npages; ++i) {
 				if (i != ap->a_reqpage)
 					vm_page_free(pages[i]);
 			}
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			return(0);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodein++;
-	cnt.v_vnodepgsin += npages;
+	VMCNT_ADD(vnodein, 1);
+	VMCNT_ADD(vnodepgsin, npages);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	error = (nmp->nm_rpcops->nr_readrpc)(vp, &uio, cred);
 	pmap_qremove(kva, npages);
 
 	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (error && (uio.uio_resid == count)) {
 		nfs_printf("nfs_getpages: error %d\n", error);
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < npages; ++i) {
 			if (i != ap->a_reqpage)
 				vm_page_free(pages[i]);
 		}
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_ERROR;
 	}
 
 	/*
 	 * Calculate the number of bytes read and validate only that number
 	 * of bytes.  Note that due to pending writes, size may be 0.  This
 	 * does not mean that the remaining data is invalid!
 	 */
 
 	size = count - uio.uio_resid;
 	VM_OBJECT_LOCK(object);
 	vm_page_lock_queues();
 	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
 		vm_page_t m;
 		nextoff = toff + PAGE_SIZE;
 		m = pages[i];
 
 		if (nextoff <= size) {
 			/*
 			 * Read operation filled an entire page
 			 */
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 		} else if (size > toff) {
 			/*
 			 * Read operation filled a partial page.
 			 */
 			m->valid = 0;
 			vm_page_set_validclean(m, 0, size - toff);
 			/* handled by vm_fault now	  */
 			/* vm_page_zero_invalid(m, TRUE); */
 		} else {
 			/*
 			 * Read operation was short.  If no error occured
 			 * we may have hit a zero-fill section.   We simply
 			 * leave valid set to 0.
 			 */
 			;
 		}
 		if (i != ap->a_reqpage) {
 			/*
 			 * Whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere (it already is in the object).  Result:
 			 * It appears that emperical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * Just in case someone was asking for this page we
 			 * now tell them that it is ok to use.
 			 */
 			if (!error) {
 				if (m->oflags & VPO_WANTED)
 					vm_page_activate(m);
 				else
 					vm_page_deactivate(m);
 				vm_page_wakeup(m);
 			} else {
 				vm_page_free(m);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 	return 0;
 }
 
 /*
  * Vnode op for VM putpages.
  */
 int
 nfs_putpages(struct vop_putpages_args *ap)
 {
 	struct uio uio;
 	struct iovec iov;
 	vm_offset_t kva;
 	struct buf *bp;
 	int iomode, must_commit, i, error, npages, count;
 	off_t offset;
 	int *rtvals;
 	struct vnode *vp;
 	struct thread *td;
 	struct ucred *cred;
 	struct nfsmount *nmp;
 	struct nfsnode *np;
 	vm_page_t *pages;
 
 	vp = ap->a_vp;
 	np = VTONFS(vp);
 	td = curthread;				/* XXX */
 	cred = curthread->td_ucred;		/* XXX */
 	nmp = VFSTONFS(vp->v_mount);
 	pages = ap->a_m;
 	count = ap->a_count;
 	rtvals = ap->a_rtvals;
 	npages = btoc(count);
 	offset = IDX_TO_OFF(pages[0]->pindex);
 	
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	mtx_lock(&np->n_mtx);
 	if (nfs_directio_enable && !nfs_directio_allow_mmap && 
 	    (np->n_flag & NNONCACHE) && (vp->v_type == VREG)) {
 		mtx_unlock(&np->n_mtx);		
 		nfs_printf("nfs_putpages: called on noncache-able vnode??\n");
 		mtx_lock(&np->n_mtx);
 	}
 
 	for (i = 0; i < npages; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	/*
 	 * When putting pages, do not extend file past EOF.
 	 */
 	if (offset + count > np->n_size) {
 		count = np->n_size - offset;
 		if (count < 0)
 			count = 0;
 	}
 	mtx_unlock(&np->n_mtx);
 
 	/*
 	 * We use only the kva address for the buffer, but this is extremely
 	 * convienient and fast.
 	 */
 	bp = getpbuf(&nfs_pbuf_freecnt);
 
 	kva = (vm_offset_t) bp->b_data;
 	pmap_qenter(kva, pages, npages);
-	cnt.v_vnodeout++;
-	cnt.v_vnodepgsout += count;
+	VMCNT_ADD(vnodeout, 1);
+	VMCNT_ADD(vnodepgsout, count);
 
 	iov.iov_base = (caddr_t) kva;
 	iov.iov_len = count;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = offset;
 	uio.uio_resid = count;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = UIO_WRITE;
 	uio.uio_td = td;
 
 	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
 	    iomode = NFSV3WRITE_UNSTABLE;
 	else
 	    iomode = NFSV3WRITE_FILESYNC;
 
 	error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, &iomode, &must_commit);
 
 	pmap_qremove(kva, npages);
 	relpbuf(bp, &nfs_pbuf_freecnt);
 
 	if (!error) {
 		int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
 		for (i = 0; i < nwritten; i++) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(pages[i]);
 		}
 		if (must_commit) {
 			nfs_clearcommit(vp->v_mount);
 		}
 	}
 	return rtvals[0];
 }
 
 /*
  * For nfs, cache consistency can only be maintained approximately.
  * Although RFC1094 does not specify the criteria, the following is
  * believed to be compatible with the reference port.
  * For nfs:
  * If the file's modify time on the server has changed since the
  * last read rpc or you have written to the file,
  * you may have lost data cache consistency with the
  * server, so flush all of the file's data out of the cache.
  * Then force a getattr rpc to ensure that you have up to date
  * attributes.
  * NB: This implies that cache data can be read when up to
  * NFS_ATTRTIMEO seconds out of date. If you find that you need current
  * attributes this could be forced by setting n_attrstamp to 0 before
  * the VOP_GETATTR() call.
  */
 static inline int
 nfs_bioread_check_cons(struct vnode *vp, struct thread *td, struct ucred *cred)
 {
 	int error = 0;
 	struct vattr vattr;
 	struct nfsnode *np = VTONFS(vp);
 	int old_lock;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	
 	/*
 	 * Grab the exclusive lock before checking whether the cache is
 	 * consistent.
 	 * XXX - We can make this cheaper later (by acquiring cheaper locks).
 	 * But for now, this suffices.
 	 */
 	old_lock = nfs_upgrade_vnlock(vp, td);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NMODIFIED) {
 		mtx_unlock(&np->n_mtx);
 		if (vp->v_type != VREG) {
 			if (vp->v_type != VDIR)
 				panic("nfs: bioread, not dir");
 			(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 		}
 		np->n_attrstamp = 0;
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			goto out;
 		mtx_lock(&np->n_mtx);
 		np->n_mtime = vattr.va_mtime;
 		mtx_unlock(&np->n_mtx);
 	} else {
 		mtx_unlock(&np->n_mtx);
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		if ((np->n_flag & NSIZECHANGED)
 		    || (NFS_TIMESPEC_COMPARE(&np->n_mtime, &vattr.va_mtime))) {
 			mtx_unlock(&np->n_mtx);
 			if (vp->v_type == VDIR)
 				(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				goto out;
 			mtx_lock(&np->n_mtx);
 			np->n_mtime = vattr.va_mtime;
 			np->n_flag &= ~NSIZECHANGED;
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 out:	
 	nfs_downgrade_vnlock(vp, td, old_lock);
 	return error;
 }
 
 /*
  * Vnode op for read using bio
  */
 int
 nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred)
 {
 	struct nfsnode *np = VTONFS(vp);
 	int biosize, i;
 	struct buf *bp, *rabp;
 	struct thread *td;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn, rabn;
 	int bcount;
 	int seqcount;
 	int nra, error = 0, n = 0, on = 0;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("nfs_read mode");
 #endif
 	if (uio->uio_resid == 0)
 		return (0);
 	if (uio->uio_offset < 0)	/* XXX VDIR cookies can be negative */
 		return (EINVAL);
 	td = uio->uio_td;
 
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);		
 
 	if (vp->v_type != VDIR &&
 	    (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
 
 	if (nfs_directio_enable && (ioflag & IO_DIRECT) && (vp->v_type == VREG))
 		/* No caching/ no readaheads. Just read data into the user buffer */
 		return nfs_readrpc(vp, uio, cred);
 
 	biosize = vp->v_mount->mnt_stat.f_iosize;
 	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
 	
 	error = nfs_bioread_check_cons(vp, td, cred);
 	if (error)
 		return error;
 
 	do {
 	    u_quad_t nsize;
 			
 	    mtx_lock(&np->n_mtx);
 	    nsize = np->n_size;
 	    mtx_unlock(&np->n_mtx);		    
 
 	    switch (vp->v_type) {
 	    case VREG:
 		nfsstats.biocache_reads++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize - 1);
 
 		/*
 		 * Start the read ahead(s), as required.
 		 */
 		if (nmp->nm_readahead > 0) {
 		    for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
 			(off_t)(lbn + 1 + nra) * biosize < nsize; nra++) {
 			rabn = lbn + 1 + nra;
 			if (incore(&vp->v_bufobj, rabn) == NULL) {
 			    rabp = nfs_getcacheblk(vp, rabn, biosize, td);
 			    if (!rabp) {
 				error = nfs_sigintr(nmp, NULL, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				    break;
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		    }
 		}
 
 		/* Note that bcount is *not* DEV_BSIZE aligned. */
 		bcount = biosize;
 		if ((off_t)lbn * biosize >= nsize) {
 			bcount = 0;
 		} else if ((off_t)(lbn + 1) * biosize > nsize) {
 			bcount = nsize - (off_t)lbn * biosize;
 		}
 		bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 		if (!bp) {
 			error = nfs_sigintr(nmp, NULL, td);
 			return (error ? error : EINTR);
 		}
 
 		/*
 		 * If B_CACHE is not set, we must issue the read.  If this
 		 * fails, we return an error.
 		 */
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			brelse(bp);
 			return (error);
 		    }
 		}
 
 		/*
 		 * on is the offset into the current bp.  Figure out how many
 		 * bytes we can copy out of the bp.  Note that bcount is
 		 * NOT DEV_BSIZE aligned.
 		 *
 		 * Then figure out how many bytes we can copy into the uio.
 		 */
 
 		n = 0;
 		if (on < bcount)
 			n = min((unsigned)(bcount - on), uio->uio_resid);
 		break;
 	    case VLNK:
 		nfsstats.biocache_readlinks++;
 		bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
 		if (!bp) {
 			error = nfs_sigintr(nmp, NULL, td);
 			return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			return (error);
 		    }
 		}
 		n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
 		on = 0;
 		break;
 	    case VDIR:
 		nfsstats.biocache_readdirs++;
 		if (np->n_direofoffset
 		    && uio->uio_offset >= np->n_direofoffset) {
 		    return (0);
 		}
 		lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
 		on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
 		bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
 		if (!bp) {
 		    error = nfs_sigintr(nmp, NULL, td);
 		    return (error ? error : EINTR);
 		}
 		if ((bp->b_flags & B_CACHE) == 0) {
 		    bp->b_iocmd = BIO_READ;
 		    vfs_busy_pages(bp, 0);
 		    error = nfs_doio(vp, bp, cred, td);
 		    if (error) {
 			    brelse(bp);
 		    }
 		    while (error == NFSERR_BAD_COOKIE) {
 			(nmp->nm_rpcops->nr_invaldir)(vp);
 			error = nfs_vinvalbuf(vp, 0, td, 1);
 			/*
 			 * Yuck! The directory has been modified on the
 			 * server. The only way to get the block is by
 			 * reading from the beginning to get all the
 			 * offset cookies.
 			 *
 			 * Leave the last bp intact unless there is an error.
 			 * Loop back up to the while if the error is another
 			 * NFSERR_BAD_COOKIE (double yuch!).
 			 */
 			for (i = 0; i <= lbn && !error; i++) {
 			    if (np->n_direofoffset
 				&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
 				    return (0);
 			    bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
 			    if (!bp) {
 				error = nfs_sigintr(nmp, NULL, td);
 				return (error ? error : EINTR);
 			    }
 			    if ((bp->b_flags & B_CACHE) == 0) {
 				    bp->b_iocmd = BIO_READ;
 				    vfs_busy_pages(bp, 0);
 				    error = nfs_doio(vp, bp, cred, td);
 				    /*
 				     * no error + B_INVAL == directory EOF,
 				     * use the block.
 				     */
 				    if (error == 0 && (bp->b_flags & B_INVAL))
 					    break;
 			    }
 			    /*
 			     * An error will throw away the block and the
 			     * for loop will break out.  If no error and this
 			     * is not the block we want, we throw away the
 			     * block and go for the next one via the for loop.
 			     */
 			    if (error || i < lbn)
 				    brelse(bp);
 			}
 		    }
 		    /*
 		     * The above while is repeated if we hit another cookie
 		     * error.  If we hit an error and it wasn't a cookie error,
 		     * we give up.
 		     */
 		    if (error)
 			    return (error);
 		}
 
 		/*
 		 * If not eof and read aheads are enabled, start one.
 		 * (You need the current block first, so that you have the
 		 *  directory offset cookie of the next block.)
 		 */
 		if (nmp->nm_readahead > 0 &&
 		    (bp->b_flags & B_INVAL) == 0 &&
 		    (np->n_direofoffset == 0 ||
 		    (lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
 		    incore(&vp->v_bufobj, lbn + 1) == NULL) {
 			rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
 			if (rabp) {
 			    if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
 				rabp->b_flags |= B_ASYNC;
 				rabp->b_iocmd = BIO_READ;
 				vfs_busy_pages(rabp, 0);
 				if (nfs_asyncio(nmp, rabp, cred, td)) {
 				    rabp->b_flags |= B_INVAL;
 				    rabp->b_ioflags |= BIO_ERROR;
 				    vfs_unbusy_pages(rabp);
 				    brelse(rabp);
 				}
 			    } else {
 				brelse(rabp);
 			    }
 			}
 		}
 		/*
 		 * Unlike VREG files, whos buffer size ( bp->b_bcount ) is
 		 * chopped for the EOF condition, we cannot tell how large
 		 * NFS directories are going to be until we hit EOF.  So
 		 * an NFS directory buffer is *not* chopped to its EOF.  Now,
 		 * it just so happens that b_resid will effectively chop it
 		 * to EOF.  *BUT* this information is lost if the buffer goes
 		 * away and is reconstituted into a B_CACHE state ( due to
 		 * being VMIO ) later.  So we keep track of the directory eof
 		 * in np->n_direofoffset and chop it off as an extra step
 		 * right here.
 		 */
 		n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
 		if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
 			n = np->n_direofoffset - uio->uio_offset;
 		break;
 	    default:
 		nfs_printf(" nfs_bioread: type %x unexpected\n", vp->v_type);
 		bp = NULL;
 		break;
 	    };
 
 	    if (n > 0) {
 		    error = uiomove(bp->b_data + on, (int)n, uio);
 	    }
 	    if (vp->v_type == VLNK)
 		n = 0;
 	    if (bp != NULL)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n > 0);
 	return (error);
 }
 
 /*
  * The NFS write path cannot handle iovecs with len > 1. So we need to 
  * break up iovecs accordingly (restricting them to wsize).
  * For the SYNC case, we can do this with 1 copy (user buffer -> mbuf). 
  * For the ASYNC case, 2 copies are needed. The first a copy from the 
  * user buffer to a staging buffer and then a second copy from the staging
  * buffer to mbufs. This can be optimized by copying from the user buffer
  * directly into mbufs and passing the chain down, but that requires a 
  * fair amount of re-working of the relevant codepaths (and can be done
  * later).
  */
 static int
 nfs_directio_write(vp, uiop, cred, ioflag)
 	struct vnode *vp;
 	struct uio *uiop;
 	struct ucred *cred;
 	int ioflag;
 {
 	int error;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	struct thread *td = uiop->uio_td;
 	int size;
 	int wsize;
 	
 	mtx_lock(&nmp->nm_mtx);
 	wsize = nmp->nm_wsize;
 	mtx_unlock(&nmp->nm_mtx);
 	if (ioflag & IO_SYNC) {
 		int iomode, must_commit;
 		struct uio uio;
 		struct iovec iov;
 do_sync:
 		while (uiop->uio_resid > 0) {
 			size = min(uiop->uio_resid, wsize);
 			size = min(uiop->uio_iov->iov_len, size);
 			iov.iov_base = uiop->uio_iov->iov_base;
 			iov.iov_len = size;
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			uio.uio_offset = uiop->uio_offset;
 			uio.uio_resid = size;
 			uio.uio_segflg = UIO_USERSPACE;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_td = td;
 			iomode = NFSV3WRITE_FILESYNC;
 			error = (nmp->nm_rpcops->nr_writerpc)(vp, &uio, cred, 
 						      &iomode, &must_commit);
 			KASSERT((must_commit == 0), 
 				("nfs_directio_write: Did not commit write"));
 			if (error)
 				return (error);
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base = 
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	} else {
 		struct uio *t_uio;
 		struct iovec *t_iov;
 		struct buf *bp;
 		
 		/*
 		 * Break up the write into blocksize chunks and hand these
 		 * over to nfsiod's for write back.
 		 * Unfortunately, this incurs a copy of the data. Since 
 		 * the user could modify the buffer before the write is 
 		 * initiated.
 		 * 
 		 * The obvious optimization here is that one of the 2 copies
 		 * in the async write path can be eliminated by copying the
 		 * data here directly into mbufs and passing the mbuf chain
 		 * down. But that will require a fair amount of re-working
 		 * of the code and can be done if there's enough interest
 		 * in NFS directio access.
 		 */
 		while (uiop->uio_resid > 0) {
 			size = min(uiop->uio_resid, wsize);
 			size = min(uiop->uio_iov->iov_len, size);
 			bp = getpbuf(&nfs_pbuf_freecnt);
 			t_uio = malloc(sizeof(struct uio), M_NFSDIRECTIO, M_WAITOK);
 			t_iov = malloc(sizeof(struct iovec), M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_base = malloc(size, M_NFSDIRECTIO, M_WAITOK);
 			t_iov->iov_len = size;
 			t_uio->uio_iov = t_iov;
 			t_uio->uio_iovcnt = 1;
 			t_uio->uio_offset = uiop->uio_offset;
 			t_uio->uio_resid = size;
 			t_uio->uio_segflg = UIO_SYSSPACE;
 			t_uio->uio_rw = UIO_WRITE;
 			t_uio->uio_td = td;
 			bcopy(uiop->uio_iov->iov_base, t_iov->iov_base, size);
 			bp->b_flags |= B_DIRECT;
 			bp->b_iocmd = BIO_WRITE;
 			if (cred != NOCRED) {
 				crhold(cred);
 				bp->b_wcred = cred;
 			} else 
 				bp->b_wcred = NOCRED;			
 			bp->b_caller1 = (void *)t_uio;
 			bp->b_vp = vp;
 			error = nfs_asyncio(nmp, bp, NOCRED, td);
 			if (error) {
 				free(t_iov->iov_base, M_NFSDIRECTIO);
 				free(t_iov, M_NFSDIRECTIO);
 				free(t_uio, M_NFSDIRECTIO);
 				bp->b_vp = NULL;
 				relpbuf(bp, &nfs_pbuf_freecnt);
 				if (error == EINTR)
 					return (error);
 				goto do_sync;
 			}
 			uiop->uio_offset += size;
 			uiop->uio_resid -= size;
 			if (uiop->uio_iov->iov_len <= size) {
 				uiop->uio_iovcnt--;
 				uiop->uio_iov++;
 			} else {
 				uiop->uio_iov->iov_base = 
 					(char *)uiop->uio_iov->iov_base + size;
 				uiop->uio_iov->iov_len -= size;
 			}
 		}
 	}
 	return (0);
 }
 
 /*
  * Vnode op for write using bio
  */
 int
 nfs_write(struct vop_write_args *ap)
 {
 	int biosize;
 	struct uio *uio = ap->a_uio;
 	struct thread *td = uio->uio_td;
 	struct vnode *vp = ap->a_vp;
 	struct nfsnode *np = VTONFS(vp);
 	struct ucred *cred = ap->a_cred;
 	int ioflag = ap->a_ioflag;
 	struct buf *bp;
 	struct vattr vattr;
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	daddr_t lbn;
 	int bcount;
 	int n, on, error = 0;
 	struct proc *p = td?td->td_proc:NULL;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("nfs_write mode");
 	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
 		panic("nfs_write proc");
 #endif
 	if (vp->v_type != VREG)
 		return (EIO);
 	mtx_lock(&np->n_mtx);
 	if (np->n_flag & NWRITEERR) {
 		np->n_flag &= ~NWRITEERR;
 		mtx_unlock(&np->n_mtx);
 		return (np->n_error);
 	} else
 		mtx_unlock(&np->n_mtx);
 	mtx_lock(&nmp->nm_mtx);
 	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
 	    (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) {
 		mtx_unlock(&nmp->nm_mtx);
 		(void)nfs_fsinfo(nmp, vp, cred, td);
 	} else
 		mtx_unlock(&nmp->nm_mtx);
 
 	/*
 	 * Synchronously flush pending buffers if we are in synchronous
 	 * mode or if we are appending.
 	 */
 	if (ioflag & (IO_APPEND | IO_SYNC)) {
 		mtx_lock(&np->n_mtx);
 		if (np->n_flag & NMODIFIED) {
 			mtx_unlock(&np->n_mtx);
 #ifdef notyet /* Needs matching nonblock semantics elsewhere, too. */
 			/*
 			 * Require non-blocking, synchronous writes to
 			 * dirty files to inform the program it needs
 			 * to fsync(2) explicitly.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 #endif
 flush_and_restart:
 			np->n_attrstamp = 0;
 			error = nfs_vinvalbuf(vp, V_SAVE, td, 1);
 			if (error)
 				return (error);
 		} else
 			mtx_unlock(&np->n_mtx);
 	}
 
 	/*
 	 * If IO_APPEND then load uio_offset.  We restart here if we cannot
 	 * get the append lock.
 	 */
 	if (ioflag & IO_APPEND) {
 		np->n_attrstamp = 0;
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return (error);
 		mtx_lock(&np->n_mtx);
 		uio->uio_offset = np->n_size;
 		mtx_unlock(&np->n_mtx);
 	}
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
 		return (EFBIG);
 	if (uio->uio_resid == 0)
 		return (0);
 
 	if (nfs_directio_enable && (ioflag & IO_DIRECT) && vp->v_type == VREG)
 		return nfs_directio_write(vp, uio, cred, ioflag);
 
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, i don't think it matters
 	 */
 	if (p != NULL) {
 		PROC_LOCK(p);
 		if (uio->uio_offset + uio->uio_resid >
 		    lim_cur(p, RLIMIT_FSIZE)) {
 			psignal(p, SIGXFSZ);
 			PROC_UNLOCK(p);
 			return (EFBIG);
 		}
 		PROC_UNLOCK(p);
 	}
 
 	biosize = vp->v_mount->mnt_stat.f_iosize;
 	/*
 	 * Find all of this file's B_NEEDCOMMIT buffers.  If our writes
 	 * would exceed the local maximum per-file write commit size when
 	 * combined with those, we must decide whether to flush,
 	 * go synchronous, or return error.  We don't bother checking
 	 * IO_UNIT -- we just make all writes atomic anyway, as there's
 	 * no point optimizing for something that really won't ever happen.
 	 */
 	if (!(ioflag & IO_SYNC)) {
 		int nflag;
 
 		mtx_lock(&np->n_mtx);
 		nflag = np->n_flag;
 		mtx_unlock(&np->n_mtx);		
 		int needrestart = 0;
 		if (nmp->nm_wcommitsize < uio->uio_resid) {
 			/*
 			 * If this request could not possibly be completed
 			 * without exceeding the maximum outstanding write
 			 * commit size, see if we can convert it into a
 			 * synchronous write operation.
 			 */
 			if (ioflag & IO_NDELAY)
 				return (EAGAIN);
 			ioflag |= IO_SYNC;
 			if (nflag & NMODIFIED)
 				needrestart = 1;
 		} else if (nflag & NMODIFIED) {
 			int wouldcommit = 0;
 			BO_LOCK(&vp->v_bufobj);
 			if (vp->v_bufobj.bo_dirty.bv_cnt != 0) {
 				TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd,
 				    b_bobufs) {
 					if (bp->b_flags & B_NEEDCOMMIT)
 						wouldcommit += bp->b_bcount;
 				}
 			}
 			BO_UNLOCK(&vp->v_bufobj);
 			/*
 			 * Since we're not operating synchronously and
 			 * bypassing the buffer cache, we are in a commit
 			 * and holding all of these buffers whether
 			 * transmitted or not.  If not limited, this
 			 * will lead to the buffer cache deadlocking,
 			 * as no one else can flush our uncommitted buffers.
 			 */
 			wouldcommit += uio->uio_resid;
 			/*
 			 * If we would initially exceed the maximum
 			 * outstanding write commit size, flush and restart.
 			 */
 			if (wouldcommit > nmp->nm_wcommitsize)
 				needrestart = 1;
 		}
 		if (needrestart)
 			goto flush_and_restart;
 	}
 
 	do {
 		nfsstats.biocache_writes++;
 		lbn = uio->uio_offset / biosize;
 		on = uio->uio_offset & (biosize-1);
 		n = min((unsigned)(biosize - on), uio->uio_resid);
 again:
 		/*
 		 * Handle direct append and file extension cases, calculate
 		 * unaligned buffer size.
 		 */
 		mtx_lock(&np->n_mtx);
 		if (uio->uio_offset == np->n_size && n) {
 			mtx_unlock(&np->n_mtx);
 			/*
 			 * Get the buffer (in its pre-append state to maintain
 			 * B_CACHE if it was previously set).  Resize the
 			 * nfsnode after we have locked the buffer to prevent
 			 * readers from reading garbage.
 			 */
 			bcount = on;
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 
 			if (bp != NULL) {
 				long save;
 
 				mtx_lock(&np->n_mtx);
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 				mtx_unlock(&np->n_mtx);
 
 				save = bp->b_flags & B_CACHE;
 				bcount += n;
 				allocbuf(bp, bcount);
 				bp->b_flags |= save;
 			}
 		} else {
 			/*
 			 * Obtain the locked cache block first, and then
 			 * adjust the file's size as appropriate.
 			 */
 			bcount = on + n;
 			if ((off_t)lbn * biosize + bcount < np->n_size) {
 				if ((off_t)(lbn + 1) * biosize < np->n_size)
 					bcount = biosize;
 				else
 					bcount = np->n_size - (off_t)lbn * biosize;
 			}
 			mtx_unlock(&np->n_mtx);
 			bp = nfs_getcacheblk(vp, lbn, bcount, td);
 			mtx_lock(&np->n_mtx);
 			if (uio->uio_offset + n > np->n_size) {
 				np->n_size = uio->uio_offset + n;
 				np->n_flag |= NMODIFIED;
 				vnode_pager_setsize(vp, np->n_size);
 			}
 			mtx_unlock(&np->n_mtx);
 		}
 
 		if (!bp) {
 			error = nfs_sigintr(nmp, NULL, td);
 			if (!error)
 				error = EINTR;
 			break;
 		}
 
 		/*
 		 * Issue a READ if B_CACHE is not set.  In special-append
 		 * mode, B_CACHE is based on the buffer prior to the write
 		 * op and is typically set, avoiding the read.  If a read
 		 * is required in special append mode, the server will
 		 * probably send us a short-read since we extended the file
 		 * on our end, resulting in b_resid == 0 and, thusly,
 		 * B_CACHE getting set.
 		 *
 		 * We can also avoid issuing the read if the write covers
 		 * the entire buffer.  We have to make sure the buffer state
 		 * is reasonable in this case since we will not be initiating
 		 * I/O.  See the comments in kern/vfs_bio.c's getblk() for
 		 * more information.
 		 *
 		 * B_CACHE may also be set due to the buffer being cached
 		 * normally.
 		 */
 
 		if (on == 0 && n == bcount) {
 			bp->b_flags |= B_CACHE;
 			bp->b_flags &= ~B_INVAL;
 			bp->b_ioflags &= ~BIO_ERROR;
 		}
 
 		if ((bp->b_flags & B_CACHE) == 0) {
 			bp->b_iocmd = BIO_READ;
 			vfs_busy_pages(bp, 0);
 			error = nfs_doio(vp, bp, cred, td);
 			if (error) {
 				brelse(bp);
 				break;
 			}
 		}
 		if (bp->b_wcred == NOCRED)
 			bp->b_wcred = crhold(cred);
 		mtx_lock(&np->n_mtx);
 		np->n_flag |= NMODIFIED;
 		mtx_unlock(&np->n_mtx);
 
 		/*
 		 * If dirtyend exceeds file size, chop it down.  This should
 		 * not normally occur but there is an append race where it
 		 * might occur XXX, so we log it.
 		 *
 		 * If the chopping creates a reverse-indexed or degenerate
 		 * situation with dirtyoff/end, we 0 both of them.
 		 */
 
 		if (bp->b_dirtyend > bcount) {
 			nfs_printf("NFS append race @%lx:%d\n",
 			    (long)bp->b_blkno * DEV_BSIZE,
 			    bp->b_dirtyend - bcount);
 			bp->b_dirtyend = bcount;
 		}
 
 		if (bp->b_dirtyoff >= bp->b_dirtyend)
 			bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		/*
 		 * If the new write will leave a contiguous dirty
 		 * area, just update the b_dirtyoff and b_dirtyend,
 		 * otherwise force a write rpc of the old dirty area.
 		 *
 		 * While it is possible to merge discontiguous writes due to
 		 * our having a B_CACHE buffer ( and thus valid read data
 		 * for the hole), we don't because it could lead to
 		 * significant cache coherency problems with multiple clients,
 		 * especially if locking is implemented later on.
 		 *
 		 * as an optimization we could theoretically maintain
 		 * a linked list of discontinuous areas, but we would still
 		 * have to commit them separately so there isn't much
 		 * advantage to it except perhaps a bit of asynchronization.
 		 */
 
 		if (bp->b_dirtyend > 0 &&
 		    (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
 			if (bwrite(bp) == EINTR) {
 				error = EINTR;
 				break;
 			}
 			goto again;
 		}
 
 		error = uiomove((char *)bp->b_data + on, n, uio);
 
 		/*
 		 * Since this block is being modified, it must be written
 		 * again and not just committed.  Since write clustering does
 		 * not work for the stage 1 data write, only the stage 2
 		 * commit rpc, we have to clear B_CLUSTEROK as well.
 		 */
 		bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 
 		if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			brelse(bp);
 			break;
 		}
 
 		/*
 		 * Only update dirtyoff/dirtyend if not a degenerate
 		 * condition.
 		 */
 		if (n) {
 			if (bp->b_dirtyend > 0) {
 				bp->b_dirtyoff = min(on, bp->b_dirtyoff);
 				bp->b_dirtyend = max((on + n), bp->b_dirtyend);
 			} else {
 				bp->b_dirtyoff = on;
 				bp->b_dirtyend = on + n;
 			}
 			vfs_bio_set_validclean(bp, on, n);
 		}
 
 		/*
 		 * If IO_SYNC do bwrite().
 		 *
 		 * IO_INVAL appears to be unused.  The idea appears to be
 		 * to turn off caching in this case.  Very odd.  XXX
 		 */
 		if ((ioflag & IO_SYNC)) {
 			if (ioflag & IO_INVAL)
 				bp->b_flags |= B_NOCACHE;
 			error = bwrite(bp);
 			if (error)
 				break;
 		} else if ((n + on) == biosize) {
 			bp->b_flags |= B_ASYNC;
 			(void) (nmp->nm_rpcops->nr_writebp)(bp, 0, NULL);
 		} else {
 			bdwrite(bp);
 		}
 	} while (uio->uio_resid > 0 && n > 0);
 
 	return (error);
 }
 
 /*
  * Get an nfs cache block.
  *
  * Allocate a new one if the block isn't currently in the cache
  * and return the block marked busy. If the calling process is
  * interrupted by a signal for an interruptible mount point, return
  * NULL.
  *
  * The caller must carefully deal with the possible B_INVAL state of
  * the buffer.  nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
  * indirectly), so synchronous reads can be issued without worrying about
  * the B_INVAL state.  We have to be a little more careful when dealing
  * with writes (see comments in nfs_write()) when extending a file past
  * its EOF.
  */
 static struct buf *
 nfs_getcacheblk(struct vnode *vp, daddr_t bn, int size, struct thread *td)
 {
 	struct buf *bp;
 	struct mount *mp;
 	struct nfsmount *nmp;
 
 	mp = vp->v_mount;
 	nmp = VFSTONFS(mp);
 
 	if (nmp->nm_flag & NFSMNT_INT) {
  		sigset_t oldset;
 
  		nfs_set_sigmask(td, &oldset);
 		bp = getblk(vp, bn, size, PCATCH, 0, 0);
  		nfs_restore_sigmask(td, &oldset);
 		while (bp == NULL) {
 			if (nfs_sigintr(nmp, NULL, td))
 				return (NULL);
 			bp = getblk(vp, bn, size, 0, 2 * hz, 0);
 		}
 	} else {
 		bp = getblk(vp, bn, size, 0, 0, 0);
 	}
 
 	if (vp->v_type == VREG) {
 		int biosize;
 
 		biosize = mp->mnt_stat.f_iosize;
 		bp->b_blkno = bn * (biosize / DEV_BSIZE);
 	}
 	return (bp);
 }
 
 /*
  * Flush and invalidate all dirty buffers. If another process is already
  * doing the flush, just wait for completion.
  */
 int
 nfs_vinvalbuf(struct vnode *vp, int flags, struct thread *td, int intrflg)
 {
 	struct nfsnode *np = VTONFS(vp);
 	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
 	int error = 0, slpflag, slptimeo;
  	int old_lock = 0;
 
 	ASSERT_VOP_LOCKED(vp, "nfs_vinvalbuf");
 
 	/*
 	 * XXX This check stops us from needlessly doing a vinvalbuf when
 	 * being called through vclean().  It is not clear that this is
 	 * unsafe.
 	 */
 	if (vp->v_iflag & VI_DOOMED)
 		return (0);
 
 	if ((nmp->nm_flag & NFSMNT_INT) == 0)
 		intrflg = 0;
 	if (intrflg) {
 		slpflag = PCATCH;
 		slptimeo = 2 * hz;
 	} else {
 		slpflag = 0;
 		slptimeo = 0;
 	}
 
 	old_lock = nfs_upgrade_vnlock(vp, td);
 	/*
 	 * Now, flush as required.
 	 */
 	if ((flags & V_SAVE) && (vp->v_bufobj.bo_object != NULL)) {
 		VM_OBJECT_LOCK(vp->v_bufobj.bo_object);
 		vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(vp->v_bufobj.bo_object);
 		/*
 		 * If the page clean was interrupted, fail the invalidation.
 		 * Not doing so, we run the risk of losing dirty pages in the 
 		 * vinvalbuf() call below.
 		 */
 		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
 			goto out;
 	}
 
 	error = vinvalbuf(vp, flags, td, slpflag, 0);
 	while (error) {
 		if (intrflg && (error = nfs_sigintr(nmp, NULL, td)))
 			goto out;
 		error = vinvalbuf(vp, flags, td, 0, slptimeo);
 	}
 	mtx_lock(&np->n_mtx);
 	if (np->n_directio_asyncwr == 0)
 		np->n_flag &= ~NMODIFIED;
 	mtx_unlock(&np->n_mtx);
 out:
 	nfs_downgrade_vnlock(vp, td, old_lock);
 	return error;
 }
 
 /*
  * Initiate asynchronous I/O. Return an error if no nfsiods are available.
  * This is mainly to avoid queueing async I/O requests when the nfsiods
  * are all hung on a dead server.
  *
  * Note: nfs_asyncio() does not clear (BIO_ERROR|B_INVAL) but when the bp
  * is eventually dequeued by the async daemon, nfs_doio() *will*.
  */
 int
 nfs_asyncio(struct nfsmount *nmp, struct buf *bp, struct ucred *cred, struct thread *td)
 {
 	int iod;
 	int gotiod;
 	int slpflag = 0;
 	int slptimeo = 0;
 	int error, error2;
 
 	/*
 	 * Commits are usually short and sweet so lets save some cpu and
 	 * leave the async daemons for more important rpc's (such as reads
 	 * and writes).
 	 */
 	mtx_lock(&nfs_iod_mtx);
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
 	    (nmp->nm_bufqiods > nfs_numasync / 2)) {
 		mtx_unlock(&nfs_iod_mtx);
 		return(EIO);
 	}
 again:
 	if (nmp->nm_flag & NFSMNT_INT)
 		slpflag = PCATCH;
 	gotiod = FALSE;
 
 	/*
 	 * Find a free iod to process this request.
 	 */
 	for (iod = 0; iod < nfs_numasync; iod++)
 		if (nfs_iodwant[iod]) {
 			gotiod = TRUE;
 			break;
 		}
 
 	/*
 	 * Try to create one if none are free.
 	 */
 	if (!gotiod) {
 		iod = nfs_nfsiodnew();
 		if (iod != -1)
 			gotiod = TRUE;
 	}
 
 	if (gotiod) {
 		/*
 		 * Found one, so wake it up and tell it which
 		 * mount to process.
 		 */
 		NFS_DPF(ASYNCIO, ("nfs_asyncio: waking iod %d for mount %p\n",
 		    iod, nmp));
 		nfs_iodwant[iod] = NULL;
 		nfs_iodmount[iod] = nmp;
 		nmp->nm_bufqiods++;
 		wakeup(&nfs_iodwant[iod]);
 	}
 
 	/*
 	 * If none are free, we may already have an iod working on this mount
 	 * point.  If so, it will process our request.
 	 */
 	if (!gotiod) {
 		if (nmp->nm_bufqiods > 0) {
 			NFS_DPF(ASYNCIO,
 				("nfs_asyncio: %d iods are already processing mount %p\n",
 				 nmp->nm_bufqiods, nmp));
 			gotiod = TRUE;
 		}
 	}
 
 	/*
 	 * If we have an iod which can process the request, then queue
 	 * the buffer.
 	 */
 	if (gotiod) {
 		/*
 		 * Ensure that the queue never grows too large.  We still want
 		 * to asynchronize so we block rather then return EIO.
 		 */
 		while (nmp->nm_bufqlen >= 2*nfs_numasync) {
 			NFS_DPF(ASYNCIO,
 				("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
 			nmp->nm_bufqwant = TRUE;
  			error = nfs_msleep(td, &nmp->nm_bufq, &nfs_iod_mtx, 
 					   slpflag | PRIBIO,
  					   "nfsaio", slptimeo);
 			if (error) {
 				error2 = nfs_sigintr(nmp, NULL, td);
 				if (error2) {
 					mtx_unlock(&nfs_iod_mtx);					
 					return (error2);
 				}
 				if (slpflag == PCATCH) {
 					slpflag = 0;
 					slptimeo = 2 * hz;
 				}
 			}
 			/*
 			 * We might have lost our iod while sleeping,
 			 * so check and loop if nescessary.
 			 */
 			if (nmp->nm_bufqiods == 0) {
 				NFS_DPF(ASYNCIO,
 					("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
 				goto again;
 			}
 		}
 
 		if (bp->b_iocmd == BIO_READ) {
 			if (bp->b_rcred == NOCRED && cred != NOCRED)
 				bp->b_rcred = crhold(cred);
 		} else {
 			if (bp->b_wcred == NOCRED && cred != NOCRED)
 				bp->b_wcred = crhold(cred);
 		}
 
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		BUF_KERNPROC(bp);
 		TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
 		nmp->nm_bufqlen++;
 		if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 			mtx_lock(&(VTONFS(bp->b_vp))->n_mtx);			
 			VTONFS(bp->b_vp)->n_flag |= NMODIFIED;
 			VTONFS(bp->b_vp)->n_directio_asyncwr++;
 			mtx_unlock(&(VTONFS(bp->b_vp))->n_mtx);
 		}
 		mtx_unlock(&nfs_iod_mtx);
 		return (0);
 	}
 
 	mtx_unlock(&nfs_iod_mtx);
 
 	/*
 	 * All the iods are busy on other mounts, so return EIO to
 	 * force the caller to process the i/o synchronously.
 	 */
 	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
 	return (EIO);
 }
 
 void
 nfs_doio_directwrite(struct buf *bp)
 {
 	int iomode, must_commit;
 	struct uio *uiop = (struct uio *)bp->b_caller1;
 	char *iov_base = uiop->uio_iov->iov_base;
 	struct nfsmount *nmp = VFSTONFS(bp->b_vp->v_mount);
 	
 	iomode = NFSV3WRITE_FILESYNC;
 	uiop->uio_td = NULL; /* NULL since we're in nfsiod */
 	(nmp->nm_rpcops->nr_writerpc)(bp->b_vp, uiop, bp->b_wcred, &iomode, &must_commit);
 	KASSERT((must_commit == 0), ("nfs_doio_directwrite: Did not commit write"));
 	free(iov_base, M_NFSDIRECTIO);
 	free(uiop->uio_iov, M_NFSDIRECTIO);
 	free(uiop, M_NFSDIRECTIO);
 	if ((bp->b_flags & B_DIRECT) && bp->b_iocmd == BIO_WRITE) {
 		struct nfsnode *np = VTONFS(bp->b_vp);
 		mtx_lock(&np->n_mtx);
 		np->n_directio_asyncwr--;
 		if (np->n_directio_asyncwr == 0) {
 			VTONFS(bp->b_vp)->n_flag &= ~NMODIFIED;
 			if ((np->n_flag & NFSYNCWAIT)) {
 				np->n_flag &= ~NFSYNCWAIT;
 				wakeup((caddr_t)&np->n_directio_asyncwr);
 			}
 		}
 		mtx_unlock(&np->n_mtx);
 	}
 	bp->b_vp = NULL;
 	relpbuf(bp, &nfs_pbuf_freecnt);
 }
 
 /*
  * Do an I/O operation to/from a cache block. This may be called
  * synchronously or from an nfsiod.
  */
 int
 nfs_doio(struct vnode *vp, struct buf *bp, struct ucred *cr, struct thread *td)
 {
 	struct uio *uiop;
 	struct nfsnode *np;
 	struct nfsmount *nmp;
 	int error = 0, iomode, must_commit = 0;
 	struct uio uio;
 	struct iovec io;
 	struct proc *p = td ? td->td_proc : NULL;
 	uint8_t	iocmd;
 	
 	np = VTONFS(vp);
 	nmp = VFSTONFS(vp->v_mount);
 	uiop = &uio;
 	uiop->uio_iov = &io;
 	uiop->uio_iovcnt = 1;
 	uiop->uio_segflg = UIO_SYSSPACE;
 	uiop->uio_td = td;
 
 	/*
 	 * clear BIO_ERROR and B_INVAL state prior to initiating the I/O.  We
 	 * do this here so we do not have to do it in all the code that
 	 * calls us.
 	 */
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
 	iocmd = bp->b_iocmd;
 	if (iocmd == BIO_READ) {
 	    io.iov_len = uiop->uio_resid = bp->b_bcount;
 	    io.iov_base = bp->b_data;
 	    uiop->uio_rw = UIO_READ;
 
 	    switch (vp->v_type) {
 	    case VREG:
 		uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
 		nfsstats.read_bios++;
 		error = (nmp->nm_rpcops->nr_readrpc)(vp, uiop, cr);
 
 		if (!error) {
 		    if (uiop->uio_resid) {
 			/*
 			 * If we had a short read with no error, we must have
 			 * hit a file hole.  We should zero-fill the remainder.
 			 * This can also occur if the server hits the file EOF.
 			 *
 			 * Holes used to be able to occur due to pending
 			 * writes, but that is not possible any longer.
 			 */
 			int nread = bp->b_bcount - uiop->uio_resid;
 			int left  = uiop->uio_resid;
 
 			if (left > 0)
 				bzero((char *)bp->b_data + nread, left);
 			uiop->uio_resid = 0;
 		    }
 		}
 		/* ASSERT_VOP_LOCKED(vp, "nfs_doio"); */
 		if (p && (vp->v_vflag & VV_TEXT)) {
 			mtx_lock(&np->n_mtx);
 			if (NFS_TIMESPEC_COMPARE(&np->n_mtime, &np->n_vattr.va_mtime)) {
 				mtx_unlock(&np->n_mtx);
 				PROC_LOCK(p);
 				killproc(p, "text file modification");
 				PROC_UNLOCK(p);
 			} else
 				mtx_unlock(&np->n_mtx);
 		}
 		break;
 	    case VLNK:
 		uiop->uio_offset = (off_t)0;
 		nfsstats.readlink_bios++;
 		error = (nmp->nm_rpcops->nr_readlinkrpc)(vp, uiop, cr);
 		break;
 	    case VDIR:
 		nfsstats.readdir_bios++;
 		uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
 		if ((nmp->nm_flag & NFSMNT_NFSV4) != 0)
 			error = nfs4_readdirrpc(vp, uiop, cr);
 		else {
 			if ((nmp->nm_flag & NFSMNT_RDIRPLUS) != 0) {
 				error = nfs_readdirplusrpc(vp, uiop, cr);
 				if (error == NFSERR_NOTSUPP)
 					nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
 			}
 			if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
 				error = nfs_readdirrpc(vp, uiop, cr);
 		}
 		/*
 		 * end-of-directory sets B_INVAL but does not generate an
 		 * error.
 		 */
 		if (error == 0 && uiop->uio_resid == bp->b_bcount)
 			bp->b_flags |= B_INVAL;
 		break;
 	    default:
 		nfs_printf("nfs_doio:  type %x unexpected\n", vp->v_type);
 		break;
 	    };
 	    if (error) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = error;
 	    }
 	} else {
 	    /*
 	     * If we only need to commit, try to commit
 	     */
 	    if (bp->b_flags & B_NEEDCOMMIT) {
 		    int retv;
 		    off_t off;
 
 		    off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
 		    retv = (nmp->nm_rpcops->nr_commit)(
 				vp, off, bp->b_dirtyend-bp->b_dirtyoff,
 				bp->b_wcred, td);
 		    if (retv == 0) {
 			    bp->b_dirtyoff = bp->b_dirtyend = 0;
 			    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 			    bp->b_resid = 0;
 			    bufdone(bp);
 			    return (0);
 		    }
 		    if (retv == NFSERR_STALEWRITEVERF) {
 			    nfs_clearcommit(vp->v_mount);
 		    }
 	    }
 
 	    /*
 	     * Setup for actual write
 	     */
 	    mtx_lock(&np->n_mtx);
 	    if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
 		bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
 	    mtx_unlock(&np->n_mtx);
 
 	    if (bp->b_dirtyend > bp->b_dirtyoff) {
 		io.iov_len = uiop->uio_resid = bp->b_dirtyend
 		    - bp->b_dirtyoff;
 		uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
 		    + bp->b_dirtyoff;
 		io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 		uiop->uio_rw = UIO_WRITE;
 		nfsstats.write_bios++;
 
 		if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
 		    iomode = NFSV3WRITE_UNSTABLE;
 		else
 		    iomode = NFSV3WRITE_FILESYNC;
 
 		error = (nmp->nm_rpcops->nr_writerpc)(vp, uiop, cr, &iomode, &must_commit);
 
 		/*
 		 * When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
 		 * to cluster the buffers needing commit.  This will allow
 		 * the system to submit a single commit rpc for the whole
 		 * cluster.  We can do this even if the buffer is not 100%
 		 * dirty (relative to the NFS blocksize), so we optimize the
 		 * append-to-file-case.
 		 *
 		 * (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
 		 * cleared because write clustering only works for commit
 		 * rpc's, not for the data portion of the write).
 		 */
 
 		if (!error && iomode == NFSV3WRITE_UNSTABLE) {
 		    bp->b_flags |= B_NEEDCOMMIT;
 		    if (bp->b_dirtyoff == 0
 			&& bp->b_dirtyend == bp->b_bcount)
 			bp->b_flags |= B_CLUSTEROK;
 		} else {
 		    bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 		}
 
 		/*
 		 * For an interrupted write, the buffer is still valid
 		 * and the write hasn't been pushed to the server yet,
 		 * so we can't set BIO_ERROR and report the interruption
 		 * by setting B_EINTR. For the B_ASYNC case, B_EINTR
 		 * is not relevant, so the rpc attempt is essentially
 		 * a noop.  For the case of a V3 write rpc not being
 		 * committed to stable storage, the block is still
 		 * dirty and requires either a commit rpc or another
 		 * write rpc with iomode == NFSV3WRITE_FILESYNC before
 		 * the block is reused. This is indicated by setting
 		 * the B_DELWRI and B_NEEDCOMMIT flags.
 		 *
 		 * If the buffer is marked B_PAGING, it does not reside on
 		 * the vp's paging queues so we cannot call bdirty().  The
 		 * bp in this case is not an NFS cache block so we should
 		 * be safe. XXX
 		 */
     		if (error == EINTR || error == EIO || error == ETIMEDOUT
 		    || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 			int s;
 
 			s = splbio();
 			bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 			if ((bp->b_flags & B_PAGING) == 0) {
 			    bdirty(bp);
 			    bp->b_flags &= ~B_DONE;
 			}
 			if (error && (bp->b_flags & B_ASYNC) == 0)
 			    bp->b_flags |= B_EINTR;
 			splx(s);
 	    	} else {
 		    if (error) {
 			bp->b_ioflags |= BIO_ERROR;
 			bp->b_error = np->n_error = error;
 			mtx_lock(&np->n_mtx);
 			np->n_flag |= NWRITEERR;
 			mtx_unlock(&np->n_mtx);
 		    }
 		    bp->b_dirtyoff = bp->b_dirtyend = 0;
 		}
 	    } else {
 		bp->b_resid = 0;
 		bufdone(bp);
 		return (0);
 	    }
 	}
 	bp->b_resid = uiop->uio_resid;
 	if (must_commit)
 	    nfs_clearcommit(vp->v_mount);
 	bufdone(bp);
 	return (error);
 }
 
 /*
  * Used to aid in handling ftruncate() operations on the NFS client side.
  * Truncation creates a number of special problems for NFS.  We have to
  * throw away VM pages and buffer cache buffers that are beyond EOF, and
  * we have to properly handle VM pages or (potentially dirty) buffers
  * that straddle the truncation point.
  */
 
 int
 nfs_meta_setsize(struct vnode *vp, struct ucred *cred, struct thread *td, u_quad_t nsize)
 {
 	struct nfsnode *np = VTONFS(vp);
 	u_quad_t tsize;
 	int biosize = vp->v_mount->mnt_stat.f_iosize;
 	int error = 0;
 
 	mtx_lock(&np->n_mtx);
 	tsize = np->n_size;
 	np->n_size = nsize;
 	mtx_unlock(&np->n_mtx);
 
 	if (nsize < tsize) {
 		struct buf *bp;
 		daddr_t lbn;
 		int bufsize;
 
 		/*
 		 * vtruncbuf() doesn't get the buffer overlapping the 
 		 * truncation point.  We may have a B_DELWRI and/or B_CACHE
 		 * buffer that now needs to be truncated.
 		 */
 		error = vtruncbuf(vp, cred, td, nsize, biosize);
 		lbn = nsize / biosize;
 		bufsize = nsize & (biosize - 1);
 		bp = nfs_getcacheblk(vp, lbn, bufsize, td);
  		if (!bp)
  			return EINTR;
 		if (bp->b_dirtyoff > bp->b_bcount)
 			bp->b_dirtyoff = bp->b_bcount;
 		if (bp->b_dirtyend > bp->b_bcount)
 			bp->b_dirtyend = bp->b_bcount;
 		bp->b_flags |= B_RELBUF;  /* don't leave garbage around */
 		brelse(bp);
 	} else {
 		vnode_pager_setsize(vp, nsize);
 	}
 	return(error);
 }
 
Index: head/sys/pc98/pc98/machdep.c
===================================================================
--- head/sys/pc98/pc98/machdep.c	(revision 169666)
+++ head/sys/pc98/pc98/machdep.c	(revision 169667)
@@ -1,2790 +1,2790 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_kstack_pages.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
 #include <sys/clock.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/memrange.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #ifdef DDB
 #ifndef KDB
 #error KDB must be enabled in order for DDB to work!
 #endif
 #include <ddb/ddb.h>
 #include <ddb/db_sym.h>
 #endif
 
 #include <pc98/pc98/pc98_machdep.h>
 
 #include <net/netisr.h>
 
 #include <machine/bootinfo.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb.h>
 #include <machine/pcb_ext.h>
 #include <machine/proc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/specialreg.h>
 #include <machine/vm86.h>
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 #ifdef SMP
 #include <machine/privatespace.h>
 #include <machine/smp.h>
 #endif
 
 #ifdef DEV_ISA
 #include <i386/isa/icu.h>
 #endif
 
 /* Sanity check for __curthread() */
 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
 
 extern void init386(int first);
 extern void dblfault_handler(void);
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 #if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
 #define CPU_ENABLE_SSE
 #endif
 
 static void cpu_startup(void *);
 static void fpstate_drop(struct thread *td);
 static void get_fpcontext(struct thread *td, mcontext_t *mcp);
 static int  set_fpcontext(struct thread *td, const mcontext_t *mcp);
 #ifdef CPU_ENABLE_SSE
 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
 #endif /* CPU_ENABLE_SSE */
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 int	need_pre_dma_flush;	/* If 1, use wbinvd befor DMA transfer. */
 int	need_post_dma_flush;	/* If 1, use invd after DMA transfer. */
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int	_udatasel, _ucodesel;
 u_int	basemem;
 
 static int	ispc98 = 1;
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int cold = 1;
 
 #ifdef COMPAT_43
 static void osendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 #ifdef COMPAT_FREEBSD4
 static void freebsd4_sendsig(sig_t catcher, ksiginfo_t *, sigset_t *mask);
 #endif
 
 long Maxmem = 0;
 long realmem = 0;
 
 #define PHYSMAP_SIZE	(2 * 16)
 
 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
 #define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)
 
 struct kva_md_info kmi;
 
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct pcpu __pcpu;
 #endif
 
 struct mtx icu_lock;
 
 struct mem_range_softc mem_range_softc;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ju (%ju MB)\n", ptoa((uintmax_t)Maxmem),
 	    ptoa((uintmax_t)Maxmem) / 1048576);
 	realmem = Maxmem;
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			vm_paddr_t size;
 
 			size = phys_avail[indx + 1] - phys_avail[indx];
 			printf(
 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
 			    (uintmax_t)phys_avail[indx],
 			    (uintmax_t)phys_avail[indx + 1] - 1,
 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
 	printf("avail memory = %ju (%ju MB)\n",
-	    ptoa((uintmax_t)cnt.v_free_count),
-	    ptoa((uintmax_t)cnt.v_free_count) / 1048576);
+	    ptoa((uintmax_t)VMCNT_GET(free_count)),
+	    ptoa((uintmax_t)VMCNT_GET(free_count)) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 #ifdef COMPAT_43
 static void
 osendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct osigframe sf, *fp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = ksi->ksi_code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 static void
 freebsd4_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe4 sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe4));
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe4 *)regs->tf_esp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = ksi->ksi_code;
 		sf.sf_si.si_addr = ksi->ksi_addr;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct sigframe sf, *sfp;
 	struct proc *p;
 	struct thread *td;
 	struct sigacts *psp;
 	char *sp;
 	struct trapframe *regs;
 	int sig;
 	int oonstack;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 #ifdef COMPAT_FREEBSD4
 	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
 		freebsd4_sendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 #ifdef COMPAT_43
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		osendsig(catcher, ksi, mask);
 		return;
 	}
 #endif
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
 	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
 	fpstate_drop(td);
 
 	/* Allocate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sp = td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe);
 #if defined(COMPAT_43)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sp = (char *)regs->tf_esp - sizeof(struct sigframe);
 	/* Align to 16 bytes. */
 	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill in POSIX parts */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = ksi->ksi_code;
 		sf.sf_addr = (register_t)ksi->ksi_addr;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 #ifdef DEBUG
 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_eflags &= ~PSL_T;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  *
  * MPSAFE
  */
 #ifdef COMPAT_43
 int
 osigreturn(td, uap)
 	struct thread *td;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct osigcontext sc;
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	struct proc *p = td->td_proc;
 	int eflags, error;
 	ksiginfo_t ksi;
 
 	regs = td->td_frame;
 	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
 	if (error != 0)
 		return (error);
 	scp = &sc;
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (scp->sc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 	SIGSETOLD(td->td_sigmask, scp->sc_mask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif /* COMPAT_43 */
 
 #ifdef COMPAT_FREEBSD4
 /*
  * MPSAFE
  */
 int
 freebsd4_sigreturn(td, uap)
 	struct thread *td;
 	struct freebsd4_sigreturn_args /* {
 		const ucontext4 *sigcntxp;
 	} */ *uap;
 {
 	struct ucontext4 uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const struct ucontext4 *ucp;
 	int cs, eflags, error;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("freebsd4_sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("freebsd4_sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 #endif	/* COMPAT_FREEBSD4 */
 
 /*
  * MPSAFE
  */
 int
 sigreturn(td, uap)
 	struct thread *td;
 	struct sigreturn_args /* {
 		const struct __ucontext *sigcntxp;
 	} */ *uap;
 {
 	ucontext_t uc;
 	struct proc *p = td->td_proc;
 	struct trapframe *regs;
 	const ucontext_t *ucp;
 	int cs, eflags, error, ret;
 	ksiginfo_t ksi;
 
 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
 	if (error != 0)
 		return (error);
 	ucp = &uc;
 	regs = td->td_frame;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (td->td_pcb->pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &td->td_pcb->pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 		}
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
 			    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			ksiginfo_init_trap(&ksi);
 			ksi.ksi_signo = SIGBUS;
 			ksi.ksi_code = BUS_OBJERR;
 			ksi.ksi_trapno = T_PROTFLT;
 			ksi.ksi_addr = (void *)regs->tf_eip;
 			trapsignal(td, &ksi);
 			return (EINVAL);
 		}
 
 		ret = set_fpcontext(td, &ucp->uc_mcontext);
 		if (ret != 0)
 			return (ret);
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		td->td_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	td->td_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 	register_t reg;
 	uint64_t tsc1, tsc2;
 
 	if (pcpu_find(cpu_id) == NULL || rate == NULL)
 		return (EINVAL);
 	if (!tsc_present)
 		return (EOPNOTSUPP);
 
 	/* If we're booting, trust the rate calibrated moments ago. */
 	if (cold) {
 		*rate = tsc_freq;
 		return (0);
 	}
 
 #ifdef SMP
 	/* Schedule ourselves on the indicated cpu. */
 	mtx_lock_spin(&sched_lock);
 	sched_bind(curthread, cpu_id);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/* Calibrate by measuring a short delay. */
 	reg = intr_disable();
 	tsc1 = rdtsc();
 	DELAY(1000);
 	tsc2 = rdtsc();
 	intr_restore(reg);
 
 #ifdef SMP
 	mtx_lock_spin(&sched_lock);
 	sched_unbind(curthread);
 	mtx_unlock_spin(&sched_lock);
 #endif
 
 	/*
 	 * Calculate the difference in readings, convert to Mhz, and
 	 * subtract 0.5% of the total.  Empirical testing has shown that
 	 * overhead in DELAY() works out to approximately this value.
 	 */
 	tsc2 -= tsc1;
 	*rate = tsc2 * 1000 - tsc2 * 5;
 	return (0);
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  In the SMP case we default to
  * off because a halted cpu will not currently pick up a new thread in the
  * run queue until the next timer tick.  If turned on this will result in
  * approximately a 4.2% loss in real time performance in buildworld tests
  * (but improves user and sys times oddly enough), and saves approximately
  * 5% in power consumption on an idle machine (tests w/2xCPU 1.1GHz P3).
  *
  * XXX we need to have a cpu mask of idle cpus and generate an IPI or
  * otherwise generate some sort of interrupt to wake up cpus sitting in HLT.
  * Then we can have our cake and eat it too.
  *
  * XXX I'm turning it on for SMP as well by default for now.  It seems to
  * help lock contention somewhat, and this is critical for HTT. -Peter
  */
 static int	cpu_idle_hlt = 1;
 TUNABLE_INT("machdep.cpu_idle_hlt", &cpu_idle_hlt);
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 static void
 cpu_idle_default(void)
 {
 	/*
 	 * we must absolutely guarentee that hlt is the
 	 * absolute next instruction after sti or we
 	 * introduce a timing window.
 	 */
 	__asm __volatile("sti; hlt");
 }
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * sched_runnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 
 #ifdef SMP
 	if (mp_grab_cpu_hlt())
 		return;
 #endif
 
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (sched_runnable())
 			enable_intr();
 		else
 			(*cpu_idle_hook)();
 	}
 }
 
 /* Other subsystems (e.g., ACPI) can hook this later. */
 void (*cpu_idle_hook)(void) = cpu_idle_default;
 
 /*
  * Clear registers on exec
  */
 void
 exec_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
 	pcb->pcb_gs = _udatasel;
 	load_gs(_udatasel);
 
 	if (td->td_proc->p_md.md_ldt)
 		user_ldt_free(td);
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Drop the FP state if we hold it, so that the process gets a
 	 * clean FP state if it uses the FPU again.
 	 */
 	fpstate_drop(td);
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	td->td_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 
 	/*
 	 * CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
 	 *
 	 * Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
 	 * instructions.  We must set the CR0_MP bit and use the CR0_TS
 	 * bit to control the trap, because setting the CR0_EM bit does
 	 * not cause WAIT instructions to trap.  It's important to trap
 	 * WAIT instructions - otherwise the "wait" variants of no-wait
 	 * control instructions would degenerate to the "no-wait" variants
 	 * after FP context switches but work correctly otherwise.  It's
 	 * particularly important to trap WAITs when there is no NPX -
 	 * otherwise the "wait" variants would always degenerate.
 	 *
 	 * Try setting CR0_NE to get correct error reporting on 486DX's.
 	 * Setting it should fail or do nothing on lesser processors.
 	 */
 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 u_long bootdev;		/* not a struct cdev *- encoding is different */
 SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
 	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 struct region_descriptor r_gdt, r_idt;	/* table descriptors */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  vm_offset_t	proc0kstack;
 
 
 /*
  * software prototypes -- in more palatable form.
  *
  * GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
  * GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
  */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	1 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUFS_SEL	2 %fs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUGS_SEL	3 %gs Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	4 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	5 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUCODE_SEL	6 Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GUDATA_SEL	7 Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length  */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	10 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	11 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	12 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GNDIS_SEL	18 NDIS Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);
 
 #ifdef DDB
 /*
  * Display the index and function name of any IDT entries that don't use
  * the default 'rsvd' entry point.
  */
 DB_SHOW_COMMAND(idt, db_show_idt)
 {
 	struct gate_descriptor *ip;
 	int idx;
 	uintptr_t func;
 
 	ip = idt;
 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
 		func = (ip->gd_hioffset << 16 | ip->gd_looffset);
 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
 			db_printf("%3d\t", idx);
 			db_printsym(func, DB_STGY_PROC);
 			db_printf("\n");
 		}
 		ip++;
 	}
 }
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * XXX first should be vm_paddr_t.
  */
 static void
 getmemsize(int first)
 {
 	int i, off, physmap_idx, pa_indx, da_indx;
 	int pg_n;
 	u_long physmem_tunable;
 	u_int extmem, under16;
 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t *pte;
 	quad_t dcons_addr, dcons_size;
 
 	bzero(physmap, sizeof(physmap));
 
 	/* XXX - some of EPSON machines can't use PG_N */
 	pg_n = PG_N;
 	if (pc98_machine_type & M_EPSON_PC98) {
 		switch (epson_machine_id) {
 #ifdef WB_CACHE
 		default:
 #endif
 		case EPSON_PC486_HX:
 		case EPSON_PC486_HG:
 		case EPSON_PC486_HA:
 			pg_n = 0;
 			break;
 		}
 	}
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
         under16 = pc98_getmemsize(&basemem, &extmem);
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE)
 		pmap_kenter(KERNBASE + pa, pa);
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1]);
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
 		Maxmem = atop(physmem_tunable);
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);
 
 	/*
 	 * We need to divide chunk if Maxmem is larger than 16MB and
 	 * under 16MB area is not full of memory.
 	 * (1) system area (15-16MB region) is cut off
 	 * (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
 	 */
 	if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
 		/* 15M - 16M region is cut off, so need to divide chunk */
 		physmap[physmap_idx + 1] = under16 * 1024;
 		physmap_idx += 2;
 		physmap[physmap_idx] = 0x1000000;
 		physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
 	}
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	da_indx = 1;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 	dump_avail[da_indx] = physmap[0];
 	pte = CMAP1;
 
 	/*
 	 * Get dcons buffer address
 	 */
 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
 	    getenv_quad("dcons.size", &dcons_size) == 0)
 		dcons_addr = 0;
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_paddr_t end;
 
 		end = ptoa((vm_paddr_t)Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad, full;
 			int *ptr = (int *)CADDR1;
 
 			full = FALSE;
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= KERNLOAD && pa < first)
 				goto do_dump_avail;
 
 			/*
 			 * block out dcons buffer
 			 */
 			if (dcons_addr > 0
 			    && pa >= trunc_page(dcons_addr)
 			    && pa < dcons_addr + dcons_size)
 				goto do_dump_avail;
 
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | pg_n;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa)
 				page_bad = TRUE;
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555)
 				page_bad = TRUE;
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff)
 				page_bad = TRUE;
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0)
 				page_bad = TRUE;
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE)
 				continue;
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf(
 		"Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					full = TRUE;
 					goto do_dump_avail;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
 			}
 			physmem++;
 do_dump_avail:
 			if (dump_avail[da_indx] == pa) {
 				dump_avail[da_indx] += PAGE_SIZE;
 			} else {
 				da_indx++;
 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
 					da_indx--;
 					goto do_next;
 				}
 				dump_avail[da_indx++] = pa;	/* start */
 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
 			}
 do_next:
 			if (full)
 				break;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
 		    off);
 }
 
 void
 init386(first)
 	int first;
 {
 	struct gate_descriptor *gdp;
 	int gsel_tss, metadata_missing, x;
 	struct pcpu *pc;
 
 	thread0.td_kstack = proc0kstack;
 	thread0.td_pcb = (struct pcb *)
 	   (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
  	 * This may be done better later if it gets more high level
  	 * components in it. If so just link td->td_proc here.
 	 */
 	proc_linkup(&proc0, &thread0);
 
 	/*
 	 * Initialize DMAC
 	 */
 	pc98_init_dmac();
 
 	metadata_missing = 0;
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		metadata_missing = 1;
 	}
 	if (envmode == 1)
 		kern_envp = static_env;
 	else if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/* Init basic tunables, hz etc */
 	init_param1();
 
 	/*
 	 * Make gdt memory segments.  All segments cover the full 4GB
 	 * of address space and permissions are enforced at page level.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);
 
 #ifdef SMP
 	pc = &SMP_prvspace[0].pcpu;
 #else
 	pc = &__pcpu;
 #endif
 	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
 	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;
 
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	PCPU_SET(prvspace, pc);
 	PCPU_SET(curthread, &thread0);
 	PCPU_SET(curpcb, thread0.td_pcb);
 	PCPU_SET(curtid, thread0.td_tid);
 
 	/*
 	 * Initialize mutexes.
 	 *
 	 * icu_lock: in order to allow an interrupt to occur in a critical
 	 * 	     section, to set pcpu->ipending (etc...) properly, we
 	 *	     must be able to get the icu lock, so it can't be
 	 *	     under witness.
 	 */
 	mutex_init();
 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS | MTX_NOPROFILE);
 
 	/* make ldt memory segments */
 	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
 	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	PCPU_SET(currentldt, _default_ldt);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
 		    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DE, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DB, &IDTVEC(dbg),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_BP, &IDTVEC(bpt),  SDT_SYS386IGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_OF, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_BR, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NM, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL
 	    , GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_DF, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_TS, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_NP, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_SS, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_PF, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MF, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_MC, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the i8254 before the console so that console
 	 * initialization can use DELAY().
 	 */
 	i8254_init();
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	if (metadata_missing)
 		printf("WARNING: loader(8) metadata is missing!\n");
 
 #ifdef DEV_ISA
 	atpic_startup();
 #endif
 
 #ifdef DDB
 	ksym_start = bootinfo.bi_symtab;
 	ksym_end = bootinfo.bi_esymtab;
 #endif
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(IDT_UD, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	setidt(IDT_GP, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL,
 	    GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
 	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
 	    KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb) - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	/* pointer to selector slot for %fs/%gs */
 	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int)dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 	init_param2(physmem);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(lcall_syscall);
 	gdp->gd_looffset = x;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = x >> 16;
 
 	/* XXX does this work? */
 	/* XXX yes! */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	thread0.td_pcb->pcb_flags = 0; /* XXXKSE */
 	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
 	thread0.td_pcb->pcb_ext = 0;
 	thread0.td_frame = &proc0_tf;
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_flags = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_flags);
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL)
 
 static void
 f00f_hack(void *unused)
 {
 	struct gate_descriptor *new_idt;
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	GIANT_REQUIRED;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 
 	/* Put the problematic entry (#6) at the end of the lower page. */
 	new_idt = (struct gate_descriptor*)
 	    (tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (u_int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_edi = tf->tf_edi;
 	pcb->pcb_esi = tf->tf_esi;
 	pcb->pcb_ebp = tf->tf_ebp;
 	pcb->pcb_ebx = tf->tf_ebx;
 	pcb->pcb_eip = tf->tf_eip;
 	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	td->td_frame->tf_eflags &= ~PSL_T;
 	return (0);
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	pcb = td->td_pcb;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	pcb = td->td_pcb;
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 #ifdef CPU_ENABLE_SSE
 static void
 fill_fpregs_xmm(sv_xmm, sv_87)
 	struct savexmm *sv_xmm;
 	struct save87 *sv_87;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	bzero(sv_87, sizeof(*sv_87));
 
 	/* FPU control/status */
 	penv_87->en_cw = penv_xmm->en_cw;
 	penv_87->en_sw = penv_xmm->en_sw;
 	penv_87->en_tw = penv_xmm->en_tw;
 	penv_87->en_fip = penv_xmm->en_fip;
 	penv_87->en_fcs = penv_xmm->en_fcs;
 	penv_87->en_opcode = penv_xmm->en_opcode;
 	penv_87->en_foo = penv_xmm->en_foo;
 	penv_87->en_fos = penv_xmm->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
 }
 
 static void
 set_fpregs_xmm(sv_87, sv_xmm)
 	struct save87 *sv_87;
 	struct savexmm *sv_xmm;
 {
 	register struct env87 *penv_87 = &sv_87->sv_env;
 	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
 	int i;
 
 	/* FPU control/status */
 	penv_xmm->en_cw = penv_87->en_cw;
 	penv_xmm->en_sw = penv_87->en_sw;
 	penv_xmm->en_tw = penv_87->en_tw;
 	penv_xmm->en_fip = penv_87->en_fip;
 	penv_xmm->en_fcs = penv_87->en_fcs;
 	penv_xmm->en_opcode = penv_87->en_opcode;
 	penv_xmm->en_foo = penv_87->en_foo;
 	penv_xmm->en_fos = penv_87->en_fos;
 
 	/* FPU registers */
 	for (i = 0; i < 8; ++i)
 		sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
 }
 #endif /* CPU_ENABLE_SSE */
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		fill_fpregs_xmm(&td->td_pcb->pcb_save.sv_xmm,
 						(struct save87 *)fpregs);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(&td->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 #ifdef CPU_ENABLE_SSE
 	if (cpu_fxsr) {
 		set_fpregs_xmm((struct save87 *)fpregs,
 					   &td->td_pcb->pcb_save.sv_xmm);
 		return (0);
 	}
 #endif /* CPU_ENABLE_SSE */
 	bcopy(fpregs, &td->td_pcb->pcb_save.sv_87, sizeof *fpregs);
 	return (0);
 }
 
 /*
  * Get machine context.
  */
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct trapframe *tp;
 
 	tp = td->td_frame;
 
 	PROC_LOCK(curthread->td_proc);
 	mcp->mc_onstack = sigonstack(tp->tf_esp);
 	PROC_UNLOCK(curthread->td_proc);
 	mcp->mc_gs = td->td_pcb->pcb_gs;
 	mcp->mc_fs = tp->tf_fs;
 	mcp->mc_es = tp->tf_es;
 	mcp->mc_ds = tp->tf_ds;
 	mcp->mc_edi = tp->tf_edi;
 	mcp->mc_esi = tp->tf_esi;
 	mcp->mc_ebp = tp->tf_ebp;
 	mcp->mc_isp = tp->tf_isp;
 	mcp->mc_eflags = tp->tf_eflags;
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_eax = 0;
 		mcp->mc_edx = 0;
 		mcp->mc_eflags &= ~PSL_C;
 	} else {
 		mcp->mc_eax = tp->tf_eax;
 		mcp->mc_edx = tp->tf_edx;
 	}
 	mcp->mc_ebx = tp->tf_ebx;
 	mcp->mc_ecx = tp->tf_ecx;
 	mcp->mc_eip = tp->tf_eip;
 	mcp->mc_cs = tp->tf_cs;
 	mcp->mc_esp = tp->tf_esp;
 	mcp->mc_ss = tp->tf_ss;
 	mcp->mc_len = sizeof(*mcp);
 	get_fpcontext(td, mcp);
 	return (0);
 }
 
 /*
  * Set machine context.
  *
  * However, we don't set any but the user modifiable flags, and we won't
  * touch the cs selector.
  */
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct trapframe *tp;
 	int eflags, ret;
 
 	tp = td->td_frame;
 	if (mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 	eflags = (mcp->mc_eflags & PSL_USERCHANGE) |
 	    (tp->tf_eflags & ~PSL_USERCHANGE);
 	if ((ret = set_fpcontext(td, mcp)) == 0) {
 		tp->tf_fs = mcp->mc_fs;
 		tp->tf_es = mcp->mc_es;
 		tp->tf_ds = mcp->mc_ds;
 		tp->tf_edi = mcp->mc_edi;
 		tp->tf_esi = mcp->mc_esi;
 		tp->tf_ebp = mcp->mc_ebp;
 		tp->tf_ebx = mcp->mc_ebx;
 		tp->tf_edx = mcp->mc_edx;
 		tp->tf_ecx = mcp->mc_ecx;
 		tp->tf_eax = mcp->mc_eax;
 		tp->tf_eip = mcp->mc_eip;
 		tp->tf_eflags = eflags;
 		tp->tf_esp = mcp->mc_esp;
 		tp->tf_ss = mcp->mc_ss;
 		td->td_pcb->pcb_gs = mcp->mc_gs;
 		ret = 0;
 	}
 	return (ret);
 }
 
 static void
 get_fpcontext(struct thread *td, mcontext_t *mcp)
 {
 #ifndef DEV_NPX
 	mcp->mc_fpformat = _MC_FPFMT_NODEV;
 	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
 #else
 	union savefpu *addr;
 
 	/*
 	 * XXX mc_fpstate might be misaligned, since its declaration is not
 	 * unportabilized using __attribute__((aligned(16))) like the
 	 * declaration of struct savemm, and anyway, alignment doesn't work
 	 * for auto variables since we don't use gcc's pessimal stack
 	 * alignment.  Work around this by abusing the spare fields after
 	 * mcp->mc_fpstate.
 	 *
 	 * XXX unpessimize most cases by only aligning when fxsave might be
 	 * called, although this requires knowing too much about
 	 * npxgetregs()'s internals.
 	 */
 	addr = (union savefpu *)&mcp->mc_fpstate;
 	if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 	    cpu_fxsr &&
 #endif
 	    ((uintptr_t)(void *)addr & 0xF)) {
 		do
 			addr = (void *)((char *)addr + 4);
 		while ((uintptr_t)(void *)addr & 0xF);
 	}
 	mcp->mc_ownedfp = npxgetregs(td, addr);
 	if (addr != (union savefpu *)&mcp->mc_fpstate) {
 		bcopy(addr, &mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
 		bzero(&mcp->mc_spare2, sizeof(mcp->mc_spare2));
 	}
 	mcp->mc_fpformat = npxformat();
 #endif
 }
 
 static int
 set_fpcontext(struct thread *td, const mcontext_t *mcp)
 {
 	union savefpu *addr;
 
 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
 		return (0);
 	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
 	    mcp->mc_fpformat != _MC_FPFMT_XMM)
 		return (EINVAL);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
 		/* We don't care what state is left in the FPU or PCB. */
 		fpstate_drop(td);
 	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
 		/* XXX align as above. */
 		addr = (union savefpu *)&mcp->mc_fpstate;
 		if (td == PCPU_GET(fpcurthread) &&
 #ifdef CPU_ENABLE_SSE
 		    cpu_fxsr &&
 #endif
 		    ((uintptr_t)(void *)addr & 0xF)) {
 			do
 				addr = (void *)((char *)addr + 4);
 			while ((uintptr_t)(void *)addr & 0xF);
 			bcopy(&mcp->mc_fpstate, addr, sizeof(mcp->mc_fpstate));
 		}
 #ifdef DEV_NPX
 #ifdef CPU_ENABLE_SSE
 		if (cpu_fxsr)
 			addr->sv_xmm.sv_env.en_mxcsr &= cpu_mxcsr_mask;
 #endif
 		/*
 		 * XXX we violate the dubious requirement that npxsetregs()
 		 * be called with interrupts disabled.
 		 */
 		npxsetregs(td, addr);
 #endif
 		/*
 		 * Don't bother putting things back where they were in the
 		 * misaligned case, since we know that the caller won't use
 		 * them again.
 		 */
 	} else
 		return (EINVAL);
 	return (0);
 }
 
 static void
 fpstate_drop(struct thread *td)
 {
 	register_t s;
 
 	s = intr_disable();
 #ifdef DEV_NPX
 	if (PCPU_GET(fpcurthread) == td)
 		npxdrop();
 #endif
 	/*
 	 * XXX force a full drop of the npx.  The above only drops it if we
 	 * owned it.  npxgetregs() has the same bug in the !cpu_fxsr case.
 	 *
 	 * XXX I don't much like npxgetregs()'s semantics of doing a full
 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
 	 * sendsig() is the only caller of npxgetregs()... perhaps we just
 	 * have too many layers.
 	 */
 	curthread->td_pcb->pcb_flags &= ~PCB_NPXINITDONE;
 	intr_restore(s);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 
 	if (td == NULL) {
 		dbregs->dr[0] = rdr0();
 		dbregs->dr[1] = rdr1();
 		dbregs->dr[2] = rdr2();
 		dbregs->dr[3] = rdr3();
 		dbregs->dr[4] = rdr4();
 		dbregs->dr[5] = rdr5();
 		dbregs->dr[6] = rdr6();
 		dbregs->dr[7] = rdr7();
 	} else {
 		pcb = td->td_pcb;
 		dbregs->dr[0] = pcb->pcb_dr0;
 		dbregs->dr[1] = pcb->pcb_dr1;
 		dbregs->dr[2] = pcb->pcb_dr2;
 		dbregs->dr[3] = pcb->pcb_dr3;
 		dbregs->dr[4] = 0;
 		dbregs->dr[5] = 0;
 		dbregs->dr[6] = pcb->pcb_dr6;
 		dbregs->dr[7] = pcb->pcb_dr7;
 	}
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	struct pcb *pcb;
 	int i;
 
 	if (td == NULL) {
 		load_dr0(dbregs->dr[0]);
 		load_dr1(dbregs->dr[1]);
 		load_dr2(dbregs->dr[2]);
 		load_dr3(dbregs->dr[3]);
 		load_dr4(dbregs->dr[4]);
 		load_dr5(dbregs->dr[5]);
 		load_dr6(dbregs->dr[6]);
 		load_dr7(dbregs->dr[7]);
 	} else {
 		/*
 		 * Don't let an illegal value for dr7 get set.	Specifically,
 		 * check for undefined settings.  Setting these bit patterns
 		 * result in undefined behaviour and can lead to an unexpected
 		 * TRCTRAP.
 		 */
 		for (i = 0; i < 4; i++) {
 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 			if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
 				return (EINVAL);
 		}
 		
 		pcb = td->td_pcb;
 		
 		/*
 		 * Don't let a process set a breakpoint that is not within the
 		 * process's address space.  If a process could do this, it
 		 * could halt the system by setting a breakpoint in the kernel
 		 * (if ddb was enabled).  Thus, we need to check to make sure
 		 * that no breakpoints are being enabled for addresses outside
 		 * process's address space.
 		 *
 		 * XXX - what about when the watched area of the user's
 		 * address space is written into from within the kernel
 		 * ... wouldn't that still cause a breakpoint to be generated
 		 * from within kernel mode?
 		 */
 
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
 			/* dr0 is enabled */
 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
 			/* dr1 is enabled */
 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
 			/* dr2 is enabled */
 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 			
 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
 			/* dr3 is enabled */
 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		pcb->pcb_dr0 = dbregs->dr[0];
 		pcb->pcb_dr1 = dbregs->dr[1];
 		pcb->pcb_dr2 = dbregs->dr[2];
 		pcb->pcb_dr3 = dbregs->dr[3];
 		pcb->pcb_dr6 = dbregs->dr[6];
 		pcb->pcb_dr7 = dbregs->dr[7];
 
 		pcb->pcb_flags |= PCB_DBREGS;
 	}
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i = 0; i < nbp; i++) {
                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 #ifdef KDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called from the debugger.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* KDB */
Index: head/sys/powerpc/aim/machdep.c
===================================================================
--- head/sys/powerpc/aim/machdep.c	(revision 169666)
+++ head/sys/powerpc/aim/machdep.c	(revision 169667)
@@ -1,988 +1,988 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *	$NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bat.h>
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mmuvar.h>
 #include <machine/pcb.h>
 #include <machine/powerpc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 #include <dev/ofw/openfirm.h>
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int cold = 1;
 
 struct		pcpu __pcpu[MAXCPU];
 struct		trapframe frame0;
 
 vm_offset_t	kstack0;
 vm_offset_t	kstack0_phys;
 
 char		machine[] = "powerpc";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static int cacheline_size = CACHELINESIZE;
 SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size,
 	   CTLFLAG_RD, &cacheline_size, 0, "");
 
 static void	cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 void		powerpc_init(u_int, u_int, u_int, void *);
 
 int		save_ofw_mapping(void);
 int		restore_ofw_mapping(void);
 
 void		install_extint(void (*)(void));
 
 int             setfault(faultbuf);             /* defined in locore.S */
 
 static int	grab_mcontext(struct thread *, mcontext_t *, int);
 
 void		asm_panic(char *);
 
 long		Maxmem = 0;
 long		realmem = 0;
 
 struct pmap	ofw_pmap;
 extern int	ofmsr;
 
 struct bat	battable[16];
 
 struct kva_md_info kmi;
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 powerpc_ofw_shutdown(void *junk, int howto)
 {
 	if (howto & RB_HALT) {
 		OF_halt();
 	}
 	OF_reboot();
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	/*
 	 * Initialise the decrementer-based clock.
 	 */
 	decr_init();
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	cpu_setup(PCPU_GET(cpuid));
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ptoa(physmem),
 	    ptoa(physmem) / 1048576);
 	realmem = physmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %d bytes (%d pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
-	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
-	    ptoa(cnt.v_free_count) / 1048576);
+	printf("avail memory = %ld (%ld MB)\n", ptoa(VMCNT_GET(free_count)),
+	    ptoa(VMCNT_GET(free_count)) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0,
 	    SHUTDOWN_PRI_LAST);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the secondaries */
 	mp_announce();
 #endif  /* SMP */
 }
 
 extern char	kernel_text[], _end[];
 
 extern void	*trapcode, *trapsize;
 extern void	*alitrap, *alisize;
 extern void	*dsitrap, *dsisize;
 extern void	*decrint, *decrsize;
 extern void     *extint, *extsize;
 extern void	*dblow, *dbsize;
 extern void	*vectrap, *vectrapsize;
 
 void
 powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 {
 	struct		pcpu *pc;
 	vm_offset_t	end, off;
 	void		*kmdp;
         char		*env;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be done
 	 * before console is inited so cninit gets the right value of
 	 * boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 #ifdef DDB
 			ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 			ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 		}
 	}
 
 	/*
 	 * Init params/tunables that can be overridden by the loader
 	 */
 	init_param1();
 
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
 	proc_linkup(&proc0, &thread0);
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Set up per-cpu data.
 	 */
 	pc = &__pcpu[0];
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_cpuid = 0;
 
 	__asm __volatile("mtsprg 0, %0" :: "r"(pc));
 
 	mutex_init();
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Complain if there is no metadata.
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("powerpc_init: no loader metadata.\n");
 	}
 
 	kdb_init();
 
 	kobj_machdep_init();
 
 	/*
 	 * XXX: Initialize the interrupt tables.
 	 *      Disable translation in case the vector area
 	 *      hasn't been mapped (G5)
 	 */
 	mtmsr(mfmsr() & ~(PSL_IR | PSL_DR));
 	isync();
 	bcopy(&trapcode, (void *)EXC_RST,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize);
 	bcopy(&dsitrap,  (void *)EXC_DSI,  (size_t)&dsisize);
 	bcopy(&trapcode, (void *)EXC_ISI,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_EXI,  (size_t)&trapsize);
 	bcopy(&alitrap,  (void *)EXC_ALI,  (size_t)&alisize);
 	bcopy(&trapcode, (void *)EXC_PGM,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPU,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_SC,   (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_TRC,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPA,  (size_t)&trapsize);
 	bcopy(&vectrap,  (void *)EXC_VEC,  (size_t)&vectrapsize);
 	bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_BPT,  (size_t)&trapsize);
 #ifdef KDB
 	bcopy(&dblow,	 (void *)EXC_RST,  (size_t)&dbsize);
 	bcopy(&dblow,	 (void *)EXC_MCHK, (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_PGM,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_TRC,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_BPT,  (size_t)&dbsize);
 #endif
 	__syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD);
 
 	/*
 	 * Make sure translation has been enabled
 	 */
 	mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI);
 	isync();
 
 	/*
 	 * Initialise virtual memory.
 	 */
 	pmap_mmu_install(MMU_TYPE_OEA, 0);		/* XXX temporary */
 	pmap_bootstrap(startkernel, endkernel);
 
 	/*
 	 * Initialize params/tunables that are derived from memsize
 	 */
 	init_param2(physmem);
 
 	/*
 	 * Grab booted kernel's name
 	 */
         env = getenv("kernelname");
         if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Finish setting up thread0.
 	 */
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
 	 * Map and initialise the message buffer.
 	 */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t	p;
 
 	p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct proc *p;
 	int oonstack, rndfsize;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	oonstack = sigonstack(tf->fixreg[1]);
 
 	rndfsize = ((sizeof(sf) + 15) / 16) * 16;
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	     catcher, sig);
 
 	/*
 	 * Save user context
 	 */
 	memset(&sf, 0, sizeof(sf));
 	grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		   td->td_sigstk.ss_size - rndfsize);
 	} else {
 		sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize);
 	}
 
 	/*
 	 * Translate the signal if appropriate (Linux emu ?)
 	 */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/*
 	 * Save the floating-point state, if necessary, then copy it.
 	 */
 	/* XXX */
 
 	/*
 	 * Set up the registers to return to sigcode.
 	 *
 	 *   r1/sp - sigframe ptr
 	 *   lr    - sig function, dispatched to by blrl in trampoline
 	 *   r3    - sig number
 	 *   r4    - SIGINFO ? &siginfo : exception code
 	 *   r5    - user context
 	 *   srr0  - trampoline function addr
 	 */
 	tf->lr = (register_t)catcher;
 	tf->fixreg[1] = (register_t)sfp;
 	tf->fixreg[FIRSTARG] = sig;
 	tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/*
 		 * Signal handler installed with SA_SIGINFO.
 		 */
 		tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si;
 
 		/*
 		 * Fill siginfo structure.
 		 */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0);
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->fixreg[FIRSTARG+1] = code;
 		tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
 
 	/*
 	 * copy the frame out to userland.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Process has trashed its stack. Kill it.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
 	     tf->srr0, tf->fixreg[1]);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	ucontext_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_lr = tf->srr0;
 	pcb->pcb_sp = tf->fixreg[1];
 }
 
 /*
  * get_mcontext/sendsig helper routine that doesn't touch the
  * proc lock
  */
 static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	memset(mcp, 0, sizeof(mcontext_t));
 
 	mcp->mc_vers = _MC_VERSION;
 	mcp->mc_flags = 0;
 	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_gpr[3] = 0;
 		mcp->mc_gpr[4] = 0;
 	}
 
 	/*
 	 * This assumes that floating-point context is *not* lazy,
 	 * so if the thread has used FP there would have been a
 	 * FP-unavailable exception that would have set things up
 	 * correctly.
 	 */
 	if (pcb->pcb_flags & PCB_FPU) {
 		KASSERT(td == curthread,
 			("get_mcontext: fp save not curthread"));
 		critical_enter();
 		save_fpu(td);
 		critical_exit();
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context ? */
 
 	mcp->mc_len = sizeof(*mcp);
 
 	return (0);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tf;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 
 	if (mcp->mc_vers != _MC_VERSION ||
 	    mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 
 	/*
 	 * Don't let the user set privileged MSR bits
 	 */
 	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
 		return (EINVAL);
 	}
 
 	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
 
 	if (mcp->mc_flags & _MC_FP_VALID) {
 		if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
 			critical_enter();
 			enable_fpu(td);
 			critical_exit();
 		}
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
 		memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context? */
 
 	return (0);
 }
 
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Shutdown the CPU as much as possible.
  */
 void
 cpu_halt(void)
 {
 
 	OF_exit();
 }
 
 void
 cpu_idle(void)
 {
 	/* TODO: Insert code to halt (until next interrupt) */
 
 #ifdef INVARIANTS
 	if ((mfmsr() & PSL_EE) != PSL_EE) {
 		struct thread *td = curthread;
 		printf("td msr %x\n", td->td_md.md_saved_msr);
 		panic("ints disabled in idleproc!");
 	}
 #endif
 }
 
 /*
  * Set set up registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe	*tf;
 	struct ps_strings	arginfo;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 
 	/*
 	 * XXX Machine-independent code has already copied arguments and
 	 * XXX environment to userland.  Get them back here.
 	 */
 	(void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo));
 
 	/*
 	 * Set up arguments for _start():
 	 *	_start(argc, argv, envp, obj, cleanup, ps_strings);
 	 *
 	 * Notes:
 	 *	- obj and cleanup are the auxilliary and termination
 	 *	  vectors.  They are fixed up by ld.elf_so.
 	 *	- ps_strings is a NetBSD extention, and will be
 	 * 	  ignored by executables which are strictly
 	 *	  compliant with the SVR4 ABI.
 	 *
 	 * XXX We have to set both regs and retval here due to different
 	 * XXX calling convention in trap.c and init_main.c.
 	 */
         /*
          * XXX PG: these get overwritten in the syscall return code.
          * execve() should return EJUSTRETURN, like it does on NetBSD.
          * Emulate by setting the syscall return value cells. The
          * registers still have to be set for init's fork trampoline.
          */
         td->td_retval[0] = arginfo.ps_nargvstr;
         td->td_retval[1] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[3] = arginfo.ps_nargvstr;
 	tf->fixreg[4] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[5] = (register_t)arginfo.ps_envstr;
 	tf->fixreg[6] = 0;			/* auxillary vector */
 	tf->fixreg[7] = 0;			/* termination vector */
 	tf->fixreg[8] = (register_t)PS_STRINGS;	/* NetBSD extension */
 
 	tf->srr0 = entry;
 	tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
 	td->td_pcb->pcb_flags = 0;
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(regs, tf, sizeof(struct reg));
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		memset(fpregs, 0, sizeof(struct fpreg));
 	else
 		memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(tf, regs, sizeof(struct reg));
 	
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		enable_fpu(td);
 	memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr0 = (register_t)addr;
 
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 	
 	tf = td->td_frame;
 	tf->srr1 |= PSL_SE;
 
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr1 &= ~PSL_SE;
 
 	return (0);
 }
 
 /*
  * Initialise a struct pcpu.
  */
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_msr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_msr);
 }
 
 /*
  * kcopy(const void *src, void *dst, size_t len);
  *
  * Copy len bytes from src to dst, aborting if we encounter a fatal
  * page fault.
  *
  * kcopy() _must_ save and restore the old fault handler since it is
  * called by uiomove(), which may be in the path of servicing a non-fatal
  * page fault.
  */
 int
 kcopy(const void *src, void *dst, size_t len)
 {
 	struct thread	*td;
 	faultbuf	env, *oldfault;
 	int		rv;
 
 	td = PCPU_GET(curthread);
 	oldfault = td->td_pcb->pcb_onfault;
 	if ((rv = setfault(env)) != 0) {
 		td->td_pcb->pcb_onfault = oldfault;
 		return rv;
 	}
 
 	memcpy(dst, src, len);
 
 	td->td_pcb->pcb_onfault = oldfault;
 	return (0);
 }
 
 void
 asm_panic(char *pstr)
 {
 	panic(pstr);
 }
 
 int db_trap_glue(struct trapframe *);		/* Called from trap_subr.S */
 
 int
 db_trap_glue(struct trapframe *frame)
 {
 	if (!(frame->srr1 & PSL_PR)
 	    && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC
 		|| (frame->exc == EXC_PGM
 		    && (frame->srr1 & 0x20000))
 		|| frame->exc == EXC_BPT
 		|| frame->exc == EXC_DSI)) {
 		int type = frame->exc;
 		if (type == EXC_PGM && (frame->srr1 & 0x20000)) {
 			type = T_BREAKPOINT;
 		}
 		return (kdb_trap(type, 0, frame));
 	}
 
 	return (0);
 }
Index: head/sys/powerpc/powerpc/machdep.c
===================================================================
--- head/sys/powerpc/powerpc/machdep.c	(revision 169666)
+++ head/sys/powerpc/powerpc/machdep.c	(revision 169667)
@@ -1,988 +1,988 @@
 /*-
  * Copyright (C) 1995, 1996 Wolfgang Solfrank.
  * Copyright (C) 1995, 1996 TooLs GmbH.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Copyright (C) 2001 Benno Rice
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *	$NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cons.h>
 #include <sys/cpu.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/ucontext.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <net/netisr.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/bat.h>
 #include <machine/cpu.h>
 #include <machine/elf.h>
 #include <machine/fpu.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/mmuvar.h>
 #include <machine/pcb.h>
 #include <machine/powerpc.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/trap.h>
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 #include <dev/ofw/openfirm.h>
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 int cold = 1;
 
 struct		pcpu __pcpu[MAXCPU];
 struct		trapframe frame0;
 
 vm_offset_t	kstack0;
 vm_offset_t	kstack0_phys;
 
 char		machine[] = "powerpc";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static int cacheline_size = CACHELINESIZE;
 SYSCTL_INT(_machdep, CPU_CACHELINE, cacheline_size,
 	   CTLFLAG_RD, &cacheline_size, 0, "");
 
 static void	cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 void		powerpc_init(u_int, u_int, u_int, void *);
 
 int		save_ofw_mapping(void);
 int		restore_ofw_mapping(void);
 
 void		install_extint(void (*)(void));
 
 int             setfault(faultbuf);             /* defined in locore.S */
 
 static int	grab_mcontext(struct thread *, mcontext_t *, int);
 
 void		asm_panic(char *);
 
 long		Maxmem = 0;
 long		realmem = 0;
 
 struct pmap	ofw_pmap;
 extern int	ofmsr;
 
 struct bat	battable[16];
 
 struct kva_md_info kmi;
 
 void setPQL2(int *const size, int *const ways);
 
 void
 setPQL2(int *const size, int *const ways)
 {
 	return;
 }
 
 static void
 powerpc_ofw_shutdown(void *junk, int howto)
 {
 	if (howto & RB_HALT) {
 		OF_halt();
 	}
 	OF_reboot();
 }
 
 static void
 cpu_startup(void *dummy)
 {
 
 	/*
 	 * Initialise the decrementer-based clock.
 	 */
 	decr_init();
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	cpu_setup(PCPU_GET(cpuid));
 
 	/* startrtclock(); */
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %ld (%ld MB)\n", ptoa(physmem),
 	    ptoa(physmem) / 1048576);
 	realmem = physmem;
 
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %d bytes (%d pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	vm_ksubmap_init(&kmi);
 
-	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
-	    ptoa(cnt.v_free_count) / 1048576);
+	printf("avail memory = %ld (%ld MB)\n", ptoa(VMCNT_GET(free_count)),
+	    ptoa(VMCNT_GET(free_count)) / 1048576);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, powerpc_ofw_shutdown, 0,
 	    SHUTDOWN_PRI_LAST);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the secondaries */
 	mp_announce();
 #endif  /* SMP */
 }
 
 extern char	kernel_text[], _end[];
 
 extern void	*trapcode, *trapsize;
 extern void	*alitrap, *alisize;
 extern void	*dsitrap, *dsisize;
 extern void	*decrint, *decrsize;
 extern void     *extint, *extsize;
 extern void	*dblow, *dbsize;
 extern void	*vectrap, *vectrapsize;
 
 void
 powerpc_init(u_int startkernel, u_int endkernel, u_int basekernel, void *mdp)
 {
 	struct		pcpu *pc;
 	vm_offset_t	end, off;
 	void		*kmdp;
         char		*env;
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be done
 	 * before console is inited so cninit gets the right value of
 	 * boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 #ifdef DDB
 			ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
 			ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
 #endif
 		}
 	}
 
 	/*
 	 * Init params/tunables that can be overridden by the loader
 	 */
 	init_param1();
 
 	/*
 	 * Start initializing proc0 and thread0.
 	 */
 	proc_linkup(&proc0, &thread0);
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Set up per-cpu data.
 	 */
 	pc = &__pcpu[0];
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_cpuid = 0;
 
 	__asm __volatile("mtsprg 0, %0" :: "r"(pc));
 
 	mutex_init();
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Complain if there is no metadata.
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("powerpc_init: no loader metadata.\n");
 	}
 
 	kdb_init();
 
 	kobj_machdep_init();
 
 	/*
 	 * XXX: Initialize the interrupt tables.
 	 *      Disable translation in case the vector area
 	 *      hasn't been mapped (G5)
 	 */
 	mtmsr(mfmsr() & ~(PSL_IR | PSL_DR));
 	isync();
 	bcopy(&trapcode, (void *)EXC_RST,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_MCHK, (size_t)&trapsize);
 	bcopy(&dsitrap,  (void *)EXC_DSI,  (size_t)&dsisize);
 	bcopy(&trapcode, (void *)EXC_ISI,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_EXI,  (size_t)&trapsize);
 	bcopy(&alitrap,  (void *)EXC_ALI,  (size_t)&alisize);
 	bcopy(&trapcode, (void *)EXC_PGM,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPU,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_DECR, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_SC,   (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_TRC,  (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_FPA,  (size_t)&trapsize);
 	bcopy(&vectrap,  (void *)EXC_VEC,  (size_t)&vectrapsize);
 	bcopy(&trapcode, (void *)EXC_VECAST, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_THRM, (size_t)&trapsize);
 	bcopy(&trapcode, (void *)EXC_BPT,  (size_t)&trapsize);
 #ifdef KDB
 	bcopy(&dblow,	 (void *)EXC_RST,  (size_t)&dbsize);
 	bcopy(&dblow,	 (void *)EXC_MCHK, (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_PGM,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_TRC,  (size_t)&dbsize);
 	bcopy(&dblow,   (void *)EXC_BPT,  (size_t)&dbsize);
 #endif
 	__syncicache(EXC_RSVD, EXC_LAST - EXC_RSVD);
 
 	/*
 	 * Make sure translation has been enabled
 	 */
 	mtmsr(mfmsr() | PSL_IR|PSL_DR|PSL_ME|PSL_RI);
 	isync();
 
 	/*
 	 * Initialise virtual memory.
 	 */
 	pmap_mmu_install(MMU_TYPE_OEA, 0);		/* XXX temporary */
 	pmap_bootstrap(startkernel, endkernel);
 
 	/*
 	 * Initialize params/tunables that are derived from memsize
 	 */
 	init_param2(physmem);
 
 	/*
 	 * Grab booted kernel's name
 	 */
         env = getenv("kernelname");
         if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Finish setting up thread0.
 	 */
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 
 	/*
 	 * Map and initialise the message buffer.
 	 */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, msgbuf_phys + off);
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 bzero(void *buf, size_t len)
 {
 	caddr_t	p;
 
 	p = buf;
 
 	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
 		*p++ = 0;
 		len--;
 	}
 
 	while (len >= sizeof(u_long) * 8) {
 		*(u_long*) p = 0;
 		*((u_long*) p + 1) = 0;
 		*((u_long*) p + 2) = 0;
 		*((u_long*) p + 3) = 0;
 		len -= sizeof(u_long) * 8;
 		*((u_long*) p + 4) = 0;
 		*((u_long*) p + 5) = 0;
 		*((u_long*) p + 6) = 0;
 		*((u_long*) p + 7) = 0;
 		p += sizeof(u_long) * 8;
 	}
 
 	while (len >= sizeof(u_long)) {
 		*(u_long*) p = 0;
 		len -= sizeof(u_long);
 		p += sizeof(u_long);
 	}
 
 	while (len) {
 		*p++ = 0;
 		len--;
 	}
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct proc *p;
 	int oonstack, rndfsize;
 	int sig;
 	int code;
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	oonstack = sigonstack(tf->fixreg[1]);
 
 	rndfsize = ((sizeof(sf) + 15) / 16) * 16;
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	     catcher, sig);
 
 	/*
 	 * Save user context
 	 */
 	memset(&sf, 0, sizeof(sf));
 	grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		   td->td_sigstk.ss_size - rndfsize);
 	} else {
 		sfp = (struct sigframe *)(tf->fixreg[1] - rndfsize);
 	}
 
 	/*
 	 * Translate the signal if appropriate (Linux emu ?)
 	 */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/*
 	 * Save the floating-point state, if necessary, then copy it.
 	 */
 	/* XXX */
 
 	/*
 	 * Set up the registers to return to sigcode.
 	 *
 	 *   r1/sp - sigframe ptr
 	 *   lr    - sig function, dispatched to by blrl in trampoline
 	 *   r3    - sig number
 	 *   r4    - SIGINFO ? &siginfo : exception code
 	 *   r5    - user context
 	 *   srr0  - trampoline function addr
 	 */
 	tf->lr = (register_t)catcher;
 	tf->fixreg[1] = (register_t)sfp;
 	tf->fixreg[FIRSTARG] = sig;
 	tf->fixreg[FIRSTARG+2] = (register_t)&sfp->sf_uc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/*
 		 * Signal handler installed with SA_SIGINFO.
 		 */
 		tf->fixreg[FIRSTARG+1] = (register_t)&sfp->sf_si;
 
 		/*
 		 * Fill siginfo structure.
 		 */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_addr = (void *) ((tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0);
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->fixreg[FIRSTARG+1] = code;
 		tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ? 
 		                             tf->dar : tf->srr0;
 	}
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	tf->srr0 = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
 
 	/*
 	 * copy the frame out to userland.
 	 */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Process has trashed its stack. Kill it.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
 	     tf->srr0, tf->fixreg[1]);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	ucontext_t uc;
 	int error;
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	error = set_mcontext(td, &uc.uc_mcontext);
 	if (error != 0)
 		return (error);
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
 	     td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);
 
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_lr = tf->srr0;
 	pcb->pcb_sp = tf->fixreg[1];
 }
 
 /*
  * get_mcontext/sendsig helper routine that doesn't touch the
  * proc lock
  */
 static int
 grab_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	memset(mcp, 0, sizeof(mcontext_t));
 
 	mcp->mc_vers = _MC_VERSION;
 	mcp->mc_flags = 0;
 	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
 	if (flags & GET_MC_CLEAR_RET) {
 		mcp->mc_gpr[3] = 0;
 		mcp->mc_gpr[4] = 0;
 	}
 
 	/*
 	 * This assumes that floating-point context is *not* lazy,
 	 * so if the thread has used FP there would have been a
 	 * FP-unavailable exception that would have set things up
 	 * correctly.
 	 */
 	if (pcb->pcb_flags & PCB_FPU) {
 		KASSERT(td == curthread,
 			("get_mcontext: fp save not curthread"));
 		critical_enter();
 		save_fpu(td);
 		critical_exit();
 		mcp->mc_flags |= _MC_FP_VALID;
 		memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
 		memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context ? */
 
 	mcp->mc_len = sizeof(*mcp);
 
 	return (0);
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
 {
 	int error;
 
 	error = grab_mcontext(td, mcp, flags);
 	if (error == 0) {
 		PROC_LOCK(curthread->td_proc);
 		mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
 		PROC_UNLOCK(curthread->td_proc);
 	}
 
 	return (error);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mcp)
 {
 	struct pcb *pcb;
 	struct trapframe *tf;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 
 	if (mcp->mc_vers != _MC_VERSION ||
 	    mcp->mc_len != sizeof(*mcp))
 		return (EINVAL);
 
 	/*
 	 * Don't let the user set privileged MSR bits
 	 */
 	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
 		return (EINVAL);
 	}
 
 	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));
 
 	if (mcp->mc_flags & _MC_FP_VALID) {
 		if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
 			critical_enter();
 			enable_fpu(td);
 			critical_exit();
 		}
 		memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
 		memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
 	}
 
 	/* XXX Altivec context? */
 
 	return (0);
 }
 
 void
 cpu_boot(int howto)
 {
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Shutdown the CPU as much as possible.
  */
 void
 cpu_halt(void)
 {
 
 	OF_exit();
 }
 
 void
 cpu_idle(void)
 {
 	/* TODO: Insert code to halt (until next interrupt) */
 
 #ifdef INVARIANTS
 	if ((mfmsr() & PSL_EE) != PSL_EE) {
 		struct thread *td = curthread;
 		printf("td msr %x\n", td->td_md.md_saved_msr);
 		panic("ints disabled in idleproc!");
 	}
 #endif
 }
 
 /*
  * Set set up registers on exec.
  */
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe	*tf;
 	struct ps_strings	arginfo;
 
 	tf = trapframe(td);
 	bzero(tf, sizeof *tf);
 	tf->fixreg[1] = -roundup(-stack + 8, 16);
 
 	/*
 	 * XXX Machine-independent code has already copied arguments and
 	 * XXX environment to userland.  Get them back here.
 	 */
 	(void)copyin((char *)PS_STRINGS, &arginfo, sizeof(arginfo));
 
 	/*
 	 * Set up arguments for _start():
 	 *	_start(argc, argv, envp, obj, cleanup, ps_strings);
 	 *
 	 * Notes:
 	 *	- obj and cleanup are the auxilliary and termination
 	 *	  vectors.  They are fixed up by ld.elf_so.
 	 *	- ps_strings is a NetBSD extention, and will be
 	 * 	  ignored by executables which are strictly
 	 *	  compliant with the SVR4 ABI.
 	 *
 	 * XXX We have to set both regs and retval here due to different
 	 * XXX calling convention in trap.c and init_main.c.
 	 */
         /*
          * XXX PG: these get overwritten in the syscall return code.
          * execve() should return EJUSTRETURN, like it does on NetBSD.
          * Emulate by setting the syscall return value cells. The
          * registers still have to be set for init's fork trampoline.
          */
         td->td_retval[0] = arginfo.ps_nargvstr;
         td->td_retval[1] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[3] = arginfo.ps_nargvstr;
 	tf->fixreg[4] = (register_t)arginfo.ps_argvstr;
 	tf->fixreg[5] = (register_t)arginfo.ps_envstr;
 	tf->fixreg[6] = 0;			/* auxillary vector */
 	tf->fixreg[7] = 0;			/* termination vector */
 	tf->fixreg[8] = (register_t)PS_STRINGS;	/* NetBSD extension */
 
 	tf->srr0 = entry;
 	tf->srr1 = PSL_MBO | PSL_USERSET | PSL_FE_DFLT;
 	td->td_pcb->pcb_flags = 0;
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(regs, tf, sizeof(struct reg));
 
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		memset(fpregs, 0, sizeof(struct fpreg));
 	else
 		memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	memcpy(tf, regs, sizeof(struct reg));
 	
 	return (0);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 	/* No debug registers on PowerPC */
 	return (ENOSYS);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	if ((pcb->pcb_flags & PCB_FPU) == 0)
 		enable_fpu(td);
 	memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
 
 	return (0);
 }
 
 int
 ptrace_set_pc(struct thread *td, unsigned long addr)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr0 = (register_t)addr;
 
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 	
 	tf = td->td_frame;
 	tf->srr1 |= PSL_SE;
 
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	struct trapframe *tf;
 
 	tf = td->td_frame;
 	tf->srr1 &= ~PSL_SE;
 
 	return (0);
 }
 
 /*
  * Initialise a struct pcpu.
  */
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t sz)
 {
 
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0)
 		td->td_md.md_saved_msr = intr_disable();
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		intr_restore(td->td_md.md_saved_msr);
 }
 
 /*
  * kcopy(const void *src, void *dst, size_t len);
  *
  * Copy len bytes from src to dst, aborting if we encounter a fatal
  * page fault.
  *
  * kcopy() _must_ save and restore the old fault handler since it is
  * called by uiomove(), which may be in the path of servicing a non-fatal
  * page fault.
  */
 int
 kcopy(const void *src, void *dst, size_t len)
 {
 	struct thread	*td;
 	faultbuf	env, *oldfault;
 	int		rv;
 
 	td = PCPU_GET(curthread);
 	oldfault = td->td_pcb->pcb_onfault;
 	if ((rv = setfault(env)) != 0) {
 		td->td_pcb->pcb_onfault = oldfault;
 		return rv;
 	}
 
 	memcpy(dst, src, len);
 
 	td->td_pcb->pcb_onfault = oldfault;
 	return (0);
 }
 
 void
 asm_panic(char *pstr)
 {
 	panic(pstr);
 }
 
 int db_trap_glue(struct trapframe *);		/* Called from trap_subr.S */
 
 int
 db_trap_glue(struct trapframe *frame)
 {
 	if (!(frame->srr1 & PSL_PR)
 	    && (frame->exc == EXC_TRC || frame->exc == EXC_RUNMODETRC
 		|| (frame->exc == EXC_PGM
 		    && (frame->srr1 & 0x20000))
 		|| frame->exc == EXC_BPT
 		|| frame->exc == EXC_DSI)) {
 		int type = frame->exc;
 		if (type == EXC_PGM && (frame->srr1 & 0x20000)) {
 			type = T_BREAKPOINT;
 		}
 		return (kdb_trap(type, 0, frame));
 	}
 
 	return (0);
 }
Index: head/sys/sparc64/sparc64/machdep.c
===================================================================
--- head/sys/sparc64/sparc64/machdep.c	(revision 169666)
+++ head/sys/sparc64/sparc64/machdep.c	(revision 169667)
@@ -1,912 +1,912 @@
 /*-
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * 	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/cache.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/upa.h>
 #include <machine/ver.h>
 
 typedef int ofw_vec_t(void *);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct trapframe frame0;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 
 /*
  * Note: timer quality for CPU's is set low to try and prevent them from
  * being chosen as the primary timecounter.  The CPU counters are not
  * synchronized among the CPU's so in MP machines this causes problems
  * when calculating the time.  With this value the CPU's should only be
  * chosen as the primary timecounter as a last resort.
  */
 
 #define	UP_TICK_QUALITY	1000
 #define	MP_TICK_QUALITY	-100
 static struct timecounter tick_tc;
 
 char sparc64_model[32];
 
 static int cpu_use_vis = 1;
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static timecounter_get_t tick_get_timecount;
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
 		  ofw_vec_t *vec);
 void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	tick_tc.tc_get_timecount = tick_get_timecount;
 	tick_tc.tc_poll_pps = NULL;
 	tick_tc.tc_counter_mask = ~0u;
 	tick_tc.tc_frequency = tick_freq;
 	tick_tc.tc_name = "tick";
 	tick_tc.tc_quality = UP_TICK_QUALITY;
 #ifdef SMP
 	/*
 	 * We do not know if each CPU's tick counter is synchronized.
 	 */
 	if (cpu_mp_probe())
 		tick_tc.tc_quality = MP_TICK_QUALITY;
 #endif
 
 	tc_init(&tick_tc);
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz / PAGE_SIZE;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
-	printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
-	    cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
+	printf("avail memory = %lu (%lu MB)\n", VMCNT_GET(free_count) *
+	    PAGE_SIZE, VMCNT_GET(free_count) / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 	cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid));
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = rdpr(pil);
 		wrpr(pil, 0, PIL_TICK);
 		td->td_md.md_saved_pil = pil;
 	}
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0)
 		wrpr(pil, td->td_md.md_saved_pil, 0);
 }
 
 unsigned
 tick_get_timecount(struct timecounter *tc)
 {
 	return ((unsigned)rd(tick));
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	phandle_t child;
 	phandle_t root;
 	struct pcpu *pc;
 	vm_offset_t end;
 	caddr_t kmdp;
 	u_int clock;
 	char *env;
 	char type[8];
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Find out what kind of cpu we have first, for anything that changes
 	 * behaviour.
 	 */
 	cpu_impl = VER_IMPL(rdpr(ver));
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_init(vec);
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
 	init_param1();
 
 	root = OF_peer(0);
 	for (child = OF_child(root); child != 0; child = OF_peer(child)) {
 		OF_getprop(child, "device_type", type, sizeof(type));
 		if (strcmp(type, "cpu") == 0)
 			break;
 	}
 
 	/*
 	 * Initialize the tick counter.  Must be before the console is inited
 	 * in order to provide the low-level console drivers with a working
 	 * DELAY().
 	 */
 	OF_getprop(child, "clock-frequency", &clock, sizeof(clock));
 	tick_init(clock);
 
 	/*
 	 * Initialize the console before printing anything.
 	 */
 	cninit();
 
 	/*
 	 * Panic if there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("sparc64_init: no loader metadata.\n"
 		       "This probably means you are not using loader(8).\n");
 		panic("sparc64_init");
 	}
 
 	/*
 	 * Sanity check the kernel end, which is important.
 	 */
 	if (end == 0) {
 		printf("sparc64_init: warning, kernel end not specified.\n"
 		       "Attempting to continue anyway.\n");
 		end = (vm_offset_t)_end;
 	}
 
 	cache_init(child);
 	uma_set_align(cache.dc_linesize - 1);
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 	getenv_int("machdep.use_vis", &cpu_use_vis);
 	if (cpu_use_vis) {
 		switch (cpu_impl) {
 		case CPU_IMPL_SPARC64:
 		case CPU_IMPL_ULTRASPARCI:
 		case CPU_IMPL_ULTRASPARCII:
 		case CPU_IMPL_ULTRASPARCIIi:
 		case CPU_IMPL_ULTRASPARCIIe:
 			cpu_block_copy = spitfire_block_copy;
 			cpu_block_zero = spitfire_block_zero;
 			break;
 		}
 	}
 
 #ifdef SMP
 	mp_tramp = mp_tramp_alloc();
 #endif
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(end);
 
 	/*
 	 * Initialize tunables.
 	 */
 	init_param2(physmem);
 	env = getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 	/*
 	 * Initialize the interrupt tables.
 	 */
 	intr_init1();
 
 	/*
 	 * Initialize proc0 stuff (p_contested needs to be done early).
 	 */
 	proc_linkup(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	thread0.td_kstack = kstack0;
 	thread0.td_pcb = (struct pcb *)
 	    (thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 
 	/*
 	 * Prime our per-cpu data page for use.  Note, we are using it for our
 	 * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or
 	 * it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_curpcb = thread0.td_pcb;
 	pc->pc_mid = UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG));
 	pc->pc_addr = (vm_offset_t)pcpu0;
 	pc->pc_node = child;
 	pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
 	pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;
 
 	/*
 	 * Initialize global registers.
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize the message buffer (after setting trap table).
 	 */
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	mutex_init();
 	intr_init2();
 
 	/*
 	 * Finish pmap initialization now that we're ready for mutexes.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 set_openfirm_callback(ofw_vec_t *vec)
 {
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	int oonstack;
 	u_long sp;
 	int sig;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal trampoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	bcopy(tf, mc, sizeof(*tf));
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	}
 	mc->mc_flags = _MC_VERSION;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		tf->tf_fprs &= ~FPRS_FEF;
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	uint64_t wstate;
 
 	if (!TSTATE_SECURE(mc->mc_tstate) ||
 	    (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	wstate = tf->tf_wstate;
 	bcopy(mc, tf, sizeof(*tf));
 	tf->tf_wstate = wstate;
 	if ((mc->mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	openfirmware_exit(args);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(void)
 {
 	/* Insert code to halt (until next interrupt) for the idle loop */
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 	tf->tf_tnpc = entry + 4;
 	tf->tf_tpc = entry;
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
 
 	td->td_retval[0] = tf->tf_out[0];
 	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
Index: head/sys/sparc64/sparc64/pmap.c
===================================================================
--- head/sys/sparc64/sparc64/pmap.c	(revision 169666)
+++ head/sys/sparc64/sparc64/pmap.c	(revision 169667)
@@ -1,1948 +1,1948 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by the University of
  *      California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  * $FreeBSD$
  */
 
 /*
  * Manages physical address maps.
  *
  * In addition to hardware address maps, this module is called upon to
  * provide software-use-only maps which may or may not be stored in the
  * same form as hardware maps.  These pseudo-maps are used to store
  * intermediate results from copy operations to and from address spaces.
  *
  * Since the information managed by this module is also stored by the
  * logical address mapping module, this module may throw away valid virtual
  * to physical mappings at almost any time.  However, invalidations of
  * mappings must be done as requested.
  *
  * In order to cope with hardware architectures which make virtual to
  * physical map invalidates expensive, this module may delay invalidate
  * reduced protection operations until such time as they are actually
  * necessary.  This module is given full information as to which processors
  * are currently using which maps, and to when physical maps must be made
  * correct.
  */
 
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h> 
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 
 #include <machine/cache.h>
 #include <machine/frame.h>
 #include <machine/instr.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_mem.h>
 #include <machine/smp.h>
 #include <machine/tlb.h>
 #include <machine/tte.h>
 #include <machine/tsb.h>
 
 #define	PMAP_DEBUG
 
 #ifndef	PMAP_SHPGPERPROC
 #define	PMAP_SHPGPERPROC	200
 #endif
 
 /*
  * Virtual and physical address of message buffer.
  */
 struct msgbuf *msgbufp;
 vm_paddr_t msgbuf_phys;
 
 /*
  * Map of physical memory reagions.
  */
 vm_paddr_t phys_avail[128];
 static struct ofw_mem_region mra[128];
 struct ofw_mem_region sparc64_memreg[128];
 int sparc64_nmemreg;
 static struct ofw_map translations[128];
 static int translations_size;
 
 static vm_offset_t pmap_idle_map;
 static vm_offset_t pmap_temp_map_1;
 static vm_offset_t pmap_temp_map_2;
 
 /*
  * First and last available kernel virtual addresses.
  */
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 vm_offset_t kernel_vm_end;
 
 vm_offset_t vm_max_kernel_address;
 
 /*
  * Kernel pmap.
  */
 struct pmap kernel_pmap_store;
 
 /*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static void pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, boolean_t wired);
 
 extern int tl1_immu_miss_patch_1[];
 extern int tl1_immu_miss_patch_2[];
 extern int tl1_dmmu_miss_patch_1[];
 extern int tl1_dmmu_miss_patch_2[];
 extern int tl1_dmmu_prot_patch_1[];
 extern int tl1_dmmu_prot_patch_2[];
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
  * resident count drops to 0, there are no more pages to remove, so we
  * need not continue.
  */
 #define	PMAP_REMOVE_DONE(pm) \
 	((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 
 /*
  * The threshold (in bytes) above which tsb_foreach() is used in pmap_remove()
  * and pmap_protect() instead of trying each virtual address.
  */
 #define	PMAP_TSB_THRESH	((TSB_SIZE / 2) * PAGE_SIZE)
 
 SYSCTL_NODE(_debug, OID_AUTO, pmap_stats, CTLFLAG_RD, 0, "");
 
 PMAP_STATS_VAR(pmap_nenter);
 PMAP_STATS_VAR(pmap_nenter_update);
 PMAP_STATS_VAR(pmap_nenter_replace);
 PMAP_STATS_VAR(pmap_nenter_new);
 PMAP_STATS_VAR(pmap_nkenter);
 PMAP_STATS_VAR(pmap_nkenter_oc);
 PMAP_STATS_VAR(pmap_nkenter_stupid);
 PMAP_STATS_VAR(pmap_nkremove);
 PMAP_STATS_VAR(pmap_nqenter);
 PMAP_STATS_VAR(pmap_nqremove);
 PMAP_STATS_VAR(pmap_ncache_enter);
 PMAP_STATS_VAR(pmap_ncache_enter_c);
 PMAP_STATS_VAR(pmap_ncache_enter_oc);
 PMAP_STATS_VAR(pmap_ncache_enter_cc);
 PMAP_STATS_VAR(pmap_ncache_enter_coc);
 PMAP_STATS_VAR(pmap_ncache_enter_nc);
 PMAP_STATS_VAR(pmap_ncache_enter_cnc);
 PMAP_STATS_VAR(pmap_ncache_remove);
 PMAP_STATS_VAR(pmap_ncache_remove_c);
 PMAP_STATS_VAR(pmap_ncache_remove_oc);
 PMAP_STATS_VAR(pmap_ncache_remove_cc);
 PMAP_STATS_VAR(pmap_ncache_remove_coc);
 PMAP_STATS_VAR(pmap_ncache_remove_nc);
 PMAP_STATS_VAR(pmap_nzero_page);
 PMAP_STATS_VAR(pmap_nzero_page_c);
 PMAP_STATS_VAR(pmap_nzero_page_oc);
 PMAP_STATS_VAR(pmap_nzero_page_nc);
 PMAP_STATS_VAR(pmap_nzero_page_area);
 PMAP_STATS_VAR(pmap_nzero_page_area_c);
 PMAP_STATS_VAR(pmap_nzero_page_area_oc);
 PMAP_STATS_VAR(pmap_nzero_page_area_nc);
 PMAP_STATS_VAR(pmap_nzero_page_idle);
 PMAP_STATS_VAR(pmap_nzero_page_idle_c);
 PMAP_STATS_VAR(pmap_nzero_page_idle_oc);
 PMAP_STATS_VAR(pmap_nzero_page_idle_nc);
 PMAP_STATS_VAR(pmap_ncopy_page);
 PMAP_STATS_VAR(pmap_ncopy_page_c);
 PMAP_STATS_VAR(pmap_ncopy_page_oc);
 PMAP_STATS_VAR(pmap_ncopy_page_nc);
 PMAP_STATS_VAR(pmap_ncopy_page_dc);
 PMAP_STATS_VAR(pmap_ncopy_page_doc);
 PMAP_STATS_VAR(pmap_ncopy_page_sc);
 PMAP_STATS_VAR(pmap_ncopy_page_soc);
 
 PMAP_STATS_VAR(pmap_nnew_thread);
 PMAP_STATS_VAR(pmap_nnew_thread_oc);
 
 /*
  * Quick sort callout for comparing memory regions.
  */
 static int mr_cmp(const void *a, const void *b);
 static int om_cmp(const void *a, const void *b);
 static int
 mr_cmp(const void *a, const void *b)
 {
 	const struct ofw_mem_region *mra;
 	const struct ofw_mem_region *mrb;
 
 	mra = a;
 	mrb = b;
 	if (mra->mr_start < mrb->mr_start)
 		return (-1);
 	else if (mra->mr_start > mrb->mr_start)
 		return (1);
 	else
 		return (0);
 }
 static int
 om_cmp(const void *a, const void *b)
 {
 	const struct ofw_map *oma;
 	const struct ofw_map *omb;
 
 	oma = a;
 	omb = b;
 	if (oma->om_start < omb->om_start)
 		return (-1);
 	else if (oma->om_start > omb->om_start)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t ekva)
 {
 	struct pmap *pm;
 	struct tte *tp;
 	vm_offset_t off;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	vm_size_t physsz;
 	vm_size_t virtsz;
 	ihandle_t pmem;
 	ihandle_t vmem;
 	int sz;
 	int i;
 	int j;
 
 	/*
 	 * Find out what physical memory is available from the prom and
 	 * initialize the phys_avail array.  This must be done before
 	 * pmap_bootstrap_alloc is called.
 	 */
 	if ((pmem = OF_finddevice("/memory")) == -1)
 		panic("pmap_bootstrap: finddevice /memory");
 	if ((sz = OF_getproplen(pmem, "available")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/available");
 	if (sizeof(phys_avail) < sz)
 		panic("pmap_bootstrap: phys_avail too small");
 	if (sizeof(mra) < sz)
 		panic("pmap_bootstrap: mra too small");
 	bzero(mra, sz);
 	if (OF_getprop(pmem, "available", mra, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/available");
 	sz /= sizeof(*mra);
 	CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
 	qsort(mra, sz, sizeof (*mra), mr_cmp);
 	physsz = 0;
 	getenv_quad("hw.physmem", &physmem);
 	physmem = btoc(physmem);
 	for (i = 0, j = 0; i < sz; i++, j += 2) {
 		CTR2(KTR_PMAP, "start=%#lx size=%#lx", mra[i].mr_start,
 		    mra[i].mr_size);
 		if (physmem != 0 && btoc(physsz + mra[i].mr_size) >= physmem) {
 			if (btoc(physsz) < physmem) {
 				phys_avail[j] = mra[i].mr_start;
 				phys_avail[j + 1] = mra[i].mr_start +
 				    (ctob(physmem) - physsz);
 				physsz = ctob(physmem);
 			}
 			break;
 		}
 		phys_avail[j] = mra[i].mr_start;
 		phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 		physsz += mra[i].mr_size;
 	}
 	physmem = btoc(physsz);
 
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel tsb.
 	 */
 	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 	tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 	tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1;
 
 	/*
 	 * Allocate the kernel tsb and lock it in the tlb.
 	 */
 	pa = pmap_bootstrap_alloc(tsb_kernel_size);
 	if (pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: tsb unaligned\n");
 	tsb_kernel_phys = pa;
 	tsb_kernel = (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size);
 	pmap_map_tsb();
 	bzero(tsb_kernel, tsb_kernel_size);
 
 	/*
 	 * Allocate and map the message buffer.
 	 */
 	msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
 
 	/*
 	 * Patch the virtual address and the tsb mask into the trap table.
 	 */
 
 #define	SETHI(rd, imm22) \
 	(EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) | \
 	    EIF_IMM((imm22) >> 10, 22))
 #define	OR_R_I_R(rd, imm13, rs1) \
 	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) | \
 	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
 
 #define	PATCH(addr) do { \
 	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) || \
 	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0, IF_F3_RS1(addr[1])) || \
 	    addr[2] != SETHI(IF_F2_RD(addr[2]), 0x0)) \
 		panic("pmap_boostrap: patched instructions have changed"); \
 	addr[0] |= EIF_IMM((tsb_kernel_mask) >> 10, 22); \
 	addr[1] |= EIF_IMM(tsb_kernel_mask, 10); \
 	addr[2] |= EIF_IMM(((vm_offset_t)tsb_kernel) >> 10, 22); \
 	flush(addr); \
 	flush(addr + 1); \
 	flush(addr + 2); \
 } while (0)
 
 	PATCH(tl1_immu_miss_patch_1);
 	PATCH(tl1_immu_miss_patch_2);
 	PATCH(tl1_dmmu_miss_patch_1);
 	PATCH(tl1_dmmu_miss_patch_2);
 	PATCH(tl1_dmmu_prot_patch_1);
 	PATCH(tl1_dmmu_prot_patch_2);
 	
 	/*
 	 * Enter fake 8k pages for the 4MB kernel pages, so that
 	 * pmap_kextract() will work for them.
 	 */
 	for (i = 0; i < kernel_tlb_slots; i++) {
 		pa = kernel_tlbs[i].te_pa;
 		va = kernel_tlbs[i].te_va;
 		for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) {
 			tp = tsb_kvtotte(va + off);
 			tp->tte_vpn = TV_VPN(va + off, TS_8K);
 			tp->tte_data = TD_V | TD_8K | TD_PA(pa + off) |
 			    TD_REF | TD_SW | TD_CP | TD_CV | TD_P | TD_W;
 		}
 	}
 
 	/*
 	 * Set the start and end of kva.  The kernel is loaded at the first
 	 * available 4 meg super page, so round up to the end of the page.
 	 */
 	virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
 	virtual_end = vm_max_kernel_address;
 	kernel_vm_end = vm_max_kernel_address;
 
 	/*
 	 * Allocate kva space for temporary mappings.
 	 */
 	pmap_idle_map = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 	pmap_temp_map_1 = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 	pmap_temp_map_2 = virtual_avail;
 	virtual_avail += PAGE_SIZE * DCACHE_COLORS;
 
 	/*
 	 * Allocate a kernel stack with guard page for thread0 and map it into
 	 * the kernel tsb.  We must ensure that the virtual address is coloured
 	 * properly, since we're allocating from phys_avail so the memory won't
 	 * have an associated vm_page_t.
 	 */
 	pa = pmap_bootstrap_alloc(roundup(KSTACK_PAGES, DCACHE_COLORS) *
 	    PAGE_SIZE);
 	kstack0_phys = pa;
 	virtual_avail += roundup(KSTACK_GUARD_PAGES, DCACHE_COLORS) *
 	    PAGE_SIZE;
 	kstack0 = virtual_avail;
 	virtual_avail += roundup(KSTACK_PAGES, DCACHE_COLORS) * PAGE_SIZE;
 	KASSERT(DCACHE_COLOR(kstack0) == DCACHE_COLOR(kstack0_phys),
 	    ("pmap_bootstrap: kstack0 miscoloured"));
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tp = tsb_kvtotte(va);
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW |
 		    TD_CP | TD_CV | TD_P | TD_W;
 	}
 
 	/*
 	 * Calculate the last available physical address.
 	 */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		;
 	Maxmem = sparc64_btop(phys_avail[i + 1]);
 
 	/*
 	 * Add the prom mappings to the kernel tsb.
 	 */
 	if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 		panic("pmap_bootstrap: finddevice /virtual-memory");
 	if ((sz = OF_getproplen(vmem, "translations")) == -1)
 		panic("pmap_bootstrap: getproplen translations");
 	if (sizeof(translations) < sz)
 		panic("pmap_bootstrap: translations too small");
 	bzero(translations, sz);
 	if (OF_getprop(vmem, "translations", translations, sz) == -1)
 		panic("pmap_bootstrap: getprop /virtual-memory/translations");
 	sz /= sizeof(*translations);
 	translations_size = sz;
 	CTR0(KTR_PMAP, "pmap_bootstrap: translations");
 	qsort(translations, sz, sizeof (*translations), om_cmp);
 	for (i = 0; i < sz; i++) {
 		CTR3(KTR_PMAP,
 		    "translation: start=%#lx size=%#lx tte=%#lx",
 		    translations[i].om_start, translations[i].om_size,
 		    translations[i].om_tte);
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS)
 			continue;
 		for (off = 0; off < translations[i].om_size;
 		    off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			tp = tsb_kvtotte(va);
 			tp->tte_vpn = TV_VPN(va, TS_8K);
 			tp->tte_data =
 			    ((translations[i].om_tte &
 			      ~(TD_SOFT_MASK << TD_SOFT_SHIFT)) | TD_EXEC) +
 			    off;
 		}
 	}
 
 	/*
 	 * Get the available physical memory ranges from /memory/reg. These
 	 * are only used for kernel dumps, but it may not be wise to do prom
 	 * calls in that situation.
 	 */
 	if ((sz = OF_getproplen(pmem, "reg")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/reg");
 	if (sizeof(sparc64_memreg) < sz)
 		panic("pmap_bootstrap: sparc64_memreg too small");
 	if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/reg");
 	sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * NOTE: PMAP_LOCK_INIT() is needed as part of the initialization
 	 * but sparc64 start up is not ready to initialize mutexes yet.
 	 * It is called in machdep.c.
 	 */
 	pm = kernel_pmap;
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = TLB_CTX_KERNEL;
 	pm->pm_active = ~0;
 
 	/* XXX flush all non-locked tlb entries */
 }
 
 void
 pmap_map_tsb(void)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	u_long data;
 	u_long s;
 	int i;
 
 	s = intr_disable();
 
 	/*
 	 * Map the 4mb tsb pages.
 	 */
 	for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) {
 		va = (vm_offset_t)tsb_kernel + i;
 		pa = tsb_kernel_phys + i;
 		data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP | TD_CV |
 		    TD_P | TD_W;
 		/* XXX - cheetah */
 		stxa(AA_DMMU_TAR, ASI_DMMU, TLB_TAR_VA(va) |
 		    TLB_TAR_CTX(TLB_CTX_KERNEL));
 		stxa_sync(0, ASI_DTLB_DATA_IN_REG, data);
 	}
 
 	/*
 	 * Set the secondary context to be the kernel context (needed for
 	 * fp block operations in the kernel and the cache code).
 	 */
 	stxa(AA_DMMU_SCXR, ASI_DMMU, TLB_CTX_KERNEL);
 	membar(Sync);
 
 	intr_restore(s);
 }
 
 /*
  * Allocate a physical page of memory directly from the phys_avail map.
  * Can only be called from pmap_bootstrap before avail start and end are
  * calculated.
  */
 static vm_paddr_t
 pmap_bootstrap_alloc(vm_size_t size)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = round_page(size);
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		return (pa);
 	}
 	panic("pmap_bootstrap_alloc");
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.tte_list);
 	m->md.color = DCACHE_COLOR(VM_PAGE_TO_PHYS(m));
 	m->md.flags = 0;
 	m->md.pmap = NULL;
 }
 
 /*
  * Initialize the pmap module.
  */
 void
 pmap_init(void)
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int result;
 	int i;
 
 	for (i = 0; i < translations_size; i++) {
 		addr = translations[i].om_start;
 		size = translations[i].om_size;
 		if (addr < VM_MIN_PROM_ADDRESS || addr > VM_MAX_PROM_ADDRESS)
 			continue;
 		result = vm_map_find(kernel_map, NULL, 0, &addr, size, FALSE,
 		    VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (result != KERN_SUCCESS || addr != translations[i].om_start)
 			panic("pmap_init: vm_map_find");
 	}
 }
 
 /*
  * Extract the physical page address associated with the given
  * map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pm, vm_offset_t va)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 
 	if (pm == kernel_pmap)
 		return (pmap_kextract(va));
 	PMAP_LOCK(pm);
 	tp = tsb_tte_lookup(pm, va);
 	if (tp == NULL)
 		pa = 0;
 	else
 		pa = TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp));
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pm, vm_offset_t va, vm_prot_t prot)
 {
 	struct tte *tp;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	if (pm == kernel_pmap) {
 		if (va >= VM_MIN_DIRECT_ADDRESS) {
 			tp = NULL;
 			m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS(va));
 			vm_page_hold(m);
 		} else {
 			tp = tsb_kvtotte(va);
 			if ((tp->tte_data & TD_V) == 0)
 				tp = NULL;
 		}
 	} else {
 		PMAP_LOCK(pm);
 		tp = tsb_tte_lookup(pm, va);
 	}
 	if (tp != NULL && ((tp->tte_data & TD_SW) ||
 	    (prot & VM_PROT_WRITE) == 0)) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	if (pm != kernel_pmap)
 		PMAP_UNLOCK(pm);
 	return (m);
 }
 
 /*
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	struct tte *tp;
 
 	if (va >= VM_MIN_DIRECT_ADDRESS)
 		return (TLB_DIRECT_TO_PHYS(va));
 	tp = tsb_kvtotte(va);
 	if ((tp->tte_data & TD_V) == 0)
 		return (0);
 	return (TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp)));
 }
 
 int
 pmap_cache_enter(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_enter: fake page"));
 	PMAP_STATS_INC(pmap_ncache_enter);
 
 	/*
 	 * Find the color for this virtual address and note the added mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]++;
 
 	/*
 	 * If all existing mappings have the same color, the mapping is
 	 * cacheable.
 	 */
 	if (m->md.color == color) {
 		KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] == 0,
 		    ("pmap_cache_enter: cacheable, mappings of other color"));
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_oc);
 		return (1);
 	}
 
 	/*
 	 * If there are no mappings of the other color, and the page still has
 	 * the wrong color, this must be a new mapping.  Change the color to
 	 * match the new mapping, which is cacheable.  We must flush the page
 	 * from the cache now.
 	 */
 	if (m->md.colors[DCACHE_OTHER_COLOR(color)] == 0) {
 		KASSERT(m->md.colors[color] == 1,
 		    ("pmap_cache_enter: changing color, not new mapping"));
 		dcache_page_inval(VM_PAGE_TO_PHYS(m));
 		m->md.color = color;
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_cc);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_coc);
 		return (1);
 	}
 
 	/*
 	 * If the mapping is already non-cacheable, just return.
 	 */	
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncache_enter_nc);
 		return (0);
 	}
 
 	PMAP_STATS_INC(pmap_ncache_enter_cnc);
 
 	/*
 	 * Mark all mappings as uncacheable, flush any lines with the other
 	 * color out of the dcache, and set the color to none (-1).
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_clear_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	dcache_page_inval(VM_PAGE_TO_PHYS(m));
 	m->md.color = -1;
 	return (0);
 }
 
 void
 pmap_cache_remove(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va,
 	    m->md.colors[DCACHE_COLOR(va)]);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_remove: fake page"));
 	KASSERT(m->md.colors[DCACHE_COLOR(va)] > 0,
 	    ("pmap_cache_remove: no mappings %d <= 0",
 	    m->md.colors[DCACHE_COLOR(va)]));
 	PMAP_STATS_INC(pmap_ncache_remove);
 
 	/*
 	 * Find the color for this virtual address and note the removal of
 	 * the mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]--;
 
 	/*
 	 * If the page is cacheable, just return and keep the same color, even
 	 * if there are no longer any mappings.
 	 */
 	if (m->md.color != -1) {
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_remove_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_remove_oc);
 		return;
 	}
 
 	KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] != 0,
 	    ("pmap_cache_remove: uncacheable, no mappings of other color"));
 
 	/*
 	 * If the page is not cacheable (color is -1), and the number of
 	 * mappings for this color is not zero, just return.  There are
 	 * mappings of the other color still, so remain non-cacheable.
 	 */
 	if (m->md.colors[color] != 0) {
 		PMAP_STATS_INC(pmap_ncache_remove_nc);
 		return;
 	}
 
 	/*
 	 * The number of mappings for this color is now zero.  Recache the
 	 * other colored mappings, and change the page color to the other
 	 * color.  There should be no lines in the data cache for this page,
 	 * so flushing should not be needed.
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_set_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	m->md.color = DCACHE_OTHER_COLOR(color);
 
 	if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 		PMAP_STATS_INC(pmap_ncache_remove_cc);
 	else
 		PMAP_STATS_INC(pmap_ncache_remove_coc);
 }
 
 /*
  * Map a wired page into kernel virtual address space.
  */
 void
 pmap_kenter(vm_offset_t va, vm_page_t m)
 {
 	vm_offset_t ova;
 	struct tte *tp;
 	vm_page_t om;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_STATS_INC(pmap_nkenter);
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, VM_PAGE_TO_PHYS(m), tp, tp->tte_data);
 	if (m->pc != DCACHE_COLOR(va)) {
 		CTR6(KTR_CT2,
 	"pmap_kenter: off colour va=%#lx pa=%#lx o=%p oc=%#lx ot=%d pi=%#lx",
 		    va, VM_PAGE_TO_PHYS(m), m->object,
 		    m->object ? m->object->pg_color : -1,
 		    m->object ? m->object->type : -1,
 		    m->pindex);
 		PMAP_STATS_INC(pmap_nkenter_oc);
 	}
 	if ((tp->tte_data & TD_V) != 0) {
 		om = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		ova = TTE_GET_VA(tp);
 		if (m == om && va == ova) {
 			PMAP_STATS_INC(pmap_nkenter_stupid);
 			return;
 		}
 		TAILQ_REMOVE(&om->md.tte_list, tp, tte_link);
 		pmap_cache_remove(om, ova);
 		if (va != ova)
 			tlb_page_demap(kernel_pmap, ova);
 	}
 	data = TD_V | TD_8K | VM_PAGE_TO_PHYS(m) | TD_REF | TD_SW | TD_CP |
 	    TD_P | TD_W;
 	if (pmap_cache_enter(m, va) != 0)
 		data |= TD_CV;
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = data;
 	TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 }
 
 /*
  * Map a wired page into kernel virtual address space. This additionally
  * takes a flag argument wich is or'ed to the TTE data. This is used by
  * bus_space_map().
  * NOTE: if the mapping is non-cacheable, it's the caller's responsibility
  * to flush entries that might still be in the cache, if applicable.
  */
 void
 pmap_kenter_flags(vm_offset_t va, vm_paddr_t pa, u_long flags)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter_flags: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, pa, tp, tp->tte_data);
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_P | flags;
 }
 
 /*
  * Remove a wired page from kernel virtual address space.
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct tte *tp;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_STATS_INC(pmap_nkremove);
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	if ((tp->tte_data & TD_V) == 0)
 		return;
 	m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 	TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 	pmap_cache_remove(m, va);
 	TTE_ZERO(tp);
 }
 
 /*
  * Inverse of pmap_kenter_flags, used by bus_space_unmap().
  */
 void
 pmap_kremove_flags(vm_offset_t va)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	TTE_ZERO(tp);
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  *
  * The value passed in *virt is a suggested virtual address for the mapping.
  * Architectures which can support a direct-mapped physical to virtual region
  * can return the appropriate address within that region, leaving '*virt'
  * unchanged.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 
 	return (TLB_PHYS_TO_DIRECT(start));
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 	int locked;
 
 	PMAP_STATS_INC(pmap_nqenter);
 	va = sva;
 	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
 		vm_page_lock_queues();
 	while (count-- > 0) {
 		pmap_kenter(va, *m);
 		va += PAGE_SIZE;
 		m++;
 	}
 	if (!locked)
 		vm_page_unlock_queues();
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by pmap_qenter.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 	int locked;
 
 	PMAP_STATS_INC(pmap_nqremove);
 	va = sva;
 	if (!(locked = mtx_owned(&vm_page_queue_mtx)))
 		vm_page_lock_queues();
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	if (!locked)
 		vm_page_unlock_queues();
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Initialize the pmap associated with process 0.
  */
 void
 pmap_pinit0(pmap_t pm)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pm);
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = 0;
 	pm->pm_active = 0;
 	pm->pm_tsb = NULL;
 	pm->pm_tsb_obj = NULL;
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure, such as one in a
  * vmspace structure.
  */
 void
 pmap_pinit(pmap_t pm)
 {
 	vm_page_t ma[TSB_PAGES];
 	vm_page_t m;
 	int i;
 
 	PMAP_LOCK_INIT(pm);
 
 	/*
 	 * Allocate kva space for the tsb.
 	 */
 	if (pm->pm_tsb == NULL) {
 		pm->pm_tsb = (struct tte *)kmem_alloc_nofault(kernel_map,
 		    TSB_BSIZE);
 	}
 
 	/*
 	 * Allocate an object for it.
 	 */
 	if (pm->pm_tsb_obj == NULL)
 		pm->pm_tsb_obj = vm_object_allocate(OBJT_DEFAULT, TSB_PAGES);
 
 	VM_OBJECT_LOCK(pm->pm_tsb_obj);
 	for (i = 0; i < TSB_PAGES; i++) {
 		m = vm_page_grab(pm->pm_tsb_obj, i, VM_ALLOC_NOBUSY |
 		    VM_ALLOC_RETRY | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 		m->valid = VM_PAGE_BITS_ALL;
 		m->md.pmap = pm;
 		ma[i] = m;
 	}
 	VM_OBJECT_UNLOCK(pm->pm_tsb_obj);
 	pmap_qenter((vm_offset_t)pm->pm_tsb, ma, TSB_PAGES);
 
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = -1;
 	pm->pm_active = 0;
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pm)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	struct pcpu *pc;
 
 	CTR2(KTR_PMAP, "pmap_release: ctx=%#x tsb=%p",
 	    pm->pm_context[PCPU_GET(cpuid)], pm->pm_tsb);
 	KASSERT(pmap_resident_count(pm) == 0,
 	    ("pmap_release: resident pages %ld != 0",
 	    pmap_resident_count(pm)));
 
 	/*
 	 * After the pmap was freed, it might be reallocated to a new process.
 	 * When switching, this might lead us to wrongly assume that we need
 	 * not switch contexts because old and new pmap pointer are equal.
 	 * Therefore, make sure that this pmap is not referenced by any PCPU
 	 * pointer any more. This could happen in two cases:
 	 * - A process that referenced the pmap is currently exiting on a CPU.
 	 *   However, it is guaranteed to not switch in any more after setting
 	 *   its state to PRS_ZOMBIE.
 	 * - A process that referenced this pmap ran on a CPU, but we switched
 	 *   to a kernel thread, leaving the pmap pointer unchanged.
 	 */
 	mtx_lock_spin(&sched_lock);
 	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
 		if (pc->pc_pmap == pm)
 			pc->pc_pmap = NULL;
 	}
 	mtx_unlock_spin(&sched_lock);
 
 	obj = pm->pm_tsb_obj;
 	VM_OBJECT_LOCK(obj);
 	KASSERT(obj->ref_count == 1, ("pmap_release: tsbobj ref count != 1"));
 	while (!TAILQ_EMPTY(&obj->memq)) {
 		m = TAILQ_FIRST(&obj->memq);
 		vm_page_lock_queues();
 		if (vm_page_sleep_if_busy(m, FALSE, "pmaprl"))
 			continue;
 		KASSERT(m->hold_count == 0,
 		    ("pmap_release: freeing held tsb page"));
 		m->md.pmap = NULL;
 		m->wire_count--;
-		atomic_subtract_int(&cnt.v_wire_count, 1);
+		VMCNT_DEC(wire_count, 1);
 		vm_page_free_zero(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(obj);
 	pmap_qremove((vm_offset_t)pm->pm_tsb, TSB_PAGES);
 	PMAP_LOCK_DESTROY(pm);
 }
 
 /*
  * Grow the number of kernel page table entries.  Unneeded.
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 
 	panic("pmap_growkernel: can't grow kernel");
 }
 
 int
 pmap_remove_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
 		vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	data = atomic_readandclear_long(&tp->tte_data);
 	if ((data & TD_FAKE) == 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		if ((data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((data & TD_PV) != 0) {
 			if ((data & TD_W) != 0)
 				vm_page_dirty(m);
 			if ((data & TD_REF) != 0)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.tte_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			pm->pm_stats.resident_count--;
 		}
 		pmap_cache_remove(m, va);
 	}
 	TTE_ZERO(tp);
 	if (PMAP_REMOVE_DONE(pm))
 		return (0);
 	return (1);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  */
 void
 pmap_remove(pmap_t pm, vm_offset_t start, vm_offset_t end)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	CTR3(KTR_PMAP, "pmap_remove: ctx=%#lx start=%#lx end=%#lx",
 	    pm->pm_context[PCPU_GET(cpuid)], start, end);
 	if (PMAP_REMOVE_DONE(pm))
 		return;
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (end - start > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, start, end, pmap_remove_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = start; va < end; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL) {
 				if (!pmap_remove_tte(pm, NULL, tp, va))
 					break;
 			}
 		}
 		tlb_range_demap(pm, start, end - 1);
 	}
 	PMAP_UNLOCK(pm);
 	vm_page_unlock_queues();
 }
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct pmap *pm;
 	struct tte *tpn;
 	struct tte *tp;
 	vm_offset_t va;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) {
 		tpn = TAILQ_NEXT(tp, tte_link);
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		pm = TTE_GET_PMAP(tp);
 		va = TTE_GET_VA(tp);
 		PMAP_LOCK(pm);
 		if ((tp->tte_data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((tp->tte_data & TD_REF) != 0)
 			vm_page_flag_set(m, PG_REFERENCED);
 		if ((tp->tte_data & TD_W) != 0)
 			vm_page_dirty(m);
 		tp->tte_data &= ~TD_V;
 		tlb_page_demap(pm, va);
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		pm->pm_stats.resident_count--;
 		pmap_cache_remove(m, va);
 		TTE_ZERO(tp);
 		PMAP_UNLOCK(pm);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 int
 pmap_protect_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
 		 vm_offset_t va)
 {
 	u_long data;
 	vm_page_t m;
 
 	data = atomic_clear_long(&tp->tte_data, TD_REF | TD_SW | TD_W);
 	if ((data & TD_PV) != 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		if ((data & TD_REF) != 0)
 			vm_page_flag_set(m, PG_REFERENCED);
 		if ((data & TD_W) != 0)
 			vm_page_dirty(m);
 	}
 	return (1);
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va;
 	struct tte *tp;
 
 	CTR4(KTR_PMAP, "pmap_protect: ctx=%#lx sva=%#lx eva=%#lx prot=%#lx",
 	    pm->pm_context[PCPU_GET(cpuid)], sva, eva, prot);
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	if (eva - sva > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, sva, eva, pmap_protect_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = sva; va < eva; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL)
 				pmap_protect_tte(pm, NULL, tp, va);
 		}
 		tlb_range_demap(pm, sva, eva - 1);
 	}
 	PMAP_UNLOCK(pm);
 	vm_page_unlock_queues();
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  */
 void
 pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot, wired);
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pm);
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static void
 pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     boolean_t wired)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 	u_long data;
 	int i;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	PMAP_STATS_INC(pmap_nenter);
 	pa = VM_PAGE_TO_PHYS(m);
 
 	/*
 	 * If this is a fake page from the device_pager, but it covers actual
 	 * physical memory, convert to the real backing page.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 			if (pa >= phys_avail[i] && pa <= phys_avail[i + 1]) {
 				m = PHYS_TO_VM_PAGE(pa);
 				break;
 			}
 		}
 	}
 
 	CTR6(KTR_PMAP,
 	    "pmap_enter: ctx=%p m=%p va=%#lx pa=%#lx prot=%#x wired=%d",
 	    pm->pm_context[PCPU_GET(cpuid)], m, va, pa, prot, wired);
 
 	/*
 	 * If there is an existing mapping, and the physical address has not
 	 * changed, must be protection or wiring change.
 	 */
 	if ((tp = tsb_tte_lookup(pm, va)) != NULL && TTE_GET_PA(tp) == pa) {
 		CTR0(KTR_PMAP, "pmap_enter: update");
 		PMAP_STATS_INC(pmap_nenter_update);
 
 		/*
 		 * Wiring change, just update stats.
 		 */
 		if (wired) {
 			if ((tp->tte_data & TD_WIRED) == 0) {
 				tp->tte_data |= TD_WIRED;
 				pm->pm_stats.wired_count++;
 			}
 		} else {
 			if ((tp->tte_data & TD_WIRED) != 0) {
 				tp->tte_data &= ~TD_WIRED;
 				pm->pm_stats.wired_count--;
 			}
 		}
 
 		/*
 		 * Save the old bits and clear the ones we're interested in.
 		 */
 		data = tp->tte_data;
 		tp->tte_data &= ~(TD_EXEC | TD_SW | TD_W);
 
 		/*
 		 * If we're turning off write permissions, sense modify status.
 		 */
 		if ((prot & VM_PROT_WRITE) != 0) {
 			tp->tte_data |= TD_SW;
 			if (wired) {
 				tp->tte_data |= TD_W;
 			}
 			vm_page_flag_set(m, PG_WRITEABLE);
 		} else if ((data & TD_W) != 0) {
 			vm_page_dirty(m);
 		}
 
 		/*
 		 * If we're turning on execute permissions, flush the icache.
 		 */
 		if ((prot & VM_PROT_EXECUTE) != 0) {
 			if ((data & TD_EXEC) == 0) {
 				icache_page_inval(pa);
 			}
 			tp->tte_data |= TD_EXEC;
 		}
 
 		/*
 		 * Delete the old mapping.
 		 */
 		tlb_page_demap(pm, TTE_GET_VA(tp));
 	} else {
 		/*
 		 * If there is an existing mapping, but its for a different
 		 * phsyical address, delete the old mapping.
 		 */
 		if (tp != NULL) {
 			CTR0(KTR_PMAP, "pmap_enter: replace");
 			PMAP_STATS_INC(pmap_nenter_replace);
 			pmap_remove_tte(pm, NULL, tp, va);
 			tlb_page_demap(pm, va);
 		} else {
 			CTR0(KTR_PMAP, "pmap_enter: new");
 			PMAP_STATS_INC(pmap_nenter_new);
 		}
 
 		/*
 		 * Now set up the data and install the new mapping.
 		 */
 		data = TD_V | TD_8K | TD_PA(pa);
 		if (pm == kernel_pmap)
 			data |= TD_P;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			data |= TD_SW;
 			vm_page_flag_set(m, PG_WRITEABLE);
 		}
 		if (prot & VM_PROT_EXECUTE) {
 			data |= TD_EXEC;
 			icache_page_inval(pa);
 		}
 
 		/*
 		 * If its wired update stats.  We also don't need reference or
 		 * modify tracking for wired mappings, so set the bits now.
 		 */
 		if (wired) {
 			pm->pm_stats.wired_count++;
 			data |= TD_REF | TD_WIRED;
 			if ((prot & VM_PROT_WRITE) != 0)
 				data |= TD_W;
 		}
 
 		tsb_tte_enter(pm, m, va, TS_8K, data);
 	}
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	psize = atop(end - start);
 	m = m_start;
 	PMAP_LOCK(pm);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pm, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), FALSE);
 		m = TAILQ_NEXT(m, listq);
 	}
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    FALSE);
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_object_init_pt(pmap_t pm, vm_offset_t addr, vm_object_t object,
 		    vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_DEVICE,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  * Change the wiring attribute for a map/virtual-address pair.
  * The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pm, vm_offset_t va, boolean_t wired)
 {
 	struct tte *tp;
 	u_long data;
 
 	PMAP_LOCK(pm);
 	if ((tp = tsb_tte_lookup(pm, va)) != NULL) {
 		if (wired) {
 			data = atomic_set_long(&tp->tte_data, TD_WIRED);
 			if ((data & TD_WIRED) == 0)
 				pm->pm_stats.wired_count++;
 		} else {
 			data = atomic_clear_long(&tp->tte_data, TD_WIRED);
 			if ((data & TD_WIRED) != 0)
 				pm->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pm);
 }
 
 static int
 pmap_copy_tte(pmap_t src_pmap, pmap_t dst_pmap, struct tte *tp, vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	if ((tp->tte_data & TD_FAKE) != 0)
 		return (1);
 	if (tsb_tte_lookup(dst_pmap, va) == NULL) {
 		data = tp->tte_data &
 		    ~(TD_PV | TD_REF | TD_SW | TD_CV | TD_W);
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		tsb_tte_enter(dst_pmap, m, va, TS_8K, data);
 	}
 	return (1);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 	  vm_size_t len, vm_offset_t src_addr)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	if (dst_addr != src_addr)
 		return;
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	if (len > PMAP_TSB_THRESH) {
 		tsb_foreach(src_pmap, dst_pmap, src_addr, src_addr + len,
 		    pmap_copy_tte);
 		tlb_context_demap(dst_pmap);
 	} else {
 		for (va = src_addr; va < src_addr + len; va += PAGE_SIZE) {
 			if ((tp = tsb_tte_lookup(src_pmap, va)) != NULL)
 				pmap_copy_tte(src_pmap, dst_pmap, tp, va);
 		}
 		tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_area: fake page"));
 	KASSERT(off + size <= PAGE_SIZE, ("pmap_zero_page_area: bad off/size"));
 	PMAP_STATS_INC(pmap_nzero_page_area);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_area_nc);
 		aszero(ASI_PHYS_USE_EC, pa + off, size);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_area_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		bzero((void *)(va + off), size);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_area_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		bzero((void *)(va + off), size);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_idle: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page_idle);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else if (m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_idle_oc);
 		va = pmap_idle_map + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 	}
 }
 
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t vdst;
 	vm_offset_t vsrc;
 	vm_paddr_t pdst;
 	vm_paddr_t psrc;
 	struct tte *tp;
 
 	KASSERT((mdst->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake dst page"));
 	KASSERT((msrc->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake src page"));
 	PMAP_STATS_INC(pmap_ncopy_page);
 	pdst = VM_PAGE_TO_PHYS(mdst);
 	psrc = VM_PAGE_TO_PHYS(msrc);
 	if (msrc->md.color == -1 && mdst->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncopy_page_nc);
 		ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE);
 	} else if (msrc->md.color == DCACHE_COLOR(psrc) &&
 	    mdst->md.color == DCACHE_COLOR(pdst)) {
 		PMAP_STATS_INC(pmap_ncopy_page_c);
 		vdst = TLB_PHYS_TO_DIRECT(pdst);
 		vsrc = TLB_PHYS_TO_DIRECT(psrc);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 	} else if (msrc->md.color == -1) {
 		if (mdst->md.color == DCACHE_COLOR(pdst)) {
 			PMAP_STATS_INC(pmap_ncopy_page_dc);
 			vdst = TLB_PHYS_TO_DIRECT(pdst);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_doc);
 			PMAP_LOCK(kernel_pmap);
 			vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vdst);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vdst, TS_8K);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vdst);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else if (mdst->md.color == -1) {
 		if (msrc->md.color == DCACHE_COLOR(psrc)) {
 			PMAP_STATS_INC(pmap_ncopy_page_sc);
 			vsrc = TLB_PHYS_TO_DIRECT(psrc);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_soc);
 			PMAP_LOCK(kernel_pmap);
 			vsrc = pmap_temp_map_1 + (msrc->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vsrc);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vsrc);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else {
 		PMAP_STATS_INC(pmap_ncopy_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vdst);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vdst, TS_8K);
 		vsrc = pmap_temp_map_2 + (msrc->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vsrc);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, vdst);
 		tlb_page_demap(kernel_pmap, vsrc);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pm, vm_page_t m)
 {
 	struct tte *tp;
 	int loops;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	loops = 0;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if (TTE_GET_PMAP(tp) == pm)
 			return (TRUE);
 		if (++loops >= 16)
 			break;
 	}
 	return (FALSE);
 }
 
 /*
  * Remove all pages from specified address space, this aids process exit
  * speeds.  This is much faster than pmap_remove n the case of running down
  * an entire address space.  Only works for the current pmap.
  */
 void
 pmap_remove_pages(pmap_t pm)
 {
 }
 
 /*
  * Returns TRUE if the given page has a managed mapping.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct tte *tp;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) != 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct tte *tpf;
 	struct tte *tpn;
 	struct tte *tp;
 	u_long data;
 	int count;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (0);
 	count = 0;
 	if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) {
 		tpf = tp;
 		do {
 			tpn = TAILQ_NEXT(tp, tte_link);
 			TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 			TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 			if ((tp->tte_data & TD_PV) == 0)
 				continue;
 			data = atomic_clear_long(&tp->tte_data, TD_REF);
 			if ((data & TD_REF) != 0 && ++count > 4)
 				break;
 		} while ((tp = tpn) != NULL && tp != tpf);
 	}
 	return (count);
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct tte *tp;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return (FALSE);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if ((tp->tte_data & TD_W) != 0)
 			return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 
 	return (FALSE);
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_W);
 		if ((data & TD_W) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 }
 
 void
 pmap_clear_reference(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_REF);
 		if ((data & TD_REF) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 }
 
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) != 0 ||
 	    (m->flags & PG_WRITEABLE) == 0)
 		return;
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W);
 		if ((data & TD_W) != 0) {
 			vm_page_dirty(m);
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 		}
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 int
 pmap_mincore(pmap_t pm, vm_offset_t addr)
 {
 	/* TODO; */
 	return (0);
 }
 
 /*
  * Activate a user pmap.  The pmap must be activated before its address space
  * can be accessed in any way.
  */
 void
 pmap_activate(struct thread *td)
 {
 	struct vmspace *vm;
 	struct pmap *pm;
 	int context;
 
 	vm = td->td_proc->p_vmspace;
 	pm = vmspace_pmap(vm);
 
 	mtx_lock_spin(&sched_lock);
 
 	context = PCPU_GET(tlb_ctx);
 	if (context == PCPU_GET(tlb_ctx_max)) {
 		tlb_flush_user();
 		context = PCPU_GET(tlb_ctx_min);
 	}
 	PCPU_SET(tlb_ctx, context + 1);
 
 	pm->pm_context[PCPU_GET(cpuid)] = context;
 	pm->pm_active |= PCPU_GET(cpumask);
 	PCPU_SET(pmap, pm);
 
 	stxa(AA_DMMU_TSB, ASI_DMMU, pm->pm_tsb);
 	stxa(AA_IMMU_TSB, ASI_IMMU, pm->pm_tsb);
 	stxa(AA_DMMU_PCXR, ASI_DMMU, context);
 	membar(Sync);
 
 	mtx_unlock_spin(&sched_lock);
 }
 
 vm_offset_t
 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
 {
 
 	return (va);
 }
Index: head/sys/sun4v/sun4v/machdep.c
===================================================================
--- head/sys/sun4v/sun4v/machdep.c	(revision 169666)
+++ head/sys/sun4v/sun4v/machdep.c	(revision 169667)
@@ -1,949 +1,949 @@
 /*-
  * Copyright (c) 2001 Jake Burkholder.
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * 	from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
 #include <sys/cons.h>
 #include <sys/eventhandler.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/interrupt.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/pcpu.h>
 #include <sys/ptrace.h>
 #include <sys/reboot.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/timetc.h>
 #include <sys/ucontext.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 
 #include <ddb/ddb.h>
 
 #include <machine/bus.h>
 #include <machine/cache.h>
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/fp.h>
 #include <machine/fsr.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_machdep.h>
 #include <machine/ofw_mem.h>
 #include <machine/pcb.h>
 #include <machine/pmap.h>
 #include <machine/pstate.h>
 #include <machine/reg.h>
 #include <machine/sigframe.h>
 #include <machine/smp.h>
 #include <machine/tick.h>
 #include <machine/tlb.h>
 #include <machine/tstate.h>
 #include <machine/asm.h>
 #include <machine/ver.h>
 #include <machine/hv_api.h>
 
 /* XXX move this to a header */
 extern void mdesc_init(void);
 
 typedef int ofw_vec_t(void *);
 
 #ifdef DDB
 extern vm_offset_t ksym_start, ksym_end;
 #endif
 
 struct tlb_entry *kernel_tlbs;
 int kernel_tlb_slots;
 
 int cold = 1;
 long Maxmem;
 long realmem;
 
 char pcpu0[PCPU_PAGES * PAGE_SIZE];
 struct trapframe frame0;
 
 vm_offset_t kstack0;
 vm_paddr_t kstack0_phys;
 
 struct kva_md_info kmi;
 
 u_long ofw_vec;
 u_long ofw_tba;
 
 /*
  * Note: timer quality for CPU's is set low to try and prevent them from
  * being chosen as the primary timecounter.  The CPU counters are not
  * synchronized among the CPU's so in MP machines this causes problems
  * when calculating the time.  With this value the CPU's should only be
  * chosen as the primary timecounter as a last resort.
  */
 
 #define	UP_TICK_QUALITY	1000
 #ifdef SUN4V
 #define	MP_TICK_QUALITY	1000
 #else
 #define	MP_TICK_QUALITY	-100
 #endif
 
 
 
 
 
 static struct timecounter tick_tc;
 
 char sparc64_model[32];
 
 cpu_block_copy_t *cpu_block_copy;
 cpu_block_zero_t *cpu_block_zero;
 
 static timecounter_get_t tick_get_timecount;
 void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
 		  ofw_vec_t *vec);
 void sparc64_shutdown_final(void *dummy, int howto);
 
 static void cpu_startup(void *);
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
 
 CTASSERT((1 << INT_SHIFT) == sizeof(int));
 CTASSERT((1 << PTR_SHIFT) == sizeof(char *));
 
 CTASSERT(sizeof(struct reg) == 256);
 CTASSERT(sizeof(struct fpreg) == 272);
 CTASSERT(sizeof(struct __mcontext) == 512);
 
 CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
 CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
 CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));
 
 CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));
 CTASSERT((sizeof(struct pcpu) & ((1<<6)-1)) == 0);
 
 static void
 cpu_startup(void *arg)
 {
 	vm_paddr_t physsz;
 	int i;
 
 	tick_tc.tc_get_timecount = tick_get_timecount;
 	tick_tc.tc_poll_pps = NULL;
 	tick_tc.tc_counter_mask = ~0u;
 	tick_tc.tc_frequency = tick_freq;
 	tick_tc.tc_name = "tick";
 	tick_tc.tc_quality = UP_TICK_QUALITY;
 #ifdef SMP
 	/*
 	 * We do not know if each CPU's tick counter is synchronized.
 	 */
 	if (cpu_mp_probe())
 		tick_tc.tc_quality = MP_TICK_QUALITY;
 #endif
 
 	tc_init(&tick_tc);
 
 	physsz = 0;
 	for (i = 0; i < sparc64_nmemreg; i++)
 		physsz += sparc64_memreg[i].mr_size;
 	printf("real memory  = %lu (%lu MB)\n", physsz,
 	    physsz / (1024 * 1024));
 	realmem = (long)physsz;
 
 	vm_ksubmap_init(&kmi);
 
 	bufinit();
 	vm_pager_bufferinit();
 
 	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
 	    SHUTDOWN_PRI_LAST);
 
-	printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
-	    cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));
+	printf("avail memory = %lu (%lu MB)\n", VMCNT_GET(free_count) *
+	    PAGE_SIZE, VMCNT_GET(free_count) / ((1024 * 1024) / PAGE_SIZE));
 
 	if (bootverbose)
 		printf("machine: %s\n", sparc64_model);
 
 #ifdef notyet
 	cpu_identify(rdpr(ver), tick_freq, PCPU_GET(cpuid));
 #endif 
 }
 
 void
 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
 {
 	struct intr_request *ir;
 	int i;
 
 	pcpu->pc_irtail = &pcpu->pc_irhead;
 	for (i = 0; i < IR_FREE; i++) {
 		ir = &pcpu->pc_irpool[i];
 		ir->ir_next = pcpu->pc_irfree;
 		pcpu->pc_irfree = ir;
 	}
 }
 
 void
 spinlock_enter(void)
 {
 	struct thread *td;
 	register_t pil;
 
 	td = curthread;
 	if (td->td_md.md_spinlock_count == 0) {
 		pil = intr_disable();
 		td->td_md.md_saved_pil = pil;
 	}
 	td->td_md.md_spinlock_count++;
 	critical_enter();
 }
 
 void
 spinlock_exit(void)
 {
 	struct thread *td;
 
 	td = curthread;
 	critical_exit();
 	td->td_md.md_spinlock_count--;
 	if (td->td_md.md_spinlock_count == 0) {
 		intr_restore(td->td_md.md_saved_pil);
 	}
 
 }
 
 unsigned
 tick_get_timecount(struct timecounter *tc)
 {
 	return ((unsigned)rd(tick));
 }
 
 void
 sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
 {
 	phandle_t child;
 	phandle_t root;
 	struct pcpu *pc;
 	vm_offset_t end;
 	caddr_t kmdp;
 	u_int clock;
 	char *env;
 	char type[8];
 
 	end = 0;
 	kmdp = NULL;
 
 	/*
 	 * Initialize Open Firmware (needed for console).
 	 */
 	OF_init(vec);
 
 	/*
 	 * Parse metadata if present and fetch parameters.  Must be before the
 	 * console is inited so cninit gets the right value of boothowto.
 	 */
 	if (mdp != NULL) {
 		preload_metadata = mdp;
 		kmdp = preload_search_by_type("elf kernel");
 		if (kmdp != NULL) {
 			boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
 			kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
 			end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
 			kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
 			    int);
 			kernel_tlbs = (void *)preload_search_info(kmdp,
 			    MODINFO_METADATA | MODINFOMD_DTLB);
 		}
 	}
 
 	init_param1();
 
 	root = OF_peer(0);
 	for (child = OF_child(root); child != 0; child = OF_peer(child)) {
 		OF_getprop(child, "device_type", type, sizeof(type));
 		if (strcmp(type, "cpu") == 0)
 			break;
 	}
 
 	OF_getprop(child, "clock-frequency", &clock, sizeof(clock));
 
 	/*
 	 * Initialize the console before printing anything.
 	 * console uses the pcpu area for serialization 
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize proc0 stuff (p_contested needs to be done early).
 	 */
 
 	proc_linkup(&proc0, &thread0);
 	proc0.p_md.md_sigtramp = NULL;
 	proc0.p_md.md_utrap = NULL;
 	frame0.tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_PRIV;
 	thread0.td_frame = &frame0;
 	if ((u_long)thread0.td_frame & 0x3f) {
 		panic("unaligned frame0");
 	}
 
 	/*
 	 * Prime our per-cpu data page for use.  Note, we are using it for our
 	 * stack, so don't pass the real size (PAGE_SIZE) to pcpu_init or
 	 * it'll zero it out from under us.
 	 */
 	pc = (struct pcpu *)(pcpu0 + (PCPU_PAGES * PAGE_SIZE)) - 1;
 	pcpu_init(pc, 0, sizeof(struct pcpu));
 	pc->pc_curthread = &thread0;
 	pc->pc_addr = (vm_offset_t)pcpu0;
 
 	cninit();
 	tick_init(clock);
 
 	printf("cpu0: UltraSparc T1 Processor (%d.%02d MHz CPU)\n",
 	    (clock + 4999) / 1000000, ((clock + 4999) / 10000) % 100);
 
 	/*
 	 * Panic is there is no metadata.  Most likely the kernel was booted
 	 * directly, instead of through loader(8).
 	 */
 	if (mdp == NULL || kmdp == NULL) {
 		printf("sparc64_init: no loader metadata.\n"
 		       "This probably means you are not using loader(8).\n");
 		panic("sparc64_init");
 	}
 
 	/*
 	 * Sanity check the kernel end, which is important.
 	 */
 	if (end == 0) {
 		printf("sparc64_init: warning, kernel end not specified.\n"
 		       "Attempting to continue anyway.\n");
 		end = (vm_offset_t)_end;
 	}
 
 	cpu_block_copy = bcopy;
 	cpu_block_zero = bzero;
 
 #ifdef SMP
 	mp_tramp = mp_tramp_alloc();
 #endif
 
 	env = getenv("kernelname");
 	if (env != NULL) {
 		strlcpy(kernelname, env, sizeof(kernelname));
 		freeenv(env);
 	}
 
 
 
 	/*
 	 * Initialize global registers.
 	 * needed for curthread to work
 	 */
 	cpu_setregs(pc);
 
 	/*
 	 * Initialize virtual memory and calculate physmem.
 	 */
 	pmap_bootstrap(end);
 
 	thread0.td_kstack = kstack0;
 	thread0.td_md.md_saved_pil = 0;
 	thread0.td_pcb = (struct pcb *)
 		(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
 	thread0.td_pcb->pcb_kstack = (uint64_t)(((char *)thread0.td_pcb) - (CCFSZ + SPOFF));
 	thread0.td_pcb = (struct pcb *)TLB_PHYS_TO_DIRECT(vtophys((vm_offset_t)thread0.td_pcb));
 	pc->pc_curpcb = thread0.td_pcb;
 
 	if (((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f) != 0) {
 		printf("unaligned stack pcb_kstack & 0x3f == 0x%lx\n", 
 		       ((thread0.td_pcb->pcb_kstack + SPOFF) & 0x3f));
 	}
 
 	/*
 	 * Update PCPU_REG to point to direct address
 	 * to support easy phys <-> virt translation in trap handler
 	 */
 	pc = (struct pcpu *)TLB_PHYS_TO_DIRECT(vtophys(pc));
 	cpu_setregs(pc);
 	
 	/*
 	 * Initialize tunables.
 	 */
 	init_param2(physmem);
 
 	/*
 	 * setup trap table and fault status area
 	 */
 	trap_init();
 	
 	/*
 	 * Initialize the message buffer (after setting trap table).
 	 */
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	mutex_init();
 	
 	mdesc_init();
 
 	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);
 
 	kdb_init();
 
 #ifdef KDB
 	if (boothowto & RB_KDB)
 		kdb_enter("Boot flags requested debugger");
 #endif
 }
 
 void
 set_openfirm_callback(ofw_vec_t *vec)
 {
 	ofw_tba = rdpr(tba);
 	ofw_vec = (u_long)vec;
 }
 
 void
 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
 {
 	struct trapframe *tf;
 	struct sigframe *sfp;
 	struct sigacts *psp;
 	struct sigframe sf;
 	struct thread *td;
 	struct frame *fp;
 	struct proc *p;
 	int oonstack;
 	u_long sp;
 	int sig;
 	int code;
 
 	oonstack = 0;
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	tf = td->td_frame;
 	sp = tf->tf_sp + SPOFF;
 	oonstack = sigonstack(sp);
 
 	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
 	    catcher, sig);
 
 	/* Make sure we have a signal trampoline to return to. */
 	if (p->p_md.md_sigtramp == NULL) {
 		/*
 		 * No signal tramoline... kill the process.
 		 */
 		CTR0(KTR_SIG, "sendsig: no sigtramp");
 		printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	}
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = td->td_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct sigframe));
 	} else
 		sfp = (struct sigframe *)sp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	fp = (struct frame *)sfp - 1;
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	tf->tf_out[0] = sig;
 	tf->tf_out[2] = (register_t)&sfp->sf_uc;
 	tf->tf_out[4] = (register_t)catcher;
 
 	/* Fill siginfo structure. */
 	sf.sf_si = ksi->ksi_info;
 	sf.sf_si.si_addr = (void *)tf->tf_tpc;
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		tf->tf_out[1] = (register_t)&sfp->sf_si;
 
 		/* Fill in POSIX parts. */
 		sf.sf_si = ksi->ksi_info;
 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
 	} else {
 		/* Old FreeBSD-style arguments. */
 		tf->tf_out[1] = ksi->ksi_code;
 		tf->tf_out[3] = (register_t)ksi->ksi_addr;
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (rwindow_save(td) != 0 || copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
 	    suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 		/* NOTREACHED */
 	}
 
 	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
 	tf->tf_tnpc = tf->tf_tpc + 4;
 	tf->tf_sp = (u_long)fp - SPOFF;
 
 	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
 	    tf->tf_sp);
 
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 #ifndef	_SYS_SYSPROTO_H_
 struct sigreturn_args {
 	ucontext_t *ucp;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 sigreturn(struct thread *td, struct sigreturn_args *uap)
 {
 	struct proc *p;
 	mcontext_t *mc;
 	ucontext_t uc;
 	int error;
 
 	p = td->td_proc;
 	if (rwindow_save(td)) {
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
 	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
 		CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
 		return (EFAULT);
 	}
 
 	mc = &uc.uc_mcontext;
 	error = set_mcontext(td, mc);
 	if (error != 0)
 		return (error);
 
 	PROC_LOCK(p);
 	td->td_sigmask = uc.uc_sigmask;
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
 	    td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
 {
 
 	return sigreturn(td, (struct sigreturn_args *)uap);
 }
 #endif
 
 /*
  * Construct a PCB from a trapframe. This is called from kdb_trap() where
  * we want to start a backtrace from the function that caused us to enter
  * the debugger. We have the context in the trapframe, but base the trace
  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
  * enough for a backtrace.
  */
 void
 makectx(struct trapframe *tf, struct pcb *pcb)
 {
 
 	pcb->pcb_pc = tf->tf_tpc;
 	pcb->pcb_sp = tf->tf_sp;
 }
 
 int
 get_mcontext(struct thread *td, mcontext_t *mc, int flags)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	bcopy(tf, mc, sizeof(*tf));
 	if (flags & GET_MC_CLEAR_RET) {
 		mc->mc_out[0] = 0;
 		mc->mc_out[1] = 0;
 	}
 	mc->mc_flags = _MC_VERSION;
 	critical_enter();
 	if ((tf->tf_fprs & FPRS_FEF) != 0) {
 		savefpctx(pcb->pcb_ufp);
 		pcb->pcb_flags |= PCB_FEF;
 		tf->tf_fprs &= ~FPRS_FEF;
 	}
 	if ((pcb->pcb_flags & PCB_FEF) != 0) {
 		bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
 		mc->mc_fprs |= FPRS_FEF;
 	}
 	critical_exit();
 	return (0);
 }
 
 int
 set_mcontext(struct thread *td, const mcontext_t *mc)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	uint64_t wstate;
 
 	if (!TSTATE_SECURE(mc->mc_tstate) ||
 	    (mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
 		return (EINVAL);
 	tf = td->td_frame;
 	pcb = td->td_pcb;
 	/* Make sure the windows are spilled first. */
 	flushw();
 	wstate = tf->tf_wstate;
 	bcopy(mc, tf, sizeof(*tf));
 	tf->tf_wstate = wstate;
 	if ((mc->mc_fprs & FPRS_FEF) != 0) {
 		tf->tf_fprs = 0;
 		bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 		pcb->pcb_flags |= PCB_FEF;
 	}
 	return (0);
 }
 
 /*
  * Exit the kernel and execute a firmware call that will not return, as
  * specified by the arguments.
  */
 void
 cpu_shutdown(void *args)
 {
 
 #ifdef SMP
 	cpu_mp_shutdown();
 #endif
 	hv_mach_exit(0);
 }
 
 /* Get current clock frequency for the given cpu id. */
 int
 cpu_est_clockrate(int cpu_id, uint64_t *rate)
 {
 
 	return (ENXIO);
 }
 
 /*
  * Duplicate OF_exit() with a different firmware call function that restores
  * the trap table, otherwise a RED state exception is triggered in at least
  * some firmware versions.
  */
 void
 cpu_halt(void)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"exit",
 		0,
 		0
 	};
 
 	cpu_shutdown(&args);
 }
 
 void
 sparc64_shutdown_final(void *dummy, int howto)
 {
 	static struct {
 		cell_t name;
 		cell_t nargs;
 		cell_t nreturns;
 	} args = {
 		(cell_t)"SUNW,power-off",
 		0,
 		0
 	};
 
 	/* Turn the power off? */
 	if ((howto & RB_POWEROFF) != 0)
 		cpu_shutdown(&args);
 	/* In case of halt, return to the firmware */
 	if ((howto & RB_HALT) != 0)
 		cpu_halt();
 }
 
 void
 cpu_idle(void)
 {
 
 	if (rdpr(pil) != 0) 
 		panic("pil in cpu_idle not 0 - %ld", rdpr(pil));
 	if (rdpr(pstate) != 0x16)
 		panic("interrupts disabled in cpu_idle 0x%lx", rdpr(pstate));
 		/* XXX heinous hack begin*/
 	
 	cpu_yield();
 }
 
 int
 ptrace_set_pc(struct thread *td, u_long addr)
 {
 
 	td->td_frame->tf_tpc = addr;
 	td->td_frame->tf_tnpc = addr + 4;
 	return (0);
 }
 
 int
 ptrace_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 int
 ptrace_clear_single_step(struct thread *td)
 {
 	/* TODO; */
 	return (0);
 }
 
 void
 exec_setregs(struct thread *td, u_long entry, u_long stack, u_long ps_strings)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 	struct proc *p;
 	uint64_t kstack;
 	u_long sp;
 
 	/* XXX no cpu_exec */
 
 	p = td->td_proc;
 	p->p_md.md_sigtramp = NULL;
 	if (p->p_md.md_utrap != NULL) {
 		utrap_free(p->p_md.md_utrap);
 		p->p_md.md_utrap = NULL;
 	}
 	pcb = td->td_pcb;
 	kstack = pcb->pcb_kstack;
 	tf = td->td_frame;
 	sp = rounddown(stack, 16);
 	bzero(pcb, sizeof(*pcb));
 	bzero(tf, sizeof(*tf));
 	pcb->pcb_kstack = kstack;
 
 	tf->tf_out[0] = stack;
 	tf->tf_out[3] = p->p_sysent->sv_psstrings;
 	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
 
 	tf->tf_tnpc = entry + 4;
 	tf->tf_tpc = entry;
 	tf->tf_tstate = TSTATE_IE | TSTATE_PEF | TSTATE_MM_TSO;
 
 	td->td_retval[0] = tf->tf_out[0];
 	td->td_retval[1] = tf->tf_out[1];
 }
 
 int
 fill_regs(struct thread *td, struct reg *regs)
 {
 
 	bcopy(td->td_frame, regs, sizeof(*regs));
 	return (0);
 }
 
 int
 set_regs(struct thread *td, struct reg *regs)
 {
 	struct trapframe *tf;
 
 	if (!TSTATE_SECURE(regs->r_tstate))
 		return (EINVAL);
 	tf = td->td_frame;
 	regs->r_wstate = tf->tf_wstate;
 	bcopy(regs, tf, sizeof(*regs));
 	return (0);
 }
 
 int
 fill_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 set_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	return (ENOSYS);
 }
 
 int
 fill_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs = ~FPRS_FEF;
 	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
 	fpregs->fr_fsr = tf->tf_fsr;
 	fpregs->fr_gsr = tf->tf_gsr;
 	return (0);
 }
 
 int
 set_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 	struct trapframe *tf;
 	struct pcb *pcb;
 
 	pcb = td->td_pcb;
 	tf = td->td_frame;
 	tf->tf_fprs &= ~FPRS_FEF;
 	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
 	tf->tf_fsr = fpregs->fr_fsr;
 	tf->tf_gsr = fpregs->fr_gsr;
 	return (0);
 }
 
 struct md_utrap *
 utrap_alloc(void)
 {
 	struct md_utrap *ut;
 
 	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK | M_ZERO);
 	ut->ut_refcnt = 1;
 	return (ut);
 }
 
 void
 utrap_free(struct md_utrap *ut)
 {
 	int refcnt;
 
 	if (ut == NULL)
 		return;
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt--;
 	refcnt = ut->ut_refcnt;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	if (refcnt == 0)
 		free(ut, M_SUBPROC);
 }
 
 struct md_utrap *
 utrap_hold(struct md_utrap *ut)
 {
 
 	if (ut == NULL)
 		return (NULL);
 	mtx_pool_lock(mtxpool_sleep, ut);
 	ut->ut_refcnt++;
 	mtx_pool_unlock(mtxpool_sleep, ut);
 	return (ut);
 }
 
 void
 cpu_yield(void)
 {
 	if (rdpr(pil) < PIL_TICK)
 		hv_cpu_yield();
 }
Index: head/sys/sun4v/sun4v/pmap.c
===================================================================
--- head/sys/sun4v/sun4v/pmap.c	(revision 169666)
+++ head/sys/sun4v/sun4v/pmap.c	(revision 169667)
@@ -1,2234 +1,2234 @@
 /*-
  * Copyright (c) 2006 Kip Macy <kmacy@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kstack_pages.h"
 #include "opt_msgbuf.h"
 #include "opt_pmap.h"
 #include "opt_trap_trace.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/kdb.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h> 
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
 #include <machine/cpu.h>
 #include <machine/cache.h>
 #include <machine/frame.h>
 #include <machine/instr.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_mem.h>
 #include <machine/mmu.h>
 #include <machine/smp.h>
 #include <machine/tlb.h>
 #include <machine/tte.h>
 #include <machine/tte_hash.h>
 #include <machine/pcb.h>
 #include <machine/pstate.h>
 #include <machine/tsb.h>
 
 #include <machine/hypervisorvar.h>
 #include <machine/hv_api.h>
 
 #ifdef TRAP_TRACING
 void trap_trace_report(int);
 #endif
 
 #if 1
 #define	PMAP_DEBUG
 #endif
 #ifndef	PMAP_SHPGPERPROC
 #define	PMAP_SHPGPERPROC	200
 #endif
 
 cache_enable_t *cache_enable;
 cache_flush_t *cache_flush;
 dcache_page_inval_t *dcache_page_inval;
 icache_page_inval_t *icache_page_inval;
 
 /*
  * Virtual and physical address of message buffer.
  */
 struct msgbuf *msgbufp;
 vm_paddr_t msgbuf_phys;
 
 /*
  * Map of physical memory reagions.
  */
 vm_paddr_t phys_avail[128];
 vm_paddr_t phys_avail_tmp[128];
 static struct ofw_mem_region mra[128];
 static struct ofw_map translations[128];
 static int translations_size;
 
 
 struct ofw_mem_region sparc64_memreg[128];
 int sparc64_nmemreg;
 
 extern vm_paddr_t mmu_fault_status_area;
 
 /*
  * First and last available kernel virtual addresses.
  */
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 vm_offset_t kernel_vm_end;
 vm_offset_t vm_max_kernel_address;
 
 #ifndef PMAP_SHPGPERPROC
 #define PMAP_SHPGPERPROC 200
 #endif
 /*
  * Data for the pv entry allocation mechanism
  */
 static uma_zone_t pvzone;
 static struct vm_object pvzone_obj;
 static int pv_entry_count = 0, pv_entry_max = 0, pv_entry_high_water = 0;
 int pmap_debug = 0;
 static int pmap_debug_range = 1;
 static int use_256M_pages = 1;
 
 static struct mtx pmap_ctx_lock;
 static uint16_t ctx_stack[PMAP_CONTEXT_MAX];
 static int ctx_stack_top; 
 
 static int permanent_mappings = 0;
 static uint64_t nucleus_memory;
 static uint64_t nucleus_mappings[4];
 /*
  * Kernel pmap.
  */
 struct pmap kernel_pmap_store;
 
 hv_tsb_info_t kernel_td[MAX_TSB_INFO];
 
 /*
  * This should be determined at boot time
  * with tiny TLBS it doesn't make sense to try and selectively
  * invalidate more than this 
  */
 #define MAX_INVALIDATES   32
 #define MAX_TSB_CLEARS   128
 
 /*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size);
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
  * resident count drops to 0, there are no more pages to remove, so we
  * need not continue.
  */
 #define	PMAP_REMOVE_DONE(pm) \
 	((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 
 /*
  * Kernel MMU interface
  */
 #define curthread_pmap vmspace_pmap(curthread->td_proc->p_vmspace) 
 
 #ifdef PMAP_DEBUG
 #define KDPRINTF if (pmap_debug) printf
 #define DPRINTF \
 	if (curthread_pmap && (curthread_pmap->pm_context != 0) && ((PCPU_GET(cpumask) & curthread_pmap->pm_active) == 0)) \
    	panic("cpumask(0x%x) & active (0x%x) == 0 pid == %d\n",  \
 	      PCPU_GET(cpumask), curthread_pmap->pm_active, curthread->td_proc->p_pid); \
 if (pmap_debug) printf
 
 
 #else
 #define DPRINTF(...)
 #define KDPRINTF(...)
 #endif
 
 
 static void free_pv_entry(pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t locked_pmap);
 
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va);
 static void pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va);
 static void pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot);
 static void pmap_tsb_reset(pmap_t pmap);
 static void pmap_tsb_resize(pmap_t pmap);
 static void pmap_tte_hash_resize(pmap_t pmap);
 
 void pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap);
 
 struct tsb_resize_info {
 	uint64_t tri_tsbscratch;
 	uint64_t tri_tsb_ra;
 };
 
 /*
  * Quick sort callout for comparing memory regions.
  */
 static int mr_cmp(const void *a, const void *b);
 static int om_cmp(const void *a, const void *b);
 static int
 mr_cmp(const void *a, const void *b)
 {
 	const struct ofw_mem_region *mra;
 	const struct ofw_mem_region *mrb;
 
 	mra = a;
 	mrb = b;
 	if (mra->mr_start < mrb->mr_start)
 		return (-1);
 	else if (mra->mr_start > mrb->mr_start)
 		return (1);
 	else
 		return (0);
 }
 static int
 om_cmp(const void *a, const void *b)
 {
 	const struct ofw_map *oma;
 	const struct ofw_map *omb;
 
 	oma = a;
 	omb = b;
 	if (oma->om_start < omb->om_start)
 		return (-1);
 	else if (oma->om_start > omb->om_start)
 		return (1);
 	else
 		return (0);
 }
 
 static __inline void
 free_context(uint16_t ctx)
 {
 	mtx_lock_spin(&pmap_ctx_lock);
 	ctx_stack[ctx_stack_top++] = ctx;
 	mtx_unlock_spin(&pmap_ctx_lock);
 
 	KASSERT(ctx_stack_top < PMAP_CONTEXT_MAX, 
 		("context stack overrun - system error"));
 }
 
 static __inline uint16_t
 get_context(void)
 {
 	uint16_t ctx;
 
 	mtx_lock_spin(&pmap_ctx_lock);
 	ctx = ctx_stack[--ctx_stack_top];
 	mtx_unlock_spin(&pmap_ctx_lock);
 
 	KASSERT(ctx_stack_top > 0,
 		("context stack underrun - need to implement context stealing"));
 
 	return ctx;
 }
 
 static __inline void
 free_pv_entry(pv_entry_t pv)
 {
 	pv_entry_count--;
 	uma_zfree(pvzone, pv);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t locked_pmap)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	struct vpgqueues *vpq;
 	uint64_t tte_data;
 	pmap_t pmap;
 	pv_entry_t allocated_pv, next_pv, pv;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	allocated_pv = uma_zalloc(pvzone, M_NOWAIT);
 	if (allocated_pv != NULL) {
 		pv_entry_count++;
 		if (pv_entry_count > pv_entry_high_water)
 			pagedaemon_wakeup();
 		else
 			return (allocated_pv);
 	}
 
 	/*
 	 * Reclaim pv entries: At first, destroy mappings to inactive
 	 * pages.  After that, if a pv entry is still needed, destroy
 	 * mappings to active pages.
 	 */
 	if (ratecheck(&lastprint, &printinterval))
 		printf("Approaching the limit on PV entries, "
 		    "increase the vm.pmap.shpgperproc tunable.\n");
 
 	vpq = &vm_page_queues[PQ_INACTIVE];
 retry:
 	TAILQ_FOREACH(m, &vpq->pl, pageq) {
 		if (m->hold_count || m->busy)
 			continue;
 		TAILQ_FOREACH_SAFE(pv, &m->md.pv_list, pv_list, next_pv) {
 			va = pv->pv_va;
 			pmap = pv->pv_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap))
 				continue;
 			pmap->pm_stats.resident_count--;
 
 			tte_data = tte_hash_delete(pmap->pm_hash, va);
 
 			KASSERT((tte_data & VTD_WIRED) == 0,
 			    ("get_pv_entry: wired pte %#jx", (uintmax_t)tte_data));
 			if (tte_data & VTD_REF)
 				vm_page_flag_set(m, PG_REFERENCED);
 			if (tte_data & VTD_W) {
 				KASSERT((tte_data & VTD_SW_W),
 				("get_pv_entry: modified page not writable: va: %lx, tte: %lx",
 				    va, tte_data));
 				vm_page_dirty(m);
 			}
 
 			pmap_invalidate_page(pmap, va, TRUE);
 			TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			if (TAILQ_EMPTY(&m->md.pv_list))
 				vm_page_flag_clear(m, PG_WRITEABLE);
 			m->md.pv_list_count--;
 
 			if (pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			if (allocated_pv == NULL)
 				allocated_pv = pv;
 			else
 				free_pv_entry(pv);
 		}
 	}
 	if (allocated_pv == NULL) {
 		if (vpq == &vm_page_queues[PQ_INACTIVE]) {
 			vpq = &vm_page_queues[PQ_ACTIVE];
 			goto retry;
 		}
 		panic("get_pv_entry: increase the vm.pmap.shpgperproc tunable");
 	}
 	return (allocated_pv);
 }
 
 /*
  * Allocate a physical page of memory directly from the phys_avail map.
  * Can only be called from pmap_bootstrap before avail start and end are
  * calculated.
  */
 static vm_paddr_t
 pmap_bootstrap_alloc(vm_size_t size)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = round_page(size);
 
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		pmap_scrub_pages(pa, size);
 		return (pa);
 	}
 	panic("pmap_bootstrap_alloc");
 }
 
 /*
  * Activate a user pmap.  The pmap must be activated before its address space
  * can be accessed in any way.
  */
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t pmap, oldpmap;
 	int err;
 	
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 #if defined(SMP)
 	atomic_clear_int(&oldpmap->pm_active, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_tlbactive, PCPU_GET(cpumask));
 	atomic_set_int(&pmap->pm_active, PCPU_GET(cpumask));
 #else
 	oldpmap->pm_active &= ~1;
 	pmap->pm_active |= 1;
 	pmap->pm_tlbactive |= 1;
 #endif
 
 	pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);
 	pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 	pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
 
 	PCPU_SET(curpmap, pmap);
 	if (pmap->pm_context != 0)
 		if ((err = hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra)) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 	stxa(MMU_CID_S, ASI_MMU_CONTEXTID, pmap->pm_context);
 	membar(Sync);
 	critical_exit();
 }
 
 vm_offset_t 
 pmap_addr_hint(vm_object_t object, vm_offset_t va, vm_size_t size)
 {
 	return (va);
 }
 
 /*
  * Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t ekva)
 {
 	struct pmap *pm;
 	vm_offset_t off, va;
 	vm_paddr_t pa, tsb_8k_pa, tsb_4m_pa, kernel_hash_pa, nucleus_memory_start;
 	vm_size_t physsz, virtsz, kernel_hash_shift;
 	ihandle_t pmem, vmem;
 	int i, j, k, sz;
 	uint64_t tsb_8k_size, tsb_4m_size, error, physmem_tunable, physmemstart_tunable;
 	vm_paddr_t real_phys_avail[128], tmp_phys_avail[128], bounds;
 	
 
 	if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 		panic("pmap_bootstrap: finddevice /virtual-memory");
 	if ((sz = OF_getproplen(vmem, "translations")) == -1)
 		panic("pmap_bootstrap: getproplen translations");
 	if (sizeof(translations) < sz)
 		panic("pmap_bootstrap: translations too small");
 	bzero(translations, sz);
 	if (OF_getprop(vmem, "translations", translations, sz) == -1)
 		panic("pmap_bootstrap: getprop /virtual-memory/translations");
 	sz /= sizeof(*translations);
 	translations_size = sz;
 	nucleus_memory_start = 0;
 	CTR0(KTR_PMAP, "pmap_bootstrap: translations");
 	qsort(translations, sz, sizeof (*translations), om_cmp);
 
 	for (i = 0; i < sz; i++) {
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 			translations[i].om_size, translations[i].om_start, 
 			translations[i].om_tte);
 		if ((translations[i].om_start >= KERNBASE) && 
 		    (translations[i].om_start <= KERNBASE + 3*PAGE_SIZE_4M)) {
 			for (j = 0; j < translations[i].om_size; j += PAGE_SIZE_4M) {
 				KDPRINTF("mapping permanent translation\n");
 				pa = TTE_GET_PA(translations[i].om_tte) + j;
 				va = translations[i].om_start + j;
 				error = hv_mmu_map_perm_addr(va, KCONTEXT, 
 							     pa | TTE_KERNEL | VTD_4M, MAP_ITLB | MAP_DTLB);
 				if (error != H_EOK)
 					panic("map_perm_addr returned error=%ld", error);
 				
 				if ((nucleus_memory_start == 0) || (pa < nucleus_memory_start))
 					nucleus_memory_start = pa;
 				printf("nucleus_mappings[%d] = 0x%lx\n", permanent_mappings, pa);
 				nucleus_mappings[permanent_mappings++] = pa;
 				nucleus_memory += PAGE_SIZE_4M;
 #ifdef SMP
 				mp_add_nucleus_mapping(va, pa|TTE_KERNEL|VTD_4M);
 #endif
 			}
 		}  
 	}
 
 	/*
 	 * Find out what physical memory is available from the prom and
 	 * initialize the phys_avail array.  This must be done before
 	 * pmap_bootstrap_alloc is called.
 	 */
 	if ((pmem = OF_finddevice("/memory")) == -1)
 		panic("pmap_bootstrap: finddevice /memory");
 	if ((sz = OF_getproplen(pmem, "available")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/available");
 	if (sizeof(vm_paddr_t)*128 < sz) /* FIXME */
 		panic("pmap_bootstrap: phys_avail too small");
 	if (sizeof(mra) < sz)
 		panic("pmap_bootstrap: mra too small");
 	bzero(mra, sz);
 	if (OF_getprop(pmem, "available", mra, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/available");
 
 	sz /= sizeof(*mra);
 	CTR0(KTR_PMAP, "pmap_bootstrap: physical memory");
 
 	qsort(mra, sz, sizeof (*mra), mr_cmp);
 	physmemstart_tunable = physmem_tunable = physmem = physsz = 0;
 	
         if (TUNABLE_ULONG_FETCH("hw.physmemstart", &physmemstart_tunable)) {
 		KDPRINTF("desired physmemstart=0x%lx\n", physmemstart_tunable);
 	}
         if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable)) {
                 physmem = atop(physmem_tunable);
 		KDPRINTF("desired physmem=0x%lx\n", physmem_tunable);
 	}
 	if ((physmem_tunable != 0) && (physmemstart_tunable != 0))
 		physmem_tunable += physmemstart_tunable;
 	
 	bzero(real_phys_avail, sizeof(real_phys_avail));
 	bzero(tmp_phys_avail, sizeof(tmp_phys_avail));
 
 	for (i = 0, j = 0; i < sz; i++) {
 		uint64_t size;
 		KDPRINTF("start=%#lx size=%#lx\n", mra[i].mr_start, mra[i].mr_size);
 		if (mra[i].mr_size < PAGE_SIZE_4M)
 			continue;
 
 		if ((mra[i].mr_start & PAGE_MASK_4M) || (mra[i].mr_size & PAGE_MASK_4M)) {
 			uint64_t newstart, roundup;
 			newstart = ((mra[i].mr_start + (PAGE_MASK_4M)) & ~PAGE_MASK_4M);
 			roundup = newstart - mra[i].mr_start;
 			size = (mra[i].mr_size - roundup) & ~PAGE_MASK_4M;
 			mra[i].mr_start = newstart;
 			if (size < PAGE_SIZE_4M)
 				continue;
 			mra[i].mr_size = size;
 		}
 		real_phys_avail[j] = mra[i].mr_start;
 		if (physmem_tunable != 0 && ((physsz + mra[i].mr_size) >= physmem_tunable)) {
 			mra[i].mr_size = physmem_tunable - physsz;
 			physsz = physmem_tunable;
 			real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 			break;
 		}
 		physsz += mra[i].mr_size;
 		real_phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 		j += 2;
 	}
 	physmem = btoc(physsz - physmemstart_tunable);
 
 	/*
 	 * This is needed for versions of OFW that would allocate us memory
 	 * and then forget to remove it from the available ranges ...
 	 * as well as for compensating for the above move of nucleus pages
 	 */
 	for (i = 0, j = 0, bounds = (1UL<<32); real_phys_avail[i] != 0; i += 2) {
 		vm_paddr_t start = real_phys_avail[i];
 		uint64_t end = real_phys_avail[i + 1];
 		CTR2(KTR_PMAP, "start=%#lx size=%#lx\n", start, end);
 		KDPRINTF("real_phys start=%#lx end=%#lx\n", start, end);
 		/* 
 		 * Is kernel memory at the beginning of range?
 		 */
 		if (nucleus_memory_start == start) {
 			start += nucleus_memory;
 		}
 		/* 
 		 * Is kernel memory at the end of range?
 		 */
 		if (nucleus_memory_start == (end - nucleus_memory)) 
 			end -= nucleus_memory;
 
 		if (physmemstart_tunable != 0 && 
 		    (end < physmemstart_tunable))
 			continue;
 
 		if (physmemstart_tunable != 0 && 
 		    ((start < physmemstart_tunable))) {
 			start = physmemstart_tunable;
 		}
 
 		/* 
 		 * Is kernel memory in the middle somewhere?		 
 		 */
 		if ((nucleus_memory_start > start) && 
 		    (nucleus_memory_start < end)) {
 			phys_avail[j] = start;
 			phys_avail[j+1] = nucleus_memory_start;
 			start =  nucleus_memory_start + nucleus_memory;
 			j += 2;
 		}
 		/*
 		 * Break phys_avail up on 4GB boundaries to try
 		 * to work around PCI-e allocation bug
 		 * we rely on the fact that kernel memory is allocated 
 		 * from the first 4GB of physical memory
 		 */ 
 		while (bounds < start)
 			bounds += (1UL<<32);
 
 		while (bounds < end) {
 			phys_avail[j] = start;
 			phys_avail[j + 1] = bounds;
 			start = bounds;
 			bounds += (1UL<<32);
 			j += 2;
 		}
 		phys_avail[j] = start; 
 		phys_avail[j + 1] = end;
 		j += 2;
 	}
 
 	/*
 	 * Merge nucleus memory in to real_phys_avail
 	 *
 	 */
 	for (i = 0; real_phys_avail[i] != 0; i += 2) {
 		if (real_phys_avail[i] == nucleus_memory_start + nucleus_memory)
 			real_phys_avail[i] -= nucleus_memory;
 		
 		if (real_phys_avail[i + 1] == nucleus_memory_start)
 			real_phys_avail[i + 1] += nucleus_memory;
 		
 		if (real_phys_avail[i + 1] == real_phys_avail[i + 2]) {
 			real_phys_avail[i + 1] = real_phys_avail[i + 3];
 			for (k = i + 2; real_phys_avail[k] != 0; k += 2) {
 				real_phys_avail[k] = real_phys_avail[k + 2];
 				real_phys_avail[k + 1] = real_phys_avail[k + 3];
 			}
 		}
 	}
 	for (i = 0; phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 
 	/*
 	 * Shuffle the memory range containing the 256MB page with 
 	 * nucleus_memory to the beginning of the phys_avail array
 	 * so that physical memory from that page is preferentially
 	 * allocated first
 	 */
 	for (j = 0; phys_avail[j] != 0; j += 2) 
 		if (nucleus_memory_start < phys_avail[j])
 			break;
 	/*
 	 * Don't shuffle unless we have a full 256M page in the range
 	 * our kernel malloc appears to be horribly brittle
 	 */
 	if ((phys_avail[j + 1] - phys_avail[j]) < 
 	    (PAGE_SIZE_256M - nucleus_memory))
 		goto skipshuffle;
 
 	for (i = j, k = 0; phys_avail[i] != 0; k++, i++)
 		tmp_phys_avail[k] = phys_avail[i];
 	for (i = 0; i < j; i++)
 		tmp_phys_avail[k + i] = phys_avail[i];
 	for (i = 0; i < 128; i++)
 		phys_avail[i] = tmp_phys_avail[i];
 
 skipshuffle:
 	for (i = 0; real_phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("real_phys_avail[%d]=0x%lx real_phys_avail[%d]=0x%lx\n",
 			i, real_phys_avail[i], i+1, real_phys_avail[i+1]);
 
 	for (i = 0; phys_avail[i] != 0; i += 2)
 		if (pmap_debug_range || pmap_debug)
 			printf("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel tsb.
 	 */
 	virtsz = roundup(physsz, PAGE_SIZE_4M << (PAGE_SHIFT - TTE_SHIFT));
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 
 	/*
 	 * Set the start and end of kva.  The kernel is loaded at the first
 	 * available 4 meg super page, so round up to the end of the page.
 	 */
 	virtual_avail = roundup2(ekva, PAGE_SIZE_4M);
 	virtual_end = vm_max_kernel_address;
 	kernel_vm_end = vm_max_kernel_address;
 
 	/*
 	 * Allocate and map a 4MB page for the kernel hashtable 
 	 *
 	 */
 #ifndef SIMULATOR
 	kernel_hash_shift = 10; /* PAGE_SIZE_4M*2 */
 #else
 	kernel_hash_shift = 6; /* PAGE_SIZE_8K*64 */
 #endif
 
 	kernel_hash_pa = pmap_bootstrap_alloc((1<<(kernel_hash_shift + PAGE_SHIFT)));
 	if (kernel_hash_pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: hashtable pa unaligned\n");
 	/*
 	 * Set up TSB descriptors for the hypervisor
 	 *
 	 */
 #ifdef notyet
 	tsb_8k_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 #else
 	/* avoid alignment complaints from the hypervisor */
 	tsb_8k_size = PAGE_SIZE_4M;
 #endif
 
 	tsb_8k_pa = pmap_bootstrap_alloc(tsb_8k_size);
 	if (tsb_8k_pa & PAGE_MASK_4M)
 		panic("pmap_bootstrap: tsb unaligned\n");
 	KDPRINTF("tsb_8k_size is 0x%lx, tsb_8k_pa is 0x%lx\n", tsb_8k_size, tsb_8k_pa);
 
 	tsb_4m_size = (virtsz >> (PAGE_SHIFT_4M - TTE_SHIFT)) << 3;
 	tsb_4m_pa = pmap_bootstrap_alloc(tsb_4m_size);
 
 	kernel_td[TSB8K_INDEX].hti_idxpgsz = TTE8K;
 	kernel_td[TSB8K_INDEX].hti_assoc = 1;
 	kernel_td[TSB8K_INDEX].hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 	kernel_td[TSB8K_INDEX].hti_ctx_index = 0;
 	kernel_td[TSB8K_INDEX].hti_pgszs = TSB8K;
 	kernel_td[TSB8K_INDEX].hti_rsvd = 0;
 	kernel_td[TSB8K_INDEX].hti_ra = tsb_8k_pa;
 
 	/*
 	 * Initialize kernel's private TSB from 8K page TSB
 	 *
 	 */
 	kernel_pmap->pm_tsb.hti_idxpgsz = TTE8K;
 	kernel_pmap->pm_tsb.hti_assoc = 1;
 	kernel_pmap->pm_tsb.hti_ntte = (tsb_8k_size >> TTE_SHIFT);
 	kernel_pmap->pm_tsb.hti_ctx_index = 0;
 	kernel_pmap->pm_tsb.hti_pgszs = TSB8K;
 	kernel_pmap->pm_tsb.hti_rsvd = 0;
 	kernel_pmap->pm_tsb.hti_ra = tsb_8k_pa;
 	
 	kernel_pmap->pm_tsb_ra = vtophys((vm_offset_t)&kernel_pmap->pm_tsb);
 	tsb_set_scratchpad_kernel(&kernel_pmap->pm_tsb);
 	
 	/*
 	 * Initialize kernel TSB for 4M pages
 	 * currently (not by design) used for permanent mappings
 	 */
 	
 
 	KDPRINTF("tsb_4m_pa is 0x%lx tsb_4m_size is 0x%lx\n", tsb_4m_pa, tsb_4m_size);
 	kernel_td[TSB4M_INDEX].hti_idxpgsz = TTE4M;
 	kernel_td[TSB4M_INDEX].hti_assoc = 1;
 	kernel_td[TSB4M_INDEX].hti_ntte = (tsb_4m_size >> TTE_SHIFT);
 	kernel_td[TSB4M_INDEX].hti_ctx_index = 0;
 	kernel_td[TSB4M_INDEX].hti_pgszs = TSB4M|TSB256M;
 	kernel_td[TSB4M_INDEX].hti_rsvd = 0;
 	kernel_td[TSB4M_INDEX].hti_ra = tsb_4m_pa;
 	/*
 	 * allocate MMU fault status areas for all CPUS
 	 */
 	mmu_fault_status_area = pmap_bootstrap_alloc(MMFSA_SIZE*MAXCPU);
 
 	/*
 	 * Allocate and map the message buffer.
 	 */
 	msgbuf_phys = pmap_bootstrap_alloc(MSGBUF_SIZE);
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(msgbuf_phys);
 
 	/*
 	 * Allocate a kernel stack with guard page for thread0 and map it into
 	 * the kernel tsb.  
 	 */
 	pa = pmap_bootstrap_alloc(KSTACK_PAGES*PAGE_SIZE);
 	kstack0_phys = pa;
 	virtual_avail += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	kstack0 = virtual_avail;
 	virtual_avail += KSTACK_PAGES * PAGE_SIZE;
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va,
 			    pa | TTE_KERNEL | VTD_8K, 0);
 	}
 	/*
 	 * Calculate the last available physical address.
 	 */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 	KDPRINTF("phys_avail[%d]=0x%lx phys_avail[%d]=0x%lx\n",
 			i, phys_avail[i], i+1, phys_avail[i+1]);
 
 	Maxmem = sparc64_btop(phys_avail[i + 1]);
 	
 	/*
 	 * Add the prom mappings to the kernel tsb.
 	 */
 	for (i = 0; i < sz; i++) {
 		CTR3(KTR_PMAP,
 		    "translation: start=%#lx size=%#lx tte=%#lx",
 		    translations[i].om_start, translations[i].om_size,
 		    translations[i].om_tte);
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS) 
 			continue;
 
 		for (off = 0; off < translations[i].om_size;
 		     off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			pa = TTE_GET_PA(translations[i].om_tte) + off;
 			tsb_assert_invalid(&kernel_td[TSB8K_INDEX], va);
 			tsb_set_tte_real(&kernel_td[TSB8K_INDEX], va, va, pa | 
 				    TTE_KERNEL | VTD_8K, 0);
 		}
 	}
 
 	if ((error = hv_mmu_tsb_ctx0(MAX_TSB_INFO, 
 				     vtophys((vm_offset_t)kernel_td))) != H_EOK)
 		panic("failed to set ctx0 TSBs error: %ld", error);
 
 #ifdef SMP
 	mp_set_tsb_desc_ra(vtophys((vm_offset_t)&kernel_td));
 #endif
 	/*
 	 * setup direct mappings
 	 * 
 	 */
 	for (i = 0, pa = real_phys_avail[i]; pa != 0; i += 2, pa = real_phys_avail[i]) {
 		vm_paddr_t tag_pa = 0, next_pa = 0;
 		uint64_t size_bits = VTD_4M;
 		while (pa < real_phys_avail[i + 1]) {
 			if (use_256M_pages &&
 			    (pa & PAGE_MASK_256M) == 0 && 
 			    ((pa + PAGE_SIZE_256M) <= real_phys_avail[i + 1])) {
 				tag_pa = pa;
 				size_bits = VTD_256M;
 				next_pa = pa + PAGE_SIZE_256M;
 			} else if (next_pa <= pa) {
 				tag_pa = pa;
 				size_bits = VTD_4M;
 			}
 			tsb_assert_invalid(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa));
 			tsb_set_tte_real(&kernel_td[TSB4M_INDEX], TLB_PHYS_TO_DIRECT(pa), 
 					 TLB_PHYS_TO_DIRECT(pa), 
 					 tag_pa | TTE_KERNEL | size_bits, 0);
 			pa += PAGE_SIZE_4M;
 		}
 	}
 
 	/*
 	 * Get the available physical memory ranges from /memory/reg. These
 	 * are only used for kernel dumps, but it may not be wise to do prom
 	 * calls in that situation.
 	 */
 	if ((sz = OF_getproplen(pmem, "reg")) == -1)
 		panic("pmap_bootstrap: getproplen /memory/reg");
 	if (sizeof(sparc64_memreg) < sz)
 		panic("pmap_bootstrap: sparc64_memreg too small");
 	if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 		panic("pmap_bootstrap: getprop /memory/reg");
 	sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 
 	pm = kernel_pmap;
 	pm->pm_active = ~0;
 	pm->pm_tlbactive = ~0;
 
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	TAILQ_INIT(&kernel_pmap->pm_pvlist);
 
 	/* 
 	 * This could happen earlier - but I put it here to avoid 
 	 * attempts to do updates until they're legal
 	 */
 	pm->pm_hash = tte_hash_kernel_create(TLB_PHYS_TO_DIRECT(kernel_hash_pa), kernel_hash_shift, 
 					     pmap_bootstrap_alloc(PAGE_SIZE));
 	pm->pm_hashscratch = tte_hash_set_scratchpad_kernel(pm->pm_hash);
 
 	for (i = 0; i < translations_size; i++) {
 		KDPRINTF("om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS) {
 			KDPRINTF("skipping\n");
 			continue;
 		}
 		for (off = 0; off < translations[i].om_size; off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			pa = TTE_GET_PA(translations[i].om_tte) + off;
 			tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 		}
 		KDPRINTF("set om_size=%ld om_start=%lx om_tte=%lx\n", 
 		       translations[i].om_size, translations[i].om_start, 
 		       translations[i].om_tte);
 	}
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tte_hash_insert(pm->pm_hash, va, pa | TTE_KERNEL | VTD_8K);
 	}
 	/*
 	 * Add direct mappings to hash
 	 *
 	 */
 #ifdef notyet
 	/* hash only supports 8k pages */
 	for (pa = PAGE_SIZE_4M; pa < phys_avail[2]; pa += PAGE_SIZE_4M)
 		tte_hash_insert(pm->pm_hash, TLB_PHYS_TO_DIRECT(pa), 
 				pa | TTE_KERNEL | VTD_4M);
 #endif
 }
 
 
 
 /*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 {
 	boolean_t iswired;
 	PMAP_LOCK(pmap);
 	iswired = tte_get_virt_bit(pmap, va, VTD_WIRED);
 
 	if (wired && !iswired) {
 		pmap->pm_stats.wired_count++;
 		tte_set_virt_bit(pmap, va, VTD_WIRED);
 	} else if (!wired && iswired) {
 		pmap->pm_stats.wired_count--;
 		tte_clear_virt_bit(pmap, va, VTD_WIRED);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	KDPRINTF("pmap_clear_modify(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 	tte_clear_phys_bit(m, VTD_W);
 }
 
 void
 pmap_clear_reference(vm_page_t m)
 {
 	KDPRINTF("pmap_clear_reference(0x%lx)\n", VM_PAGE_TO_PHYS(m));
 	tte_clear_phys_bit(m, VTD_REF);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
 	  vm_size_t len, vm_offset_t src_addr)
 {
 	vm_offset_t addr, end_addr;
 
 	end_addr = src_addr + len;
 	/*
 	 * Don't let optional prefaulting of pages make us go
 	 * way below the low water mark of free pages or way
 	 * above high water mark of used pv entries.
 	 */
-	if (cnt.v_free_count < cnt.v_free_reserved ||
+	if (VMCNT_GET(free_count) < VMCNT_GET(free_reserved) ||
 	    pv_entry_count > pv_entry_high_water)
 		return;
 	
 
 	vm_page_lock_queues();
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr += PAGE_SIZE) {
 		tte_t tte_data;
 		vm_page_t m;
 
 		tte_data = tte_hash_lookup(src_pmap->pm_hash, addr);
 
 		if ((tte_data & VTD_MANAGED) != 0) {
 			if (tte_hash_lookup(dst_pmap->pm_hash, addr) == 0) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 
 				tte_hash_insert(dst_pmap->pm_hash, addr, tte_data & ~(VTD_W|VTD_REF|VTD_WIRED));
 				dst_pmap->pm_stats.resident_count++;
 				pmap_insert_entry(dst_pmap, addr, m);
 			} 
 		}		
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 void
 pmap_copy_page(vm_page_t src, vm_page_t dst)
 {
 	vm_paddr_t srcpa, dstpa;
 	srcpa = VM_PAGE_TO_PHYS(src);
 	dstpa = VM_PAGE_TO_PHYS(dst);
 
 	novbcopy((char *)TLB_PHYS_TO_DIRECT(srcpa), (char *)TLB_PHYS_TO_DIRECT(dstpa), PAGE_SIZE);
 
 
 }
 
 static __inline void
 pmap_add_tte(pmap_t pmap, vm_offset_t va, vm_page_t m, tte_t *tte_data, int wired)
 {
 
 	if (wired)
 		pmap->pm_stats.wired_count++;
 	
 	if ((m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, m);
 		*tte_data |= VTD_MANAGED;
 	}
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  */
 void
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	   boolean_t wired)
 {
 	vm_paddr_t pa, opa;
 	uint64_t tte_data, otte_data;
 	vm_page_t om;
 	int invlva;
 
 	if (pmap->pm_context)
 		DPRINTF("pmap_enter(va=%lx, pa=0x%lx, prot=%x)\n", va, 
 			VM_PAGE_TO_PHYS(m), prot);
 
 	om = NULL;
 	
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 
 	tte_data = pa = VM_PAGE_TO_PHYS(m);
 	otte_data = tte_hash_delete(pmap->pm_hash, va);
 	opa = TTE_GET_PA(otte_data);
 
 	if (opa == 0) {
 		/*
 		 * This is a new mapping
 		 */
 		pmap->pm_stats.resident_count++;
 		pmap_add_tte(pmap, va, m, &tte_data, wired);
 
 	} else if (pa != opa) {
 		/*
 		 * Mapping has changed, handle validating new mapping.
 		 * 
 		 */
 		if (otte_data & VTD_WIRED)
 			pmap->pm_stats.wired_count--;
 
 		if (otte_data & VTD_MANAGED) {
 			om = PHYS_TO_VM_PAGE(opa);
 			pmap_remove_entry(pmap, om, va);
 		}
 
 		pmap_add_tte(pmap, va, m, &tte_data, wired);
 
 	} else /* (pa == opa) */ {
 		/*
 		 * Mapping has not changed, must be protection or wiring change.
 		 */
 
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && ((otte_data & VTD_WIRED) == 0))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && (otte_data & VTD_WIRED))
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * We might be turning off write access to the page,
 		 * so we go ahead and sense modify status.
 		 */
 		if (otte_data & VTD_MANAGED) {
 			om = m;
 			tte_data |= VTD_MANAGED;
 		}
 	} 
 
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	if ((prot & VM_PROT_WRITE) != 0) {
 		tte_data |= VTD_SW_W; 
 		vm_page_flag_set(m, PG_WRITEABLE);
 	}
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		tte_data |= VTD_X;
 	if (wired)
 		tte_data |= VTD_WIRED;
 	if (pmap == kernel_pmap)
 		tte_data |= VTD_P;
 	
 	invlva = FALSE;
 	if ((otte_data & ~(VTD_W|VTD_REF)) != tte_data) {
 		if (otte_data & VTD_V) {
 			if (otte_data & VTD_REF) {
 				if (otte_data & VTD_MANAGED) 
 					vm_page_flag_set(om, PG_REFERENCED);
 				if ((opa != pa) || ((opa & VTD_X) != (pa & VTD_X)))
 					invlva = TRUE;
 			}
 			if (otte_data & VTD_W) {
 				if (otte_data & VTD_MANAGED) 
 					vm_page_dirty(om);
 				if ((pa & VTD_SW_W) != 0) 
 					invlva = TRUE;
 			}
 			if (invlva)
 				pmap_invalidate_page(pmap, va, TRUE);
 		}
 	} 
 
 
 	tte_hash_insert(pmap->pm_hash, va, tte_data|TTE_MINFLAGS|VTD_REF);
 	/*
 	 * XXX this needs to be locked for the threaded / kernel case 
 	 */
 	tsb_set_tte(&pmap->pm_tsb, va, tte_data|TTE_MINFLAGS|VTD_REF, 
 		    pmap->pm_context);
 
 	if (tte_hash_needs_resize(pmap->pm_hash))
 		pmap_tte_hash_resize(pmap);
 
 	/*
 	 * 512 is an arbitrary number of tsb misses
 	 */
 	if (0 && pmap->pm_context != 0 && pmap->pm_tsb_miss_count > 512)
 		pmap_tsb_resize(pmap);
 
 	vm_page_unlock_queues();
 
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end, 
 		  vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
         vm_pindex_t diff, psize;
 
         VM_OBJECT_LOCK_ASSERT(m_start->object, MA_OWNED);
         psize = atop(end - start);
         m = m_start;
         PMAP_LOCK(pmap);
         while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_quick_locked(pmap, start + ptoa(diff), m, prot);
                 m = TAILQ_NEXT(m, listq);
         }
         PMAP_UNLOCK(pmap);
 }
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	PMAP_LOCK(pmap);
 	pmap_enter_quick_locked(pmap, va, m, prot);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	tte_t tte_data;
 
 	if (pmap->pm_context)
 		KDPRINTF("pmap_enter_quick(ctx=0x%lx va=%lx, pa=0x%lx prot=%x)\n", 
 			pmap->pm_context, va, VM_PAGE_TO_PHYS(m), prot);
 
         PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (tte_hash_lookup(pmap->pm_hash, va))
 		return;
 		
 	tte_data = VM_PAGE_TO_PHYS(m);
 	/*
 	 * Enter on the PV list if part of our managed memory. Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
 		pmap_insert_entry(pmap, va, m);
 		tte_data |= VTD_MANAGED;
 	}
 
 	pmap->pm_stats.resident_count++;
 
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		tte_data |= VTD_X;
 
 	tte_hash_insert(pmap->pm_hash, va, tte_data | TTE_MINFLAGS);
 }
 
 /*
  * Extract the physical page address associated with the given
  * map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t pa;
 	tte_t tte_data;
 
 	tte_data = tte_hash_lookup(pmap->pm_hash, va);
 	pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	tte_t tte_data;
 	vm_page_t m;
 
 	m = NULL;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	tte_data = tte_hash_lookup(pmap->pm_hash, va);
 	if (tte_data != 0 && 
 	    ((tte_data & VTD_SW_W) || (prot & VM_PROT_WRITE) == 0)) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 		vm_page_hold(m);
 	}
 	vm_page_unlock_queues();
 	PMAP_UNLOCK(pmap);
 
 	return (m);
 }
 
 void *
 pmap_alloc_zeroed_contig_pages(int npages, uint64_t alignment)
 {
 	vm_page_t m, tm;
 	int i;
 	void *ptr;
 	
 	m = NULL;
 	while (m == NULL) {	
 		for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 			m = vm_page_alloc_contig(npages, phys_avail[i], 
 						 phys_avail[i + 1], alignment, (1UL<<34));
 			if (m)
 				goto found;
 		}
 		if (m == NULL) {
 			printf("vm_page_alloc_contig failed - waiting to retry\n");
 			VM_WAIT;
 		}
 	}
 found:
 	for (i = 0, tm = m; i < npages; i++, tm++) {
 		tm->wire_count++;
 		if ((tm->flags & PG_ZERO) == 0)
 			pmap_zero_page(tm);
 	}
 	ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
 	
 	return (ptr);
 }
 
 void
 pmap_free_contig_pages(void *ptr, int npages)
 {
 	int i;
 	vm_page_t m;
 
 	m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)ptr));
 	for (i = 0; i < npages; i++, m++) {
 		m->wire_count--;
-		atomic_subtract_int(&cnt.v_wire_count, 1);
+		VMCNT_DEC(wire_count, 1);
 		vm_page_free(m);
 	}
 }
 
 void 
 pmap_growkernel(vm_offset_t addr)
 {
 	return;
 }
 
 void 
 pmap_init(void)
 {
 
 	/* allocate pv_entry zones */
 	int shpgperproc = PMAP_SHPGPERPROC;
 
 	for (ctx_stack_top = 1; ctx_stack_top < PMAP_CONTEXT_MAX; ctx_stack_top++) 
 		ctx_stack[ctx_stack_top] = ctx_stack_top;
 
 	mtx_init(&pmap_ctx_lock, "ctx lock", NULL, MTX_SPIN);
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	pvzone = uma_zcreate("PV ENTRY", sizeof(struct pv_entry), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
-	pv_entry_max = shpgperproc * maxproc + cnt.v_page_count;
+	pv_entry_max = shpgperproc * maxproc + VMCNT_GET(page_count);
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 	uma_zone_set_obj(pvzone, &pvzone_obj, pv_entry_max);
 
 	tte_hash_init();
 
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	KDPRINTF("pmap_insert_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
 	pv = get_pv_entry(pmap);
 	pv->pv_va = va;
 	pv->pv_pmap = pmap;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count++;
 }
 
 #ifdef TRAP_TRACING
 static int trap_trace_report_done;
 #endif
 
 #ifdef SMP
 static cpumask_t
 pmap_ipi(pmap_t pmap, char *func, uint64_t arg1, uint64_t arg2)
 {
 
 	int i, cpu_count, retried;
 	u_int cpus;
 	cpumask_t cpumask, active, curactive;
 	cpumask_t active_total, ackmask;
 	uint16_t *cpulist;
 
 	retried = 0;
 
 	if (!smp_started)
 		return (0);
 
 	cpumask = PCPU_GET(cpumask);
 	cpulist = PCPU_GET(cpulist);
 	curactive = 0;
 
 	if (rdpr(pil) != 14)
 		panic("pil %ld != 14", rdpr(pil));
 
 #ifndef CPUMASK_NOT_BEING_ERRONEOUSLY_CHANGED
 	/* by definition cpumask should have curcpu's bit set */
 	if (cpumask != (1 << curcpu)) 
 		panic("cpumask(0x%x) != (1 << curcpu) (0x%x)\n", 
 		      cpumask, (1 << curcpu));
 
 #endif
 #ifdef notyet
 	if ((active_total = (pmap->pm_tlbactive & ~cpumask)) == 0)
 		goto done;
 
 	if (pmap->pm_context != 0)
 		active_total = active = (pmap->pm_tlbactive & ~cpumask);
 	else 
 #endif
 		active_total = active = PCPU_GET(other_cpus);
 
 	if (active == 0)
 		goto done;
 	
  retry:
 	
 	for (i = curactive = cpu_count = 0, cpus = active; i < mp_ncpus && cpus; i++, cpus = (cpus>>1)) {
 		if ((cpus & 0x1) == 0)
 			continue;
 		
 		curactive |= (1 << i);
 		cpulist[cpu_count] = (uint16_t)i;
 		cpu_count++;
 	}
 
 	ackmask = 0;
 	cpu_ipi_selected(cpu_count, cpulist, (uint64_t)func, (uint64_t)arg1, 
 			 (uint64_t)arg2, (uint64_t *)&ackmask);
 
 	while (ackmask != curactive) {
 		membar(Sync);
 		i++;
 		if (i > 10000000) {
 #ifdef TRAP_TRACING
 			int j;
 #endif
 			uint64_t cpu_state;
 			printf("cpu with cpumask=0x%x appears to not be responding to ipis\n",
 			       curactive & ~ackmask);
 
 #ifdef TRAP_TRACING
 			if (!trap_trace_report_done) {
 				trap_trace_report_done = 1;
 				for (j = 0; j < MAXCPU; j++)
 					if (((1 << j) & curactive & ~ackmask) != 0) {
 						struct pcpu *pc = pcpu_find(j);
 						printf("pcpu pad 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx 0x%jx\n",
 						    pc->pad[0], pc->pad[1], pc->pad[2], pc->pad[3],
 						    pc->pad[4], pc->pad[5], pc->pad[6]);
 						trap_trace_report(j);
 					}
 			}
 #endif
 
 			hv_cpu_state((uint64_t)ffs64(curactive & ~ackmask), &cpu_state);
 			printf("cpu_state of %ld is %ld\n", ffs64(curactive & ~ackmask), cpu_state);
 			if (!retried) {
 				printf("I'm going to send off another ipi just to confirm that it isn't a memory barrier bug\n"
 			       "and then I'm going to panic\n");
 
 				retried = 1;
 				goto retry;
 			}
 
 			panic(" ackmask=0x%x active=0x%x\n", ackmask, curactive);
 		}
 	}
 
 	active_total |= curactive;
 	if ((active = ((pmap->pm_tlbactive & all_cpus) & ~(active_total|cpumask))) != 0) {
 		printf("pmap_ipi: retrying");
 		goto retry;
 	}
  done:
 	return (active_total);
 }
 #endif
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, int cleartsb)
 {
 
 	if (cleartsb == TRUE)
 		tsb_clear_tte(&pmap->pm_tsb, va);
 
 	DPRINTF("pmap_invalidate_page(va=0x%lx)\n", va);
 	spinlock_enter();
 	invlpg(va, pmap->pm_context);
 #ifdef SMP
 	pmap_ipi(pmap, (void *)tl_invlpg, (uint64_t)va, (uint64_t)pmap->pm_context);
 #endif
 	spinlock_exit();
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int cleartsb)
 {
 	vm_offset_t tva, invlrngva;
 	char *func;
 #ifdef SMP
 	cpumask_t active;
 #endif
 	if ((eva - sva) == PAGE_SIZE) {
 		pmap_invalidate_page(pmap, sva, cleartsb);
 		return;
 	}
 	
 
 	KASSERT(sva < eva, ("invalidating negative or zero range sva=0x%lx eva=0x%lx", sva, eva));
 
 	if (cleartsb == TRUE) 
 		tsb_clear_range(&pmap->pm_tsb, sva, eva);
 
 	spinlock_enter();
 	if ((sva - eva) < PAGE_SIZE*64) {
 		for (tva = sva; tva < eva; tva += PAGE_SIZE_8K)
 			invlpg(tva, pmap->pm_context);
 		func = tl_invlrng;
 	} else if (pmap->pm_context) {
 		func = tl_invlctx;
 		invlctx(pmap->pm_context);
 
 	} else {
 		func = tl_invltlb;
 		invltlb();
 	}
 #ifdef SMP
 	invlrngva = sva | ((eva - sva) >> PAGE_SHIFT);
 	active = pmap_ipi(pmap, (void *)func, pmap->pm_context, invlrngva);
 	active &= ~pmap->pm_active;
 	atomic_clear_int(&pmap->pm_tlbactive, active);
 #endif
 	spinlock_exit();
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	KASSERT(pmap != kernel_pmap, ("invalidate_all called on kernel_pmap"));
 
 	tsb_clear(&pmap->pm_tsb);
 
 	spinlock_enter();
 	invlctx(pmap->pm_context);
 #ifdef SMP
 	pmap_ipi(pmap, tl_invlctx, pmap->pm_context, 0);
 	pmap->pm_tlbactive = pmap->pm_active;
 #endif
 	spinlock_exit();
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	return (tte_get_phys_bit(m, VTD_W));
 }
 
 
 boolean_t 
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t va)
 {
 	return (tte_hash_lookup(pmap->pm_hash, va) == 0);
 }
 
 /*
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	tte_t tte_data;
 	vm_paddr_t pa;
 
         pa = 0;
 	if (va > KERNBASE && va < KERNBASE + nucleus_memory) {
 		uint64_t offset;
 		offset = va - KERNBASE; 
 		pa = nucleus_mappings[offset >> 22] | (va & PAGE_MASK_4M);
 	}
 	if ((pa == 0) && (tte_data = tsb_lookup_tte(va, 0)) != 0)
 		pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	if ((pa == 0) && (tte_data = tte_hash_lookup(kernel_pmap->pm_hash, va)) != 0)
 		pa = TTE_GET_PA(tte_data) | (va & TTE_GET_PAGE_MASK(tte_data));
 
 	return pa;
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  *
  * The value passed in *virt is a suggested virtual address for the mapping.
  * Architectures which can support a direct-mapped physical to virtual region
  * can return the appropriate address within that region, leaving '*virt'
  * unchanged.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return TLB_PHYS_TO_DIRECT(start);
 }
 
 int 
 pmap_mincore(pmap_t pmap, vm_offset_t addr)
 {
 	return (0);
 }
 
 void 
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object, 
 		    vm_pindex_t index, vm_size_t size)
 {
 	printf("pmap_object_init_pt\n");
 	return;
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	pv_entry_t pv;
 	int loops = 0;
 
 	if (m->flags & PG_FICTITIOUS)
 		return FALSE;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 		if (pv->pv_pmap == pmap) {
 			return TRUE;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}	
 	return (FALSE);
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_list_count = 0;
 }
 /*
  * Lower the permission for all mappings to a given page.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	if ((m->flags & PG_WRITEABLE) == 0)
 		return;
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	tte_clear_phys_bit(m, VTD_SW_W|VTD_W);
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 /*
  * Initialize the pmap associated with process 0.
  */
 void
 pmap_pinit0(pmap_t pmap)
 {
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_active = pmap->pm_tlbactive = ~0;
 	pmap->pm_context = 0;
 	pmap->pm_tsb_ra = kernel_pmap->pm_tsb_ra;
 	pmap->pm_hash = kernel_pmap->pm_hash;
 	critical_enter();
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 	TAILQ_INIT(&pmap->pm_pvlist);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure, such as one in a
  * vmspace structure.
  */
 void
 pmap_pinit(pmap_t pmap)
 {
 	int i;
 
 	pmap->pm_context = get_context();
 	pmap->pm_tsb_ra = vtophys(&pmap->pm_tsb);
 
 	vm_page_lock_queues();
 	pmap->pm_hash = tte_hash_create(pmap->pm_context, &pmap->pm_hashscratch);
 	tsb_init(&pmap->pm_tsb, &pmap->pm_tsbscratch, TSB_INIT_SHIFT);
 	vm_page_unlock_queues();
 	pmap->pm_tsb_miss_count = pmap->pm_tsb_cap_miss_count = 0;
 	pmap->pm_active = pmap->pm_tlbactive = 0;
 	for (i = 0; i < TSB_MAX_RESIZE; i++)
 		pmap->pm_old_tsb_ra[i] = 0;
 
 	TAILQ_INIT(&pmap->pm_pvlist);
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 
 	int anychanged;
 	vm_offset_t tva;
 	uint64_t clearbits;
 
 	DPRINTF("pmap_protect(0x%lx, 0x%lx, %d)\n", sva, eva, prot);
 	
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 	
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) == 
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	clearbits = anychanged = 0;
 	
 	if ((prot & VM_PROT_WRITE) == 0)
 		clearbits |= (VTD_W|VTD_SW_W);
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		clearbits |= VTD_X;
 
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (tva = sva; tva < eva; tva += PAGE_SIZE) {
 		uint64_t otte_data;
 		vm_page_t m;
 
 		if ((otte_data = tte_hash_clear_bits(pmap->pm_hash, tva, 
 						     clearbits)) == 0)
 			continue;
 		/*
 		 * XXX technically we should do a shootdown if it 
 		 * was referenced and was executable - but is not now
 		 */
 		if (!anychanged && (otte_data & VTD_W))
 			anychanged = 1;
 		
 		if (otte_data & VTD_MANAGED) {
 			m = NULL;
 
 			if (otte_data & VTD_REF) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
 				vm_page_flag_set(m, PG_REFERENCED);
 			}
 			if (otte_data & VTD_W) {
 				m = PHYS_TO_VM_PAGE(TTE_GET_PA(otte_data));
 				vm_page_dirty(m);
 			}
 		} 
 	}
 
 	vm_page_unlock_queues();
 	if (anychanged)
 		pmap_invalidate_range(pmap, sva, eva, TRUE);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 	tte_t otte;
 	
 	otte = 0;
 	va = sva;
 	while (count-- > 0) {
 		otte |= tte_hash_update(kernel_pmap->pm_hash, va,  
 					VM_PAGE_TO_PHYS(*m) | TTE_KERNEL | VTD_8K);
 		va += PAGE_SIZE;
 		m++;
 	}
 	if ((otte & VTD_REF) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, va, FALSE);
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by pmap_qenter.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 	tte_t otte;
 
 	va = sva;
 
 	otte = 0;
 	while (count-- > 0) {
 		otte |= tte_hash_delete(kernel_pmap->pm_hash, va);
 		va += PAGE_SIZE;
 	}
 	if ((otte & VTD_REF) != 0)
 		pmap_invalidate_range(kernel_pmap, sva, va, TRUE);
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 
 	tsb_deinit(&pmap->pm_tsb);
 	tte_hash_destroy(pmap->pm_hash);
 	free_context(pmap->pm_context);
 	PMAP_LOCK_DESTROY(pmap);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t start, vm_offset_t end)
 {
 	int invlva;
 	vm_offset_t tva;
 	uint64_t tte_data;
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 	
 	DPRINTF("pmap_remove(start=0x%lx, end=0x%lx)\n", 
 		start, end);
 	invlva = 0;
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (tva = start; tva < end; tva += PAGE_SIZE) {
 		if ((tte_data = tte_hash_delete(pmap->pm_hash, tva)) == 0)
 			continue;
 		pmap_remove_tte(pmap, tte_data, tva);
 		if (tte_data & (VTD_REF|VTD_W))
 			invlva = 1;
 	}
 	vm_page_unlock_queues();
 	if (invlva)
 		pmap_invalidate_range(pmap, start, end, TRUE);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	pv_entry_t pv;
 	uint64_t tte_data;
 	DPRINTF("pmap_remove_all 0x%lx\n", VM_PAGE_TO_PHYS(m));
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		PMAP_LOCK(pv->pv_pmap);
 		pv->pv_pmap->pm_stats.resident_count--;
 
 		tte_data = tte_hash_delete(pv->pv_pmap->pm_hash, pv->pv_va);
 
 		if (tte_data & VTD_WIRED)
 			pv->pv_pmap->pm_stats.wired_count--;
 		if (tte_data & VTD_REF)
 			vm_page_flag_set(m, PG_REFERENCED);
 		
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (tte_data & VTD_W) {
 			KASSERT((tte_data & VTD_SW_W),
 	("pmap_remove_all: modified page not writable: va: %lx, tte: %lx",
 			    pv->pv_va, tte_data));
 			vm_page_dirty(m);
 		}
 	
 		pmap_invalidate_page(pv->pv_pmap, pv->pv_va, TRUE);
 		TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		m->md.pv_list_count--;
 		PMAP_UNLOCK(pv->pv_pmap);
 		free_pv_entry(pv);
 	}
 	vm_page_flag_clear(m, PG_WRITEABLE);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	pv_entry_t pv;
 	if (pmap != kernel_pmap)
 		DPRINTF("pmap_remove_entry(va=0x%lx, pa=0x%lx)\n", va, VM_PAGE_TO_PHYS(m));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
 		TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
 			if (pmap == pv->pv_pmap && va == pv->pv_va) 
 				break;
 		}
 	} else {
 		TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
 			if (va == pv->pv_va) 
 				break;
 		}
 	}
 	KASSERT(pv != NULL, ("pmap_remove_entry: pv not found va=0x%lx pa=0x%lx", va, VM_PAGE_TO_PHYS(m)));
 	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 	m->md.pv_list_count--;
 	if (TAILQ_EMPTY(&m->md.pv_list))
 		vm_page_flag_clear(m, PG_WRITEABLE);
 	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 	free_pv_entry(pv);
 }
 
 
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	
 	vm_page_t m;
 	pv_entry_t pv, npv;
 	tte_t tte_data;
 	
 	DPRINTF("pmap_remove_pages(ctx=0x%lx)\n", pmap->pm_context);
 	vm_page_lock_queues();
 	PMAP_LOCK(pmap);
 	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
 		tte_data = tte_hash_delete(pmap->pm_hash, pv->pv_va);
 
 		if (tte_data == 0) {
 			printf("TTE IS ZERO @ VA %016lx\n", pv->pv_va);
 			panic("bad tte");
 		}
 		if (tte_data & VTD_WIRED) {
 			panic("wired page in process not handled correctly");
 			pmap->pm_stats.wired_count--;
 		}
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 
 		pmap->pm_stats.resident_count--;
 		
 		if (tte_data & VTD_W) {
 			vm_page_dirty(m);
 		}
 		
 		npv = TAILQ_NEXT(pv, pv_plist);
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_WRITEABLE);
 
 		free_pv_entry(pv);
 	}
 	pmap->pm_hash = tte_hash_reset(pmap->pm_hash, &pmap->pm_hashscratch);
 	if (0)
 		pmap_tsb_reset(pmap);
 
 	vm_page_unlock_queues();
 	pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 static void
 pmap_tsb_reset(pmap_t pmap)
 {
 	int i;
 
 	for (i = 1; i < TSB_MAX_RESIZE && pmap->pm_old_tsb_ra[i]; i++) {
 		pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(pmap->pm_old_tsb_ra[i]), 
 				       (1 << (TSB_INIT_SHIFT + i)));
 		pmap->pm_old_tsb_ra[i] = 0;
 	}
 	if (pmap->pm_old_tsb_ra[0] != 0) {
 		vm_paddr_t tsb_pa = pmap->pm_tsb.hti_ra;
 		int size = tsb_size(&pmap->pm_tsb);
 		pmap->pm_tsb.hti_ntte = (1 << (TSB_INIT_SHIFT + PAGE_SHIFT - TTE_SHIFT));
 		pmap->pm_tsb.hti_ra = pmap->pm_old_tsb_ra[0];
 		pmap_free_contig_pages((void *)TLB_PHYS_TO_DIRECT(tsb_pa), size);
 		pmap->pm_tsbscratch = pmap->pm_tsb.hti_ra | (uint64_t)TSB_INIT_SHIFT;
 		pmap->pm_old_tsb_ra[0] = 0;
 	}
 }
 
 void
 pmap_scrub_pages(vm_paddr_t pa, int64_t size)
 {
 	uint64_t bytes_zeroed;
 	while (size > 0) {
 		hv_mem_scrub(pa, size, &bytes_zeroed);
 		pa += bytes_zeroed;
 		size -= bytes_zeroed;
 	}
 }
 
 static void
 pmap_remove_tte(pmap_t pmap, tte_t tte_data, vm_offset_t va)
 {
 	
 	vm_page_t m;
 
 	if (pmap != kernel_pmap)
 		DPRINTF("pmap_remove_tte(va=0x%lx, pa=0x%lx)\n", va, TTE_GET_PA(tte_data));
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (tte_data & VTD_WIRED)
 		pmap->pm_stats.wired_count--;
 
 	pmap->pm_stats.resident_count--;
 	
 	if (tte_data & VTD_MANAGED) {
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tte_data));
 		if (tte_data & VTD_W) {
 			vm_page_dirty(m);	
 		}
 		if (tte_data & VTD_REF) 
 			vm_page_flag_set(m, PG_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 }
 
 /* resize the tsb if the number of capacity misses is greater than 1/4 of
  * the total 
  */  
 static void
 pmap_tsb_resize(pmap_t pmap)
 {
 	uint32_t miss_count;
 	uint32_t cap_miss_count;
 	struct tsb_resize_info info;
 	hv_tsb_info_t hvtsb;
 	uint64_t tsbscratch;
 
 	KASSERT(pmap == curthread_pmap, ("operating on non-current pmap"));
 	miss_count = pmap->pm_tsb_miss_count;
 	cap_miss_count = pmap->pm_tsb_cap_miss_count;
 	int npages_shift = tsb_page_shift(pmap);
 
 	if (npages_shift < (TSB_INIT_SHIFT + TSB_MAX_RESIZE) && 
 	    cap_miss_count > (miss_count >> 1)) {
 		DPRINTF("resizing tsb for proc=%s pid=%d\n", 
 			curthread->td_proc->p_comm, curthread->td_proc->p_pid);
 		pmap->pm_old_tsb_ra[npages_shift - TSB_INIT_SHIFT] = pmap->pm_tsb.hti_ra;
 
 		/* double TSB size */
 		tsb_init(&hvtsb, &tsbscratch, npages_shift + 1);
 #ifdef SMP
 		spinlock_enter();
 		/* reset tsb */
 		bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hv_tsb_info_t));
 		pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 
 		if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 		info.tri_tsbscratch = pmap->pm_tsbscratch;
 		info.tri_tsb_ra = pmap->pm_tsb_ra;
 		pmap_ipi(pmap, tl_tsbupdate, pmap->pm_context, vtophys(&info));
 		pmap->pm_tlbactive = pmap->pm_active;
 		spinlock_exit();
 #else 
 		bcopy(&hvtsb, &pmap->pm_tsb, sizeof(hvtsb));
 		if (hv_mmu_tsb_ctxnon0(1, pmap->pm_tsb_ra) != H_EOK)
 			panic("failed to set TSB 0x%lx - context == %ld\n", 
 			      pmap->pm_tsb_ra, pmap->pm_context);
 		pmap->pm_tsbscratch = tsb_set_scratchpad_user(&pmap->pm_tsb);
 #endif		
 	}
 	pmap->pm_tsb_miss_count = 0;
 	pmap->pm_tsb_cap_miss_count = 0;
 }
 
 static void
 pmap_tte_hash_resize(pmap_t pmap)
 {
 	tte_hash_t old_th = pmap->pm_hash;
 	
 	pmap->pm_hash = tte_hash_resize(pmap->pm_hash);
 	spinlock_enter();
 	if (curthread->td_proc->p_numthreads != 1) 
 		pmap_ipi(pmap, tl_ttehashupdate, pmap->pm_context, pmap->pm_hashscratch);
 
 	pmap->pm_hashscratch = tte_hash_set_scratchpad_user(pmap->pm_hash, pmap->pm_context);	
 	spinlock_exit();
 	tte_hash_destroy(old_th);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	XXX: The exact number of bits to check and clear is a matter that
  *	should be tested and standardized at some point in the future for
  *	optimal aging of shared pages.
  */
 
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	
 	int rv;
 	pv_entry_t pv, pvf, pvn;
 	pmap_t pmap;
 	tte_t otte_data;
 
 	rv = 0;
 	if (m->flags & PG_FICTITIOUS)
 		return (rv);
 
         mtx_assert(&vm_page_queue_mtx, MA_OWNED);
         if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		
 		pvf = pv;
 
 		do {
                         pvn = TAILQ_NEXT(pv, pv_list);
 			
                         TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 			
                         TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
 			
                         pmap = pv->pv_pmap;
                         PMAP_LOCK(pmap);
 			otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, VTD_REF);
 			if ((otte_data & VTD_REF) != 0) {
                                 pmap_invalidate_page(pmap, pv->pv_va, TRUE);
 				
                                 rv++;
                                 if (rv > 4) {
                                         PMAP_UNLOCK(pmap);
                                         break;
                                 }
 			}
 		
 			PMAP_UNLOCK(pmap);
 		} while ((pv = pvn) != NULL && pv != pvf);
 	}
 	return (rv);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_paddr_t pa;
 	vm_offset_t va;
 		
 	pa = VM_PAGE_TO_PHYS(m);
 	va = TLB_PHYS_TO_DIRECT(pa);
 	if (off == 0 && size == PAGE_SIZE)
 		hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 	else
 		bzero((char *)(va + off), size);
 
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	hwblkclr((void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m)), PAGE_SIZE);
 }
 
 void
 pmap_set_ctx_panic(uint64_t error, vm_paddr_t tsb_ra, pmap_t pmap)
 {
 	panic("setting ctxnon0 failed ctx=0x%lx hvtsb_ra=0x%lx tsbscratch=0x%lx error=0x%lx",
 	      pmap->pm_context, tsb_ra, pmap->pm_tsbscratch, error);
 	
 }
Index: head/sys/sun4v/sun4v/tsb.c
===================================================================
--- head/sys/sun4v/sun4v/tsb.c	(revision 169666)
+++ head/sys/sun4v/sun4v/tsb.c	(revision 169667)
@@ -1,284 +1,284 @@
 /*-
  * Copyright (c) 2006 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/vm.h> 
 #include <vm/vm_extern.h> 
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 
 #include <machine/cpufunc.h>
 #include <machine/hypervisorvar.h>
 #include <machine/smp.h>
 #include <machine/mmu.h>
 #include <machine/tte.h>
 #include <machine/tte_hash.h>
 #include <machine/tsb.h>
 #include <machine/vmparam.h>
 #include <machine/tlb.h>
 
 CTASSERT(sizeof(tte_t) == sizeof(uint64_t));
 #define TSB_MASK(tsb) ((tsb->hti_ntte) - 1)
 /* make TSB start off at the same size as the hash */
 #define TSB_SIZE       8
 
 #ifdef DEBUG_TSB
 #define DPRINTF printf
 #else
 #define DPRINTF(...)
 #endif
 
 void tsb_sysinit(void);
 
 void
 tsb_init(hv_tsb_info_t *hvtsb, uint64_t *scratchval, uint64_t page_shift)
 {
 	void *ptr;
 	int npages = (1 << page_shift);
 
 	ptr = pmap_alloc_zeroed_contig_pages(npages, npages*PAGE_SIZE);
 	
 	if ((((uint64_t)ptr) & (npages*PAGE_SIZE - 1)) != 0)
 		panic("vm_page_alloc_contig allocated unaligned pages: %p",
 		      ptr);
 	
 	hvtsb->hti_idxpgsz = TTE8K;
 	hvtsb->hti_assoc = 1;
 	hvtsb->hti_ntte = (npages*PAGE_SIZE >> TTE_SHIFT);
 	hvtsb->hti_ctx_index = -1;    /* TSBs aren't shared so if we don't 
 					 * set the context in the TTEs we can 
 					 * simplify miss handling slightly
 					 */
 	hvtsb->hti_pgszs = TSB8K;
 	hvtsb->hti_rsvd = 0;
 	hvtsb->hti_ra = TLB_DIRECT_TO_PHYS((vm_offset_t)ptr);
 
 	*scratchval = ((uint64_t) ptr) | page_shift;
 }
 
 void
 tsb_deinit(hv_tsb_info_t *hvtsb)
 {
 	vm_page_t m, tm;
 	int i;
 	
 
 	m = PHYS_TO_VM_PAGE((vm_paddr_t)hvtsb->hti_ra);
 	for (i = 0, tm = m; i < TSB_SIZE; i++, m++) {
 		tm->wire_count--;
-		atomic_subtract_int(&cnt.v_wire_count, 1);
+		VMCNT_DEC(wire_count, 1);
 		vm_page_free(tm);
 	}
 }
 
 
 void
 tsb_assert_invalid(hv_tsb_info_t *tsb, vm_offset_t va)
 {
 	vm_paddr_t tsb_load_pa;
 	uint64_t tsb_index, tsb_shift, tte_tag, tte_data;
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 	tsb_index = (va >> tsb_shift) & TSB_MASK(tsb);
 	tsb_load_pa = tsb->hti_ra + 2*tsb_index*sizeof(uint64_t);
 	load_real_dw(tsb_load_pa, &tte_tag, &tte_data);
 	if (tte_tag == 0 && tte_data == 0)
 		return;
 	printf("tsb_shift=0x%lx tsb_index=0x%lx\n", tsb_shift, tsb_index);
 	printf("tte_tag=0x%lx tte_data=0x%lx TSB_MASK=%lx\n", tte_tag, tte_data, (uint64_t)TSB_MASK(tsb));
 	panic("non-zero entry found where not expected");
 
 }
 
 void 
 tsb_set_tte_real(hv_tsb_info_t *tsb, vm_offset_t index_va, vm_offset_t tag_va, 
 		 uint64_t tte_data, uint64_t ctx)
 {
 	vm_paddr_t tsb_store_pa;
 	uint64_t tsb_index, tsb_shift, tte_tag;
 	DPRINTF("tsb_set_tte index_va: 0x%lx tag_va: 0x%lx idxpgsz: %x ", 
 		index_va, tag_va, tsb->hti_idxpgsz);
 
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 
 	tsb_index = (index_va >> tsb_shift) & TSB_MASK(tsb);
 	DPRINTF("tsb_index_absolute: 0x%lx tsb_index: 0x%lx\n", (index_va >> tsb_shift), tsb_index);
 	tsb_store_pa = tsb->hti_ra + 2*tsb_index*sizeof(uint64_t);
 
 	/* store new value with valid bit cleared 
 	 * to avoid invalid intermediate value;
 	 */
 	store_real(tsb_store_pa + sizeof(uint64_t), tte_data);
 	tte_tag = (ctx << TTARGET_CTX_SHIFT) | (tag_va >> TTARGET_VA_SHIFT);
 	store_real(tsb_store_pa, tte_tag);
 } 
 
 
 void 
 tsb_set_tte(hv_tsb_info_t *tsb, vm_offset_t va, uint64_t tte_data, uint64_t ctx)
 {
 
 	uint64_t tsb_index, tsb_shift, tte_tag;
 	tte_t *entry;
 
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 	tsb_index = (va >> tsb_shift) & TSB_MASK(tsb);
 	entry = (tte_t *)TLB_PHYS_TO_DIRECT(tsb->hti_ra + 2*tsb_index*sizeof(uint64_t));
 	tte_tag = (ctx << TTARGET_CTX_SHIFT) | (va >> TTARGET_VA_SHIFT);
 	/* store new value with valid bit cleared 
 	 * to avoid invalid intermediate value;
 	 */
 	*(entry + 1) = 0;
 	membar(StoreLoad);
 	*(entry) = tte_tag;
 	*(entry + 1) = tte_data;
 	membar(Sync);
 } 
 
 
 void 
 tsb_clear(hv_tsb_info_t *tsb)
 {
 	hwblkclr((void *)TLB_PHYS_TO_DIRECT(tsb->hti_ra), tsb->hti_ntte << TTE_SHIFT);
 }
 
 void 
 tsb_clear_tte(hv_tsb_info_t *tsb, vm_offset_t va)
 {
 	tte_t *entry;
 	uint64_t tsb_index, tsb_shift;
 
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 	tsb_index = (va >> tsb_shift) & TSB_MASK(tsb);
 	entry = (tte_t *)TLB_PHYS_TO_DIRECT(tsb->hti_ra + 2*tsb_index*sizeof(uint64_t));
 	
 	*(entry + 1) = 0;
 
 	membar(Sync);
 }
 
 void
 tsb_clear_range(hv_tsb_info_t *tsb, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t tva;
 	uint64_t tsb_index, tsb_shift, tsb_mask;
 	tte_t *entry;
 
 	tsb_mask = TSB_MASK(tsb);
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 
 	for (tva = sva; tva < eva; tva += PAGE_SIZE) {
 		tsb_index = (tva >> tsb_shift) & tsb_mask;
 		entry = (tte_t *)TLB_PHYS_TO_DIRECT(tsb->hti_ra + 2*tsb_index*sizeof(uint64_t));
 		*(entry + 1) = 0;
 	}
 
 	membar(Sync);
 }
 
 tte_t
 tsb_get_tte(hv_tsb_info_t *tsb, vm_offset_t va)
 {
 	tte_t *entry;
 	uint64_t tsb_index, tsb_shift, tte_tag, tte_data;
 
 	tsb_shift = TTE_PAGE_SHIFT(tsb->hti_idxpgsz);
 	tsb_index = (va >> tsb_shift) & TSB_MASK(tsb);
 	entry = (tte_t *)TLB_PHYS_TO_DIRECT(tsb->hti_ra + 2*tsb_index*sizeof(uint64_t));
 	tte_tag = *(entry);
 	tte_data = *(entry + 1);
 
 	if ((tte_tag << TTARGET_VA_SHIFT) == (va & ~PAGE_MASK_4M))
 		return tte_data;
 
 	return (0UL);
 }
 
 tte_t
 tsb_lookup_tte(vm_offset_t va, uint64_t ctx)
 {
 	tte_t tte_data;
 
 	tte_data = 0;
 
 	if ((tte_data = tsb_get_tte(&kernel_td[TSB4M_INDEX], va)) != 0)
 		goto done;
 
 	/*
 	 * handle user data 
 	 */
 done:
 	return tte_data;
 }
 
 uint64_t
 tsb_set_scratchpad_kernel(hv_tsb_info_t *tsb)
 {
 	uint64_t tsb_shift, tsb_scratch;
 	tsb_shift = ffs(tsb->hti_ntte >> (PAGE_SHIFT - TTE_SHIFT)) - 1;
 	tsb_scratch = TLB_PHYS_TO_DIRECT(tsb->hti_ra) | tsb_shift;
 	
 	set_tsb_kernel_scratchpad(tsb_scratch);
 	membar(Sync);
 	return tsb_scratch;
 }
 
 uint64_t
 tsb_set_scratchpad_user(hv_tsb_info_t *tsb)
 {
 	uint64_t tsb_shift, tsb_scratch;
 	tsb_shift = ffs(tsb->hti_ntte >> (PAGE_SHIFT - TTE_SHIFT)) - 1;
 	tsb_scratch = TLB_PHYS_TO_DIRECT(tsb->hti_ra) | tsb_shift;
 	set_tsb_user_scratchpad(tsb_scratch);
 	membar(Sync);
 	return tsb_scratch;
 }
 
 int
 tsb_size(hv_tsb_info_t *hvtsb)
 {
 	return (hvtsb->hti_ntte >> (PAGE_SHIFT - TTE_SHIFT));
 }
 
 int
 tsb_page_shift(pmap_t pmap)
 {
 	return (pmap->pm_tsbscratch & PAGE_MASK);
 }
Index: head/sys/sun4v/sun4v/tte_hash.c
===================================================================
--- head/sys/sun4v/sun4v/tte_hash.c	(revision 169666)
+++ head/sys/sun4v/sun4v/tte_hash.c	(revision 169667)
@@ -1,656 +1,656 @@
 /*-
  * Copyright (c) 2006 Kip Macy
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #ifdef DEBUG
 #include <sys/kdb.h>
 #endif
 #include <vm/vm.h> 
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h> 
 
 #include <machine/cpufunc.h>
 #include <machine/hypervisorvar.h>
 #include <machine/smp.h>
 #include <machine/mmu.h>
 #include <machine/tte.h>
 #include <machine/vmparam.h>
 #include <machine/tlb.h>
 #include <machine/tte_hash.h>
 
 #define HASH_SIZE        (1 << HASH_ENTRY_SHIFT)
 #define HASH_MASK(th)    ((1<<(th->th_shift+PAGE_SHIFT-THE_SHIFT))-1)
 #define NULL_TAG         0
 #define MAGIC_VALUE      0xcafebabe
 
 struct tte_hash_entry;
 struct of_field;
 
 #define MAX_FRAGMENT_ENTRIES ((PAGE_SIZE / sizeof(struct tte_hash_entry)) - 1)
 
 typedef struct tte_hash_field_ {
 	uint64_t tag;
 	uint64_t data;
 } tte_hash_field, *tte_hash_field_t;
 
 struct of_field {
 	int16_t          count;
 	uint8_t          lock;
 	uint8_t          pad;
 	uint32_t         flags;
 	struct tte_hash_entry *next;
 };
 
 typedef struct tte_hash_entry {
 	tte_hash_field the_fields[HASH_ENTRIES];
 	struct of_field of;
 } *tte_hash_entry_t;
 
 struct fragment_header {
 	struct tte_hash_fragment *fh_next;
 	uint8_t fh_count;
 	uint8_t fh_free_head;
 	uint8_t pad[sizeof(struct tte_hash_entry) - 10];
 };
 
 CTASSERT(sizeof(struct fragment_header) == sizeof(struct tte_hash_entry));
 
 SLIST_HEAD(tte_hash_list, tte_hash); 
 
 struct tte_hash_list hash_free_list[PAGE_SHIFT];
 
 struct tte_hash {
 	uint16_t th_shift;              /* effective size in pages */
 	uint16_t th_context;            /* TLB context   */
 	uint32_t th_entries;            /* # pages held  */
 	tte_hash_entry_t th_hashtable;  /* hash of TTEs  */
 	struct tte_hash_fragment *th_fhhead;
 	struct tte_hash_fragment *th_fhtail;
 	SLIST_ENTRY(tte_hash) th_next;
 };
 
 struct tte_hash_fragment {
 	struct fragment_header thf_head;
 	struct tte_hash_entry  thf_entries[MAX_FRAGMENT_ENTRIES];
 };
 
 CTASSERT(sizeof(struct tte_hash_fragment) == PAGE_SIZE);
 
 
 static struct tte_hash kernel_tte_hash;
 /*
  * Data for the tte_hash allocation mechanism
  */
 static uma_zone_t thzone;
 static struct vm_object thzone_obj;
 static int tte_hash_count = 0, tte_hash_max = 0;
 
 extern uint64_t hash_bucket_lock(tte_hash_field_t fields);
 extern void hash_bucket_unlock(tte_hash_field_t fields, uint64_t s);
 
 static tte_hash_t
 get_tte_hash(void)
 {
 	tte_hash_t th;
 
 	th = uma_zalloc(thzone, M_NOWAIT);
 
 	KASSERT(th != NULL, ("tte_hash allocation failed"));
 	tte_hash_count++;
 	return th;
 
 }
 
 static __inline void
 free_tte_hash(tte_hash_t th)
 {
 	tte_hash_count--;
 	uma_zfree(thzone, th);
 }
 
 static tte_hash_t
 tte_hash_cached_get(int shift)
 {
 	tte_hash_t th;
 	struct tte_hash_list *head;
 
 	th = NULL;
 	head = &hash_free_list[shift];
 	if (!SLIST_EMPTY(head)) {
 		th = SLIST_FIRST(head);
 		SLIST_REMOVE_HEAD(head, th_next);
 	}
 	return (th);
 }
 
 static void
 tte_hash_cached_free(tte_hash_t th)
 {
 	th->th_context = 0xffff;
 	SLIST_INSERT_HEAD(&hash_free_list[th->th_shift - HASH_ENTRY_SHIFT], th, th_next);
 }
 
 void 
 tte_hash_init(void)
 {
 	int i;
 
 	thzone = uma_zcreate("TTE_HASH", sizeof(struct tte_hash), NULL, NULL, 
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	tte_hash_max = maxproc;
 	uma_zone_set_obj(thzone, &thzone_obj, tte_hash_max);
 	for (i = 0; i < PAGE_SHIFT; i++)
 		SLIST_INIT(&hash_free_list[i]); 
 }
 
 tte_hash_t
 tte_hash_kernel_create(vm_offset_t va, uint16_t shift, vm_paddr_t fragment_page)
 {
 	tte_hash_t th;
 		
 	th = &kernel_tte_hash;
 	th->th_shift = shift;
 	th->th_entries = 0;
 	th->th_context = 0;
 	th->th_hashtable = (tte_hash_entry_t)va;
 	th->th_fhtail = th->th_fhhead = (void *)TLB_PHYS_TO_DIRECT(fragment_page);
 
 	return (th);
 }
 
 static inline void *
 alloc_zeroed_page(void)
 {
 	vm_page_t m;
 	static int color;
 	void *ptr;
 
 	m = NULL;
 
 	while (m == NULL) {
 		m = vm_page_alloc(NULL, color++,
 		    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 
 		if (m == NULL) 
 			VM_WAIT;
 	}
 
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	ptr = (void *)TLB_PHYS_TO_DIRECT(VM_PAGE_TO_PHYS(m));
 	return (ptr);
 }
 
 static inline void
 free_fragment_pages(void *ptr)
 {
 	struct tte_hash_fragment *fh;
 	vm_page_t m;
 	
         for (fh = ptr; fh != NULL; fh = fh->thf_head.fh_next) {
                 m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS((vm_offset_t)fh));
                 m->wire_count--;
-		atomic_subtract_int(&cnt.v_wire_count, 1);
+		VMCNT_GET(wire_count, 1);
                 vm_page_free(m);
         }
 }
 
 static inline tte_hash_t
 _tte_hash_create(uint64_t context, uint64_t *scratchval, uint16_t shift)
 {
 	tte_hash_t th;
 	
 	th = get_tte_hash();
 	th->th_shift = shift;
 	th->th_entries = 0;
 	th->th_context = (uint16_t)context;
 
 	th->th_hashtable = pmap_alloc_zeroed_contig_pages((1 << shift), PAGE_SIZE);
 
 	th->th_fhtail = th->th_fhhead = alloc_zeroed_page();
 	KASSERT(th->th_fhtail != NULL, ("th->th_fhtail == NULL"));
 	
 	if (scratchval)
 		*scratchval = (uint64_t)((vm_offset_t)th->th_hashtable) | ((vm_offset_t)(1 << shift));
 
 	return (th);
 }
 
 
 tte_hash_t
 tte_hash_create(uint64_t context, uint64_t *scratchval)
 {
 	return (_tte_hash_create(context, scratchval, HASH_ENTRY_SHIFT));
 }
 
 void
 tte_hash_destroy(tte_hash_t th)
 {
 	tte_hash_cached_free(th);
 }
 
 static void
 _tte_hash_reset(tte_hash_t th)
 {
 	
 	free_fragment_pages(th->th_fhhead->thf_head.fh_next);
 
 	th->th_fhtail = th->th_fhhead;
 	hwblkclr(th->th_fhhead, PAGE_SIZE); 
 #if 0
 	if (th->th_entries != 0) 
 #endif
 		hwblkclr(th->th_hashtable, (1 << (th->th_shift + PAGE_SHIFT)));
 	th->th_entries = 0;
 }
 
 tte_hash_t
 tte_hash_reset(tte_hash_t th, uint64_t *scratchval)
 {
 	tte_hash_t newth;
 
 	if (th->th_shift != HASH_ENTRY_SHIFT && (newth = tte_hash_cached_get(0)) != NULL) {
 		newth->th_context = th->th_context;
 		tte_hash_cached_free(th);
 		*scratchval = (uint64_t)((vm_offset_t)newth->th_hashtable) | ((vm_offset_t)HASH_SIZE);
 	} else {
 		newth = th;
 	}
 	_tte_hash_reset(newth);
 
 	return (newth);
 }
 
 static __inline void
 tte_hash_set_field(tte_hash_field_t field, uint64_t tag, tte_t tte)
 {
 	field->tag = tag;
 	field->data = tte | (field->data & VTD_LOCK);
 }
 
 static __inline tte_hash_entry_t 
 find_entry(tte_hash_t th, vm_offset_t va, int page_shift)
 {
 	uint64_t hash_index;
 
 	hash_index = (va >> page_shift) & HASH_MASK(th);
 	return (&th->th_hashtable[hash_index]);
 }
 
 static __inline tte_hash_entry_t 
 tte_hash_lookup_last_entry(tte_hash_entry_t entry)
 {
 
 	while (entry->of.next) 
 		entry = entry->of.next;
 
 	return (entry);
 }
 
 static tte_hash_entry_t 
 tte_hash_allocate_fragment_entry(tte_hash_t th)
 {
 	struct tte_hash_fragment *fh;
 	tte_hash_entry_t newentry;
 	
 	fh = th->th_fhtail;
 	if (fh->thf_head.fh_free_head == MAX_FRAGMENT_ENTRIES) {
 		fh = th->th_fhtail = fh->thf_head.fh_next = alloc_zeroed_page();
 		fh->thf_head.fh_free_head = 1;
 #ifdef NOISY_DEBUG
 		printf("new fh=%p \n", fh);
 #endif
 	} 
 	newentry = &fh->thf_entries[fh->thf_head.fh_free_head];
 
 	fh->thf_head.fh_free_head++;
 	fh->thf_head.fh_count++; 
 
 	return (newentry);
 }
 
 /*
  * if a match for va is found the tte value is returned 
  * and if field is non-null field will point to that entry
  * 
  * 
  */
 static __inline tte_t 
 _tte_hash_lookup(tte_hash_entry_t entry, tte_t tte_tag, tte_hash_field_t *field)
 {
 	int i;
 	tte_t tte_data;
 	tte_hash_field_t fields;
 
 	tte_data = 0;
 	do { 
 		fields = entry->the_fields;
 		for (i = 0; i < entry->of.count; i++) {
 			if (fields[i].tag == tte_tag) {
 				tte_data = (fields[i].data & ~VTD_LOCK);
 				*field = &fields[i];
 				goto done;
 			}
 		}
 #ifdef DEBUG
 	if (entry->of.next && entry->of.flags != MAGIC_VALUE)
 		panic("overflow pointer not null without flags set entry= %p next=%p flags=0x%x count=%d", 
 		      entry, entry->of.next, entry->of.flags, entry->of.count);
 #endif
 		entry = entry->of.next;
 	} while (entry);
 
 done:
 	return (tte_data);
 }
 
 
 static __inline void
 _tte_hash_lookup_last(tte_hash_entry_t entry, tte_hash_field_t *field)
 {
 
 	tte_hash_field_t fields;
 
 	fields = entry->the_fields;
 
 	while (entry->of.next && (entry->of.next->of.count > 1))
 		entry = entry->of.next;
 
 	if (entry->of.next && entry->of.next->of.count == 1) {
 		*field = &entry->of.next->the_fields[0];
 		entry->of.next = NULL;
 		entry->of.flags = 0;
 	} else {
 #ifdef DEBUG
 		if (entry->of.count == 0)
 			panic("count zero");
 #endif
 		*field = &entry->the_fields[--entry->of.count];
 	}
 }
 
 tte_t
 tte_hash_clear_bits(tte_hash_t th, vm_offset_t va, uint64_t flags)
 {
 	uint64_t s;
 	tte_hash_entry_t entry;
 	tte_t otte_data, tte_tag;
 	tte_hash_field_t field = NULL;
 
 	/* XXX - only handle 8K pages for now */
 	entry = find_entry(th, va, PAGE_SHIFT);
 
 	tte_tag = (((uint64_t)th->th_context << TTARGET_CTX_SHIFT)|(va >> TTARGET_VA_SHIFT));
 	
 	s = hash_bucket_lock(entry->the_fields);
 	if((otte_data = _tte_hash_lookup(entry, tte_tag, &field)) != 0)
 		tte_hash_set_field(field, field->tag, field->data & ~flags);
 	hash_bucket_unlock(entry->the_fields, s);
 	return (otte_data);
 }
 
 tte_t
 tte_hash_delete(tte_hash_t th, vm_offset_t va)
 {
 	uint64_t s;
 	tte_hash_entry_t entry;
 	tte_t tte_data, tte_tag;
 	tte_hash_field_t lookup_field = NULL; 
 	tte_hash_field_t last_field = NULL;
 
 	/* XXX - only handle 8K pages for now */
 	entry = find_entry(th, va, PAGE_SHIFT);
 
 	tte_tag = (((uint64_t)th->th_context << TTARGET_CTX_SHIFT)|(va >> TTARGET_VA_SHIFT));
 
 	s  = hash_bucket_lock(entry->the_fields);
 	
 	if ((tte_data = _tte_hash_lookup(entry, tte_tag, &lookup_field)) == 0) 
 		goto done;
 
 	_tte_hash_lookup_last(entry, &last_field);
 
 #ifdef DEBUG
 	if (last_field->tag == 0) {
 		hash_bucket_unlock(entry->the_fields, s);
 		panic("lookup_last failed for va=0x%lx\n", va);
 	}
 #endif
 	/* move last field's values in to the field we are deleting */
 	if (lookup_field != last_field) 
 		tte_hash_set_field(lookup_field, last_field->tag, last_field->data);
 	
 	tte_hash_set_field(last_field, 0, 0);
 done:	
 	hash_bucket_unlock(entry->the_fields, s);
 	if (tte_data) 
 		th->th_entries--;
 
 	return (tte_data);
 }
 
 static __inline int 
 tte_hash_insert_locked(tte_hash_t th, tte_hash_entry_t entry, uint64_t tte_tag, tte_t tte_data)
 {
 	tte_hash_entry_t lentry;
 
 	lentry = tte_hash_lookup_last_entry(entry);
 
 	if (lentry->of.count == HASH_ENTRIES) 
 		return -1;
 	tte_hash_set_field(&lentry->the_fields[lentry->of.count++], 
 			   tte_tag, tte_data);
 	th->th_entries++;
 	return (0);
 }
 
 static __inline void
 tte_hash_extend_locked(tte_hash_t th, tte_hash_entry_t entry, tte_hash_entry_t newentry, uint64_t tte_tag, tte_t tte_data)
 {
 	tte_hash_entry_t lentry;
 
 	lentry = tte_hash_lookup_last_entry(entry);
 	lentry->of.flags = MAGIC_VALUE;
 	lentry->of.next = newentry;
 	tte_hash_set_field(&newentry->the_fields[newentry->of.count++], tte_tag, tte_data);
 	th->th_entries++;
 }
 
 void
 tte_hash_insert(tte_hash_t th, vm_offset_t va, tte_t tte_data)
 {
 
 	tte_hash_entry_t entry, newentry;
 	tte_t tte_tag;
 	uint64_t s;
 	int retval;
 	
 #ifdef DEBUG
 	if (tte_hash_lookup(th, va) != 0) 
 		panic("mapping for va=0x%lx already exists", va);
 #endif
 	entry = find_entry(th, va, PAGE_SHIFT); /* should actually be a function of tte_data */
 	tte_tag = (((uint64_t)th->th_context << TTARGET_CTX_SHIFT)|(va >> TTARGET_VA_SHIFT));
 
 	s = hash_bucket_lock(entry->the_fields);
 	retval = tte_hash_insert_locked(th, entry, tte_tag, tte_data);
 	hash_bucket_unlock(entry->the_fields, s);
 
 	if (retval == -1) {
 		newentry = tte_hash_allocate_fragment_entry(th); 
 		s = hash_bucket_lock(entry->the_fields);
 		tte_hash_extend_locked(th, entry, newentry, tte_tag, tte_data);
 		hash_bucket_unlock(entry->the_fields, s);
 	}
 
 #ifdef DEBUG
 	if (tte_hash_lookup(th, va) == 0) 
 		panic("insert for va=0x%lx failed", va);
 #endif
 }
 
 /* 
  * If leave_locked is true the tte's data field will be returned to
  * the caller with the hash bucket left locked
  */
 tte_t 
 tte_hash_lookup(tte_hash_t th, vm_offset_t va)
 {
 	uint64_t s;
 	tte_hash_entry_t entry;
 	tte_t tte_data, tte_tag;
 	tte_hash_field_t field = NULL;
 	/* XXX - only handle 8K pages for now */
 	entry = find_entry(th, va, PAGE_SHIFT);
 
 	tte_tag = (((uint64_t)th->th_context << TTARGET_CTX_SHIFT)|(va >> TTARGET_VA_SHIFT));
 
 	s = hash_bucket_lock(entry->the_fields);
 	tte_data = _tte_hash_lookup(entry, tte_tag, &field);
 	hash_bucket_unlock(entry->the_fields, s);
 	
 	return (tte_data);
 }
 
 uint64_t
 tte_hash_set_scratchpad_kernel(tte_hash_t th)
 {
 	
 	uint64_t hash_scratch;
 	/* This breaks if a hash table grows above 32MB
 	 */
 	hash_scratch = ((vm_offset_t)th->th_hashtable) | ((vm_offset_t)(1<<th->th_shift));
 	set_hash_kernel_scratchpad(hash_scratch);
 	
 	return (hash_scratch);
 }
 
 uint64_t
 tte_hash_set_scratchpad_user(tte_hash_t th, uint64_t context)
 {
 
 	uint64_t hash_scratch;
 	/* This breaks if a hash table grows above 32MB
 	 */
 	th->th_context = (uint16_t)context;
 	hash_scratch = ((vm_offset_t)th->th_hashtable) | ((vm_offset_t)(1<<th->th_shift));
 	set_hash_user_scratchpad(hash_scratch);
 	
 	return (hash_scratch);
 }
 
 tte_t
 tte_hash_update(tte_hash_t th, vm_offset_t va, tte_t tte_data)
 {
 
 	uint64_t s;
 	tte_hash_entry_t entry;
 	tte_t otte_data, tte_tag;
 	tte_hash_field_t field = NULL;
 
 	entry = find_entry(th, va, PAGE_SHIFT); /* should actualy be a function of tte_data */
 
 	tte_tag = (((uint64_t)th->th_context << TTARGET_CTX_SHIFT)|(va >> TTARGET_VA_SHIFT));
 	s = hash_bucket_lock(entry->the_fields);
 	otte_data = _tte_hash_lookup(entry, tte_tag, &field);
 
 	if (otte_data == 0) {
 		hash_bucket_unlock(entry->the_fields, s);
 		tte_hash_insert(th, va, tte_data);
 	} else {
 		tte_hash_set_field(field, tte_tag, tte_data);
 		hash_bucket_unlock(entry->the_fields, s);
 	}
 	return (otte_data);
 }
 
 /*
  * resize when the average entry has a full fragment entry
  */
 int
 tte_hash_needs_resize(tte_hash_t th)
 {
 	return ((th->th_entries > (1 << (th->th_shift + PAGE_SHIFT - TTE_SHIFT + 1))) 
 		&& (th != &kernel_tte_hash));
 }
 
 tte_hash_t
 tte_hash_resize(tte_hash_t th)
 {
 	int i, j, nentries;
 	tte_hash_t newth;
 	tte_hash_entry_t src_entry, dst_entry, newentry;
 
 	KASSERT(th != &kernel_tte_hash,("tte_hash_resize not supported for this pmap"));
 	if ((newth = tte_hash_cached_get((th->th_shift - HASH_ENTRY_SHIFT) + 1)) != NULL) {
 		newth->th_context = th->th_context;
 		_tte_hash_reset(newth);
 	} else {
 		newth = _tte_hash_create(th->th_context, NULL, (th->th_shift + 1));
 	}
 
 	nentries = (1 << (th->th_shift + PAGE_SHIFT - THE_SHIFT));
 	for (i = 0; i < nentries; i++) {
 		tte_hash_field_t fields;
 		src_entry = (&th->th_hashtable[i]);
 		do {
 			fields = src_entry->the_fields;
 			for (j = 0; j < src_entry->of.count; j++) {
 				int shift = TTARGET_VA_SHIFT - PAGE_SHIFT;
 				uint64_t index = ((fields[j].tag<<shift) | (uint64_t)(i&((1<<shift)-1))) & HASH_MASK(newth);	
 				dst_entry = &(newth->th_hashtable[index]);
 				if (tte_hash_insert_locked(newth, dst_entry, fields[j].tag, fields[j].data) == -1) {
 					newentry = tte_hash_allocate_fragment_entry(newth); 
 					tte_hash_extend_locked(newth, dst_entry, newentry, fields[j].tag, fields[j].data);
 				}
 			}		 
 			src_entry = src_entry->of.next;
 		} while (src_entry);
 	}
 
 	KASSERT(th->th_entries == newth->th_entries, 
 		("not all entries copied old=%d new=%d", th->th_entries, newth->th_entries));
 	
 	return (newth);
 }
Index: head/sys/sys/vmmeter.h
===================================================================
--- head/sys/sys/vmmeter.h	(revision 169666)
+++ head/sys/sys/vmmeter.h	(revision 169667)
@@ -1,210 +1,225 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vmmeter.h	8.2 (Berkeley) 7/10/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VMMETER_H_
 #define _SYS_VMMETER_H_
 
 /*
  * System wide statistics counters.
  */
 struct vmmeter {
 	/*
 	 * General system activity.
 	 */
 	u_int v_swtch;		/* context switches */
 	u_int v_trap;		/* calls to trap */
 	u_int v_syscall;	/* calls to syscall() */
 	u_int v_intr;		/* device interrupts */
 	u_int v_soft;		/* software interrupts */
 	/*
 	 * Virtual memory activity.
 	 */
 	u_int v_vm_faults;	/* number of address memory faults */
 	u_int v_cow_faults;	/* number of copy-on-writes */
 	u_int v_cow_optim;	/* number of optimized copy-on-writes */
 	u_int v_zfod;		/* pages zero filled on demand */
 	u_int v_ozfod;		/* optimized zero fill pages */
 	u_int v_swapin;		/* swap pager pageins */
 	u_int v_swapout;	/* swap pager pageouts */
 	u_int v_swappgsin;	/* swap pager pages paged in */
 	u_int v_swappgsout;	/* swap pager pages paged out */
 	u_int v_vnodein;	/* vnode pager pageins */
 	u_int v_vnodeout;	/* vnode pager pageouts */
 	u_int v_vnodepgsin;	/* vnode_pager pages paged in */
 	u_int v_vnodepgsout;	/* vnode pager pages paged out */
 	u_int v_intrans;	/* intransit blocking page faults */
 	u_int v_reactivated;	/* number of pages reactivated from free list */
 	u_int v_pdwakeups;	/* number of times daemon has awaken from sleep */
 	u_int v_pdpages;	/* number of pages analyzed by daemon */
 
 	u_int v_dfree;		/* pages freed by daemon */
 	u_int v_pfree;		/* pages freed by exiting processes */
 	u_int v_tfree;		/* total pages freed */
 	/*
 	 * Distribution of page usages.
 	 */
 	u_int v_page_size;	/* page size in bytes */
 	u_int v_page_count;	/* total number of pages in system */
 	u_int v_free_reserved;	/* number of pages reserved for deadlock */
 	u_int v_free_target;	/* number of pages desired free */
 	u_int v_free_min;	/* minimum number of pages desired free */
 	u_int v_free_count;	/* number of pages free */
 	u_int v_wire_count;	/* number of pages wired down */
 	u_int v_active_count;	/* number of pages active */
 	u_int v_inactive_target; /* number of pages desired inactive */
 	u_int v_inactive_count;	/* number of pages inactive */
 	u_int v_cache_count;	/* number of pages on buffer cache queue */
 	u_int v_cache_min;	/* min number of pages desired on cache queue */
 	u_int v_cache_max;	/* max number of pages in cached obj */
 	u_int v_pageout_free_min;   /* min number pages reserved for kernel */
 	u_int v_interrupt_free_min; /* reserved number of pages for int code */
 	u_int v_free_severe;	/* severe depletion of pages below this pt */
 	/*
 	 * Fork/vfork/rfork activity.
 	 */
 	u_int v_forks;		/* number of fork() calls */
 	u_int v_vforks;		/* number of vfork() calls */
 	u_int v_rforks;		/* number of rfork() calls */
 	u_int v_kthreads;	/* number of fork() calls by kernel */
 	u_int v_forkpages;	/* number of VM pages affected by fork() */
 	u_int v_vforkpages;	/* number of VM pages affected by vfork() */
 	u_int v_rforkpages;	/* number of VM pages affected by rfork() */
 	u_int v_kthreadpages;	/* number of VM pages affected by fork() by kernel */
 };
 #ifdef _KERNEL
 
-extern struct vmmeter cnt;
+extern volatile struct vmmeter cnt;
 
+#define	VMCNT	__DEVOLATILE(struct vmmeter *, &cnt)
+#define	VMCNT_SET(member, val)						\
+	atomic_store_rel_int(__CONCAT(&cnt.v_, member), val)
+#define	VMCNT_ADD(member, val)						\
+	atomic_add_int(__CONCAT(&cnt.v_, member), val)
+#define	VMCNT_DEC(member, val)						\
+	atomic_subtract_int(__CONCAT(&cnt.v_, member), val)
+#define	VMCNT_GET(member)	(__CONCAT(cnt.v_, member))
+#define	VMCNT_PTR(member)						\
+	__DEVOLATILE(u_int *, __CONCAT(&cnt.v_, member))
+
 /*
  * Return TRUE if we are under our reserved low-free-pages threshold
  */
 
 static __inline 
 int
 vm_page_count_reserved(void)
 {
-    return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count));
+    return (VMCNT_GET(free_reserved) > (VMCNT_GET(free_count) +
+        VMCNT_GET(cache_count)));
 }
 
 /*
  * Return TRUE if we are under our severe low-free-pages threshold
  *
  * This routine is typically used at the user<->system interface to determine
  * whether we need to block in order to avoid a low memory deadlock.
  */
 
 static __inline 
 int
 vm_page_count_severe(void)
 {
-    return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));
+    return (VMCNT_GET(free_severe) > (VMCNT_GET(free_count) +
+        VMCNT_GET(cache_count)));
 }
 
 /*
  * Return TRUE if we are under our minimum low-free-pages threshold.
  *
  * This routine is typically used within the system to determine whether
  * we can execute potentially very expensive code in terms of memory.  It
  * is also used by the pageout daemon to calculate when to sleep, when
  * to wake waiters up, and when (after making a pass) to become more
  * desparate.
  */
 
 static __inline 
 int
 vm_page_count_min(void)
 {
-    return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));
+    return (VMCNT_GET(free_min) > (VMCNT_GET(free_count) +
+        VMCNT_GET(cache_count)));
 }
 
 /*
  * Return TRUE if we have not reached our free page target during
  * free page recovery operations.
  */
 
 static __inline 
 int
 vm_page_count_target(void)
 {
-    return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));
+    return (VMCNT_GET(free_target) > (VMCNT_GET(free_count) +
+        VMCNT_GET(cache_count)));
 }
 
 /*
  * Return the number of pages we need to free-up or cache
  * A positive number indicates that we do not have enough free pages.
  */
 
 static __inline 
 int
 vm_paging_target(void)
 {
     return (
-	(cnt.v_free_target + cnt.v_cache_min) - 
-	(cnt.v_free_count + cnt.v_cache_count)
+	(VMCNT_GET(free_target) + VMCNT_GET(cache_min)) - 
+	(VMCNT_GET(free_count) + VMCNT_GET(cache_count))
     );
 }
 
 /*
  * Returns TRUE if the pagedaemon needs to be woken up.
  */
 
 static __inline 
 int
 vm_paging_needed(void)
 {
     return (
-	(cnt.v_free_reserved + cnt.v_cache_min) >
-	(cnt.v_free_count + cnt.v_cache_count)
+	(VMCNT_GET(free_reserved) + VMCNT_GET(cache_min)) >
+	(VMCNT_GET(free_count) + VMCNT_GET(cache_count))
     );
 }
 
 #endif
 
 /* systemwide totals computed every five seconds */
 struct vmtotal {
 	int16_t	t_rq;		/* length of the run queue */
 	int16_t	t_dw;		/* jobs in ``disk wait'' (neg priority) */
 	int16_t	t_pw;		/* jobs in page wait */
 	int16_t	t_sl;		/* jobs sleeping in core */
 	int16_t	t_sw;		/* swapped out runnable/short block jobs */
 	int32_t	t_vm;		/* total virtual memory */
 	int32_t	t_avm;		/* active virtual memory */
 	int32_t	t_rm;		/* total real memory in use */
 	int32_t	t_arm;		/* active real memory */
 	int32_t	t_vmshr;	/* shared virtual memory */
 	int32_t	t_avmshr;	/* active shared virtual memory */
 	int32_t	t_rmshr;	/* shared real memory */
 	int32_t	t_armshr;	/* active shared real memory */
 	int32_t	t_free;		/* free memory pages */
 };
 
 #endif
Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c	(revision 169666)
+++ head/sys/vm/swap_pager.c	(revision 169667)
@@ -1,2546 +1,2546 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.  
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 #include "opt_swap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/blist.h>
 #include <sys/lock.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * SWB_NPAGES must be a power of 2.  It may be set to 1, 2, 4, 8, or 16
  * pages per allocation.  We recommend you stick with the default of 8.
  * The 16-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define MAX_PAGEOUT_CLUSTER 16
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 /*
  * Piecemeal swap metadata structure.  Swap is stored in a radix tree.
  *
  * If SWB_NPAGES is 8 and sizeof(char *) == sizeof(daddr_t), our radix
  * is basically 8.  Assuming PAGE_SIZE == 4096, one tree level represents
  * 32K worth of data, two levels represent 256K, three levels represent
  * 2 MBytes.   This is acceptable.
  *
  * Overall memory utilization is about the same as the old swap structure.
  */
 #define SWCORRECT(n) (sizeof(void *) * (n) / sizeof(daddr_t))
 #define SWAP_META_PAGES		(SWB_NPAGES * 2)
 #define SWAP_META_MASK		(SWAP_META_PAGES - 1)
 
 struct swblock {
 	struct swblock	*swb_hnext;
 	vm_object_t	swb_object;
 	vm_pindex_t	swb_index;
 	int		swb_count;
 	daddr_t		swb_pages[SWAP_META_PAGES];
 };
 
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static int swdev_syscall_active = 0; /* serialize swap(on|off) */
 
 static void swapdev_strategy(struct buf *, struct swdevt *sw);
 
 #define SWM_FREE	0x02	/* free, period			*/
 #define SWM_POP		0x04	/* pop out			*/
 
 int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static int nsw_rcount;		/* free read buffers			*/
 static int nsw_wcount_sync;	/* limit write buffers / synchronous	*/
 static int nsw_wcount_async;	/* limit write buffers / asynchronous	*/
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static struct swblock **swhash;
 static int swhash_mask;
 static struct mtx swhash_mtx;
 
 static int swap_async_max = 4;	/* maximum in-progress async I/O's	*/
 static struct sx sw_alloc_sx;
 
 
 SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
         CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct mtx sw_alloc_mtx;	/* protect list manipulation */ 
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t	swap_zone;
 static struct vm_object	swap_zone_obj;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 				      vm_prot_t prot, vm_ooffset_t offset);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * dmmax is in page-sized chunks with the new swap system.  It was
  * dev-bsized chunks in the old.  dmmax is always a power of 2.
  *
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int dmmax;
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax,
 	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static int	swapongeom(struct thread *, struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct thread *td);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, int npages);
 static daddr_t	swp_pager_getswapspace(int npages);
 
 /*
  * Metadata functions
  */
 static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
 static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *	
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  *	This routine must be called at splvm()
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWP_PAGER_HASH() -	hash swap meta data
  *
  *	This is an helper function which hashes the swapblk given
  *	the object and page index.  It returns a pointer to a pointer
  *	to the object, or a pointer to a NULL pointer if it could not
  *	find a swapblk.
  *
  *	This routine must be called at splvm().
  */
 static struct swblock **
 swp_pager_hash(vm_object_t object, vm_pindex_t index)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 
 	index &= ~(vm_pindex_t)SWAP_META_MASK;
 	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
 	while ((swap = *pswap) != NULL) {
 		if (swap->swb_object == object &&
 		    swap->swb_index == index
 		) {
 			break;
 		}
 		pswap = &swap->swb_hnext;
 	}
 	return (pswap);
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run 
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 
 	/*
 	 * Device Stripe, in PAGE_SIZE'd blocks
 	 */
 	dmmax = SWB_NPAGES * 2;
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	int n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is 
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
 	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
 
 	mtx_lock(&pbuf_mtx);
 	nsw_rcount = (nswbuf + 1) / 2;
 	nsw_wcount_sync = (nswbuf + 3) / 4;
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Initialize our zone.  Right now I'm just guessing on the number
 	 * we need based on the number of pages in the system.  Each swblock
 	 * can hold 16 pages, so this is probably overkill.  This reservation
 	 * is typically limited to around 32MB by default.
 	 */
-	n = cnt.v_page_count / 2;
+	n = VMCNT_GET(page_count) / 2;
 	if (maxswzone && n > maxswzone / sizeof(struct swblock))
 		n = maxswzone / sizeof(struct swblock);
 	n2 = n;
 	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 	if (swap_zone == NULL)
 		panic("failed to create swap_zone.");
 	do {
 		if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 	if (n2 != n)
 		printf("Swap zone entries reduced from %d to %d.\n", n2, n);
 	n2 = n;
 
 	/*
 	 * Initialize our meta-data hash table.  The swapper does not need to
 	 * be quite as efficient as the VM system, so we do not use an 
 	 * oversized hash table.
 	 *
 	 * 	n: 		size of hash table, must be power of 2
 	 *	swhash_mask:	hash table index mask
 	 */
 	for (n = 1; n < n2 / 8; n *= 2)
 		;
 	swhash = malloc(sizeof(struct swblock *) * n, M_VMPGDATA, M_WAITOK | M_ZERO);
 	swhash_mask = n - 1;
 	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.  We do this by creating an OBJT_DEFAULT object
  *	and then converting it with swp_pager_meta_build().
  *
  *	This routine may block in vm_object_allocate() and create a named
  *	object lookup race, so we must interlock.   We must also run at
  *	splvm() for the object lookup to handle races with interrupts, but
  *	we do not have to maintain splvm() in between the lookup and the
  *	add because (I believe) it is not possible to attempt to create
  *	a new swap object w/handle when a default object with that handle
  *	already exists.
  *
  * MPSAFE
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		 vm_ooffset_t offset)
 {
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
 
 	if (handle) {
 		mtx_lock(&Giant);
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 
 		if (object != NULL) {
 			vm_object_reference(object);
 		} else {
 			object = vm_object_allocate(OBJT_DEFAULT, pindex);
 			object->handle = handle;
 
 			VM_OBJECT_LOCK(object);
 			swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 			VM_OBJECT_UNLOCK(object);
 		}
 		sx_xunlock(&sw_alloc_sx);
 		mtx_unlock(&Giant);
 	} else {
 		object = vm_object_allocate(OBJT_DEFAULT, pindex);
 
 		VM_OBJECT_LOCK(object);
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		VM_OBJECT_UNLOCK(object);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is 
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	This routine may block, but no longer does. 
  *
  *	The object must be locked or unreferenceable.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		mtx_lock(&sw_alloc_mtx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
 		mtx_unlock(&sw_alloc_mtx);
 	}
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from 
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for the requested number of pages.  The starting
  *	swap block number (a page index) is returned or SWAPBLK_NONE
  *	if the allocation failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	Must be called at splvm() to avoid races with bitmap frees from
  *	vm_page_remove() aka swap_pager_page_removed().
  *
  *	This routine may not block
  *	This routine must be called at splvm().
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int npages)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int i;
 
 	blk = SWAPBLK_NONE;
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	for (i = 0; i < nswapdev; i++) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if (!(sp->sw_flags & SW_CLOSING)) {
 			blk = blist_alloc(sp->sw_blist, npages);
 			if (blk != SWAPBLK_NONE) {
 				blk += sp->sw_first;
 				sp->sw_used += npages;
 				swap_pager_avail -= npages;
 				swp_sizecheck();
 				swdevhd = TAILQ_NEXT(sp, sw_list);
 				goto done;
 			}
 		}
 		sp = TAILQ_NEXT(sp, sw_list);
 	}
 	if (swap_pager_full != 2) {
 		printf("swap_pager_getswapspace(%d): failed\n", npages);
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	swdevhd = NULL;
 done:
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static int
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 	
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
 			mtx_unlock(&sw_dev_mtx);
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 	
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space 
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	Note:  This routine may not block (it could in the old swap code),
  *	and through the use of the new blist routines it does not block.
  *
  *	We must be called at splvm() to avoid races with bitmap frees from
  *	vm_page_remove() aka swap_pager_page_removed().
  *
  *	This routine may not block
  *	This routine must be called at splvm().
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, int npages)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (blk >= sp->sw_first && blk < sp->sw_end) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.  
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed 
  *	or renamed vm_page_t's associated with this range in the object so 
  *	we should be ok.
  *
  *	This routine may be called at any spl.  We up our spl to splvm temporarily
  *	in order to perform the metadata removal.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The 
  *	swap blocks are not zerod.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	int n = 0;
 	daddr_t blk = SWAPBLK_NONE;
 	vm_pindex_t beg = start;	/* save start index */
 
 	VM_OBJECT_LOCK(object);
 	while (size) {
 		if (n == 0) {
 			n = BLIST_MAX_ALLOC;
 			while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
 				n >>= 1;
 				if (n == 0) {
 					swp_pager_meta_free(object, beg, start - beg);
 					VM_OBJECT_UNLOCK(object);
 					return (-1);
 				}
 			}
 		}
 		swp_pager_meta_build(object, start, blk);
 		--size;
 		++start;
 		++blk;
 		--n;
 	}
 	swp_pager_meta_free(object, start, n);
 	VM_OBJECT_UNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to block.  It may block allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source. 
  *
  *	This routine can be called at any spl
  *
  *	XXX vm_page_collapse() kinda expects us not to block because we 
  *	supposedly do not need to allocate memory, but for the moment we
  *	*may* have to get a little memory from the zone allocator, but
  *	it is taken from the interrupt memory.  We should be ok. 
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked or 
  *	inaccessible (XXX are they ?)
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 
 	VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);
 
 	/*
 	 * If destroysource is set, we remove the source object from the 
 	 * swap_pager internal queue now. 
 	 */
 	if (destroysource) {
 		if (srcobject->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_REMOVE(
 			    NOBJLIST(srcobject->handle),
 			    srcobject,
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 
 	/*
 	 * transfer source to destination.
 	 */
 	for (i = 0; i < dstobject->size; ++i) {
 		daddr_t dstaddr;
 
 		/*
 		 * Locate (without changing) the swapblk on the destination,
 		 * unless it is invalid in which case free it silently, or
 		 * if the destination is a resident page, in which case the
 		 * source is thrown away.
 		 */
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 
 		if (dstaddr == SWAPBLK_NONE) {
 			/*
 			 * Destination has no swapblk and is not resident,
 			 * copy source.
 			 */
 			daddr_t srcaddr;
 
 			srcaddr = swp_pager_meta_ctl(
 			    srcobject, 
 			    i + offset,
 			    SWM_POP
 			);
 
 			if (srcaddr != SWAPBLK_NONE) {
 				/*
 				 * swp_pager_meta_build() can sleep.
 				 */
 				vm_object_pip_add(srcobject, 1);
 				VM_OBJECT_UNLOCK(srcobject);
 				vm_object_pip_add(dstobject, 1);
 				swp_pager_meta_build(dstobject, i, srcaddr);
 				vm_object_pip_wakeup(dstobject);
 				VM_OBJECT_LOCK(srcobject);
 				vm_object_pip_wakeup(srcobject);
 			}
 		} else {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the sourceblock.
 			 */
 			
 			swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
 		}
 	}
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidently
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page within a reasonable
  *	distance.  We do not try to restrict it to the swap device stripe
  *	(that is handled in getpages/putpages).  It probably isn't worth
  *	doing here.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after)
 {
 	daddr_t blk0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = (i - 1);
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		int i;
 
 		for (i = 1; i < (SWB_NPAGES/2); ++i) {
 			daddr_t blk;
 
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = (i - 1);
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.  
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not block
  *	This routine must be called at splvm()
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
 }
 
 /*
  * SWAP_PAGER_GETPAGES() - bring pages in from swap
  *
  *	Attempt to retrieve (m, count) pages from backing store, but make
  *	sure we retrieve at least m[reqpage].  We try to load in as large
  *	a chunk surrounding m[reqpage] as is contiguous in swap and which
  *	belongs to the same object.
  *
  *	The code is designed for asynchronous operation and 
  *	immediate-notification of 'reqpage' but tends not to be
  *	used that way.  Please do not optimize-out this algorithmic
  *	feature, I intend to improve on it in the future.
  *
  *	The parent has a single vm_object_pip_add() reference prior to
  *	calling us and we should return with the same.
  *
  *	The parent has BUSY'd the pages.  We should return with 'm'
  *	left busy, but the others adjusted.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
 {
 	struct buf *bp;
 	vm_page_t mreq;
 	int i;
 	int j;
 	daddr_t blk;
 
 	mreq = m[reqpage];
 
 	KASSERT(mreq->object == object,
 	    ("swap_pager_getpages: object mismatch %p/%p",
 	    object, mreq->object));
 
 	/*
 	 * Calculate range to retrieve.  The pages have already been assigned
 	 * their swapblks.  We require a *contiguous* range but we know it to
 	 * not span devices.   If we do not supply it, bad things
 	 * happen.  Note that blk, iblk & jblk can be SWAPBLK_NONE, but the 
 	 * loops are set up such that the case(s) are handled implicitly.
 	 *
 	 * The swp_*() calls must be made at splvm().  vm_page_free() does
 	 * not need to be, but it will go a little faster if it is.
 	 */
 	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
 
 	for (i = reqpage - 1; i >= 0; --i) {
 		daddr_t iblk;
 
 		iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
 		if (blk != iblk + (reqpage - i))
 			break;
 	}
 	++i;
 
 	for (j = reqpage + 1; j < count; ++j) {
 		daddr_t jblk;
 
 		jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
 		if (blk != jblk - (j - reqpage))
 			break;
 	}
 
 	/*
 	 * free pages outside our collection range.   Note: we never free
 	 * mreq, it must remain busy throughout.
 	 */
 	if (0 < i || j < count) {
 		int k;
 
 		vm_page_lock_queues();
 		for (k = 0; k < i; ++k)
 			vm_page_free(m[k]);
 		for (k = j; k < count; ++k)
 			vm_page_free(m[k]);
 		vm_page_unlock_queues();
 	}
 
 	/*
 	 * Return VM_PAGER_FAIL if we have nothing to do.  Return mreq 
 	 * still busy, but the others unbusied.
 	 */
 	if (blk == SWAPBLK_NONE)
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Getpbuf() can sleep.
 	 */
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * Get a swap buffer header to perform the IO
 	 */
 	bp = getpbuf(&nsw_rcount);
 	bp->b_flags |= B_PAGING;
 
 	/*
 	 * map our page(s) into kva for input
 	 */
 	pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);
 
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk - (reqpage - i);
 	bp->b_bcount = PAGE_SIZE * (j - i);
 	bp->b_bufsize = PAGE_SIZE * (j - i);
 	bp->b_pager.pg_reqpage = reqpage - i;
 
 	VM_OBJECT_LOCK(object);
 	{
 		int k;
 
 		for (k = i; k < j; ++k) {
 			bp->b_pages[k - i] = m[k];
 			m[k]->oflags |= VPO_SWAPINPROG;
 		}
 	}
 	bp->b_npages = j - i;
 
-	cnt.v_swapin++;
-	cnt.v_swappgsin += bp->b_npages;
+	VMCNT_ADD(swapin, 1);
+	VMCNT_ADD(swappgsin, bp->b_npages);
 
 	/*
 	 * We still hold the lock on mreq, and our automatic completion routine
 	 * does not remove it.
 	 */
 	vm_object_pip_add(object, bp->b_npages);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our m[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * wait for the page we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the meta-data.
 	 */
 	VM_OBJECT_LOCK(object);
 	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
 		mreq->oflags |= VPO_WANTED;
 		vm_page_lock_queues();
 		vm_page_flag_set(mreq, PG_REFERENCED);
 		vm_page_unlock_queues();
-		cnt.v_intrans++;
+		VMCNT_ADD(intrans, 1);
 		if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * mreq is left busied after completion, but all the other pages
 	 * are freed.  If we had an unrecoverable read error the page will
 	 * not be valid.
 	 */
 	if (mreq->valid != VM_PAGE_BITS_ALL) {
 		return (VM_PAGER_ERROR);
 	} else {
 		return (VM_PAGER_OK);
 	}
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert 
 	 * to all-zero's later.
 	 */
 }
 
 /*
  *	swap_pager_putpages: 
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new 
  *	vm_page reservation system coupled with properly written VFS devices 
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 void
 swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     boolean_t sync, int *rtvals)
 {
 	int i;
 	int n = 0;
 
 	GIANT_REQUIRED;
 	if (count && m[0]->object != object) {
 		panic("swap_pager_getpages: object mismatch %p/%p", 
 		    object, 
 		    m[0]->object
 		);
 	}
 
 	/*
 	 * Step 1
 	 *
 	 * Turn object into OBJT_SWAP
 	 * check for bogus sysops
 	 * force sync if not pageout process
 	 */
 	if (object->type != OBJT_SWAP)
 		swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 	VM_OBJECT_UNLOCK(object);
 
 	if (curproc != pageproc)
 		sync = TRUE;
 
 	/*
 	 * Step 2
 	 *
 	 * Update nsw parameters from swap_async_max sysctl values.  
 	 * Do not let the sysop crash the machine with bogus numbers.
 	 */
 	mtx_lock(&pbuf_mtx);
 	if (swap_async_max != nsw_wcount_async_max) {
 		int n;
 
 		/*
 		 * limit range
 		 */
 		if ((n = swap_async_max) > nswbuf / 2)
 			n = nswbuf / 2;
 		if (n < 1)
 			n = 1;
 		swap_async_max = n;
 
 		/*
 		 * Adjust difference ( if possible ).  If the current async
 		 * count is too low, we may not be able to make the adjustment
 		 * at this time.
 		 */
 		n -= nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		}
 	}
 	mtx_unlock(&pbuf_mtx);
 
 	/*
 	 * Step 3
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		int j;
 		struct buf *bp;
 		daddr_t blk;
 
 		/*
 		 * Maximum I/O size is limited by a number of factors.
 		 */
 		n = min(BLIST_MAX_ALLOC, count - i);
 		n = min(n, nsw_cluster_max);
 
 		/*
 		 * Get biggest block of swap we can.  If we fail, fall
 		 * back and try to allocate a smaller block.  Don't go
 		 * overboard trying to allocate space if it would overly
 		 * fragment swap.
 		 */
 		while (
 		    (blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
 		    n > 4
 		) {
 			n >>= 1;
 		}
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
 		 * All I/O parameters have been satisfied, build the I/O
 		 * request and assign the swap space.
 		 */
 		if (sync == TRUE) {
 			bp = getpbuf(&nsw_wcount_sync);
 		} else {
 			bp = getpbuf(&nsw_wcount_async);
 			bp->b_flags = B_ASYNC;
 		}
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_LOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = m[i+j];
 
 			swp_pager_meta_build(
 			    mreq->object, 
 			    mreq->pindex,
 			    blk + j
 			);
 			vm_page_dirty(mreq);
 			rtvals[i+j] = VM_PAGER_OK;
 
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_UNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
-		cnt.v_swapout++;
-		cnt.v_swappgsout += bp->b_npages;
+		VMCNT_ADD(swapout, 1);
+		VMCNT_ADD(swappgsout, bp->b_npages);
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		if (sync == FALSE) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_PEND;
 			/* restart outter loop */
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete, then update rtvals.
 		 * We just set the rtvals[] to VM_PAGER_PEND so we can call
 		 * our async completion routine at the end, thus avoiding a
 		 * double-free.
 		 */
 		bwait(bp, PVM, "swwrt");
 		for (j = 0; j < n; ++j)
 			rtvals[i+j] = VM_PAGER_PEND;
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	VM_OBJECT_LOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
  *	the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
  *	unbusy all pages except the 'main' request page.  For WRITE 
  *	operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
  *	because we marked them all VM_PAGER_PEND on return from putpages ).
  *
  *	This routine may not block.
  *	This routine is called at splbio() or better
  *
  *	We up ourselves to splvm() as required for various vm_page related
  *	calls.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * report error
 	 */
 	if (bp->b_ioflags & BIO_ERROR) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno, 
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_LOCK(object);
 	}
 	vm_page_lock_queues();
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page 
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an 
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * When reading, reqpage needs to stay
 				 * locked for the parent, but all other
 				 * pages can be freed.  We still want to
 				 * wakeup the parent waiting on the page,
 				 * though.  ( also: pg_reqpage can be -1 and 
 				 * not match anything ).
 				 *
 				 * We have to wake specifically requested pages
 				 * up too because we cleared VPO_SWAPINPROG and
 				 * someone may be waiting for that.
 				 *
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 				if (i != bp->b_pager.pg_reqpage)
 					vm_page_free(m);
 				else
 					vm_page_flash(m);
 				/*
 				 * If i == bp->b_pager.pg_reqpage, do not wake 
 				 * the page up.  The caller needs to.
 				 */
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				vm_page_dirty(m);
 				vm_page_activate(m);
 				vm_page_io_finish(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * For read success, clear dirty bits.  Nobody should
 			 * have this page mapped but don't take any chances,
 			 * make sure the pmap modify bits are also cleared.
 			 *
 			 * NOTE: for reads, m->dirty will probably be 
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 *
 			 * If not the requested page then deactivate it.
 			 *
 			 * Note that the requested page, reqpage, is left
 			 * busied, but we still have to wake it up.  The
 			 * other pages are released (unbusied) by 
 			 * vm_page_wakeup().  We do not set reqpage's
 			 * valid bits here, it is up to the caller.
 			 */
 			pmap_clear_modify(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(m);
 
 			/*
 			 * We have to wake specifically requested pages
 			 * up too because we cleared VPO_SWAPINPROG and
 			 * could be waiting for it in getpages.  However,
 			 * be sure to not unbusy getpages specifically
 			 * requested page - getpages expects it to be 
 			 * left busy.
 			 */
 			if (i != bp->b_pager.pg_reqpage) {
 				vm_page_deactivate(m);
 				vm_page_wakeup(m);
 			} else {
 				vm_page_flash(m);
 			}
 		} else {
 			/*
 			 * For write success, clear the modify and dirty 
 			 * status, then finish the I/O ( which decrements the 
 			 * busy count and possibly wakes waiter's up ).
 			 */
 			pmap_clear_modify(m);
 			vm_page_undirty(m);
 			vm_page_io_finish(m);
 			if (vm_page_count_severe())
 				vm_page_try_to_cache(m);
 		}
 	}
 	vm_page_unlock_queues();
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/* 
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling 
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	relpbuf(
 	    bp, 
 	    ((bp->b_iocmd == BIO_READ) ? &nsw_rcount : 
 		((bp->b_flags & B_ASYNC) ? 
 		    &nsw_wcount_async : 
 		    &nsw_wcount_sync
 		)
 	    )
 	);
 }
 
 /*
  *	swap_pager_isswapped:
  *
  *	Return 1 if at least one page in the given object is paged
  *	out to the given swap device.
  *
  *	This routine may not block.
  */
 int
 swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
 {
 	daddr_t index = 0;
 	int bcount;
 	int i;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return (0);
 
 	mtx_lock(&swhash_mtx);
 	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
 		struct swblock *swap;
 
 		if ((swap = *swp_pager_hash(object, index)) != NULL) {
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				if (swp_pager_isondev(swap->swb_pages[i], sp)) {
 					mtx_unlock(&swhash_mtx);
 					return (1);
 				}
 			}
 		}
 		index += SWAP_META_PAGES;
 		if (index > 0x20000000)
 			panic("swap_pager_isswapped: failed to locate all swap meta blocks");
 	}
 	mtx_unlock(&swhash_mtx);
 	return (0);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
  *
  *	This routine dissociates the page at the given index within a
  *	swap block from its backing store, paging it in if necessary.
  *	If the page is paged in, it is placed in the inactive queue,
  *	since it had its backing store ripped out from under it.
  *	We also attempt to swap in all other pages in the swap block,
  *	we only guarantee that the one at the specified index is
  *	paged in.
  *
  *	XXX - The code to page the whole block in doesn't work, so we
  *	      revert to the one-by-one behavior for now.  Sigh.
  */
 static inline void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	vm_object_pip_add(object, 1);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
 	if (m->valid == VM_PAGE_BITS_ALL) {
 		vm_object_pip_subtract(object, 1);
 		vm_page_lock_queues();
 		vm_page_activate(m);
 		vm_page_dirty(m);
 		vm_page_unlock_queues();
 		vm_page_wakeup(m);
 		vm_pager_page_unswapped(m);
 		return;
 	}
 
 	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
 		panic("swap_pager_force_pagein: read from swap failed");/*XXX*/
 	vm_object_pip_subtract(object, 1);
 	vm_page_lock_queues();
 	vm_page_dirty(m);
 	vm_page_dontneed(m);
 	vm_page_unlock_queues();
 	vm_page_wakeup(m);
 	vm_pager_page_unswapped(m);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	struct swblock *swap;
 	int i, j, retries;
 
 	GIANT_REQUIRED;
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&swhash_mtx);
 	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
 restart:
 		for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
 			vm_object_t object = swap->swb_object;
 			vm_pindex_t pindex = swap->swb_index;
                         for (j = 0; j < SWAP_META_PAGES; ++j) {
                                 if (swp_pager_isondev(swap->swb_pages[j], sp)) {
 					/* avoid deadlock */
 					if (!VM_OBJECT_TRYLOCK(object)) {
 						break;
 					} else {
 						mtx_unlock(&swhash_mtx);
 						swp_pager_force_pagein(object,
 						    pindex + j);
 						VM_OBJECT_UNLOCK(object);
 						mtx_lock(&swhash_mtx);
 						goto restart;
 					}
 				}
                         }
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the 
  *	OBJT_SWAP object.  All swp_*() routines must be called at
  *	splvm() because swap can be freed up by the low level vm_page
  *	code which might be called from interrupts beyond what splbio() covers.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is freed.
  *
  *	This routine must be called at splvm(), except when used to convert
  *	an OBJT_DEFAULT object into an OBJT_SWAP object.
  */
 static void
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	struct swblock *swap;
 	struct swblock **pswap;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		object->type = OBJT_SWAP;
 		object->un_pager.swp.swp_bcount = 0;
 
 		if (object->handle != NULL) {
 			mtx_lock(&sw_alloc_mtx);
 			TAILQ_INSERT_TAIL(
 			    NOBJLIST(object->handle),
 			    object, 
 			    pager_object_list
 			);
 			mtx_unlock(&sw_alloc_mtx);
 		}
 	}
 	
 	/*
 	 * Locate hash entry.  If not found create, but if we aren't adding
 	 * anything just return.  If we run out of space in the map we wait
 	 * and, since the hash table may have changed, retry.
 	 */
 retry:
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) == NULL) {
 		int i;
 
 		if (swapblk == SWAPBLK_NONE)
 			goto done;
 
 		swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
 		if (swap == NULL) {
 			mtx_unlock(&swhash_mtx);
 			VM_OBJECT_UNLOCK(object);
 			if (uma_zone_exhausted(swap_zone))
 				printf("swap zone exhausted, increase kern.maxswzone\n");
 			VM_WAIT;
 			VM_OBJECT_LOCK(object);
 			goto retry;
 		}
 
 		swap->swb_hnext = NULL;
 		swap->swb_object = object;
 		swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
 		swap->swb_count = 0;
 
 		++object->un_pager.swp.swp_bcount;
 
 		for (i = 0; i < SWAP_META_PAGES; ++i)
 			swap->swb_pages[i] = SWAPBLK_NONE;
 	}
 
 	/*
 	 * Delete prior contents of metadata
 	 */
 	idx = pindex & SWAP_META_MASK;
 
 	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
 		swp_pager_freeswapspace(swap->swb_pages[idx], 1);
 		--swap->swb_count;
 	}
 
 	/*
 	 * Enter block into metadata
 	 */
 	swap->swb_pages[idx] = swapblk;
 	if (swapblk != SWAPBLK_NONE)
 		++swap->swb_count;
 done:
 	mtx_unlock(&swhash_mtx);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap 
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned 
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  *
  *	This routine must be called at splvm()
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (count > 0) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 
 		if ((swap = *pswap) != NULL) {
 			daddr_t v = swap->swb_pages[index & SWAP_META_MASK];
 
 			if (v != SWAPBLK_NONE) {
 				swp_pager_freeswapspace(v, 1);
 				swap->swb_pages[index & SWAP_META_MASK] =
 					SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			}
 			--count;
 			++index;
 		} else {
 			int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
 			count -= n;
 			index += n;
 		}
 		mtx_unlock(&swhash_mtx);
 	}
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  *
  *	This routine must be called at splvm()
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	daddr_t index = 0;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	while (object->un_pager.swp.swp_bcount) {
 		struct swblock **pswap;
 		struct swblock *swap;
 
 		mtx_lock(&swhash_mtx);
 		pswap = swp_pager_hash(object, index);
 		if ((swap = *pswap) != NULL) {
 			int i;
 
 			for (i = 0; i < SWAP_META_PAGES; ++i) {
 				daddr_t v = swap->swb_pages[i];
 				if (v != SWAPBLK_NONE) {
 					--swap->swb_count;
 					swp_pager_freeswapspace(v, 1);
 				}
 			}
 			if (swap->swb_count != 0)
 				panic("swap_pager_meta_free_all: swb_count != 0");
 			*pswap = swap->swb_hnext;
 			uma_zfree(swap_zone, swap);
 			--object->un_pager.swp.swp_bcount;
 		}
 		mtx_unlock(&swhash_mtx);
 		index += SWAP_META_PAGES;
 		if (index > 0x20000000)
 			panic("swp_pager_meta_free_all: failed to locate all swap meta blocks");
 	}
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap and vm_page_t meta data.
  *
  *	This routine is capable of looking up, popping, or freeing
  *	swapblk assignments in the swap meta data or in the vm_page_t.
  *	The routine typically returns the swapblk being looked-up, or popped,
  *	or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
  *	was invalid.  This routine will automatically free any invalid 
  *	meta-data swapblks.
  *
  *	It is not possible to store invalid swapblks in the swap meta data
  *	(other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
  *
  *	When acting on a busy resident page and paging is in progress, we 
  *	have to wait until paging is complete but otherwise can act on the 
  *	busy page.
  *
  *	This routine must be called at splvm().
  *
  *	SWM_FREE	remove and free swap block from metadata
  *	SWM_POP		remove from meta data but do not free.. pop it out
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblock **pswap;
 	struct swblock *swap;
 	daddr_t r1;
 	int idx;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * The meta data only exists of the object is OBJT_SWAP 
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	r1 = SWAPBLK_NONE;
 	mtx_lock(&swhash_mtx);
 	pswap = swp_pager_hash(object, pindex);
 
 	if ((swap = *pswap) != NULL) {
 		idx = pindex & SWAP_META_MASK;
 		r1 = swap->swb_pages[idx];
 
 		if (r1 != SWAPBLK_NONE) {
 			if (flags & SWM_FREE) {
 				swp_pager_freeswapspace(r1, 1);
 				r1 = SWAPBLK_NONE;
 			}
 			if (flags & (SWM_FREE|SWM_POP)) {
 				swap->swb_pages[idx] = SWAPBLK_NONE;
 				if (--swap->swb_count == 0) {
 					*pswap = swap->swb_hnext;
 					uma_zfree(swap_zone, swap);
 					--object->un_pager.swp.swp_bcount;
 				}
 			} 
 		}
 	}
 	mtx_unlock(&swhash_mtx);
 	return (r1);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /* 
  * MPSAFE
  */
 /* ARGSUSED */
 int
 swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
 	swdev_syscall_active = 1;
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swap_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(td, vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred, td)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks, sw_strategy_t *strategy, sw_close_t *close, dev_t dev)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf("WARNING: reducing size to maximum of %lu blocks per swap unit\n",
 			mblocks);
 		nblks = mblocks;
 	}
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 * 
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_flags = 0;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 
 	sp->sw_blist = blist_create(nblks);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks;
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 	    tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td);
 done:
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct thread *td)
 {
 	u_long nblks, dvbase;
 #ifdef MAC
 	int error;
 #endif
 
 	mtx_assert(&Giant, MA_OWNED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = mac_check_system_swapoff(td->td_ucred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0, td);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
-	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
-	    nblks + nswap_lowat) {
+	if (VMCNT_GET(free_count) + VMCNT_GET(cache_count) +
+	    swap_pager_avail < nblks + nswap_lowat) {
 		return (ENOMEM);
 	}
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
 		swap_pager_avail -= blist_fill(sp->sw_blist,
 		     dvbase, dmmax);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(td, sp);
 	sp->sw_id = NULL;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
  
 	mtx_lock(&Giant);
 	while (swdev_syscall_active)
 		tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
 	swdev_syscall_active = 1;
  
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = sp->sw_vp->v_rdev->si_name;
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, &thread0);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
  
 	swdev_syscall_active = 0;
 	wakeup_one(&swdev_syscall_active);
 	mtx_unlock(&Giant);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	int	*name = (int *)arg1;
 	int	error, n;
 	struct xswdev xs;
 	struct swdevt *sp;
 
 	if (arg2 != 1) /* name length */
 		return (EINVAL);
 
 	n = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n == *name) {
 			mtx_unlock(&sw_dev_mtx);
 			xs.xsw_version = XSWDEV_VERSION;
 			xs.xsw_dev = sp->sw_dev;
 			xs.xsw_flags = sp->sw_flags;
 			xs.xsw_nblks = sp->sw_nblks;
 			xs.xsw_used = sp->sw_used;
 
 			error = SYSCTL_OUT(req, &xs, sizeof(xs));
 			return (error);
 		}
 		n++;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (ENOENT);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * vmspace_swap_count() - count the approximate swap useage in pages for a
  *			  vmspace.
  *
  *	The map must be locked.
  *
  *	Swap useage is determined by taking the proportional swap used by
  *	VM objects backing the VM map.  To make up for fractional losses,
  *	if the VM object has any swap use at all the associated map entries
  *	count for at least 1 swap page.
  */
 int
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map = &vmspace->vm_map;
 	vm_map_entry_t cur;
 	int count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		vm_object_t object;
 
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 		    (object = cur->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(object);
 			if (object->type == OBJT_SWAP &&
 			    object->un_pager.swp.swp_bcount != 0) {
 				int n = (cur->end - cur->start) / PAGE_SIZE;
 
 				count += object->un_pager.swp.swp_bcount *
 				    SWAP_META_PAGES * n / object->size + 1;
 			}
 			VM_OBJECT_UNLOCK(object);
 		}
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct buf *bp;
 
 	bp = bp2->bio_caller2;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bufdone(bp);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	bio = g_alloc_bio();
 #if 0
 	/*
 	 * XXX: We shouldn't really sleep here when we run out of buffers
 	 * XXX: but the alternative is worse right now.
 	 */
 	if (bio == NULL) {
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 #endif
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_data = bp->b_data;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list)
 		if (sp->sw_id == cp)
 			sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 }
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 
 	/* XXX: direct call when Giant untangled */
 	g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
 }
 
 
 struct swh0h0 {
 	struct cdev *dev;
 	struct vnode *vp;
 	int	error;
 };
 
 static void
 swapongeom_ev(void *arg, int flags)
 {
 	struct swh0h0 *swh;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	swh = arg;
 	swh->error = 0;
 	pp = g_dev_getprovider(swh->dev);
 	if (pp == NULL) {
 		swh->error = ENODEV;
 		return;
 	}
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			swh->error = EBUSY;
 			return;
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap", NULL);
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	/*
 	 * XXX: Everytime you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		swh->error = error;
 		return;
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(swh->dev));
 	swh->error = 0;
 	return;
 }
 
 static int
 swapongeom(struct thread *td, struct vnode *vp)
 {
 	int error;
 	struct swh0h0 swh;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	swh.dev = vp->v_rdev;
 	swh.vp = vp;
 	swh.error = 0;
 	/* XXX: direct call when Giant untangled */
 	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
 	if (!error)
 		error = swh.error;
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
     
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef MAC
 	error = mac_check_system_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, -1);
 	(void) VOP_UNLOCK(vp, 0, td);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV);
 	return (0);
 }
Index: head/sys/vm/uma_core.c
===================================================================
--- head/sys/vm/uma_core.c	(revision 169666)
+++ head/sys/vm/uma_core.c	(revision 169667)
@@ -1,3013 +1,3013 @@
 /*-
  * Copyright (c) 2002, 2003, 2004, 2005 Jeffrey Roberson <jeff@FreeBSD.org>
  * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
  * Copyright (c) 2004-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * uma_core.c  Implementation of the Universal Memory allocator
  *
  * This allocator is intended to replace the multitude of similar object caches
  * in the standard FreeBSD kernel.  The intent is to be flexible as well as
  * effecient.  A primary design goal is to return unused memory to the rest of
  * the system.  This will make the system as a whole more flexible due to the
  * ability to move memory to subsystems which most need it instead of leaving
  * pools of reserved memory unused.
  *
  * The basic ideas stem from similar slab/zone based allocators whose algorithms
  * are well known.
  *
  */
 
 /*
  * TODO:
  *	- Improve memory usage for large allocations
  *	- Investigate cache size adjustments
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* I should really use ktr.. */
 /*
 #define UMA_DEBUG 1
 #define UMA_DEBUG_ALLOC 1
 #define UMA_DEBUG_ALLOC_1 1
 */
 
 #include "opt_ddb.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 #include <vm/uma_dbg.h>
 
 #include <machine/vmparam.h>
 
 #include <ddb/ddb.h>
 
 /*
  * This is the zone and keg from which all zones are spawned.  The idea is that
  * even the zone & keg heads are allocated from the allocator, so we use the
  * bss section to bootstrap us.
  */
 static struct uma_keg masterkeg;
 static struct uma_zone masterzone_k;
 static struct uma_zone masterzone_z;
 static uma_zone_t kegs = &masterzone_k;
 static uma_zone_t zones = &masterzone_z;
 
 /* This is the zone from which all of uma_slab_t's are allocated. */
 static uma_zone_t slabzone;
 static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
 
 /*
  * The initial hash tables come out of this zone so they can be allocated
  * prior to malloc coming up.
  */
 static uma_zone_t hashzone;
 
 /* The boot-time adjusted value for cache line alignment. */
 static int uma_align_cache = 16 - 1;
 
 static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
 
 /*
  * Are we allowed to allocate buckets?
  */
 static int bucketdisable = 1;
 
 /* Linked list of all kegs in the system */
 static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(&uma_kegs);
 
 /* This mutex protects the keg list */
 static struct mtx uma_mtx;
 
 /* Linked list of boot time pages */
 static LIST_HEAD(,uma_slab) uma_boot_pages =
     LIST_HEAD_INITIALIZER(&uma_boot_pages);
 
 /* This mutex protects the boot time pages list */
 static struct mtx uma_boot_pages_mtx;
 
 /* Is the VM done starting up? */
 static int booted = 0;
 
 /* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
 static u_int uma_max_ipers;
 static u_int uma_max_ipers_ref;
 
 /*
  * This is the handle used to schedule events that need to happen
  * outside of the allocation fast path.
  */
 static struct callout uma_callout;
 #define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
 
 /*
  * This structure is passed as the zone ctor arg so that I don't have to create
  * a special allocation function just for zones.
  */
 struct uma_zctor_args {
 	char *name;
 	size_t size;
 	uma_ctor ctor;
 	uma_dtor dtor;
 	uma_init uminit;
 	uma_fini fini;
 	uma_keg_t keg;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_kctor_args {
 	uma_zone_t zone;
 	size_t size;
 	uma_init uminit;
 	uma_fini fini;
 	int align;
 	u_int32_t flags;
 };
 
 struct uma_bucket_zone {
 	uma_zone_t	ubz_zone;
 	char		*ubz_name;
 	int		ubz_entries;
 };
 
 #define	BUCKET_MAX	128
 
 struct uma_bucket_zone bucket_zones[] = {
 	{ NULL, "16 Bucket", 16 },
 	{ NULL, "32 Bucket", 32 },
 	{ NULL, "64 Bucket", 64 },
 	{ NULL, "128 Bucket", 128 },
 	{ NULL, NULL, 0}
 };
 
 #define	BUCKET_SHIFT	4
 #define	BUCKET_ZONES	((BUCKET_MAX >> BUCKET_SHIFT) + 1)
 
 /*
  * bucket_size[] maps requested bucket sizes to zones that allocate a bucket
  * of approximately the right size.
  */
 static uint8_t bucket_size[BUCKET_ZONES];
 
 /*
  * Flags and enumerations to be passed to internal functions.
  */
 enum zfreeskip { SKIP_NONE, SKIP_DTOR, SKIP_FINI };
 
 #define	ZFREE_STATFAIL	0x00000001	/* Update zone failure statistic. */
 #define	ZFREE_STATFREE	0x00000002	/* Update zone free statistic. */
 
 /* Prototypes.. */
 
 static void *obj_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *page_alloc(uma_zone_t, int, u_int8_t *, int);
 static void *startup_alloc(uma_zone_t, int, u_int8_t *, int);
 static void page_free(void *, int, u_int8_t);
 static uma_slab_t slab_zalloc(uma_zone_t, int);
 static void cache_drain(uma_zone_t);
 static void bucket_drain(uma_zone_t, uma_bucket_t);
 static void bucket_cache_drain(uma_zone_t zone);
 static int keg_ctor(void *, int, void *, int);
 static void keg_dtor(void *, int, void *);
 static int zone_ctor(void *, int, void *, int);
 static void zone_dtor(void *, int, void *);
 static int zero_init(void *, int, int);
 static void zone_small_init(uma_zone_t zone);
 static void zone_large_init(uma_zone_t zone);
 static void zone_foreach(void (*zfunc)(uma_zone_t));
 static void zone_timeout(uma_zone_t zone);
 static int hash_alloc(struct uma_hash *);
 static int hash_expand(struct uma_hash *, struct uma_hash *);
 static void hash_free(struct uma_hash *hash);
 static void uma_timeout(void *);
 static void uma_startup3(void);
 static void *uma_zalloc_internal(uma_zone_t, void *, int);
 static void uma_zfree_internal(uma_zone_t, void *, void *, enum zfreeskip,
     int);
 static void bucket_enable(void);
 static void bucket_init(void);
 static uma_bucket_t bucket_alloc(int, int);
 static void bucket_free(uma_bucket_t);
 static void bucket_zone_drain(void);
 static int uma_zalloc_bucket(uma_zone_t zone, int flags);
 static uma_slab_t uma_zone_slab(uma_zone_t zone, int flags);
 static void *uma_slab_alloc(uma_zone_t zone, uma_slab_t slab);
 static uma_zone_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
     uma_fini fini, int align, u_int32_t flags);
 
 void uma_print_zone(uma_zone_t);
 void uma_print_stats(void);
 static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
 static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
 
 #ifdef WITNESS
 static int nosleepwithlocks = 1;
 #else
 static int nosleepwithlocks = 0;
 #endif
 SYSCTL_INT(_debug, OID_AUTO, nosleepwithlocks, CTLFLAG_RW, &nosleepwithlocks,
     0, "Convert M_WAITOK to M_NOWAIT to avoid lock-held-across-sleep paths");
 SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
     0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
 
 SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
     0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
 
 /*
  * This routine checks to see whether or not it's safe to enable buckets.
  */
 
 static void
 bucket_enable(void)
 {
-	if (cnt.v_free_count < cnt.v_free_min)
+	if (VMCNT_GET(free_count) < VMCNT_GET(free_min))
 		bucketdisable = 1;
 	else
 		bucketdisable = 0;
 }
 
 /*
  * Initialize bucket_zones, the array of zones of buckets of various sizes.
  *
  * For each zone, calculate the memory required for each bucket, consisting
  * of the header and an array of pointers.  Initialize bucket_size[] to point
  * the range of appropriate bucket sizes at the zone.
  */
 static void
 bucket_init(void)
 {
 	struct uma_bucket_zone *ubz;
 	int i;
 	int j;
 
 	for (i = 0, j = 0; bucket_zones[j].ubz_entries != 0; j++) {
 		int size;
 
 		ubz = &bucket_zones[j];
 		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
 		size += sizeof(void *) * ubz->ubz_entries;
 		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 		for (; i <= ubz->ubz_entries; i += (1 << BUCKET_SHIFT))
 			bucket_size[i >> BUCKET_SHIFT] = j;
 	}
 }
 
 /*
  * Given a desired number of entries for a bucket, return the zone from which
  * to allocate the bucket.
  */
 static struct uma_bucket_zone *
 bucket_zone_lookup(int entries)
 {
 	int idx;
 
 	idx = howmany(entries, 1 << BUCKET_SHIFT);
 	return (&bucket_zones[bucket_size[idx]]);
 }
 
 static uma_bucket_t
 bucket_alloc(int entries, int bflags)
 {
 	struct uma_bucket_zone *ubz;
 	uma_bucket_t bucket;
 
 	/*
 	 * This is to stop us from allocating per cpu buckets while we're
 	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
 	 * boot pages.  This also prevents us from allocating buckets in
 	 * low memory situations.
 	 */
 	if (bucketdisable)
 		return (NULL);
 
 	ubz = bucket_zone_lookup(entries);
 	bucket = uma_zalloc_internal(ubz->ubz_zone, NULL, bflags);
 	if (bucket) {
 #ifdef INVARIANTS
 		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
 #endif
 		bucket->ub_cnt = 0;
 		bucket->ub_entries = ubz->ubz_entries;
 	}
 
 	return (bucket);
 }
 
 static void
 bucket_free(uma_bucket_t bucket)
 {
 	struct uma_bucket_zone *ubz;
 
 	ubz = bucket_zone_lookup(bucket->ub_entries);
 	uma_zfree_internal(ubz->ubz_zone, bucket, NULL, SKIP_NONE,
 	    ZFREE_STATFREE);
 }
 
 static void
 bucket_zone_drain(void)
 {
 	struct uma_bucket_zone *ubz;
 
 	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
 		zone_drain(ubz->ubz_zone);
 }
 
 
 /*
  * Routine called by timeout which is used to fire off some time interval
  * based calculations.  (stats, hash size, etc.)
  *
  * Arguments:
  *	arg   Unused
  *
  * Returns:
  *	Nothing
  */
 static void
 uma_timeout(void *unused)
 {
 	bucket_enable();
 	zone_foreach(zone_timeout);
 
 	/* Reschedule this event */
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 }
 
 /*
  * Routine to perform timeout driven calculations.  This expands the
  * hashes and does per cpu statistics aggregation.
  *
  *  Arguments:
  *	zone  The zone to operate on
  *
  *  Returns:
  *	Nothing
  */
 static void
 zone_timeout(uma_zone_t zone)
 {
 	uma_keg_t keg;
 	u_int64_t alloc;
 
 	keg = zone->uz_keg;
 	alloc = 0;
 
 	/*
 	 * Expand the zone hash table.
 	 *
 	 * This is done if the number of slabs is larger than the hash size.
 	 * What I'm trying to do here is completely reduce collisions.  This
 	 * may be a little aggressive.  Should I allow for two collisions max?
 	 */
 	ZONE_LOCK(zone);
 	if (keg->uk_flags & UMA_ZONE_HASH &&
 	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
 		struct uma_hash newhash;
 		struct uma_hash oldhash;
 		int ret;
 
 		/*
 		 * This is so involved because allocating and freeing
 		 * while the zone lock is held will lead to deadlock.
 		 * I have to do everything in stages and check for
 		 * races.
 		 */
 		newhash = keg->uk_hash;
 		ZONE_UNLOCK(zone);
 		ret = hash_alloc(&newhash);
 		ZONE_LOCK(zone);
 		if (ret) {
 			if (hash_expand(&keg->uk_hash, &newhash)) {
 				oldhash = keg->uk_hash;
 				keg->uk_hash = newhash;
 			} else
 				oldhash = newhash;
 
 			ZONE_UNLOCK(zone);
 			hash_free(&oldhash);
 			ZONE_LOCK(zone);
 		}
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /*
  * Allocate and zero fill the next sized hash table from the appropriate
  * backing store.
  *
  * Arguments:
  *	hash  A new hash structure with the old hash size in uh_hashsize
  *
  * Returns:
  *	1 on sucess and 0 on failure.
  */
 static int
 hash_alloc(struct uma_hash *hash)
 {
 	int oldsize;
 	int alloc;
 
 	oldsize = hash->uh_hashsize;
 
 	/* We're just going to go to a power of two greater */
 	if (oldsize)  {
 		hash->uh_hashsize = oldsize * 2;
 		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
 		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
 		    M_UMAHASH, M_NOWAIT);
 	} else {
 		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
 		hash->uh_slab_hash = uma_zalloc_internal(hashzone, NULL,
 		    M_WAITOK);
 		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
 	}
 	if (hash->uh_slab_hash) {
 		bzero(hash->uh_slab_hash, alloc);
 		hash->uh_hashmask = hash->uh_hashsize - 1;
 		return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Expands the hash table for HASH zones.  This is done from zone_timeout
  * to reduce collisions.  This must not be done in the regular allocation
  * path, otherwise, we can recurse on the vm while allocating pages.
  *
  * Arguments:
  *	oldhash  The hash you want to expand
  *	newhash  The hash structure for the new table
  *
  * Returns:
  *	Nothing
  *
  * Discussion:
  */
 static int
 hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
 {
 	uma_slab_t slab;
 	int hval;
 	int i;
 
 	if (!newhash->uh_slab_hash)
 		return (0);
 
 	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
 		return (0);
 
 	/*
 	 * I need to investigate hash algorithms for resizing without a
 	 * full rehash.
 	 */
 
 	for (i = 0; i < oldhash->uh_hashsize; i++)
 		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
 			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
 			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
 			hval = UMA_HASH(newhash, slab->us_data);
 			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
 			    slab, us_hlink);
 		}
 
 	return (1);
 }
 
 /*
  * Free the hash bucket to the appropriate backing store.
  *
  * Arguments:
  *	slab_hash  The hash bucket we're freeing
  *	hashsize   The number of entries in that hash bucket
  *
  * Returns:
  *	Nothing
  */
 static void
 hash_free(struct uma_hash *hash)
 {
 	if (hash->uh_slab_hash == NULL)
 		return;
 	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
 		uma_zfree_internal(hashzone,
 		    hash->uh_slab_hash, NULL, SKIP_NONE, ZFREE_STATFREE);
 	else
 		free(hash->uh_slab_hash, M_UMAHASH);
 }
 
 /*
  * Frees all outstanding items in a bucket
  *
  * Arguments:
  *	zone   The zone to free to, must be unlocked.
  *	bucket The free/alloc bucket with items, cpu queue must be locked.
  *
  * Returns:
  *	Nothing
  */
 
 static void
 bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
 {
 	uma_slab_t slab;
 	int mzone;
 	void *item;
 
 	if (bucket == NULL)
 		return;
 
 	slab = NULL;
 	mzone = 0;
 
 	/* We have to lookup the slab again for malloc.. */
 	if (zone->uz_keg->uk_flags & UMA_ZONE_MALLOC)
 		mzone = 1;
 
 	while (bucket->ub_cnt > 0)  {
 		bucket->ub_cnt--;
 		item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 		bucket->ub_bucket[bucket->ub_cnt] = NULL;
 		KASSERT(item != NULL,
 		    ("bucket_drain: botched ptr, item is NULL"));
 #endif
 		/*
 		 * This is extremely inefficient.  The slab pointer was passed
 		 * to uma_zfree_arg, but we lost it because the buckets don't
 		 * hold them.  This will go away when free() gets a size passed
 		 * to it.
 		 */
 		if (mzone)
 			slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
 		uma_zfree_internal(zone, item, slab, SKIP_DTOR, 0);
 	}
 }
 
 /*
  * Drains the per cpu caches for a zone.
  *
  * NOTE: This may only be called while the zone is being turn down, and not
  * during normal operation.  This is necessary in order that we do not have
  * to migrate CPUs to drain the per-CPU caches.
  *
  * Arguments:
  *	zone     The zone to drain, must be unlocked.
  *
  * Returns:
  *	Nothing
  */
 static void
 cache_drain(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	int cpu;
 
 	/*
 	 * XXX: It is safe to not lock the per-CPU caches, because we're
 	 * tearing down the zone anyway.  I.e., there will be no further use
 	 * of the caches at this point.
 	 *
 	 * XXX: It would good to be able to assert that the zone is being
 	 * torn down to prevent improper use of cache_drain().
 	 *
 	 * XXX: We lock the zone before passing into bucket_cache_drain() as
 	 * it is used elsewhere.  Should the tear-down path be made special
 	 * there in some form?
 	 */
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (CPU_ABSENT(cpu))
 			continue;
 		cache = &zone->uz_cpu[cpu];
 		bucket_drain(zone, cache->uc_allocbucket);
 		bucket_drain(zone, cache->uc_freebucket);
 		if (cache->uc_allocbucket != NULL)
 			bucket_free(cache->uc_allocbucket);
 		if (cache->uc_freebucket != NULL)
 			bucket_free(cache->uc_freebucket);
 		cache->uc_allocbucket = cache->uc_freebucket = NULL;
 	}
 	ZONE_LOCK(zone);
 	bucket_cache_drain(zone);
 	ZONE_UNLOCK(zone);
 }
 
 /*
  * Drain the cached buckets from a zone.  Expects a locked zone on entry.
  */
 static void
 bucket_cache_drain(uma_zone_t zone)
 {
 	uma_bucket_t bucket;
 
 	/*
 	 * Drain the bucket queues and free the buckets, we just keep two per
 	 * cpu (alloc/free).
 	 */
 	while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		bucket_drain(zone, bucket);
 		bucket_free(bucket);
 		ZONE_LOCK(zone);
 	}
 
 	/* Now we do the free queue.. */
 	while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		bucket_free(bucket);
 	}
 }
 
 /*
  * Frees pages from a zone back to the system.  This is done on demand from
  * the pageout daemon.
  *
  * Arguments:
  *	zone  The zone to free pages from
  *	 all  Should we drain all items?
  *
  * Returns:
  *	Nothing.
  */
 void
 zone_drain(uma_zone_t zone)
 {
 	struct slabhead freeslabs = { 0 };
 	uma_keg_t keg;
 	uma_slab_t slab;
 	uma_slab_t n;
 	u_int8_t flags;
 	u_int8_t *mem;
 	int i;
 
 	keg = zone->uz_keg;
 
 	/*
 	 * We don't want to take pages from statically allocated zones at this
 	 * time
 	 */
 	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
 		return;
 
 	ZONE_LOCK(zone);
 
 #ifdef UMA_DEBUG
 	printf("%s free items: %u\n", zone->uz_name, keg->uk_free);
 #endif
 	bucket_cache_drain(zone);
 	if (keg->uk_free == 0)
 		goto finished;
 
 	slab = LIST_FIRST(&keg->uk_free_slab);
 	while (slab) {
 		n = LIST_NEXT(slab, us_link);
 
 		/* We have no where to free these to */
 		if (slab->us_flags & UMA_SLAB_BOOT) {
 			slab = n;
 			continue;
 		}
 
 		LIST_REMOVE(slab, us_link);
 		keg->uk_pages -= keg->uk_ppera;
 		keg->uk_free -= keg->uk_ipers;
 
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
 
 		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
 
 		slab = n;
 	}
 finished:
 	ZONE_UNLOCK(zone);
 
 	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
 		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
 		if (keg->uk_fini)
 			for (i = 0; i < keg->uk_ipers; i++)
 				keg->uk_fini(
 				    slab->us_data + (keg->uk_rsize * i),
 				    keg->uk_size);
 		flags = slab->us_flags;
 		mem = slab->us_data;
 
 		if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
 		    (keg->uk_flags & UMA_ZONE_REFCNT)) {
 			vm_object_t obj;
 
 			if (flags & UMA_SLAB_KMEM)
 				obj = kmem_object;
 			else
 				obj = NULL;
 			for (i = 0; i < keg->uk_ppera; i++)
 				vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),
 				    obj);
 		}
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 #ifdef UMA_DEBUG
 		printf("%s: Returning %d bytes.\n",
 		    zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);
 #endif
 		keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);
 	}
 }
 
 /*
  * Allocate a new slab for a zone.  This does not insert the slab onto a list.
  *
  * Arguments:
  *	zone  The zone to allocate slabs for
  *	wait  Shall we wait?
  *
  * Returns:
  *	The slab that was allocated or NULL if there is no memory and the
  *	caller specified M_NOWAIT.
  */
 static uma_slab_t
 slab_zalloc(uma_zone_t zone, int wait)
 {
 	uma_slabrefcnt_t slabref;
 	uma_slab_t slab;
 	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t flags;
 	int i;
 
 	slab = NULL;
 	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG
 	printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);
 #endif
 	ZONE_UNLOCK(zone);
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);
 		if (slab == NULL) {
 			ZONE_LOCK(zone);
 			return NULL;
 		}
 	}
 
 	/*
 	 * This reproduces the old vm_zone behavior of zero filling pages the
 	 * first time they are added to a zone.
 	 *
 	 * Malloced items are zeroed in uma_zalloc.
 	 */
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		wait |= M_ZERO;
 	else
 		wait &= ~M_ZERO;
 
 	mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,
 	    &flags, wait);
 	if (mem == NULL) {
 		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 			uma_zfree_internal(keg->uk_slabzone, slab, NULL,
 			    SKIP_NONE, ZFREE_STATFREE);
 		ZONE_LOCK(zone);
 		return (NULL);
 	}
 
 	/* Point the slab into the allocated memory */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
 		slab = (uma_slab_t )(mem + keg->uk_pgoff);
 
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
 	    (keg->uk_flags & UMA_ZONE_REFCNT))
 		for (i = 0; i < keg->uk_ppera; i++)
 			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
 
 	slab->us_keg = keg;
 	slab->us_data = mem;
 	slab->us_freecount = keg->uk_ipers;
 	slab->us_firstfree = 0;
 	slab->us_flags = flags;
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		for (i = 0; i < keg->uk_ipers; i++) {
 			slabref->us_freelist[i].us_refcnt = 0;
 			slabref->us_freelist[i].us_item = i+1;
 		}
 	} else {
 		for (i = 0; i < keg->uk_ipers; i++)
 			slab->us_freelist[i].us_item = i+1;
 	}
 
 	if (keg->uk_init != NULL) {
 		for (i = 0; i < keg->uk_ipers; i++)
 			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
 			    keg->uk_size, wait) != 0)
 				break;
 		if (i != keg->uk_ipers) {
 			if (keg->uk_fini != NULL) {
 				for (i--; i > -1; i--)
 					keg->uk_fini(slab->us_data +
 					    (keg->uk_rsize * i),
 					    keg->uk_size);
 			}
 			if ((keg->uk_flags & UMA_ZONE_MALLOC) ||
 			    (keg->uk_flags & UMA_ZONE_REFCNT)) {
 				vm_object_t obj;
 
 				if (flags & UMA_SLAB_KMEM)
 					obj = kmem_object;
 				else
 					obj = NULL;
 				for (i = 0; i < keg->uk_ppera; i++)
 					vsetobj((vm_offset_t)mem +
 					    (i * PAGE_SIZE), obj);
 			}
 			if (keg->uk_flags & UMA_ZONE_OFFPAGE)
 				uma_zfree_internal(keg->uk_slabzone, slab,
 				    NULL, SKIP_NONE, ZFREE_STATFREE);
 			keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,
 			    flags);
 			ZONE_LOCK(zone);
 			return (NULL);
 		}
 	}
 	ZONE_LOCK(zone);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
 
 	keg->uk_pages += keg->uk_ppera;
 	keg->uk_free += keg->uk_ipers;
 
 	return (slab);
 }
 
 /*
  * This function is intended to be used early on in place of page_alloc() so
  * that we may use the boot time page cache to satisfy allocations before
  * the VM is ready.
  */
 static void *
 startup_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	uma_keg_t keg;
 	uma_slab_t tmps;
 
 	keg = zone->uz_keg;
 
 	/*
 	 * Check our small startup cache to see if it has pages remaining.
 	 */
 	mtx_lock(&uma_boot_pages_mtx);
 	if ((tmps = LIST_FIRST(&uma_boot_pages)) != NULL) {
 		LIST_REMOVE(tmps, us_link);
 		mtx_unlock(&uma_boot_pages_mtx);
 		*pflag = tmps->us_flags;
 		return (tmps->us_data);
 	}
 	mtx_unlock(&uma_boot_pages_mtx);
 	if (booted == 0)
 		panic("UMA: Increase vm.boot_pages");
 	/*
 	 * Now that we've booted reset these users to their real allocator.
 	 */
 #ifdef UMA_MD_SMALL_ALLOC
 	keg->uk_allocf = uma_small_alloc;
 #else
 	keg->uk_allocf = page_alloc;
 #endif
 	return keg->uk_allocf(zone, bytes, pflag, wait);
 }
 
 /*
  * Allocates a number of pages from the system
  *
  * Arguments:
  *	zone  Unused
  *	bytes  The number of bytes requested
  *	wait  Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 page_alloc(uma_zone_t zone, int bytes, u_int8_t *pflag, int wait)
 {
 	void *p;	/* Returned page */
 
 	*pflag = UMA_SLAB_KMEM;
 	p = (void *) kmem_malloc(kmem_map, bytes, wait);
 
 	return (p);
 }
 
 /*
  * Allocates a number of pages from within an object
  *
  * Arguments:
  *	zone   Unused
  *	bytes  The number of bytes requested
  *	wait   Shall we wait?
  *
  * Returns:
  *	A pointer to the alloced memory or possibly
  *	NULL if M_NOWAIT is set.
  */
 static void *
 obj_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
 {
 	vm_object_t object;
 	vm_offset_t retkva, zkva;
 	vm_page_t p;
 	int pages, startpages;
 
 	object = zone->uz_keg->uk_obj;
 	retkva = 0;
 
 	/*
 	 * This looks a little weird since we're getting one page at a time.
 	 */
 	VM_OBJECT_LOCK(object);
 	p = TAILQ_LAST(&object->memq, pglist);
 	pages = p != NULL ? p->pindex + 1 : 0;
 	startpages = pages;
 	zkva = zone->uz_keg->uk_kva + pages * PAGE_SIZE;
 	for (; bytes > 0; bytes -= PAGE_SIZE) {
 		p = vm_page_alloc(object, pages,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED);
 		if (p == NULL) {
 			if (pages != startpages)
 				pmap_qremove(retkva, pages - startpages);
 			while (pages != startpages) {
 				pages--;
 				p = TAILQ_LAST(&object->memq, pglist);
 				vm_page_lock_queues();
 				vm_page_unwire(p, 0);
 				vm_page_free(p);
 				vm_page_unlock_queues();
 			}
 			retkva = 0;
 			goto done;
 		}
 		pmap_qenter(zkva, &p, 1);
 		if (retkva == 0)
 			retkva = zkva;
 		zkva += PAGE_SIZE;
 		pages += 1;
 	}
 done:
 	VM_OBJECT_UNLOCK(object);
 	*flags = UMA_SLAB_PRIV;
 
 	return ((void *)retkva);
 }
 
 /*
  * Frees a number of pages to the system
  *
  * Arguments:
  *	mem   A pointer to the memory to be freed
  *	size  The size of the memory being freed
  *	flags The original p->us_flags field
  *
  * Returns:
  *	Nothing
  */
 static void
 page_free(void *mem, int size, u_int8_t flags)
 {
 	vm_map_t map;
 
 	if (flags & UMA_SLAB_KMEM)
 		map = kmem_map;
 	else
 		panic("UMA: page_free used with invalid flags %d\n", flags);
 
 	kmem_free(map, (vm_offset_t)mem, size);
 }
 
 /*
  * Zero fill initializer
  *
  * Arguments/Returns follow uma_init specifications
  */
 static int
 zero_init(void *mem, int size, int flags)
 {
 	bzero(mem, size);
 	return (0);
 }
 
 /*
  * Finish creating a small uma zone.  This calculates ipers, and the zone size.
  *
  * Arguments
  *	zone  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 zone_small_init(uma_zone_t zone)
 {
 	uma_keg_t keg;
 	u_int rsize;
 	u_int memused;
 	u_int wastedspace;
 	u_int shsize;
 
 	keg = zone->uz_keg;
 	KASSERT(keg != NULL, ("Keg is null in zone_small_init"));
 	rsize = keg->uk_size;
 
 	if (rsize < UMA_SMALLEST_UNIT)
 		rsize = UMA_SMALLEST_UNIT;
 	if (rsize & keg->uk_align)
 		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
 
 	keg->uk_rsize = rsize;
 	keg->uk_ppera = 1;
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		rsize += UMA_FRITMREF_SZ;	/* linkage & refcnt */
 		shsize = sizeof(struct uma_slab_refcnt);
 	} else {
 		rsize += UMA_FRITM_SZ;	/* Account for linkage */
 		shsize = sizeof(struct uma_slab);
 	}
 
 	keg->uk_ipers = (UMA_SLAB_SIZE - shsize) / rsize;
 	KASSERT(keg->uk_ipers != 0, ("zone_small_init: ipers is 0"));
 	memused = keg->uk_ipers * rsize + shsize;
 	wastedspace = UMA_SLAB_SIZE - memused;
 
 	/*
 	 * We can't do OFFPAGE if we're internal or if we've been
 	 * asked to not go to the VM for buckets.  If we do this we
 	 * may end up going to the VM (kmem_map) for slabs which we
 	 * do not want to do if we're UMA_ZFLAG_CACHEONLY as a
 	 * result of UMA_ZONE_VM, which clearly forbids it.
 	 */
 	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
 	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
 		return;
 
 	if ((wastedspace >= UMA_MAX_WASTE) &&
 	    (keg->uk_ipers < (UMA_SLAB_SIZE / keg->uk_rsize))) {
 		keg->uk_ipers = UMA_SLAB_SIZE / keg->uk_rsize;
 		KASSERT(keg->uk_ipers <= 255,
 		    ("zone_small_init: keg->uk_ipers too high!"));
 #ifdef UMA_DEBUG
 		printf("UMA decided we need offpage slab headers for "
 		    "zone: %s, calculated wastedspace = %d, "
 		    "maximum wasted space allowed = %d, "
 		    "calculated ipers = %d, "
 		    "new wasted space = %d\n", zone->uz_name, wastedspace,
 		    UMA_MAX_WASTE, keg->uk_ipers,
 		    UMA_SLAB_SIZE - keg->uk_ipers * keg->uk_rsize);
 #endif
 		keg->uk_flags |= UMA_ZONE_OFFPAGE;
 		if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 			keg->uk_flags |= UMA_ZONE_HASH;
 	}
 }
 
 /*
  * Finish creating a large (> UMA_SLAB_SIZE) uma zone.  Just give in and do
  * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
  * more complicated.
  *
  * Arguments
  *	zone  The zone we should initialize
  *
  * Returns
  *	Nothing
  */
 static void
 zone_large_init(uma_zone_t zone)
 {
 	uma_keg_t keg;
 	int pages;
 
 	keg = zone->uz_keg;
 
 	KASSERT(keg != NULL, ("Keg is null in zone_large_init"));
 	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
 	    ("zone_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY zone"));
 
 	pages = keg->uk_size / UMA_SLAB_SIZE;
 
 	/* Account for remainder */
 	if ((pages * UMA_SLAB_SIZE) < keg->uk_size)
 		pages++;
 
 	keg->uk_ppera = pages;
 	keg->uk_ipers = 1;
 
 	keg->uk_flags |= UMA_ZONE_OFFPAGE;
 	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
 		keg->uk_flags |= UMA_ZONE_HASH;
 
 	keg->uk_rsize = keg->uk_size;
 }
 
 /*
  * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
  * the keg onto the global keg list.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_kctor_args
  */
 static int
 keg_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_kctor_args *arg = udata;
 	uma_keg_t keg = mem;
 	uma_zone_t zone;
 
 	bzero(keg, size);
 	keg->uk_size = arg->size;
 	keg->uk_init = arg->uminit;
 	keg->uk_fini = arg->fini;
 	keg->uk_align = arg->align;
 	keg->uk_free = 0;
 	keg->uk_pages = 0;
 	keg->uk_flags = arg->flags;
 	keg->uk_allocf = page_alloc;
 	keg->uk_freef = page_free;
 	keg->uk_recurse = 0;
 	keg->uk_slabzone = NULL;
 
 	/*
 	 * The master zone is passed to us at keg-creation time.
 	 */
 	zone = arg->zone;
 	zone->uz_keg = keg;
 
 	if (arg->flags & UMA_ZONE_VM)
 		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
 
 	if (arg->flags & UMA_ZONE_ZINIT)
 		keg->uk_init = zero_init;
 
 	/*
 	 * The +UMA_FRITM_SZ added to uk_size is to account for the
 	 * linkage that is added to the size in zone_small_init().  If
 	 * we don't account for this here then we may end up in
 	 * zone_small_init() with a calculated 'ipers' of 0.
 	 */
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		if ((keg->uk_size+UMA_FRITMREF_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)))
 			zone_large_init(zone);
 		else
 			zone_small_init(zone);
 	} else {
 		if ((keg->uk_size+UMA_FRITM_SZ) >
 		    (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
 			zone_large_init(zone);
 		else
 			zone_small_init(zone);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			keg->uk_slabzone = slabrefzone;
 		else
 			keg->uk_slabzone = slabzone;
 	}
 
 	/*
 	 * If we haven't booted yet we need allocations to go through the
 	 * startup cache until the vm is ready.
 	 */
 	if (keg->uk_ppera == 1) {
 #ifdef UMA_MD_SMALL_ALLOC
 		keg->uk_allocf = uma_small_alloc;
 		keg->uk_freef = uma_small_free;
 #endif
 		if (booted == 0)
 			keg->uk_allocf = startup_alloc;
 	}
 
 	/*
 	 * Initialize keg's lock (shared among zones) through
 	 * Master zone
 	 */
 	zone->uz_lock = &keg->uk_lock;
 	if (arg->flags & UMA_ZONE_MTXCLASS)
 		ZONE_LOCK_INIT(zone, 1);
 	else
 		ZONE_LOCK_INIT(zone, 0);
 
 	/*
 	 * If we're putting the slab header in the actual page we need to
 	 * figure out where in each page it goes.  This calculates a right
 	 * justified offset into the memory on an ALIGN_PTR boundary.
 	 */
 	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
 		u_int totsize;
 
 		/* Size of the slab struct and free list */
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = sizeof(struct uma_slab_refcnt) +
 			    keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = sizeof(struct uma_slab) +
 			    keg->uk_ipers * UMA_FRITM_SZ;
 
 		if (totsize & UMA_ALIGN_PTR)
 			totsize = (totsize & ~UMA_ALIGN_PTR) +
 			    (UMA_ALIGN_PTR + 1);
 		keg->uk_pgoff = UMA_SLAB_SIZE - totsize;
 
 		if (keg->uk_flags & UMA_ZONE_REFCNT)
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab_refcnt)
 			    + keg->uk_ipers * UMA_FRITMREF_SZ;
 		else
 			totsize = keg->uk_pgoff + sizeof(struct uma_slab)
 			    + keg->uk_ipers * UMA_FRITM_SZ;
 
 		/*
 		 * The only way the following is possible is if with our
 		 * UMA_ALIGN_PTR adjustments we are now bigger than
 		 * UMA_SLAB_SIZE.  I haven't checked whether this is
 		 * mathematically possible for all cases, so we make
 		 * sure here anyway.
 		 */
 		if (totsize > UMA_SLAB_SIZE) {
 			printf("zone %s ipers %d rsize %d size %d\n",
 			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
 			    keg->uk_size);
 			panic("UMA slab won't fit.\n");
 		}
 	}
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_alloc(&keg->uk_hash);
 
 #ifdef UMA_DEBUG
 	printf("%s(%p) size = %d ipers = %d ppera = %d pgoff = %d\n",
 	    zone->uz_name, zone,
 	    keg->uk_size, keg->uk_ipers,
 	    keg->uk_ppera, keg->uk_pgoff);
 #endif
 
 	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
 
 	mtx_lock(&uma_mtx);
 	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
 	mtx_unlock(&uma_mtx);
 	return (0);
 }
 
 /*
  * Zone header ctor.  This initializes all fields, locks, etc.
  *
  * Arguments/Returns follow uma_ctor specifications
  *	udata  Actually uma_zctor_args
  */
 
 static int
 zone_ctor(void *mem, int size, void *udata, int flags)
 {
 	struct uma_zctor_args *arg = udata;
 	uma_zone_t zone = mem;
 	uma_zone_t z;
 	uma_keg_t keg;
 
 	bzero(zone, size);
 	zone->uz_name = arg->name;
 	zone->uz_ctor = arg->ctor;
 	zone->uz_dtor = arg->dtor;
 	zone->uz_init = NULL;
 	zone->uz_fini = NULL;
 	zone->uz_allocs = 0;
 	zone->uz_frees = 0;
 	zone->uz_fails = 0;
 	zone->uz_fills = zone->uz_count = 0;
 
 	if (arg->flags & UMA_ZONE_SECONDARY) {
 		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
 		keg = arg->keg;
 		zone->uz_keg = keg;
 		zone->uz_init = arg->uminit;
 		zone->uz_fini = arg->fini;
 		zone->uz_lock = &keg->uk_lock;
 		mtx_lock(&uma_mtx);
 		ZONE_LOCK(zone);
 		keg->uk_flags |= UMA_ZONE_SECONDARY;
 		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
 			if (LIST_NEXT(z, uz_link) == NULL) {
 				LIST_INSERT_AFTER(z, zone, uz_link);
 				break;
 			}
 		}
 		ZONE_UNLOCK(zone);
 		mtx_unlock(&uma_mtx);
 	} else if (arg->keg == NULL) {
 		if (uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
 		    arg->align, arg->flags) == NULL)
 			return (ENOMEM);
 	} else {
 		struct uma_kctor_args karg;
 		int error;
 
 		/* We should only be here from uma_startup() */
 		karg.size = arg->size;
 		karg.uminit = arg->uminit;
 		karg.fini = arg->fini;
 		karg.align = arg->align;
 		karg.flags = arg->flags;
 		karg.zone = zone;
 		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
 		    flags);
 		if (error)
 			return (error);
 	}
 	keg = zone->uz_keg;
 	zone->uz_lock = &keg->uk_lock;
 
 	/*
 	 * Some internal zones don't have room allocated for the per cpu
 	 * caches.  If we're internal, bail out here.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
 		KASSERT((keg->uk_flags & UMA_ZONE_SECONDARY) == 0,
 		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
 		return (0);
 	}
 
 	if (keg->uk_flags & UMA_ZONE_MAXBUCKET)
 		zone->uz_count = BUCKET_MAX;
 	else if (keg->uk_ipers <= BUCKET_MAX)
 		zone->uz_count = keg->uk_ipers;
 	else
 		zone->uz_count = BUCKET_MAX;
 	return (0);
 }
 
 /*
  * Keg header dtor.  This frees all data, destroys locks, frees the hash
  * table and removes the keg from the global list.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 keg_dtor(void *arg, int size, void *udata)
 {
 	uma_keg_t keg;
 
 	keg = (uma_keg_t)arg;
 	mtx_lock(&keg->uk_lock);
 	if (keg->uk_free != 0) {
 		printf("Freed UMA keg was not empty (%d items). "
 		    " Lost %d pages of memory.\n",
 		    keg->uk_free, keg->uk_pages);
 	}
 	mtx_unlock(&keg->uk_lock);
 
 	if (keg->uk_flags & UMA_ZONE_HASH)
 		hash_free(&keg->uk_hash);
 
 	mtx_destroy(&keg->uk_lock);
 }
 
 /*
  * Zone header dtor.
  *
  * Arguments/Returns follow uma_dtor specifications
  *	udata  unused
  */
 static void
 zone_dtor(void *arg, int size, void *udata)
 {
 	uma_zone_t zone;
 	uma_keg_t keg;
 
 	zone = (uma_zone_t)arg;
 	keg = zone->uz_keg;
 
 	if (!(keg->uk_flags & UMA_ZFLAG_INTERNAL))
 		cache_drain(zone);
 
 	mtx_lock(&uma_mtx);
 	zone_drain(zone);
 	if (keg->uk_flags & UMA_ZONE_SECONDARY) {
 		LIST_REMOVE(zone, uz_link);
 		/*
 		 * XXX there are some races here where
 		 * the zone can be drained but zone lock
 		 * released and then refilled before we
 		 * remove it... we dont care for now
 		 */
 		ZONE_LOCK(zone);
 		if (LIST_EMPTY(&keg->uk_zones))
 			keg->uk_flags &= ~UMA_ZONE_SECONDARY;
 		ZONE_UNLOCK(zone);
 		mtx_unlock(&uma_mtx);
 	} else {
 		LIST_REMOVE(keg, uk_link);
 		LIST_REMOVE(zone, uz_link);
 		mtx_unlock(&uma_mtx);
 		uma_zfree_internal(kegs, keg, NULL, SKIP_NONE,
 		    ZFREE_STATFREE);
 	}
 	zone->uz_keg = NULL;
 }
 
 /*
  * Traverses every zone in the system and calls a callback
  *
  * Arguments:
  *	zfunc  A pointer to a function which accepts a zone
  *		as an argument.
  *
  * Returns:
  *	Nothing
  */
 static void
 zone_foreach(void (*zfunc)(uma_zone_t))
 {
 	uma_keg_t keg;
 	uma_zone_t zone;
 
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(keg, &uma_kegs, uk_link) {
 		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
 			zfunc(zone);
 	}
 	mtx_unlock(&uma_mtx);
 }
 
 /* Public functions */
 /* See uma.h */
 void
 uma_startup(void *bootmem, int boot_pages)
 {
 	struct uma_zctor_args args;
 	uma_slab_t slab;
 	u_int slabsize;
 	u_int objsize, totsize, wsize;
 	int i;
 
 #ifdef UMA_DEBUG
 	printf("Creating uma keg headers zone and keg.\n");
 #endif
 	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
 
 	/*
 	 * Figure out the maximum number of items-per-slab we'll have if
 	 * we're using the OFFPAGE slab header to track free items, given
 	 * all possible object sizes and the maximum desired wastage
 	 * (UMA_MAX_WASTE).
 	 *
 	 * We iterate until we find an object size for
 	 * which the calculated wastage in zone_small_init() will be
 	 * enough to warrant OFFPAGE.  Since wastedspace versus objsize
 	 * is an overall increasing see-saw function, we find the smallest
 	 * objsize such that the wastage is always acceptable for objects
 	 * with that objsize or smaller.  Since a smaller objsize always
 	 * generates a larger possible uma_max_ipers, we use this computed
 	 * objsize to calculate the largest ipers possible.  Since the
 	 * ipers calculated for OFFPAGE slab headers is always larger than
 	 * the ipers initially calculated in zone_small_init(), we use
 	 * the former's equation (UMA_SLAB_SIZE / keg->uk_rsize) to
 	 * obtain the maximum ipers possible for offpage slab headers.
 	 *
 	 * It should be noted that ipers versus objsize is an inversly
 	 * proportional function which drops off rather quickly so as
 	 * long as our UMA_MAX_WASTE is such that the objsize we calculate
 	 * falls into the portion of the inverse relation AFTER the steep
 	 * falloff, then uma_max_ipers shouldn't be too high (~10 on i386).
 	 *
 	 * Note that we have 8-bits (1 byte) to use as a freelist index
 	 * inside the actual slab header itself and this is enough to
 	 * accomodate us.  In the worst case, a UMA_SMALLEST_UNIT sized
 	 * object with offpage slab header would have ipers =
 	 * UMA_SLAB_SIZE / UMA_SMALLEST_UNIT (currently = 256), which is
 	 * 1 greater than what our byte-integer freelist index can
 	 * accomodate, but we know that this situation never occurs as
 	 * for UMA_SMALLEST_UNIT-sized objects, we will never calculate
 	 * that we need to go to offpage slab headers.  Or, if we do,
 	 * then we trap that condition below and panic in the INVARIANTS case.
 	 */
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab)) /
 		    (objsize + UMA_FRITM_SZ);
 		totsize *= (UMA_FRITM_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers = UMA_SLAB_SIZE / objsize;
 
 	wsize = UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) - UMA_MAX_WASTE;
 	totsize = wsize;
 	objsize = UMA_SMALLEST_UNIT;
 	while (totsize >= wsize) {
 		totsize = (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt)) /
 		    (objsize + UMA_FRITMREF_SZ);
 		totsize *= (UMA_FRITMREF_SZ + objsize);
 		objsize++;
 	}
 	if (objsize > UMA_SMALLEST_UNIT)
 		objsize--;
 	uma_max_ipers_ref = UMA_SLAB_SIZE / objsize;
 
 	KASSERT((uma_max_ipers_ref <= 255) && (uma_max_ipers <= 255),
 	    ("uma_startup: calculated uma_max_ipers values too large!"));
 
 #ifdef UMA_DEBUG
 	printf("Calculated uma_max_ipers (for OFFPAGE) is %d\n", uma_max_ipers);
 	printf("Calculated uma_max_ipers_slab (for OFFPAGE) is %d\n",
 	    uma_max_ipers_ref);
 #endif
 
 	/* "manually" create the initial zone */
 	args.name = "UMA Kegs";
 	args.size = sizeof(struct uma_keg);
 	args.ctor = keg_ctor;
 	args.dtor = keg_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = &masterkeg;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Filling boot free list.\n");
 #endif
 	for (i = 0; i < boot_pages; i++) {
 		slab = (uma_slab_t)((u_int8_t *)bootmem + (i * UMA_SLAB_SIZE));
 		slab->us_data = (u_int8_t *)slab;
 		slab->us_flags = UMA_SLAB_BOOT;
 		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
 	}
 	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
 
 #ifdef UMA_DEBUG
 	printf("Creating uma zone headers zone and keg.\n");
 #endif
 	args.name = "UMA Zones";
 	args.size = sizeof(struct uma_zone) +
 	    (sizeof(struct uma_cache) * (mp_maxid + 1));
 	args.ctor = zone_ctor;
 	args.dtor = zone_dtor;
 	args.uminit = zero_init;
 	args.fini = NULL;
 	args.keg = NULL;
 	args.align = 32 - 1;
 	args.flags = UMA_ZFLAG_INTERNAL;
 	/* The initial zone has no Per cpu queues so it's smaller */
 	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
 
 #ifdef UMA_DEBUG
 	printf("Initializing pcpu cache locks.\n");
 #endif
 #ifdef UMA_DEBUG
 	printf("Creating slab and hash zones.\n");
 #endif
 
 	/*
 	 * This is the max number of free list items we'll have with
 	 * offpage slabs.
 	 */
 	slabsize = uma_max_ipers * UMA_FRITM_SZ;
 	slabsize += sizeof(struct uma_slab);
 
 	/* Now make a zone for slab headers */
 	slabzone = uma_zcreate("UMA Slabs",
 				slabsize,
 				NULL, NULL, NULL, NULL,
 				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	/*
 	 * We also create a zone for the bigger slabs with reference
 	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
 	 */
 	slabsize = uma_max_ipers_ref * UMA_FRITMREF_SZ;
 	slabsize += sizeof(struct uma_slab_refcnt);
 	slabrefzone = uma_zcreate("UMA RCntSlabs",
 				  slabsize,
 				  NULL, NULL, NULL, NULL,
 				  UMA_ALIGN_PTR,
 				  UMA_ZFLAG_INTERNAL);
 
 	hashzone = uma_zcreate("UMA Hash",
 	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
 
 	bucket_init();
 
 #ifdef UMA_MD_SMALL_ALLOC
 	booted = 1;
 #endif
 
 #ifdef UMA_DEBUG
 	printf("UMA startup complete.\n");
 #endif
 }
 
 /* see uma.h */
 void
 uma_startup2(void)
 {
 	booted = 1;
 	bucket_enable();
 #ifdef UMA_DEBUG
 	printf("UMA startup2 complete.\n");
 #endif
 }
 
 /*
  * Initialize our callout handle
  *
  */
 
 static void
 uma_startup3(void)
 {
 #ifdef UMA_DEBUG
 	printf("Starting callout.\n");
 #endif
 	callout_init(&uma_callout, CALLOUT_MPSAFE);
 	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
 #ifdef UMA_DEBUG
 	printf("UMA startup3 complete.\n");
 #endif
 }
 
 static uma_zone_t
 uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
 		int align, u_int32_t flags)
 {
 	struct uma_kctor_args args;
 
 	args.size = size;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
 	args.flags = flags;
 	args.zone = zone;
 	return (uma_zalloc_internal(kegs, &args, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_set_align(int align)
 {
 
 	if (align != UMA_ALIGN_CACHE)
 		uma_align_cache = align;
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zcreate(char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
 		uma_init uminit, uma_fini fini, int align, u_int32_t flags)
 
 {
 	struct uma_zctor_args args;
 
 	/* This stuff is essential for the zone ctor */
 	args.name = name;
 	args.size = size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = uminit;
 	args.fini = fini;
 	args.align = align;
 	args.flags = flags;
 	args.keg = NULL;
 
 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
 }
 
 /* See uma.h */
 uma_zone_t
 uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
 		    uma_init zinit, uma_fini zfini, uma_zone_t master)
 {
 	struct uma_zctor_args args;
 
 	args.name = name;
 	args.size = master->uz_keg->uk_size;
 	args.ctor = ctor;
 	args.dtor = dtor;
 	args.uminit = zinit;
 	args.fini = zfini;
 	args.align = master->uz_keg->uk_align;
 	args.flags = master->uz_keg->uk_flags | UMA_ZONE_SECONDARY;
 	args.keg = master->uz_keg;
 
 	return (uma_zalloc_internal(zones, &args, M_WAITOK));
 }
 
 /* See uma.h */
 void
 uma_zdestroy(uma_zone_t zone)
 {
 
 	uma_zfree_internal(zones, zone, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 /* See uma.h */
 void *
 uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
 {
 	void *item;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int cpu;
 
 	/* This is the fast path allocation */
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
 	    zone->uz_name, flags);
 
 	if (flags & M_WAITOK) {
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
 		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
 	}
 
 	/*
 	 * If possible, allocate from the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to allocate from
 	 * the current cache; when we re-acquire the critical section, we
 	 * must detect and handle migration if it has occurred.
 	 */
 zalloc_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zalloc_start:
 	bucket = cache->uc_allocbucket;
 
 	if (bucket) {
 		if (bucket->ub_cnt > 0) {
 			bucket->ub_cnt--;
 			item = bucket->ub_bucket[bucket->ub_cnt];
 #ifdef INVARIANTS
 			bucket->ub_bucket[bucket->ub_cnt] = NULL;
 #endif
 			KASSERT(item != NULL,
 			    ("uma_zalloc: Bucket pointer mangled."));
 			cache->uc_allocs++;
 			critical_exit();
 #ifdef INVARIANTS
 			ZONE_LOCK(zone);
 			uma_dbg_alloc(zone, NULL, item);
 			ZONE_UNLOCK(zone);
 #endif
 			if (zone->uz_ctor != NULL) {
 				if (zone->uz_ctor(item, zone->uz_keg->uk_size,
 				    udata, flags) != 0) {
 					uma_zfree_internal(zone, item, udata,
 					    SKIP_DTOR, ZFREE_STATFAIL |
 					    ZFREE_STATFREE);
 					return (NULL);
 				}
 			}
 			if (flags & M_ZERO)
 				bzero(item, zone->uz_keg->uk_size);
 			return (item);
 		} else if (cache->uc_freebucket) {
 			/*
 			 * We have run out of items in our allocbucket.
 			 * See if we can switch with our free bucket.
 			 */
 			if (cache->uc_freebucket->ub_cnt > 0) {
 #ifdef UMA_DEBUG_ALLOC
 				printf("uma_zalloc: Swapping empty with"
 				    " alloc.\n");
 #endif
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 
 				goto zalloc_start;
 			}
 		}
 	}
 	/*
 	 * Attempt to retrieve the item from the per-CPU cache has failed, so
 	 * we must go back to the zone.  This requires the zone lock, so we
 	 * must drop the critical section, then re-acquire it when we go back
 	 * to the cache.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	bucket = cache->uc_allocbucket;
 	if (bucket != NULL) {
 		if (bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 		bucket = cache->uc_freebucket;
 		if (bucket != NULL && bucket->ub_cnt > 0) {
 			ZONE_UNLOCK(zone);
 			goto zalloc_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	/* Our old one is now a free bucket */
 	if (cache->uc_allocbucket) {
 		KASSERT(cache->uc_allocbucket->ub_cnt == 0,
 		    ("uma_zalloc_arg: Freeing a non free bucket."));
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    cache->uc_allocbucket, ub_link);
 		cache->uc_allocbucket = NULL;
 	}
 
 	/* Check the free list for a new alloc bucket */
 	if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zalloc_arg: Returning an empty bucket."));
 
 		LIST_REMOVE(bucket, ub_link);
 		cache->uc_allocbucket = bucket;
 		ZONE_UNLOCK(zone);
 		goto zalloc_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* Bump up our uz_count so we get here less */
 	if (zone->uz_count < BUCKET_MAX)
 		zone->uz_count++;
 
 	/*
 	 * Now lets just fill a bucket and put it on the free list.  If that
 	 * works we'll restart the allocation from the begining.
 	 */
 	if (uma_zalloc_bucket(zone, flags)) {
 		ZONE_UNLOCK(zone);
 		goto zalloc_restart;
 	}
 	ZONE_UNLOCK(zone);
 	/*
 	 * We may not be able to get a bucket so return an actual item.
 	 */
 #ifdef UMA_DEBUG
 	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
 #endif
 
 	return (uma_zalloc_internal(zone, udata, flags));
 }
 
 static uma_slab_t
 uma_zone_slab(uma_zone_t zone, int flags)
 {
 	uma_slab_t slab;
 	uma_keg_t keg;
 
 	keg = zone->uz_keg;
 
 	/*
 	 * This is to prevent us from recursively trying to allocate
 	 * buckets.  The problem is that if an allocation forces us to
 	 * grab a new bucket we will call page_alloc, which will go off
 	 * and cause the vm to allocate vm_map_entries.  If we need new
 	 * buckets there too we will recurse in kmem_alloc and bad
 	 * things happen.  So instead we return a NULL bucket, and make
 	 * the code that allocates buckets smart enough to deal with it
 	 *
 	 * XXX: While we want this protection for the bucket zones so that
 	 * recursion from the VM is handled (and the calling code that
 	 * allocates buckets knows how to deal with it), we do not want
 	 * to prevent allocation from the slab header zones (slabzone
 	 * and slabrefzone) if uk_recurse is not zero for them.  The
 	 * reason is that it could lead to NULL being returned for
 	 * slab header allocations even in the M_WAITOK case, and the
 	 * caller can't handle that. 
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)
 		if ((zone != slabzone) && (zone != slabrefzone))
 			return (NULL);
 
 	slab = NULL;
 
 	for (;;) {
 		/*
 		 * Find a slab with some space.  Prefer slabs that are partially
 		 * used over those that are totally full.  This helps to reduce
 		 * fragmentation.
 		 */
 		if (keg->uk_free != 0) {
 			if (!LIST_EMPTY(&keg->uk_part_slab)) {
 				slab = LIST_FIRST(&keg->uk_part_slab);
 			} else {
 				slab = LIST_FIRST(&keg->uk_free_slab);
 				LIST_REMOVE(slab, us_link);
 				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
 				    us_link);
 			}
 			return (slab);
 		}
 
 		/*
 		 * M_NOVM means don't ask at all!
 		 */
 		if (flags & M_NOVM)
 			break;
 
 		if (keg->uk_maxpages &&
 		    keg->uk_pages >= keg->uk_maxpages) {
 			keg->uk_flags |= UMA_ZFLAG_FULL;
 
 			if (flags & M_NOWAIT)
 				break;
 			else
 				msleep(keg, &keg->uk_lock, PVM,
 				    "zonelimit", 0);
 			continue;
 		}
 		keg->uk_recurse++;
 		slab = slab_zalloc(zone, flags);
 		keg->uk_recurse--;
 
 		/*
 		 * If we got a slab here it's safe to mark it partially used
 		 * and return.  We assume that the caller is going to remove
 		 * at least one item.
 		 */
 		if (slab) {
 			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 			return (slab);
 		}
 		/*
 		 * We might not have been able to get a slab but another cpu
 		 * could have while we were unlocked.  Check again before we
 		 * fail.
 		 */
 		if (flags & M_NOWAIT)
 			flags |= M_NOVM;
 	}
 	return (slab);
 }
 
 static void *
 uma_slab_alloc(uma_zone_t zone, uma_slab_t slab)
 {
 	uma_keg_t keg;
 	uma_slabrefcnt_t slabref;
 	void *item;
 	u_int8_t freei;
 
 	keg = zone->uz_keg;
 
 	freei = slab->us_firstfree;
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slab->us_firstfree = slabref->us_freelist[freei].us_item;
 	} else {
 		slab->us_firstfree = slab->us_freelist[freei].us_item;
 	}
 	item = slab->us_data + (keg->uk_rsize * freei);
 
 	slab->us_freecount--;
 	keg->uk_free--;
 #ifdef INVARIANTS
 	uma_dbg_alloc(zone, slab, item);
 #endif
 	/* Move this slab to the full list */
 	if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
 	}
 
 	return (item);
 }
 
 static int
 uma_zalloc_bucket(uma_zone_t zone, int flags)
 {
 	uma_bucket_t bucket;
 	uma_slab_t slab;
 	int16_t saved;
 	int max, origflags = flags;
 
 	/*
 	 * Try this zone's free list first so we don't allocate extra buckets.
 	 */
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		KASSERT(bucket->ub_cnt == 0,
 		    ("uma_zalloc_bucket: Bucket on free list is not empty."));
 		LIST_REMOVE(bucket, ub_link);
 	} else {
 		int bflags;
 
 		bflags = (flags & ~M_ZERO);
 		if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 			bflags |= M_NOVM;
 
 		ZONE_UNLOCK(zone);
 		bucket = bucket_alloc(zone->uz_count, bflags);
 		ZONE_LOCK(zone);
 	}
 
 	if (bucket == NULL)
 		return (0);
 
 #ifdef SMP
 	/*
 	 * This code is here to limit the number of simultaneous bucket fills
 	 * for any given zone to the number of per cpu caches in this zone. This
 	 * is done so that we don't allocate more memory than we really need.
 	 */
 	if (zone->uz_fills >= mp_ncpus)
 		goto done;
 
 #endif
 	zone->uz_fills++;
 
 	max = MIN(bucket->ub_entries, zone->uz_count);
 	/* Try to keep the buckets totally full */
 	saved = bucket->ub_cnt;
 	while (bucket->ub_cnt < max &&
 	    (slab = uma_zone_slab(zone, flags)) != NULL) {
 		while (slab->us_freecount && bucket->ub_cnt < max) {
 			bucket->ub_bucket[bucket->ub_cnt++] =
 			    uma_slab_alloc(zone, slab);
 		}
 
 		/* Don't block on the next fill */
 		flags |= M_NOWAIT;
 	}
 
 	/*
 	 * We unlock here because we need to call the zone's init.
 	 * It should be safe to unlock because the slab dealt with
 	 * above is already on the appropriate list within the keg
 	 * and the bucket we filled is not yet on any list, so we
 	 * own it.
 	 */
 	if (zone->uz_init != NULL) {
 		int i;
 
 		ZONE_UNLOCK(zone);
 		for (i = saved; i < bucket->ub_cnt; i++)
 			if (zone->uz_init(bucket->ub_bucket[i],
 			    zone->uz_keg->uk_size, origflags) != 0)
 				break;
 		/*
 		 * If we couldn't initialize the whole bucket, put the
 		 * rest back onto the freelist.
 		 */
 		if (i != bucket->ub_cnt) {
 			int j;
 
 			for (j = i; j < bucket->ub_cnt; j++) {
 				uma_zfree_internal(zone, bucket->ub_bucket[j],
 				    NULL, SKIP_FINI, 0);
 #ifdef INVARIANTS
 				bucket->ub_bucket[j] = NULL;
 #endif
 			}
 			bucket->ub_cnt = i;
 		}
 		ZONE_LOCK(zone);
 	}
 
 	zone->uz_fills--;
 	if (bucket->ub_cnt != 0) {
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 		return (1);
 	}
 #ifdef SMP
 done:
 #endif
 	bucket_free(bucket);
 
 	return (0);
 }
 /*
  * Allocates an item for an internal zone
  *
  * Arguments
  *	zone   The zone to alloc for.
  *	udata  The data to be passed to the constructor.
  *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
  *
  * Returns
  *	NULL if there is no memory and M_NOWAIT is set
  *	An item if successful
  */
 
 static void *
 uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)
 {
 	uma_keg_t keg;
 	uma_slab_t slab;
 	void *item;
 
 	item = NULL;
 	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
 #endif
 	ZONE_LOCK(zone);
 
 	slab = uma_zone_slab(zone, flags);
 	if (slab == NULL) {
 		zone->uz_fails++;
 		ZONE_UNLOCK(zone);
 		return (NULL);
 	}
 
 	item = uma_slab_alloc(zone, slab);
 
 	zone->uz_allocs++;
 
 	ZONE_UNLOCK(zone);
 
 	/*
 	 * We have to call both the zone's init (not the keg's init)
 	 * and the zone's ctor.  This is because the item is going from
 	 * a keg slab directly to the user, and the user is expecting it
 	 * to be both zone-init'd as well as zone-ctor'd.
 	 */
 	if (zone->uz_init != NULL) {
 		if (zone->uz_init(item, keg->uk_size, flags) != 0) {
 			uma_zfree_internal(zone, item, udata, SKIP_FINI,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (zone->uz_ctor != NULL) {
 		if (zone->uz_ctor(item, keg->uk_size, udata, flags) != 0) {
 			uma_zfree_internal(zone, item, udata, SKIP_DTOR,
 			    ZFREE_STATFAIL | ZFREE_STATFREE);
 			return (NULL);
 		}
 	}
 	if (flags & M_ZERO)
 		bzero(item, keg->uk_size);
 
 	return (item);
 }
 
 /* See uma.h */
 void
 uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
 {
 	uma_keg_t keg;
 	uma_cache_t cache;
 	uma_bucket_t bucket;
 	int bflags;
 	int cpu;
 
 	keg = zone->uz_keg;
 
 #ifdef UMA_DEBUG_ALLOC_1
 	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
 #endif
 	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
 	    zone->uz_name);
 
 	if (zone->uz_dtor)
 		zone->uz_dtor(item, keg->uk_size, udata);
 #ifdef INVARIANTS
 	ZONE_LOCK(zone);
 	if (keg->uk_flags & UMA_ZONE_MALLOC)
 		uma_dbg_free(zone, udata, item);
 	else
 		uma_dbg_free(zone, NULL, item);
 	ZONE_UNLOCK(zone);
 #endif
 	/*
 	 * The race here is acceptable.  If we miss it we'll just have to wait
 	 * a little longer for the limits to be reset.
 	 */
 	if (keg->uk_flags & UMA_ZFLAG_FULL)
 		goto zfree_internal;
 
 	/*
 	 * If possible, free to the per-CPU cache.  There are two
 	 * requirements for safe access to the per-CPU cache: (1) the thread
 	 * accessing the cache must not be preempted or yield during access,
 	 * and (2) the thread must not migrate CPUs without switching which
 	 * cache it accesses.  We rely on a critical section to prevent
 	 * preemption and migration.  We release the critical section in
 	 * order to acquire the zone mutex if we are unable to free to the
 	 * current cache; when we re-acquire the critical section, we must
 	 * detect and handle migration if it has occurred.
 	 */
 zfree_restart:
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 
 zfree_start:
 	bucket = cache->uc_freebucket;
 
 	if (bucket) {
 		/*
 		 * Do we have room in our bucket? It is OK for this uz count
 		 * check to be slightly out of sync.
 		 */
 
 		if (bucket->ub_cnt < bucket->ub_entries) {
 			KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
 			    ("uma_zfree: Freeing to non free bucket index."));
 			bucket->ub_bucket[bucket->ub_cnt] = item;
 			bucket->ub_cnt++;
 			cache->uc_frees++;
 			critical_exit();
 			return;
 		} else if (cache->uc_allocbucket) {
 #ifdef UMA_DEBUG_ALLOC
 			printf("uma_zfree: Swapping buckets.\n");
 #endif
 			/*
 			 * We have run out of space in our freebucket.
 			 * See if we can switch with our alloc bucket.
 			 */
 			if (cache->uc_allocbucket->ub_cnt <
 			    cache->uc_freebucket->ub_cnt) {
 				bucket = cache->uc_freebucket;
 				cache->uc_freebucket = cache->uc_allocbucket;
 				cache->uc_allocbucket = bucket;
 				goto zfree_start;
 			}
 		}
 	}
 	/*
 	 * We can get here for two reasons:
 	 *
 	 * 1) The buckets are NULL
 	 * 2) The alloc and free buckets are both somewhat full.
 	 *
 	 * We must go back the zone, which requires acquiring the zone lock,
 	 * which in turn means we must release and re-acquire the critical
 	 * section.  Since the critical section is released, we may be
 	 * preempted or migrate.  As such, make sure not to maintain any
 	 * thread-local state specific to the cache from prior to releasing
 	 * the critical section.
 	 */
 	critical_exit();
 	ZONE_LOCK(zone);
 	critical_enter();
 	cpu = curcpu;
 	cache = &zone->uz_cpu[cpu];
 	if (cache->uc_freebucket != NULL) {
 		if (cache->uc_freebucket->ub_cnt <
 		    cache->uc_freebucket->ub_entries) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 		if (cache->uc_allocbucket != NULL &&
 		    (cache->uc_allocbucket->ub_cnt <
 		    cache->uc_freebucket->ub_cnt)) {
 			ZONE_UNLOCK(zone);
 			goto zfree_start;
 		}
 	}
 
 	/* Since we have locked the zone we may as well send back our stats */
 	zone->uz_allocs += cache->uc_allocs;
 	cache->uc_allocs = 0;
 	zone->uz_frees += cache->uc_frees;
 	cache->uc_frees = 0;
 
 	bucket = cache->uc_freebucket;
 	cache->uc_freebucket = NULL;
 
 	/* Can we throw this on the zone full list? */
 	if (bucket != NULL) {
 #ifdef UMA_DEBUG_ALLOC
 		printf("uma_zfree: Putting old bucket on the free list.\n");
 #endif
 		/* ub_cnt is pointing to the last free item */
 		KASSERT(bucket->ub_cnt != 0,
 		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
 		LIST_INSERT_HEAD(&zone->uz_full_bucket,
 		    bucket, ub_link);
 	}
 	if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {
 		LIST_REMOVE(bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		cache->uc_freebucket = bucket;
 		goto zfree_start;
 	}
 	/* We are no longer associated with this CPU. */
 	critical_exit();
 
 	/* And the zone.. */
 	ZONE_UNLOCK(zone);
 
 #ifdef UMA_DEBUG_ALLOC
 	printf("uma_zfree: Allocating new free bucket.\n");
 #endif
 	bflags = M_NOWAIT;
 
 	if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)
 		bflags |= M_NOVM;
 	bucket = bucket_alloc(zone->uz_count, bflags);
 	if (bucket) {
 		ZONE_LOCK(zone);
 		LIST_INSERT_HEAD(&zone->uz_free_bucket,
 		    bucket, ub_link);
 		ZONE_UNLOCK(zone);
 		goto zfree_restart;
 	}
 
 	/*
 	 * If nothing else caught this, we'll just do an internal free.
 	 */
 zfree_internal:
 	uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);
 
 	return;
 }
 
 /*
  * Frees an item to an INTERNAL zone or allocates a free bucket
  *
  * Arguments:
  *	zone   The zone to free to
  *	item   The item we're freeing
  *	udata  User supplied data for the dtor
  *	skip   Skip dtors and finis
  */
 static void
 uma_zfree_internal(uma_zone_t zone, void *item, void *udata,
     enum zfreeskip skip, int flags)
 {
 	uma_slab_t slab;
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int8_t *mem;
 	u_int8_t freei;
 
 	keg = zone->uz_keg;
 
 	if (skip < SKIP_DTOR && zone->uz_dtor)
 		zone->uz_dtor(item, keg->uk_size, udata);
 	if (skip < SKIP_FINI && zone->uz_fini)
 		zone->uz_fini(item, keg->uk_size);
 
 	ZONE_LOCK(zone);
 
 	if (flags & ZFREE_STATFAIL)
 		zone->uz_fails++;
 	if (flags & ZFREE_STATFREE)
 		zone->uz_frees++;
 
 	if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {
 		mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));
 		if (keg->uk_flags & UMA_ZONE_HASH)
 			slab = hash_sfind(&keg->uk_hash, mem);
 		else {
 			mem += keg->uk_pgoff;
 			slab = (uma_slab_t)mem;
 		}
 	} else {
 		slab = (uma_slab_t)udata;
 	}
 
 	/* Do we need to remove from any lists? */
 	if (slab->us_freecount+1 == keg->uk_ipers) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 	} else if (slab->us_freecount == 0) {
 		LIST_REMOVE(slab, us_link);
 		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
 	}
 
 	/* Slab management stuff */
 	freei = ((unsigned long)item - (unsigned long)slab->us_data)
 		/ keg->uk_rsize;
 
 #ifdef INVARIANTS
 	if (!skip)
 		uma_dbg_free(zone, slab, item);
 #endif
 
 	if (keg->uk_flags & UMA_ZONE_REFCNT) {
 		slabref = (uma_slabrefcnt_t)slab;
 		slabref->us_freelist[freei].us_item = slab->us_firstfree;
 	} else {
 		slab->us_freelist[freei].us_item = slab->us_firstfree;
 	}
 	slab->us_firstfree = freei;
 	slab->us_freecount++;
 
 	/* Zone statistics */
 	keg->uk_free++;
 
 	if (keg->uk_flags & UMA_ZFLAG_FULL) {
 		if (keg->uk_pages < keg->uk_maxpages)
 			keg->uk_flags &= ~UMA_ZFLAG_FULL;
 
 		/* 
 		 * We can handle one more allocation. Since we're clearing ZFLAG_FULL,
 		 * wake up all procs blocked on pages. This should be uncommon, so 
 		 * keeping this simple for now (rather than adding count of blocked 
 		 * threads etc).
 		 */
 		wakeup(keg);
 	}
 
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_max(uma_zone_t zone, int nitems)
 {
 	uma_keg_t keg;
 
 	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
 	if (keg->uk_ppera > 1)
 		keg->uk_maxpages = nitems * keg->uk_ppera;
 	else
 		keg->uk_maxpages = nitems / keg->uk_ipers;
 
 	if (keg->uk_maxpages * keg->uk_ipers < nitems)
 		keg->uk_maxpages++;
 
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_init(uma_zone_t zone, uma_init uminit)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_init on non-empty keg"));
 	zone->uz_keg->uk_init = uminit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_fini on non-empty keg"));
 	zone->uz_keg->uk_fini = fini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_zinit on non-empty keg"));
 	zone->uz_init = zinit;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 void
 uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
 {
 	ZONE_LOCK(zone);
 	KASSERT(zone->uz_keg->uk_pages == 0,
 	    ("uma_zone_set_zfini on non-empty keg"));
 	zone->uz_fini = zfini;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_freef is not actually used with the zone locked */
 void
 uma_zone_set_freef(uma_zone_t zone, uma_free freef)
 {
 	ZONE_LOCK(zone);
 	zone->uz_keg->uk_freef = freef;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 /* XXX uk_allocf is not actually used with the zone locked */
 void
 uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
 {
 	ZONE_LOCK(zone);
 	zone->uz_keg->uk_flags |= UMA_ZFLAG_PRIVALLOC;
 	zone->uz_keg->uk_allocf = allocf;
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 int
 uma_zone_set_obj(uma_zone_t zone, struct vm_object *obj, int count)
 {
 	uma_keg_t keg;
 	vm_offset_t kva;
 	int pages;
 
 	keg = zone->uz_keg;
 	pages = count / keg->uk_ipers;
 
 	if (pages * keg->uk_ipers < count)
 		pages++;
 
 	kva = kmem_alloc_nofault(kernel_map, pages * UMA_SLAB_SIZE);
 
 	if (kva == 0)
 		return (0);
 	if (obj == NULL) {
 		obj = vm_object_allocate(OBJT_DEFAULT,
 		    pages);
 	} else {
 		VM_OBJECT_LOCK_INIT(obj, "uma object");
 		_vm_object_allocate(OBJT_DEFAULT,
 		    pages, obj);
 	}
 	ZONE_LOCK(zone);
 	keg->uk_kva = kva;
 	keg->uk_obj = obj;
 	keg->uk_maxpages = pages;
 	keg->uk_allocf = obj_alloc;
 	keg->uk_flags |= UMA_ZONE_NOFREE | UMA_ZFLAG_PRIVALLOC;
 	ZONE_UNLOCK(zone);
 	return (1);
 }
 
 /* See uma.h */
 void
 uma_prealloc(uma_zone_t zone, int items)
 {
 	int slabs;
 	uma_slab_t slab;
 	uma_keg_t keg;
 
 	keg = zone->uz_keg;
 	ZONE_LOCK(zone);
 	slabs = items / keg->uk_ipers;
 	if (slabs * keg->uk_ipers < items)
 		slabs++;
 	while (slabs > 0) {
 		slab = slab_zalloc(zone, M_WAITOK);
 		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
 		slabs--;
 	}
 	ZONE_UNLOCK(zone);
 }
 
 /* See uma.h */
 u_int32_t *
 uma_find_refcnt(uma_zone_t zone, void *item)
 {
 	uma_slabrefcnt_t slabref;
 	uma_keg_t keg;
 	u_int32_t *refcnt;
 	int idx;
 
 	keg = zone->uz_keg;
 	slabref = (uma_slabrefcnt_t)vtoslab((vm_offset_t)item &
 	    (~UMA_SLAB_MASK));
 	KASSERT(slabref != NULL && slabref->us_keg->uk_flags & UMA_ZONE_REFCNT,
 	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
 	idx = ((unsigned long)item - (unsigned long)slabref->us_data)
 	    / keg->uk_rsize;
 	refcnt = &slabref->us_freelist[idx].us_refcnt;
 	return refcnt;
 }
 
 /* See uma.h */
 void
 uma_reclaim(void)
 {
 #ifdef UMA_DEBUG
 	printf("UMA: vm asked us to release pages!\n");
 #endif
 	bucket_enable();
 	zone_foreach(zone_drain);
 	/*
 	 * Some slabs may have been freed but this zone will be visited early
 	 * we visit again so that we can free pages that are empty once other
 	 * zones are drained.  We have to do the same for buckets.
 	 */
 	zone_drain(slabzone);
 	zone_drain(slabrefzone);
 	bucket_zone_drain();
 }
 
 /* See uma.h */
 int
 uma_zone_exhausted(uma_zone_t zone)
 {
 	int full;
 
 	ZONE_LOCK(zone);
 	full = (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
 	ZONE_UNLOCK(zone);
 	return (full);	
 }
 
 int
 uma_zone_exhausted_nolock(uma_zone_t zone)
 {
 	return (zone->uz_keg->uk_flags & UMA_ZFLAG_FULL);
 }
 
 void *
 uma_large_malloc(int size, int wait)
 {
 	void *mem;
 	uma_slab_t slab;
 	u_int8_t flags;
 
 	slab = uma_zalloc_internal(slabzone, NULL, wait);
 	if (slab == NULL)
 		return (NULL);
 	mem = page_alloc(NULL, size, &flags, wait);
 	if (mem) {
 		vsetslab((vm_offset_t)mem, slab);
 		slab->us_data = mem;
 		slab->us_flags = flags | UMA_SLAB_MALLOC;
 		slab->us_size = size;
 	} else {
 		uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE,
 		    ZFREE_STATFAIL | ZFREE_STATFREE);
 	}
 
 	return (mem);
 }
 
 void
 uma_large_free(uma_slab_t slab)
 {
 	vsetobj((vm_offset_t)slab->us_data, kmem_object);
 	page_free(slab->us_data, slab->us_size, slab->us_flags);
 	uma_zfree_internal(slabzone, slab, NULL, SKIP_NONE, ZFREE_STATFREE);
 }
 
 void
 uma_print_stats(void)
 {
 	zone_foreach(uma_print_zone);
 }
 
 static void
 slab_print(uma_slab_t slab)
 {
 	printf("slab: keg %p, data %p, freecount %d, firstfree %d\n",
 		slab->us_keg, slab->us_data, slab->us_freecount,
 		slab->us_firstfree);
 }
 
 static void
 cache_print(uma_cache_t cache)
 {
 	printf("alloc: %p(%d), free: %p(%d)\n",
 		cache->uc_allocbucket,
 		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
 		cache->uc_freebucket,
 		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
 }
 
 void
 uma_print_zone(uma_zone_t zone)
 {
 	uma_cache_t cache;
 	uma_keg_t keg;
 	uma_slab_t slab;
 	int i;
 
 	keg = zone->uz_keg;
 	printf("%s(%p) size %d(%d) flags %d ipers %d ppera %d out %d free %d\n",
 	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
 	    keg->uk_ipers, keg->uk_ppera,
 	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
 	printf("Part slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
 		slab_print(slab);
 	printf("Free slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
 		slab_print(slab);
 	printf("Full slabs:\n");
 	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
 		slab_print(slab);
 	for (i = 0; i <= mp_maxid; i++) {
 		if (CPU_ABSENT(i))
 			continue;
 		cache = &zone->uz_cpu[i];
 		printf("CPU %d Cache:\n", i);
 		cache_print(cache);
 	}
 }
 
 #ifdef DDB
 /*
  * Generate statistics across both the zone and its per-cpu cache's.  Return
  * desired statistics if the pointer is non-NULL for that statistic.
  *
  * Note: does not update the zone statistics, as it can't safely clear the
  * per-CPU cache statistic.
  *
  * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
  * safe from off-CPU; we should modify the caches to track this information
  * directly so that we don't have to.
  */
 static void
 uma_zone_sumstat(uma_zone_t z, int *cachefreep, u_int64_t *allocsp,
     u_int64_t *freesp)
 {
 	uma_cache_t cache;
 	u_int64_t allocs, frees;
 	int cachefree, cpu;
 
 	allocs = frees = 0;
 	cachefree = 0;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (CPU_ABSENT(cpu))
 			continue;
 		cache = &z->uz_cpu[cpu];
 		if (cache->uc_allocbucket != NULL)
 			cachefree += cache->uc_allocbucket->ub_cnt;
 		if (cache->uc_freebucket != NULL)
 			cachefree += cache->uc_freebucket->ub_cnt;
 		allocs += cache->uc_allocs;
 		frees += cache->uc_frees;
 	}
 	allocs += z->uz_allocs;
 	frees += z->uz_frees;
 	if (cachefreep != NULL)
 		*cachefreep = cachefree;
 	if (allocsp != NULL)
 		*allocsp = allocs;
 	if (freesp != NULL)
 		*freesp = frees;
 }
 #endif /* DDB */
 
 static int
 sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
 {
 	uma_keg_t kz;
 	uma_zone_t z;
 	int count;
 
 	count = 0;
 	mtx_lock(&uma_mtx);
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	mtx_unlock(&uma_mtx);
 	return (sysctl_handle_int(oidp, &count, 0, req));
 }
 
 static int
 sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
 {
 	struct uma_stream_header ush;
 	struct uma_type_header uth;
 	struct uma_percpu_stat ups;
 	uma_bucket_t bucket;
 	struct sbuf sbuf;
 	uma_cache_t cache;
 	uma_keg_t kz;
 	uma_zone_t z;
 	char *buffer;
 	int buflen, count, error, i;
 
 	mtx_lock(&uma_mtx);
 restart:
 	mtx_assert(&uma_mtx, MA_OWNED);
 	count = 0;
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			count++;
 	}
 	mtx_unlock(&uma_mtx);
 
 	buflen = sizeof(ush) + count * (sizeof(uth) + sizeof(ups) *
 	    (mp_maxid + 1)) + 1;
 	buffer = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
 
 	mtx_lock(&uma_mtx);
 	i = 0;
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link)
 			i++;
 	}
 	if (i > count) {
 		free(buffer, M_TEMP);
 		goto restart;
 	}
 	count =  i;
 
 	sbuf_new(&sbuf, buffer, buflen, SBUF_FIXEDLEN);
 
 	/*
 	 * Insert stream header.
 	 */
 	bzero(&ush, sizeof(ush));
 	ush.ush_version = UMA_STREAM_VERSION;
 	ush.ush_maxcpus = (mp_maxid + 1);
 	ush.ush_count = count;
 	if (sbuf_bcat(&sbuf, &ush, sizeof(ush)) < 0) {
 		mtx_unlock(&uma_mtx);
 		error = ENOMEM;
 		goto out;
 	}
 
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			bzero(&uth, sizeof(uth));
 			ZONE_LOCK(z);
 			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
 			uth.uth_align = kz->uk_align;
 			uth.uth_pages = kz->uk_pages;
 			uth.uth_keg_free = kz->uk_free;
 			uth.uth_size = kz->uk_size;
 			uth.uth_rsize = kz->uk_rsize;
 			uth.uth_maxpages = kz->uk_maxpages;
 			if (kz->uk_ppera > 1)
 				uth.uth_limit = kz->uk_maxpages /
 				    kz->uk_ppera;
 			else
 				uth.uth_limit = kz->uk_maxpages *
 				    kz->uk_ipers;
 
 			/*
 			 * A zone is secondary is it is not the first entry
 			 * on the keg's zone list.
 			 */
 			if ((kz->uk_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z))
 				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
 
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				uth.uth_zone_free += bucket->ub_cnt;
 			uth.uth_allocs = z->uz_allocs;
 			uth.uth_frees = z->uz_frees;
 			uth.uth_fails = z->uz_fails;
 			if (sbuf_bcat(&sbuf, &uth, sizeof(uth)) < 0) {
 				ZONE_UNLOCK(z);
 				mtx_unlock(&uma_mtx);
 				error = ENOMEM;
 				goto out;
 			}
 			/*
 			 * While it is not normally safe to access the cache
 			 * bucket pointers while not on the CPU that owns the
 			 * cache, we only allow the pointers to be exchanged
 			 * without the zone lock held, not invalidated, so
 			 * accept the possible race associated with bucket
 			 * exchange during monitoring.
 			 */
 			for (i = 0; i < (mp_maxid + 1); i++) {
 				bzero(&ups, sizeof(ups));
 				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
 					goto skip;
 				if (CPU_ABSENT(i))
 					goto skip;
 				cache = &z->uz_cpu[i];
 				if (cache->uc_allocbucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_allocbucket->ub_cnt;
 				if (cache->uc_freebucket != NULL)
 					ups.ups_cache_free +=
 					    cache->uc_freebucket->ub_cnt;
 				ups.ups_allocs = cache->uc_allocs;
 				ups.ups_frees = cache->uc_frees;
 skip:
 				if (sbuf_bcat(&sbuf, &ups, sizeof(ups)) < 0) {
 					ZONE_UNLOCK(z);
 					mtx_unlock(&uma_mtx);
 					error = ENOMEM;
 					goto out;
 				}
 			}
 			ZONE_UNLOCK(z);
 		}
 	}
 	mtx_unlock(&uma_mtx);
 	sbuf_finish(&sbuf);
 	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
 out:
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(uma, db_show_uma)
 {
 	u_int64_t allocs, frees;
 	uma_bucket_t bucket;
 	uma_keg_t kz;
 	uma_zone_t z;
 	int cachefree;
 
 	db_printf("%18s %8s %8s %8s %12s\n", "Zone", "Size", "Used", "Free",
 	    "Requests");
 	LIST_FOREACH(kz, &uma_kegs, uk_link) {
 		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
 			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
 				allocs = z->uz_allocs;
 				frees = z->uz_frees;
 				cachefree = 0;
 			} else
 				uma_zone_sumstat(z, &cachefree, &allocs,
 				    &frees);
 			if (!((kz->uk_flags & UMA_ZONE_SECONDARY) &&
 			    (LIST_FIRST(&kz->uk_zones) != z)))
 				cachefree += kz->uk_free;
 			LIST_FOREACH(bucket, &z->uz_full_bucket, ub_link)
 				cachefree += bucket->ub_cnt;
 			db_printf("%18s %8ju %8jd %8d %12ju\n", z->uz_name,
 			    (uintmax_t)kz->uk_size,
 			    (intmax_t)(allocs - frees), cachefree,
 			    (uintmax_t)allocs);
 		}
 	}
 }
 #endif
Index: head/sys/vm/vm_contig.c
===================================================================
--- head/sys/vm/vm_contig.c	(revision 169666)
+++ head/sys/vm/vm_contig.c	(revision 169667)
@@ -1,619 +1,619 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/linker_set.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 static int
 vm_contig_launder_page(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t m_tmp;
 	struct vnode *vp;
 	struct mount *mp;
 
 	object = m->object;
 	if (!VM_OBJECT_TRYLOCK(object))
 		return (EAGAIN);
 	if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) {
 		VM_OBJECT_UNLOCK(object);
 		vm_page_lock_queues();
 		return (EBUSY);
 	}
 	vm_page_test_dirty(m);
 	if (m->dirty == 0 && m->hold_count == 0)
 		pmap_remove_all(m);
 	if (m->dirty) {
 		if ((object->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(object);
 			return (EAGAIN);
 		}
 		if (object->type == OBJT_VNODE) {
 			vm_page_unlock_queues();
 			vp = object->handle;
 			vm_object_reference_locked(object);
 			VM_OBJECT_UNLOCK(object);
 			(void) vn_start_write(vp, &mp, V_WAIT);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 			VM_OBJECT_LOCK(object);
 			vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 			VM_OBJECT_UNLOCK(object);
 			VOP_UNLOCK(vp, 0, curthread);
 			vm_object_deallocate(object);
 			vn_finished_write(mp);
 			vm_page_lock_queues();
 			return (0);
 		} else if (object->type == OBJT_SWAP ||
 			   object->type == OBJT_DEFAULT) {
 			m_tmp = m;
 			vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC);
 			VM_OBJECT_UNLOCK(object);
 			return (0);
 		}
 	} else if (m->hold_count == 0)
 		vm_page_cache(m);
 	VM_OBJECT_UNLOCK(object);
 	return (0);
 }
 
 static int
 vm_contig_launder(int queue)
 {
 	vm_page_t m, next;
 	int error;
 
 	for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) {
 		next = TAILQ_NEXT(m, pageq);
 
 		/* Skip marker pages */
 		if ((m->flags & PG_MARKER) != 0)
 			continue;
 
 		KASSERT(VM_PAGE_INQUEUE2(m, queue),
 		    ("vm_contig_launder: page %p's queue is not %d", m, queue));
 		error = vm_contig_launder_page(m);
 		if (error == 0)
 			return (TRUE);
 		if (error == EBUSY)
 			return (FALSE);
 	}
 	return (FALSE);
 }
 
 /*
  * This interface is for merging with malloc() someday.
  * Even if we never implement compaction so that contiguous allocation
  * works after initialization time, malloc()'s data structures are good
  * for statistics and for allocations of less than a page.
  */
 static void *
 contigmalloc1(
 	unsigned long size,	/* should be size_t here and for malloc() */
 	struct malloc_type *type,
 	int flags,
 	vm_paddr_t low,
 	vm_paddr_t high,
 	unsigned long alignment,
 	unsigned long boundary,
 	vm_map_t map)
 {
 	int i, start;
 	vm_paddr_t phys;
 	vm_object_t object;
 	vm_offset_t addr, tmp_addr;
 	int pass, pqtype;
 	int inactl, actl, inactmax, actmax;
 	vm_page_t pga = vm_page_array;
 
 	size = round_page(size);
 	if (size == 0)
 		panic("contigmalloc1: size must not be 0");
 	if ((alignment & (alignment - 1)) != 0)
 		panic("contigmalloc1: alignment must be a power of 2");
 	if ((boundary & (boundary - 1)) != 0)
 		panic("contigmalloc1: boundary must be a power of 2");
 
 	start = 0;
 	for (pass = 2; pass >= 0; pass--) {
 		vm_page_lock_queues();
 again0:
 		mtx_lock(&vm_page_queue_free_mtx);
 again:
 		/*
 		 * Find first page in array that is free, within range,
 		 * aligned, and such that the boundary won't be crossed.
 		 */
-		for (i = start; i < cnt.v_page_count; i++) {
+		for (i = start; i < VMCNT_GET(page_count); i++) {
 			phys = VM_PAGE_TO_PHYS(&pga[i]);
 			pqtype = pga[i].queue - pga[i].pc;
 			if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) &&
 			    (phys >= low) && (phys < high) &&
 			    ((phys & (alignment - 1)) == 0) &&
 			    (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0))
 				break;
 		}
 
 		/*
 		 * If the above failed or we will exceed the upper bound, fail.
 		 */
-		if ((i == cnt.v_page_count) ||
+		if ((i == VMCNT_GET(page_count)) ||
 			((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
 			mtx_unlock(&vm_page_queue_free_mtx);
 			/*
 			 * Instead of racing to empty the inactive/active
 			 * queues, give up, even with more left to free,
 			 * if we try more than the initial amount of pages.
 			 *
 			 * There's no point attempting this on the last pass.
 			 */
 			if (pass > 0) {
 				inactl = actl = 0;
 				inactmax = vm_page_queues[PQ_INACTIVE].lcnt;
 				actmax = vm_page_queues[PQ_ACTIVE].lcnt;
 again1:
 				if (inactl < inactmax &&
 				    vm_contig_launder(PQ_INACTIVE)) {
 					inactl++;
 					goto again1;
 				}
 				if (actl < actmax &&
 				    vm_contig_launder(PQ_ACTIVE)) {
 					actl++;
 					goto again1;
 				}
 			}
 			vm_page_unlock_queues();
 			continue;
 		}
 		start = i;
 
 		/*
 		 * Check successive pages for contiguous and free.
 		 */
 		for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
 			pqtype = pga[i].queue - pga[i].pc;
 			if ((VM_PAGE_TO_PHYS(&pga[i]) !=
 			    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
 			    ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) {
 				start++;
 				goto again;
 			}
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 		for (i = start; i < (start + size / PAGE_SIZE); i++) {
 			vm_page_t m = &pga[i];
 
 			if (VM_PAGE_INQUEUE1(m, PQ_CACHE)) {
 				if (m->hold_count != 0) {
 					start++;
 					goto again0;
 				}
 				object = m->object;
 				if (!VM_OBJECT_TRYLOCK(object)) {
 					start++;
 					goto again0;
 				}
 				if ((m->oflags & VPO_BUSY) || m->busy != 0) {
 					VM_OBJECT_UNLOCK(object);
 					start++;
 					goto again0;
 				}
 				vm_page_free(m);
 				VM_OBJECT_UNLOCK(object);
 			}
 		}
 		mtx_lock(&vm_page_queue_free_mtx);
 		for (i = start; i < (start + size / PAGE_SIZE); i++) {
 			pqtype = pga[i].queue - pga[i].pc;
 			if (pqtype != PQ_FREE) {
 				start++;
 				goto again;
 			}
 		}
 		for (i = start; i < (start + size / PAGE_SIZE); i++) {
 			vm_page_t m = &pga[i];
 			vm_pageq_remove_nowakeup(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			if (m->flags & PG_ZERO)
 				vm_page_zero_count--;
 			/* Don't clear the PG_ZERO flag, we'll need it later. */
 			m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
 			KASSERT(m->dirty == 0,
 			    ("contigmalloc1: page %p was dirty", m));
 			m->wire_count = 0;
 			m->busy = 0;
 		}
 		mtx_unlock(&vm_page_queue_free_mtx);
 		vm_page_unlock_queues();
 		/*
 		 * We've found a contiguous chunk that meets are requirements.
 		 * Allocate kernel VM, unfree and assign the physical pages to
 		 * it and return kernel VM pointer.
 		 */
 		vm_map_lock(map);
 		if (vm_map_findspace(map, vm_map_min(map), size, &addr) !=
 		    KERN_SUCCESS) {
 			/*
 			 * XXX We almost never run out of kernel virtual
 			 * space, so we don't make the allocated memory
 			 * above available.
 			 */
 			vm_map_unlock(map);
 			return (NULL);
 		}
 		vm_object_reference(kernel_object);
 		vm_map_insert(map, kernel_object, addr - VM_MIN_KERNEL_ADDRESS,
 		    addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
 		vm_map_unlock(map);
 
 		tmp_addr = addr;
 		VM_OBJECT_LOCK(kernel_object);
 		for (i = start; i < (start + size / PAGE_SIZE); i++) {
 			vm_page_t m = &pga[i];
 			vm_page_insert(m, kernel_object,
 				OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
 			if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
 				pmap_zero_page(m);
 			tmp_addr += PAGE_SIZE;
 		}
 		VM_OBJECT_UNLOCK(kernel_object);
 		vm_map_wire(map, addr, addr + size,
 		    VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES);
 
 		return ((void *)addr);
 	}
 	return (NULL);
 }
 
 static void
 vm_page_release_contigl(vm_page_t m, vm_pindex_t count)
 {
 	while (count--) {
 		vm_page_free_toq(m);
 		m++;
 	}
 }
 
 static void
 vm_page_release_contig(vm_page_t m, vm_pindex_t count)
 {
 	vm_page_lock_queues();
 	vm_page_release_contigl(m, count);
 	vm_page_unlock_queues();
 }
 
 static int
 vm_contig_unqueue_free(vm_page_t m)
 {
 	int error = 0;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if ((m->queue - m->pc) == PQ_FREE)
 		vm_pageq_remove_nowakeup(m);
 	else
 		error = EAGAIN;
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (error)
 		return (error);
 	m->valid = VM_PAGE_BITS_ALL;
 	if (m->flags & PG_ZERO)
 		vm_page_zero_count--;
 	/* Don't clear the PG_ZERO flag; we'll need it later. */
 	m->flags = PG_UNMANAGED | (m->flags & PG_ZERO);
 	m->oflags = 0;
 	KASSERT(m->dirty == 0,
 	    ("contigmalloc2: page %p was dirty", m));
 	m->wire_count = 0;
 	m->busy = 0;
 	return (error);
 }
 
 vm_page_t
 vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high,
 	    vm_offset_t alignment, vm_offset_t boundary)
 {
 	vm_object_t object;
 	vm_offset_t size;
 	vm_paddr_t phys;
 	vm_page_t pga = vm_page_array;
 	static vm_pindex_t np = 0;
 	static vm_pindex_t start = 0;
 	vm_pindex_t startl = 0;
  	int i, pass, pqtype;
 
 	size = npages << PAGE_SHIFT;
 	if (size == 0)
 		panic("vm_page_alloc_contig: size must not be 0");
 	if ((alignment & (alignment - 1)) != 0)
 		panic("vm_page_alloc_contig: alignment must be a power of 2");
 	if ((boundary & (boundary - 1)) != 0)
 		panic("vm_page_alloc_contig: boundary must be a power of 2");
 
 	/*
 	 * Two simple optimizations.  First, don't scan high ordered pages
 	 * if they are outside of the requested address range.  Second, cache
 	 * the starting page index across calls and reuse it instead of
 	 * restarting the scan from the top.  This is conditional on the
 	 * requested number of pages being the same or greater than the
 	 * cached amount.
 	 */
 	for (pass = 0; pass < 2; pass++) {
 		vm_page_lock_queues();
 		if ((np == 0) || (np > npages)) {
 			if (atop(high) < vm_page_array_size)
 				start = atop(high) - npages + 1;
 			else
 				start = vm_page_array_size - npages + 1;
 		}
 		np = 0;
 retry:
 		start--;
 		/*
 		 * Find last page in array that is free, within range,
 		 * aligned, and such that the boundary won't be crossed.
 		 */
 		for (i = start; i >= 0; i--) {
 			phys = VM_PAGE_TO_PHYS(&pga[i]);
 			pqtype = pga[i].queue - pga[i].pc;
 			if (pass == 0) {
 				if (pqtype != PQ_FREE && pqtype != PQ_CACHE)
 					continue;
 			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
 				    pga[i].queue != PQ_ACTIVE &&
 				    pga[i].queue != PQ_INACTIVE)
 				continue;
 			if (phys >= low && phys + size <= high &&
 			    ((phys & (alignment - 1)) == 0) &&
 			    ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)
 				break;
 		}
 		/* There are no candidates at all. */
 		if (i < 0) {
 			vm_page_unlock_queues();
 			continue;
 		}
 		start = i;
 		/*
 		 * Check successive pages for contiguous and free.
 		 */
 		for (i = start + npages - 1; i > start; i--) {
 			pqtype = pga[i].queue - pga[i].pc;
 			if (VM_PAGE_TO_PHYS(&pga[i]) !=
 			    VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) {
 				start = i - npages + 1;
 				goto retry;
 			}
 			if (pass == 0) {
 				if (pqtype != PQ_FREE && pqtype != PQ_CACHE) {
 					start = i - npages + 1;
 					goto retry;
 				}
 			} else if (pqtype != PQ_FREE && pqtype != PQ_CACHE &&
 				    pga[i].queue != PQ_ACTIVE &&
 				    pga[i].queue != PQ_INACTIVE) {
 				start = i - npages + 1;
 				goto retry;
 			}
 		}
 		for (i = start + npages - 1; i >= start; i--) {
 			vm_page_t m = &pga[i];
 
 retry_page:
 			pqtype = m->queue - m->pc;
 			if (pass != 0 && pqtype != PQ_FREE &&
 			    pqtype != PQ_CACHE) {
 				if (m->queue == PQ_ACTIVE ||
 				    m->queue == PQ_INACTIVE) {
 					if (vm_contig_launder_page(m) != 0)
 						goto cleanup_freed;
 					pqtype = m->queue - m->pc;
 					if (pqtype != PQ_FREE &&
 					    pqtype != PQ_CACHE)
 						goto cleanup_freed;
 				} else {
 cleanup_freed:
 					vm_page_release_contigl(&pga[i + 1],
 					    start + npages - 1 - i);
 					start = i - npages + 1;
 					goto retry;
 				}
 			}
 			if (pqtype == PQ_CACHE) {
 				if (m->hold_count != 0)
 					goto cleanup_freed;
 				object = m->object;
 				if (!VM_OBJECT_TRYLOCK(object))
 					goto cleanup_freed;
 				if ((m->oflags & VPO_BUSY) || m->busy != 0) {
 					VM_OBJECT_UNLOCK(object);
 					goto cleanup_freed;
 				}
 				vm_page_free(m);
 				VM_OBJECT_UNLOCK(object);
 			}
 			/*
 			 * There is no good API for freeing a page
 			 * directly to PQ_NONE on our behalf, so spin.
 			 */
 			if (vm_contig_unqueue_free(m) != 0)
 				goto retry_page;
 		}
 		/*
 		 * We've found a contiguous chunk that meets are requirements.
 		 */
 		np = npages;
 		startl = start;
 		vm_page_unlock_queues();
 		return (&pga[startl]);
 	}
 	return (NULL);
 }
 
 static void *
 contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags)
 {
 	vm_object_t object = kernel_object;
 	vm_map_t map = kernel_map;
 	vm_offset_t addr, tmp_addr;
 	vm_pindex_t i;
  
 	/*
 	 * Allocate kernel VM, unfree and assign the physical pages to
 	 * it and return kernel VM pointer.
 	 */
 	vm_map_lock(map);
 	if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr)
 	    != KERN_SUCCESS) {
 		vm_map_unlock(map);
 		return (NULL);
 	}
 	vm_object_reference(object);
 	vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS,
 	    addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 	tmp_addr = addr;
 	VM_OBJECT_LOCK(object);
 	for (i = 0; i < npages; i++) {
 		vm_page_insert(&m[i], object,
 		    OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
 		if ((flags & M_ZERO) && !(m[i].flags & PG_ZERO))
 			pmap_zero_page(&m[i]);
 		tmp_addr += PAGE_SIZE;
 	}
 	VM_OBJECT_UNLOCK(object);
 	vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	return ((void *)addr);
 }
 
 static int vm_old_contigmalloc = 0;
 SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc,
     CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm");
 TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc);
 
 void *
 contigmalloc(
 	unsigned long size,	/* should be size_t here and for malloc() */
 	struct malloc_type *type,
 	int flags,
 	vm_paddr_t low,
 	vm_paddr_t high,
 	unsigned long alignment,
 	unsigned long boundary)
 {
 	void * ret;
 	vm_page_t pages;
 	vm_pindex_t npgs;
 
 	npgs = round_page(size) >> PAGE_SHIFT;
 	mtx_lock(&Giant);
 	if (vm_old_contigmalloc) {
 		ret = contigmalloc1(size, type, flags, low, high, alignment,
 		    boundary, kernel_map);
 	} else {
 		pages = vm_page_alloc_contig(npgs, low, high,
 		    alignment, boundary);
 		if (pages == NULL) {
 			ret = NULL;
 		} else {
 			ret = contigmalloc2(pages, npgs, flags);
 			if (ret == NULL)
 				vm_page_release_contig(pages, npgs);
 		}
 		
 	}
 	mtx_unlock(&Giant);
 	malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT);
 	return (ret);
 }
 
 void
 contigfree(void *addr, unsigned long size, struct malloc_type *type)
 {
 	vm_pindex_t npgs;
 
 	npgs = round_page(size) >> PAGE_SHIFT;
 	kmem_free(kernel_map, (vm_offset_t)addr, size);
 	malloc_type_freed(type, npgs << PAGE_SHIFT);
 }
Index: head/sys/vm/vm_fault.c
===================================================================
--- head/sys/vm/vm_fault.c	(revision 169666)
+++ head/sys/vm/vm_fault.c	(revision 169667)
@@ -1,1345 +1,1346 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Page fault handling module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/mount.h>	/* XXX Temporary for VFS_LOCK_GIANT() */
 
 #define PFBAK 4
 #define PFFOR 4
 #define PAGEORDER_SIZE (PFBAK+PFFOR)
 
 static int prefault_pageorder[] = {
 	-1 * PAGE_SIZE, 1 * PAGE_SIZE,
 	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
 	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
 	-4 * PAGE_SIZE, 4 * PAGE_SIZE
 };
 
 static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
 static void vm_fault_prefault(pmap_t, vm_offset_t, vm_map_entry_t);
 
 #define VM_FAULT_READ_AHEAD 8
 #define VM_FAULT_READ_BEHIND 7
 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1)
 
 struct faultstate {
 	vm_page_t m;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_page_t first_m;
 	vm_object_t	first_object;
 	vm_pindex_t first_pindex;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	int lookup_still_valid;
 	struct vnode *vp;
 };
 
 static inline void
 release_page(struct faultstate *fs)
 {
 	vm_page_wakeup(fs->m);
 	vm_page_lock_queues();
 	vm_page_deactivate(fs->m);
 	vm_page_unlock_queues();
 	fs->m = NULL;
 }
 
 static inline void
 unlock_map(struct faultstate *fs)
 {
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
 		fs->lookup_still_valid = FALSE;
 	}
 }
 
 static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
 	vm_object_pip_wakeup(fs->object);
 	VM_OBJECT_UNLOCK(fs->object);
 	if (fs->object != fs->first_object) {
 		VM_OBJECT_LOCK(fs->first_object);
 		vm_page_lock_queues();
 		vm_page_free(fs->first_m);
 		vm_page_unlock_queues();
 		vm_object_pip_wakeup(fs->first_object);
 		VM_OBJECT_UNLOCK(fs->first_object);
 		fs->first_m = NULL;
 	}
 	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);	
 	if (fs->vp != NULL) { 
 		int vfslocked;
 
 		vfslocked = VFS_LOCK_GIANT(fs->vp->v_mount);
 		vput(fs->vp);
 		fs->vp = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 }
 
 /*
  * TRYPAGER - used by vm_fault to calculate whether the pager for the
  *	      current object *might* contain the page.
  *
  *	      default objects are zero-fill, there is no real pager.
  */
 #define TRYPAGER	(fs.object->type != OBJT_DEFAULT && \
 			(((fault_flags & VM_FAULT_WIRE_MASK) == 0) || wired))
 
 /*
  *	vm_fault:
  *
  *	Handle a page fault occurring at the given address,
  *	requiring the given permissions, in the map specified.
  *	If successful, the page is inserted into the
  *	associated physical map.
  *
  *	NOTE: the given address should be truncated to the
  *	proper page address.
  *
  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
  *	a standard error specifying why the fault is fatal is returned.
  *
  *
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	 int fault_flags)
 {
 	vm_prot_t prot;
 	int is_first_object_locked, result;
 	boolean_t growstack, wired;
 	int map_generation;
 	vm_object_t next_object;
 	vm_page_t marray[VM_FAULT_READ];
 	int hardfault;
 	int faultcount;
 	struct faultstate fs;
 
 	hardfault = 0;
 	growstack = TRUE;
 	PCPU_LAZY_INC(cnt.v_vm_faults);
 
 RetryFault:;
 
 	/*
 	 * Find the backing store object and offset into it to begin the
 	 * search.
 	 */
 	fs.map = map;
 	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
 	    &fs.first_object, &fs.first_pindex, &prot, &wired);
 	if (result != KERN_SUCCESS) {
 		if (result != KERN_PROTECTION_FAILURE ||
 		    (fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE) {
 			if (growstack && result == KERN_INVALID_ADDRESS &&
 			    map != kernel_map && curproc != NULL) {
 				result = vm_map_growstack(curproc, vaddr);
 				if (result != KERN_SUCCESS)
 					return (KERN_FAILURE);
 				growstack = FALSE;
 				goto RetryFault;
 			}
 			return (result);
 		}
 
 		/*
    		 * If we are user-wiring a r/w segment, and it is COW, then
    		 * we need to do the COW operation.  Note that we don't COW
    		 * currently RO sections now, because it is NOT desirable
    		 * to COW .text.  We simply keep .text from ever being COW'ed
    		 * and take the heat that one cannot debug wired .text sections.
    		 */
 		result = vm_map_lookup(&fs.map, vaddr,
 			VM_PROT_READ|VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE,
 			&fs.entry, &fs.first_object, &fs.first_pindex, &prot, &wired);
 		if (result != KERN_SUCCESS)
 			return (result);
 
 		/*
 		 * If we don't COW now, on a user wire, the user will never
 		 * be able to write to the mapping.  If we don't make this
 		 * restriction, the bookkeeping would be nearly impossible.
 		 *
 		 * XXX The following assignment modifies the map without
 		 * holding a write lock on it.
 		 */
 		if ((fs.entry->protection & VM_PROT_WRITE) == 0)
 			fs.entry->max_protection &= ~VM_PROT_WRITE;
 	}
 
 	map_generation = fs.map->timestamp;
 
 	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
 		panic("vm_fault: fault on nofault entry, addr: %lx",
 		    (u_long)vaddr);
 	}
 
 	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
 	 * truncation operations) during I/O.  This must be done after
 	 * obtaining the vnode lock in order to avoid possible deadlocks.
 	 *
 	 * XXX vnode_pager_lock() can block without releasing the map lock.
 	 */
 	if (fs.first_object->flags & OBJ_NEEDGIANT)
 		mtx_lock(&Giant);
 	VM_OBJECT_LOCK(fs.first_object);
 	vm_object_reference_locked(fs.first_object);
 	fs.vp = vnode_pager_lock(fs.first_object);
 	KASSERT(fs.vp == NULL || !fs.map->system_map,
 	    ("vm_fault: vnode-backed object mapped by system map"));
 	KASSERT((fs.first_object->flags & OBJ_NEEDGIANT) == 0 ||
 	    !fs.map->system_map,
 	    ("vm_fault: Object requiring giant mapped by system map"));
 	if (fs.first_object->flags & OBJ_NEEDGIANT)
 		mtx_unlock(&Giant);
 	vm_object_pip_add(fs.first_object, 1);
 
 	fs.lookup_still_valid = TRUE;
 
 	if (wired)
 		fault_type = prot;
 
 	fs.first_m = NULL;
 
 	/*
 	 * Search for the page at object/offset.
 	 */
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 	while (TRUE) {
 		/*
 		 * If the object is dead, we stop here
 		 */
 		if (fs.object->flags & OBJ_DEAD) {
 			unlock_and_deallocate(&fs);
 			return (KERN_PROTECTION_FAILURE);
 		}
 
 		/*
 		 * See if page is resident
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			int queue;
 
 			/* 
 			 * check for page-based copy on write.
 			 * We check fs.object == fs.first_object so
 			 * as to ensure the legacy COW mechanism is
 			 * used when the page in question is part of
 			 * a shadow object.  Otherwise, vm_page_cowfault()
 			 * removes the page from the backing object, 
 			 * which is not what we want.
 			 */
 			vm_page_lock_queues();
 			if ((fs.m->cow) && 
 			    (fault_type & VM_PROT_WRITE) &&
 			    (fs.object == fs.first_object)) {
 				vm_page_cowfault(fs.m);
 				vm_page_unlock_queues();
 				unlock_and_deallocate(&fs);
 				goto RetryFault;
 			}
 
 			/*
 			 * Wait/Retry if the page is busy.  We have to do this
 			 * if the page is busy via either VPO_BUSY or 
 			 * vm_page_t->busy because the vm_pager may be using
 			 * vm_page_t->busy for pageouts ( and even pageins if
 			 * it is the vnode pager ), and we could end up trying
 			 * to pagein and pageout the same page simultaneously.
 			 *
 			 * We can theoretically allow the busy case on a read
 			 * fault if the page is marked valid, but since such
 			 * pages are typically already pmap'd, putting that
 			 * special case in might be more effort then it is 
 			 * worth.  We cannot under any circumstances mess
 			 * around with a vm_page_t->busy page except, perhaps,
 			 * to pmap it.
 			 */
 			if ((fs.m->oflags & VPO_BUSY) || fs.m->busy) {
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(fs.object);
 				if (fs.object != fs.first_object) {
 					VM_OBJECT_LOCK(fs.first_object);
 					vm_page_lock_queues();
 					vm_page_free(fs.first_m);
 					vm_page_unlock_queues();
 					vm_object_pip_wakeup(fs.first_object);
 					VM_OBJECT_UNLOCK(fs.first_object);
 					fs.first_m = NULL;
 				}
 				unlock_map(&fs);
 				if (fs.vp != NULL) {
 					int vfslck;
 
 					vfslck = VFS_LOCK_GIANT(fs.vp->v_mount);
 					vput(fs.vp);
 					fs.vp = NULL;
 					VFS_UNLOCK_GIANT(vfslck);
 				}
 				VM_OBJECT_LOCK(fs.object);
 				if (fs.m == vm_page_lookup(fs.object,
 				    fs.pindex)) {
 					vm_page_sleep_if_busy(fs.m, TRUE,
 					    "vmpfw");
 				}
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_UNLOCK(fs.object);
 				PCPU_LAZY_INC(cnt.v_intrans);
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
 			queue = fs.m->queue;
 
 			vm_pageq_remove_nowakeup(fs.m);
 
 			if (VM_PAGE_RESOLVEQUEUE(fs.m, queue) == PQ_CACHE &&
 			    vm_page_count_severe()) {
 				vm_page_activate(fs.m);
 				vm_page_unlock_queues();
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
 			}
 			vm_page_unlock_queues();
 
 			/*
 			 * Mark page busy for other processes, and the 
 			 * pagedaemon.  If it still isn't completely valid
 			 * (readable), jump to readrest, else break-out ( we
 			 * found the page ).
 			 */
 			vm_page_busy(fs.m);
 			if (((fs.m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
 				fs.m->object != kernel_object && fs.m->object != kmem_object) {
 				goto readrest;
 			}
 
 			break;
 		}
 
 		/*
 		 * Page is not resident, If this is the search termination
 		 * or the pager might contain the page, allocate a new page.
 		 */
 		if (TRYPAGER || fs.object == fs.first_object) {
 			if (fs.pindex >= fs.object->size) {
 				unlock_and_deallocate(&fs);
 				return (KERN_PROTECTION_FAILURE);
 			}
 
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 */
 			fs.m = NULL;
 			if (!vm_page_count_severe()) {
 				fs.m = vm_page_alloc(fs.object, fs.pindex,
 				    (fs.vp || fs.object->backing_object)? VM_ALLOC_NORMAL: VM_ALLOC_ZERO);
 			}
 			if (fs.m == NULL) {
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
 			}
 		}
 
 readrest:
 		/*
 		 * We have found a valid page or we have allocated a new page.
 		 * The page thus may not be valid or may not be entirely 
 		 * valid.
 		 *
 		 * Attempt to fault-in the page if there is a chance that the
 		 * pager has it, and potentially fault in additional pages
 		 * at the same time.
 		 */
 		if (TRYPAGER) {
 			int rv;
 			int reqpage;
 			int ahead, behind;
 			u_char behavior = vm_map_entry_behavior(fs.entry);
 
 			if (behavior == MAP_ENTRY_BEHAV_RANDOM) {
 				ahead = 0;
 				behind = 0;
 			} else {
 				behind = (vaddr - fs.entry->start) >> PAGE_SHIFT;
 				if (behind > VM_FAULT_READ_BEHIND)
 					behind = VM_FAULT_READ_BEHIND;
 
 				ahead = ((fs.entry->end - vaddr) >> PAGE_SHIFT) - 1;
 				if (ahead > VM_FAULT_READ_AHEAD)
 					ahead = VM_FAULT_READ_AHEAD;
 			}
 			is_first_object_locked = FALSE;
 			if ((behavior == MAP_ENTRY_BEHAV_SEQUENTIAL ||
 			     (behavior != MAP_ENTRY_BEHAV_RANDOM &&
 			      fs.pindex >= fs.entry->lastr &&
 			      fs.pindex < fs.entry->lastr + VM_FAULT_READ)) &&
 			    (fs.first_object == fs.object ||
 			     (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object))) &&
 			    fs.first_object->type != OBJT_DEVICE) {
 				vm_pindex_t firstpindex, tmppindex;
 
 				if (fs.first_pindex < 2 * VM_FAULT_READ)
 					firstpindex = 0;
 				else
 					firstpindex = fs.first_pindex - 2 * VM_FAULT_READ;
 
 				vm_page_lock_queues();
 				/*
 				 * note: partially valid pages cannot be 
 				 * included in the lookahead - NFS piecemeal
 				 * writes will barf on it badly.
 				 */
 				for (tmppindex = fs.first_pindex - 1;
 					tmppindex >= firstpindex;
 					--tmppindex) {
 					vm_page_t mt;
 
 					mt = vm_page_lookup(fs.first_object, tmppindex);
 					if (mt == NULL || (mt->valid != VM_PAGE_BITS_ALL))
 						break;
 					if (mt->busy ||
 					    (mt->oflags & VPO_BUSY) ||
 					    (mt->flags & (PG_FICTITIOUS | PG_UNMANAGED)) ||
 						mt->hold_count ||
 						mt->wire_count) 
 						continue;
 					pmap_remove_all(mt);
 					if (mt->dirty) {
 						vm_page_deactivate(mt);
 					} else {
 						vm_page_cache(mt);
 					}
 				}
 				vm_page_unlock_queues();
 				ahead += behind;
 				behind = 0;
 			}
 			if (is_first_object_locked)
 				VM_OBJECT_UNLOCK(fs.first_object);
 			/*
 			 * now we find out if any other pages should be paged
 			 * in at this time this routine checks to see if the
 			 * pages surrounding this fault reside in the same
 			 * object as the page for this fault.  If they do,
 			 * then they are faulted in also into the object.  The
 			 * array "marray" returned contains an array of
 			 * vm_page_t structs where one of them is the
 			 * vm_page_t passed to the routine.  The reqpage
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
 			 *
 			 * fs.m plus the additional pages are VPO_BUSY'd.
 			 *
 			 * XXX vm_fault_additional_pages() can block
 			 * without releasing the map lock.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    fs.m, behind, ahead, marray, &reqpage);
 
 			/*
 			 * update lastr imperfectly (we do not know how much
 			 * getpages will actually read), but good enough.
 			 *
 			 * XXX The following assignment modifies the map
 			 * without holding a write lock on it.
 			 */
 			fs.entry->lastr = fs.pindex + faultcount - behind;
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
 			 * releasing the lock on the map.  We hold a ref on
 			 * fs.object and the pages are VPO_BUSY'd.
 			 */
 			unlock_map(&fs);
 
 			rv = faultcount ?
 			    vm_pager_get_pages(fs.object, marray, faultcount,
 				reqpage) : VM_PAGER_FAIL;
 
 			if (rv == VM_PAGER_OK) {
 				/*
 				 * Found the page. Leave it busy while we play
 				 * with it.
 				 */
 
 				/*
 				 * Relookup in case pager changed page. Pager
 				 * is responsible for disposition of old page
 				 * if moved.
 				 */
 				fs.m = vm_page_lookup(fs.object, fs.pindex);
 				if (!fs.m) {
 					unlock_and_deallocate(&fs);
 					goto RetryFault;
 				}
 
 				hardfault++;
 				break; /* break to PAGE HAS BEEN FOUND */
 			}
 			/*
 			 * Remove the bogus page (which does not exist at this
 			 * object/offset); before doing so, we must get back
 			 * our object lock to preserve our invariant.
 			 *
 			 * Also wake up any other process that may want to bring
 			 * in this page.
 			 *
 			 * If this is the top-level object, we must leave the
 			 * busy page to prevent another process from rushing
 			 * past us, and inserting the page in that object at
 			 * the same time that we are.
 			 */
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager read error, pid %d (%s)\n",
 				    curproc->p_pid, curproc->p_comm);
 			/*
 			 * Data outside the range of the pager or an I/O error
 			 */
 			/*
 			 * XXX - the check for kernel_map is a kludge to work
 			 * around having the machine panic on a kernel space
 			 * fault w/ I/O error.
 			 */
 			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
 				(rv == VM_PAGER_BAD)) {
 				vm_page_lock_queues();
 				vm_page_free(fs.m);
 				vm_page_unlock_queues();
 				fs.m = NULL;
 				unlock_and_deallocate(&fs);
 				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
 			}
 			if (fs.object != fs.first_object) {
 				vm_page_lock_queues();
 				vm_page_free(fs.m);
 				vm_page_unlock_queues();
 				fs.m = NULL;
 				/*
 				 * XXX - we cannot just fall out at this
 				 * point, m has been freed and is invalid!
 				 */
 			}
 		}
 
 		/*
 		 * We get here if the object has default pager (or unwiring) 
 		 * or the pager doesn't have the page.
 		 */
 		if (fs.object == fs.first_object)
 			fs.first_m = fs.m;
 
 		/*
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
 		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
 		next_object = fs.object->backing_object;
 		if (next_object == NULL) {
 			/*
 			 * If there's no object left, fill the page in the top
 			 * object with zeros.
 			 */
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_UNLOCK(fs.object);
 
 				fs.object = fs.first_object;
 				fs.pindex = fs.first_pindex;
 				fs.m = fs.first_m;
 				VM_OBJECT_LOCK(fs.object);
 			}
 			fs.first_m = NULL;
 
 			/*
 			 * Zero the page if necessary and mark it valid.
 			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				pmap_zero_page(fs.m);
 			} else {
 				PCPU_LAZY_INC(cnt.v_ozfod);
 			}
 			PCPU_LAZY_INC(cnt.v_zfod);
 			fs.m->valid = VM_PAGE_BITS_ALL;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			KASSERT(fs.object != next_object,
 			    ("object loop %p", next_object));
 			VM_OBJECT_LOCK(next_object);
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
 			VM_OBJECT_UNLOCK(fs.object);
 			fs.object = next_object;
 		}
 	}
 
 	KASSERT((fs.m->oflags & VPO_BUSY) != 0,
 	    ("vm_fault: not busy after main loop"));
 
 	/*
 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 	 * is held.]
 	 */
 
 	/*
 	 * If the page is being written, but isn't already owned by the
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
 	if (fs.object != fs.first_object) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
 		if (fault_type & VM_PROT_WRITE) {
 			/*
 			 * This allows pages to be virtually copied from a 
 			 * backing_object into the first_object, where the 
 			 * backing object has no other refs to it, and cannot
 			 * gain any more refs.  Instead of a bcopy, we just 
 			 * move the page from the backing object to the 
 			 * first object.  Note that we must mark the page 
 			 * dirty in the first object so that it will go out 
 			 * to swap when needed.
 			 */
 			is_first_object_locked = FALSE;
 			if (
 				/*
 				 * Only one shadow object
 				 */
 				(fs.object->shadow_count == 1) &&
 				/*
 				 * No COW refs, except us
 				 */
 				(fs.object->ref_count == 1) &&
 				/*
 				 * No one else can look this object up
 				 */
 				(fs.object->handle == NULL) &&
 				/*
 				 * No other ways to look the object up
 				 */
 				((fs.object->type == OBJT_DEFAULT) ||
 				 (fs.object->type == OBJT_SWAP)) &&
 			    (is_first_object_locked = VM_OBJECT_TRYLOCK(fs.first_object)) &&
 				/*
 				 * We don't chase down the shadow chain
 				 */
 			    fs.object == fs.first_object->backing_object) {
 				vm_page_lock_queues();
 				/*
 				 * get rid of the unnecessary page
 				 */
 				vm_page_free(fs.first_m);
 				/*
 				 * grab the page and put it into the 
 				 * process'es object.  The page is 
 				 * automatically made dirty.
 				 */
 				vm_page_rename(fs.m, fs.first_object, fs.first_pindex);
 				vm_page_unlock_queues();
 				vm_page_busy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
 				PCPU_LAZY_INC(cnt.v_cow_optim);
 			} else {
 				/*
 				 * Oh, well, lets copy it.
 				 */
 				pmap_copy_page(fs.m, fs.first_m);
 				fs.first_m->valid = VM_PAGE_BITS_ALL;
 			}
 			if (fs.m) {
 				/*
 				 * We no longer need the old page or object.
 				 */
 				release_page(&fs);
 			}
 			/*
 			 * fs.object != fs.first_object due to above 
 			 * conditional
 			 */
 			vm_object_pip_wakeup(fs.object);
 			VM_OBJECT_UNLOCK(fs.object);
 			/*
 			 * Only use the new page below...
 			 */
 			fs.object = fs.first_object;
 			fs.pindex = fs.first_pindex;
 			fs.m = fs.first_m;
 			if (!is_first_object_locked)
 				VM_OBJECT_LOCK(fs.object);
 			PCPU_LAZY_INC(cnt.v_cow_faults);
 		} else {
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * We must verify that the maps have not changed since our last
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
 		vm_object_t retry_object;
 		vm_pindex_t retry_pindex;
 		vm_prot_t retry_prot;
 
 		if (!vm_map_trylock_read(fs.map)) {
 			release_page(&fs);
 			unlock_and_deallocate(&fs);
 			goto RetryFault;
 		}
 		fs.lookup_still_valid = TRUE;
 		if (fs.map->timestamp != map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 
 			/*
 			 * If we don't need the page any longer, put it on the inactive
 			 * list (the easiest thing to do here).  If no one needs it,
 			 * pageout will grab it eventually.
 			 */
 			if (result != KERN_SUCCESS) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 
 				/*
 				 * If retry of map lookup would have blocked then
 				 * retry fault from start.
 				 */
 				if (result == KERN_FAILURE)
 					goto RetryFault;
 				return (result);
 			}
 			if ((retry_object != fs.first_object) ||
 			    (retry_pindex != fs.first_pindex)) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 				goto RetryFault;
 			}
 
 			/*
 			 * Check whether the protection has changed or the object has
 			 * been copied while we left the map unlocked. Changing from
 			 * read to write permission is OK - we leave the page
 			 * write-protected, and catch the write fault. Changing from
 			 * write to read permission means that we can't mark the page
 			 * write-enabled after all.
 			 */
 			prot &= retry_prot;
 		}
 	}
 	if (prot & VM_PROT_WRITE) {
 		vm_object_set_writeable_dirty(fs.object);
 
 		/*
 		 * If the fault is a write, we know that this page is being
 		 * written NOW so dirty it explicitly to save on 
 		 * pmap_is_modified() calls later.
 		 *
 		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
 		 * if the page is already dirty to prevent data written with
 		 * the expectation of being synced from not being synced.
 		 * Likewise if this entry does not request NOSYNC then make
 		 * sure the page isn't marked NOSYNC.  Applications sharing
 		 * data should use the same flags to avoid ping ponging.
 		 *
 		 * Also tell the backing pager, if any, that it should remove
 		 * any swap backing since the page is now dirty.
 		 */
 		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
 			if (fs.m->dirty == 0)
 				fs.m->oflags |= VPO_NOSYNC;
 		} else {
 			fs.m->oflags &= ~VPO_NOSYNC;
 		}
 		if (fault_flags & VM_FAULT_DIRTY) {
 			vm_page_dirty(fs.m);
 			vm_pager_page_unswapped(fs.m);
 		}
 	}
 
 	/*
 	 * Page had better still be busy
 	 */
 	KASSERT(fs.m->oflags & VPO_BUSY,
 		("vm_fault: page %p not busy!", fs.m));
 	/*
 	 * Sanity check: page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
 	if (fs.m->valid != VM_PAGE_BITS_ALL) {
 		vm_page_zero_invalid(fs.m, TRUE);
 		printf("Warning: page %p partially invalid on fault\n", fs.m);
 	}
 	VM_OBJECT_UNLOCK(fs.object);
 
 	/*
 	 * Put this page into the physical map.  We had to do the unlock above
 	 * because pmap_enter() may sleep.  We don't put the page
 	 * back on the active queue until later so that the pageout daemon
 	 * won't find it (yet).
 	 */
 	pmap_enter(fs.map->pmap, vaddr, fs.m, prot, wired);
 	if (((fault_flags & VM_FAULT_WIRE_MASK) == 0) && (wired == 0)) {
 		vm_fault_prefault(fs.map->pmap, vaddr, fs.entry);
 	}
 	VM_OBJECT_LOCK(fs.object);
 	vm_page_lock_queues();
 	vm_page_flag_set(fs.m, PG_REFERENCED);
 
 	/*
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
 	if (fault_flags & VM_FAULT_WIRE_MASK) {
 		if (wired)
 			vm_page_wire(fs.m);
 		else
 			vm_page_unwire(fs.m, 1);
 	} else {
 		vm_page_activate(fs.m);
 	}
 	vm_page_unlock_queues();
 	vm_page_wakeup(fs.m);
 
 	/*
 	 * Unlock everything, and return
 	 */
 	unlock_and_deallocate(&fs);
 	PROC_LOCK(curproc);
 	if ((curproc->p_sflag & PS_INMEM) && curproc->p_stats) {
 		if (hardfault) {
 			curproc->p_stats->p_ru.ru_majflt++;
 		} else {
 			curproc->p_stats->p_ru.ru_minflt++;
 		}
 	}
 	PROC_UNLOCK(curproc);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of vm_map_pmap_enter, except it runs at page fault time instead
  * of mmap time.
  */
 static void
 vm_fault_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
 {
 	int i;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
 	vm_object_t object;
 
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
 	object = entry->object.vm_object;
 
 	starta = addra - PFBAK * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	for (i = 0; i < PAGEORDER_SIZE; i++) {
 		vm_object_t backing_object, lobject;
 
 		addr = addra + prefault_pageorder[i];
 		if (addr > addra + (PFFOR * PAGE_SIZE))
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if (!pmap_is_prefaultable(pmap, addr))
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = object;
 		VM_OBJECT_LOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
 			if (lobject->backing_object_offset & PAGE_MASK)
 				break;
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_LOCK(backing_object);
 			VM_OBJECT_UNLOCK(lobject);
 			lobject = backing_object;
 		}
 		/*
 		 * give-up when a page is not in memory
 		 */
 		if (m == NULL) {
 			VM_OBJECT_UNLOCK(lobject);
 			break;
 		}
 		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			(m->busy == 0) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 
 			vm_page_lock_queues();
 			if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
 				vm_page_deactivate(m);
 			pmap_enter_quick(pmap, addr, m, entry->protection);
 			vm_page_unlock_queues();
 		}
 		VM_OBJECT_UNLOCK(lobject);
 	}
 }
 
 /*
  *	vm_fault_quick:
  *
  *	Ensure that the requested virtual address, which may be in userland,
  *	is valid.  Fault-in the page if necessary.  Return -1 on failure.
  */
 int
 vm_fault_quick(caddr_t v, int prot)
 {
 	int r;
 
 	if (prot & VM_PROT_WRITE)
 		r = subyte(v, fubyte(v));
 	else
 		r = fubyte(v);
 	return(r);
 }
 
 /*
  *	vm_fault_wire:
  *
  *	Wire down a range of virtual addresses in a map.
  */
 int
 vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     boolean_t user_wire, boolean_t fictitious)
 {
 	vm_offset_t va;
 	int rv;
 
 	/*
 	 * We simulate a fault to get the page and enter it in the physical
 	 * map.  For user wiring, we only ask for read access on currently
 	 * read-only sections.
 	 */
 	for (va = start; va < end; va += PAGE_SIZE) {
 		rv = vm_fault(map, va,
 		    user_wire ? VM_PROT_READ : VM_PROT_READ | VM_PROT_WRITE,
 		    user_wire ? VM_FAULT_USER_WIRE : VM_FAULT_CHANGE_WIRING);
 		if (rv) {
 			if (va != start)
 				vm_fault_unwire(map, start, va, fictitious);
 			return (rv);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_fault_unwire:
  *
  *	Unwire a range of virtual addresses in a map.
  */
 void
 vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     boolean_t fictitious)
 {
 	vm_paddr_t pa;
 	vm_offset_t va;
 	pmap_t pmap;
 
 	pmap = vm_map_pmap(map);
 
 	/*
 	 * Since the pages are wired down, we must be able to get their
 	 * mappings from the physical map system.
 	 */
 	for (va = start; va < end; va += PAGE_SIZE) {
 		pa = pmap_extract(pmap, va);
 		if (pa != 0) {
 			pmap_change_wiring(pmap, va, FALSE);
 			if (!fictitious) {
 				vm_page_lock_queues();
 				vm_page_unwire(PHYS_TO_VM_PAGE(pa), 1);
 				vm_page_unlock_queues();
 			}
 		}
 	}
 }
 
 /*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
  *		Copy all of the pages from a wired-down map entry to another.
  *
  *	In/out conditions:
  *		The source and destination maps must be locked for write.
  *		The source map entry must be wired down (or be a sharing map
  *		entry corresponding to a main map entry that is wired down).
  */
 void
 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry)
 	vm_map_t dst_map;
 	vm_map_t src_map;
 	vm_map_entry_t dst_entry;
 	vm_map_entry_t src_entry;
 {
 	vm_object_t backing_object, dst_object, object;
 	vm_object_t src_object;
 	vm_ooffset_t dst_offset;
 	vm_ooffset_t src_offset;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
 
 #ifdef	lint
 	src_map++;
 #endif	/* lint */
 
 	src_object = src_entry->object.vm_object;
 	src_offset = src_entry->offset;
 
 	/*
 	 * Create the top-level object for the destination entry. (Doesn't
 	 * actually shadow anything - we copy the pages directly.)
 	 */
 	dst_object = vm_object_allocate(OBJT_DEFAULT,
 	    OFF_TO_IDX(dst_entry->end - dst_entry->start));
 
 	VM_OBJECT_LOCK(dst_object);
 	dst_entry->object.vm_object = dst_object;
 	dst_entry->offset = 0;
 
 	prot = dst_entry->max_protection;
 
 	/*
 	 * Loop through all of the pages in the entry's range, copying each
 	 * one from the source object (it should be there) to the destination
 	 * object.
 	 */
 	for (vaddr = dst_entry->start, dst_offset = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
 
 		/*
 		 * Allocate a page in the destination object
 		 */
 		do {
 			dst_m = vm_page_alloc(dst_object,
 				OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL);
 			if (dst_m == NULL) {
 				VM_OBJECT_UNLOCK(dst_object);
 				VM_WAIT;
 				VM_OBJECT_LOCK(dst_object);
 			}
 		} while (dst_m == NULL);
 
 		/*
 		 * Find the page in the source object, and copy it in.
 		 * (Because the source is wired down, the page will be in
 		 * memory.)
 		 */
 		VM_OBJECT_LOCK(src_object);
 		object = src_object;
 		pindex = 0;
 		while ((src_m = vm_page_lookup(object, pindex +
 		    OFF_TO_IDX(dst_offset + src_offset))) == NULL &&
 		    (src_entry->protection & VM_PROT_WRITE) == 0 &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
 			 * Allow fallback to backing objects if we are reading.
 			 */
 			VM_OBJECT_LOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
 			VM_OBJECT_UNLOCK(object);
 			object = backing_object;
 		}
 		if (src_m == NULL)
 			panic("vm_fault_copy_wired: page missing");
 		pmap_copy_page(src_m, dst_m);
 		VM_OBJECT_UNLOCK(object);
 		dst_m->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_UNLOCK(dst_object);
 
 		/*
 		 * Enter it in the pmap...
 		 */
 		pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE);
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
 		VM_OBJECT_LOCK(dst_object);
 		vm_page_lock_queues();
 		vm_page_activate(dst_m);
 		vm_page_unlock_queues();
 		vm_page_wakeup(dst_m);
 	}
 	VM_OBJECT_UNLOCK(dst_object);
 }
 
 
 /*
  * This routine checks around the requested page for other pages that
  * might be able to be faulted in.  This routine brackets the viable
  * pages for the pages to be paged in.
  *
  * Inputs:
  *	m, rbehind, rahead
  *
  * Outputs:
  *  marray (array of vm_page_t), reqpage (index of requested page)
  *
  * Return value:
  *  number of pages in marray
  *
  * This routine can't block.
  */
 static int
 vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	vm_page_t m;
 	int rbehind;
 	int rahead;
 	vm_page_t *marray;
 	int *reqpage;
 {
 	int i,j;
 	vm_object_t object;
 	vm_pindex_t pindex, startpindex, endpindex, tpindex;
 	vm_page_t rtm;
 	int cbehind, cahead;
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 
 	object = m->object;
 	pindex = m->pindex;
 
 	/*
 	 * we don't fault-ahead for device pager
 	 */
 	if (object->type == OBJT_DEVICE) {
 		*reqpage = 0;
 		marray[0] = m;
 		return 1;
 	}
 
 	cbehind = cahead = 0;
 
 	/*
 	 * if the requested page is not available, then give up now
 	 */
 	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
 		return 0;
 	}
 
 	if ((cbehind == 0) && (cahead == 0)) {
 		*reqpage = 0;
 		marray[0] = m;
 		return 1;
 	}
 
 	if (rahead > cahead) {
 		rahead = cahead;
 	}
 
 	if (rbehind > cbehind) {
 		rbehind = cbehind;
 	}
 
 	/*
 	 * try to do any readahead that we might have free pages for.
 	 */
 	if ((rahead + rbehind) >
-		((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) {
+		((VMCNT_GET(free_count) + VMCNT_GET(cache_count)) -
+		VMCNT_GET(free_reserved))) {
 		pagedaemon_wakeup();
 		marray[0] = m;
 		*reqpage = 0;
 		return 1;
 	}
 
 	/*
 	 * scan backward for the read behind pages -- in memory 
 	 */
 	if (pindex > 0) {
 		if (rbehind > pindex) {
 			rbehind = pindex;
 			startpindex = 0;
 		} else {
 			startpindex = pindex - rbehind;
 		}
 
 		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
 		    rtm->pindex >= startpindex)
 			startpindex = rtm->pindex + 1;
 
 		for (i = 0, tpindex = startpindex; tpindex < pindex; i++, tpindex++) {
 
 			rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 			if (rtm == NULL) {
 				vm_page_lock_queues();
 				for (j = 0; j < i; j++) {
 					vm_page_free(marray[j]);
 				}
 				vm_page_unlock_queues();
 				marray[0] = m;
 				*reqpage = 0;
 				return 1;
 			}
 
 			marray[i] = rtm;
 		}
 	} else {
 		startpindex = 0;
 		i = 0;
 	}
 
 	marray[i] = m;
 	/* page offset of the required page */
 	*reqpage = i;
 
 	tpindex = pindex + 1;
 	i++;
 
 	/*
 	 * scan forward for the read ahead pages
 	 */
 	endpindex = tpindex + rahead;
 	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
 		endpindex = rtm->pindex;
 	if (endpindex > object->size)
 		endpindex = object->size;
 
 	for (; tpindex < endpindex; i++, tpindex++) {
 
 		rtm = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 		if (rtm == NULL) {
 			break;
 		}
 
 		marray[i] = rtm;
 	}
 
 	/* return number of bytes of pages */
 	return i;
 }
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 169666)
+++ head/sys/vm/vm_glue.c	(revision 169667)
@@ -1,1011 +1,1011 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 extern int maxslp;
 
 /*
  * System initialization
  *
  * Note: proc0 from proc.h
  */
 static void vm_init_limits(void *);
 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
 
 /*
  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
  *
  * Note: run scheduling should be divorced from the vm system.
  */
 static void scheduler(void *);
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_ANY, scheduler, NULL)
 
 #ifndef NO_SWAPPING
 static void swapout(struct proc *);
 #endif
 
 
 static volatile int proc0_rescan;
 
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjuction with this call.
  */
 int
 useracc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	PROC_LOCK(curproc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&curproc->p_vmspace->vm_map))) >
 	    lim_cur(curproc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(curproc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(curproc);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + VMCNT_GET(wire_count) > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (error == KERN_SUCCESS ? 0 : EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m, ma[1];
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_LOCK(object);
 	pindex = OFF_TO_IDX(offset);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		ma[0] = m;
 		rv = vm_pager_get_pages(object, ma, 1, 0);
 		m = vm_page_lookup(object, pindex);
 		if (m == NULL)
 			goto out;
 		if (m->valid == 0 || rv != VM_PAGER_OK) {
 			vm_page_lock_queues();
 			vm_page_free(m);
 			vm_page_unlock_queues();
 			m = NULL;
 			goto out;
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(m);
 	vm_page_unlock_queues();
 	vm_page_wakeup(m);
 out:
 	VM_OBJECT_UNLOCK(object);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_lock_queues();
 	vm_page_unhold(m);
 	vm_page_unlock_queues();
 }
 
 #ifndef KSTACK_MAX_PAGES
 #define KSTACK_MAX_PAGES 32
 #endif
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
 void
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = KSTACK_PAGES;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
 	td->td_kstack_obj = ksobj;
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 	ks = kmem_alloc_nofault(kernel_map,
 	   (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 	if (ks == 0)
 		panic("vm_thread_new: kstack allocation failed");
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		/*
 		 * Get a kernel stack page.
 		 */
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_WIRED);
 		ma[i] = m;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m;
 	int i, pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	pmap_qremove(ks, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_unwire(m, 0);
 		vm_page_free(m);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kmem_free(kernel_map, ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_lock_queues();
 		vm_page_dirty(m);
 		vm_page_unwire(m, 0);
 		vm_page_unlock_queues();
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 void
 vm_thread_swapin(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	int i, pages, rv;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_LOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 		if (m->valid != VM_PAGE_BITS_ALL) {
 			rv = vm_pager_get_pages(ksobj, &m, 1, 0);
 			if (rv != VM_PAGER_OK)
 				panic("vm_thread_swapin: cannot get kstack for proc: %d", td->td_proc->p_pid);
 			m = vm_page_lookup(ksobj, i);
 			m->valid = VM_PAGE_BITS_ALL;
 		}
 		ma[i] = m;
 		vm_page_lock_queues();
 		vm_page_wire(m);
 		vm_page_unlock_queues();
 		vm_page_wakeup(m);
 	}
 	VM_OBJECT_UNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 
 /*
  * Set up a variable-sized alternate kstack.
  */
 void
 vm_thread_new_altkstack(struct thread *td, int pages)
 {
 
 	td->td_altkstack = td->td_kstack;
 	td->td_altkstack_obj = td->td_kstack_obj;
 	td->td_altkstack_pages = td->td_kstack_pages;
 
 	vm_thread_new(td, pages);
 }
 
 /*
  * Restore the original kstack.
  */
 void
 vm_thread_dispose_altkstack(struct thread *td)
 {
 
 	vm_thread_dispose(td);
 
 	td->td_kstack = td->td_altkstack;
 	td->td_kstack_obj = td->td_altkstack_obj;
 	td->td_kstack_pages = td->td_altkstack_pages;
 	td->td_altkstack = 0;
 	td->td_altkstack_obj = NULL;
 	td->td_altkstack_pages = 0;
 }
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 void
 vm_forkproc(td, p2, td2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				vmspace_unshare(p1);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return;
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vmspace_fork(p1->p_vmspace);
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 }
 
 /*
  * Called after process has been wait(2)'ed apon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 /*
  * Set default limits for VM system.
  * Called for proc 0, and then inherited by all others.
  *
  * XXX should probably act directly on proc0.
  */
 static void
 vm_init_limits(udata)
 	void *udata;
 {
 	struct proc *p = udata;
 	struct plimit *limp;
 	int rss_limit;
 
 	/*
 	 * Set up the initial limits on process VM. Set the maximum resident
 	 * set size to be half of (reasonably) available memory.  Since this
 	 * is a soft limit, it comes into effect only when the system is out
 	 * of memory - half of main memory helps to favor smaller processes,
 	 * and reduces thrashing of the object cache.
 	 */
 	limp = p->p_limit;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
 	limp->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
 	limp->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
 	/* limit the limit to no less than 2MB */
-	rss_limit = max(cnt.v_free_count, 512);
+	rss_limit = max(VMCNT_GET(free_count), 512);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
 	limp->pl_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_sflag & PS_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_sflag & PS_SWAPPINGIN)
 		msleep(&p->p_sflag, &p->p_mtx, PVM, "faultin", 0);
 	else if ((p->p_sflag & PS_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		mtx_lock_spin(&sched_lock);
 		p->p_sflag |= PS_SWAPPINGIN;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 
 		PROC_LOCK(p);
 		mtx_lock_spin(&sched_lock);
 		p->p_sflag &= ~PS_SWAPPINGIN;
 		p->p_sflag |= PS_INMEM;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			TD_CLR_SWAPPED(td);
 			if (TD_CAN_RUN(td))
 				setrunnable(td);
 		}
 		mtx_unlock_spin(&sched_lock);
 
 		wakeup(&p->p_sflag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
 	}
 #endif /* NO_SWAPPING */
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
  *  XXXKSE - process with the thread with highest priority counts..
  *
  * Giant is held on entry.
  */
 /* ARGSUSED*/
 static void
 scheduler(dummy)
 	void *dummy;
 {
 	struct proc *p;
 	struct thread *td;
 	int pri;
 	struct proc *pp;
 	int ppri;
 
 	mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
 	mtx_unlock(&Giant);
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		mtx_lock_spin(&sched_lock);
 		proc0_rescan = 0;
 		mtx_unlock_spin(&sched_lock);
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 			continue;
 		}
 		mtx_lock_spin(&sched_lock);
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				pri = p->p_swtime + td->td_slptime;
 				if ((p->p_sflag & PS_SWAPINREQ) == 0) {
 					pri -= p->p_nice * 8;
 				}
 
 				/*
 				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					pp = p;
 					ppri = pri;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		mtx_lock_spin(&sched_lock);
 		if (!proc0_rescan) {
 			TD_SET_IWAIT(&thread0);
 			mi_switch(SW_VOL, NULL);
 		}
 		proc0_rescan = 0;
 		mtx_unlock_spin(&sched_lock);
 		goto loop;
 	}
 	PROC_LOCK(p);
 
 	/*
 	 * Another process may be bringing or may have already
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
 	if (p->p_sflag & (PS_INMEM | PS_SWAPPINGOUT | PS_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
 		mtx_lock_spin(&sched_lock);
 		proc0_rescan = 0;
 		mtx_unlock_spin(&sched_lock);
 		goto loop;
 	}
 
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPINREQ;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
 	mtx_lock_spin(&sched_lock);
 	p->p_swtime = 0;
 	proc0_rescan = 0;
 	mtx_unlock_spin(&sched_lock);
 	goto loop;
 }
 
 void kick_proc0(void)
 {
 	struct thread *td = &thread0;
 
 		
 	if (TD_AWAITING_INTR(td)) {
 		CTR2(KTR_INTR, "%s: sched_add %d", __func__, 0);
 		TD_CLR_IWAIT(td);
 		sched_add(td, SRQ_INTR);
 	} else {
 		proc0_rescan = 1;
 		CTR2(KTR_INTR, "%s: state %d",
 		    __func__, td->td_state);
 	}
 	
 }
 
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  * procs and unwire their u-areas.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  * if any, otherwise the longest-resident process.
  */
 void
 swapout_procs(action)
 int action;
 {
 	struct proc *p;
 	struct thread *td;
 	int didswap = 0;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (p->p_state == PRS_NEW) {
 			mtx_unlock_spin(&sched_lock);
 			continue;
 		}
 		mtx_unlock_spin(&sched_lock);
 
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
 		 * Perform a quick check whether
 		 * a process has P_SYSTEM.
 		 */
 		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
 
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
 		 * structures as there is a possible
 		 * deadlock.  Test this first as
 		 * this may block.
 		 *
 		 * Lock the map until swapout
 		 * finishes, or a thread of this
 		 * process may attempt to alter
 		 * the map.
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL)
 			continue;
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
 		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
 		    ) != 0) {
 			goto nextproc2;
 		}
 		/*
 		 * only aiod changes vmspace, however it will be
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
 		if ((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) != PS_INMEM)
 			goto nextproc2;
 
 		switch (p->p_state) {
 		default:
 			/* Don't swap out processes in any sort
 			 * of 'special' state. */
 			break;
 
 		case PRS_NORMAL:
 			mtx_lock_spin(&sched_lock);
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if (PRI_IS_REALTIME(td->td_pri_class))
 					goto nextproc;
 
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
 				if (td->td_slptime < swap_idle_threshold1)
 					goto nextproc;
 
 				/*
 				 * Do not swapout a process if it is
 				 * waiting on a critical event of some
 				 * kind or there is a thread whose
 				 * pageable memory may be accessed.
 				 *
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
 				if ((td->td_priority) < PSOCK ||
 				    !thread_safetoswapout(td))
 					goto nextproc;
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
 				 * idle processes >= swap_idle_threshold2,
 				 * then swap the process out.
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
 				    (td->td_slptime < swap_idle_threshold2)))
 					goto nextproc;
 
 				if (minslptime > td->td_slptime)
 					minslptime = td->td_slptime;
 			}
 
 			/*
 			 * If the pageout daemon didn't free enough pages,
 			 * or if this process is idle and the system is
 			 * configured to swap proactively, swap it out.
 			 */
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
 				swapout(p);
 				didswap++;
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
 				sx_sunlock(&allproc_lock);
 				goto retry;
 			}
 nextproc:			
 			mtx_unlock_spin(&sched_lock);
 		}
 nextproc2:
 		PROC_UNLOCK(p);
 		vm_map_unlock(&vm->vm_map);
 nextproc1:
 		vmspace_free(vm);
 		continue;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_sflag & (PS_INMEM|PS_SWAPPINGOUT|PS_SWAPPINGIN)) == PS_INMEM,
 		("swapout: lost a swapout race?"));
 
 #if defined(INVARIANTS)
 	/*
 	 * Make sure that all threads are safe to be swapped out.
 	 *
 	 * Alternatively, we could swap out only safe threads.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td) {
 		KASSERT(thread_safetoswapout(td),
 			("swapout: there is a thread not safe for swapout"));
 	}
 #endif /* INVARIANTS */
 
 	++p->p_stats->p_ru.ru_nswap;
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 
 	p->p_sflag &= ~PS_INMEM;
 	p->p_sflag |= PS_SWAPPINGOUT;
 	PROC_UNLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td)
 		TD_SET_SWAPPED(td);
 	mtx_unlock_spin(&sched_lock);
 
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	mtx_lock_spin(&sched_lock);
 	p->p_sflag &= ~PS_SWAPPINGOUT;
 	p->p_swtime = 0;
 }
 #endif /* !NO_SWAPPING */
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 169666)
+++ head/sys/vm/vm_map.c	(revision 169667)
@@ -1,3414 +1,3414 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 #include <sys/file.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a single hint is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 static struct mtx map_sleep_mtx;
 static uma_zone_t mapentzone;
 static uma_zone_t kmapentzone;
 static uma_zone_t mapzone;
 static uma_zone_t vmspace_zone;
 static struct vm_object kmapentobj;
 static int vmspace_zinit(void *mem, int size, int flags);
 static void vmspace_zfini(void *mem, int size);
 static int vm_map_zinit(void *mem, int ize, int flags);
 static void vm_map_zfini(void *mem, int size);
 static void _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max);
 
 #ifdef INVARIANTS
 static void vm_map_zdtor(void *mem, int size, void *arg);
 static void vmspace_zdtor(void *mem, int size, void *arg);
 #endif
 
 /* 
  * PROC_VMSPACE_{UN,}LOCK() can be a noop as long as vmspaces are type
  * stable.
  */
 #define PROC_VMSPACE_LOCK(p) do { } while (0)
 #define PROC_VMSPACE_UNLOCK(p) do { } while (0)
 
 void
 vm_map_startup(void)
 {
 	mtx_init(&map_sleep_mtx, "vm map sleep mutex", NULL, MTX_DEF);
 	mapzone = uma_zcreate("MAP", sizeof(struct vm_map), NULL,
 #ifdef INVARIANTS
 	    vm_map_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_map_zinit, vm_map_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_prealloc(mapzone, MAX_KMAP);
 	kmapentzone = uma_zcreate("KMAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    UMA_ZONE_MTXCLASS | UMA_ZONE_VM);
 	uma_prealloc(kmapentzone, MAX_KMAPENT);
 	mapentzone = uma_zcreate("MAP ENTRY", sizeof(struct vm_map_entry),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 
 static void
 vmspace_zfini(void *mem, int size)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 	pmap_release(vmspace_pmap(vm));
 	vm_map_zfini(&vm->vm_map, sizeof(vm->vm_map));
 }
 
 static int
 vmspace_zinit(void *mem, int size, int flags)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	(void)vm_map_zinit(&vm->vm_map, sizeof(vm->vm_map), flags);
 	pmap_pinit(vmspace_pmap(vm));
 	return (0);
 }
 
 static void
 vm_map_zfini(void *mem, int size)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	mtx_destroy(&map->system_mtx);
 	sx_destroy(&map->lock);
 }
 
 static int
 vm_map_zinit(void *mem, int size, int flags)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	map->nentries = 0;
 	map->size = 0;
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 	return (0);
 }
 
 #ifdef INVARIANTS
 static void
 vmspace_zdtor(void *mem, int size, void *arg)
 {
 	struct vmspace *vm;
 
 	vm = (struct vmspace *)mem;
 
 	vm_map_zdtor(&vm->vm_map, sizeof(vm->vm_map), arg);
 }
 static void
 vm_map_zdtor(void *mem, int size, void *arg)
 {
 	vm_map_t map;
 
 	map = (vm_map_t)mem;
 	KASSERT(map->nentries == 0,
 	    ("map %p nentries == %d on free.",
 	    map, map->nentries));
 	KASSERT(map->size == 0,
 	    ("map %p size == %lu on free.",
 	    map, (unsigned long)map->size));
 }
 #endif	/* INVARIANTS */
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  */
 struct vmspace *
 vmspace_alloc(min, max)
 	vm_offset_t min, max;
 {
 	struct vmspace *vm;
 
 	vm = uma_zalloc(vmspace_zone, M_WAITOK);
 	CTR1(KTR_VM, "vmspace_alloc: %p", vm);
 	_vm_map_init(&vm->vm_map, min, max);
 	vm->vm_map.pmap = vmspace_pmap(vm);		/* XXX */
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	vm->vm_swrss = 0;
 	vm->vm_tsize = 0;
 	vm->vm_dsize = 0;
 	vm->vm_ssize = 0;
 	vm->vm_taddr = 0;
 	vm->vm_daddr = 0;
 	vm->vm_maxsaddr = 0;
 	return (vm);
 }
 
 void
 vm_init2(void)
 {
-	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(cnt.v_page_count,
+	uma_zone_set_obj(kmapentzone, &kmapentobj, lmin(VMCNT_GET(page_count),
 	    (VM_MAX_KERNEL_ADDRESS - KERNBASE) / PAGE_SIZE) / 8 +
 	     maxproc * 2 + maxfiles);
 	vmspace_zone = uma_zcreate("VMSPACE", sizeof(struct vmspace), NULL,
 #ifdef INVARIANTS
 	    vmspace_zdtor,
 #else
 	    NULL,
 #endif
 	    vmspace_zinit, vmspace_zfini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 static inline void
 vmspace_dofree(struct vmspace *vm)
 {
 	CTR1(KTR_VM, "vmspace_free: %p", vm);
 
 	/*
 	 * Make sure any SysV shm is freed, it might not have been in
 	 * exit1().
 	 */
 	shmexit(vm);
 
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 * Delete all of the mappings and pages they hold, then call
 	 * the pmap module to reclaim anything left.
 	 */
 	(void)vm_map_remove(&vm->vm_map, vm->vm_map.min_offset,
 	    vm->vm_map.max_offset);
 
 	uma_zfree(vmspace_zone, vm);
 }
 
 void
 vmspace_free(struct vmspace *vm)
 {
 	int refcnt;
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	do
 		refcnt = vm->vm_refcnt;
 	while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1)
 		vmspace_dofree(vm);
 }
 
 void
 vmspace_exitfree(struct proc *p)
 {
 	struct vmspace *vm;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	p->p_vmspace = NULL;
 	PROC_VMSPACE_UNLOCK(p);
 	KASSERT(vm == &vmspace0, ("vmspace_exitfree: wrong vmspace"));
 	vmspace_free(vm);
 }
 
 void
 vmspace_exit(struct thread *td)
 {
 	int refcnt;
 	struct vmspace *vm;
 	struct proc *p;
 
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 *
 	 * The last exiting process to reach this point releases as
 	 * much of the environment as it can. vmspace_dofree() is the
 	 * slower fallback in case another process had a temporary
 	 * reference to the vmspace.
 	 */
 
 	p = td->td_proc;
 	vm = p->p_vmspace;
 	atomic_add_int(&vmspace0.vm_refcnt, 1);
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt > 1 && p->p_vmspace != &vmspace0) {
 			/* Switch now since other proc might free vmspace */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = &vmspace0;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
 	if (refcnt == 1) {
 		if (p->p_vmspace != vm) {
 			/* vmspace not yet freed, switch back */
 			PROC_VMSPACE_LOCK(p);
 			p->p_vmspace = vm;
 			PROC_VMSPACE_UNLOCK(p);
 			pmap_activate(td);
 		}
 		pmap_remove_pages(vmspace_pmap(vm));
 		/* Switch now since this proc will free vmspace */
 		PROC_VMSPACE_LOCK(p);
 		p->p_vmspace = &vmspace0;
 		PROC_VMSPACE_UNLOCK(p);
 		pmap_activate(td);
 		vmspace_dofree(vm);
 	}
 }
 
 /* Acquire reference to vmspace owned by another process. */
 
 struct vmspace *
 vmspace_acquire_ref(struct proc *p)
 {
 	struct vmspace *vm;
 	int refcnt;
 
 	PROC_VMSPACE_LOCK(p);
 	vm = p->p_vmspace;
 	if (vm == NULL) {
 		PROC_VMSPACE_UNLOCK(p);
 		return (NULL);
 	}
 	do {
 		refcnt = vm->vm_refcnt;
 		if (refcnt <= 0) { 	/* Avoid 0->1 transition */
 			PROC_VMSPACE_UNLOCK(p);
 			return (NULL);
 		}
 	} while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt + 1));
 	if (vm != p->p_vmspace) {
 		PROC_VMSPACE_UNLOCK(p);
 		vmspace_free(vm);
 		return (NULL);
 	}
 	PROC_VMSPACE_UNLOCK(p);
 	return (vm);
 }
 
 void
 _vm_map_lock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xlock(&map->lock, file, line);
 	map->timestamp++;
 }
 
 void
 _vm_map_unlock(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_unlock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xunlock(&map->lock, file, line);
 }
 
 void
 _vm_map_lock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_lock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xlock(&map->lock, file, line);
 }
 
 void
 _vm_map_unlock_read(vm_map_t map, const char *file, int line)
 {
 
 	if (map->system_map)
 		_mtx_unlock_flags(&map->system_mtx, 0, file, line);
 	else
 		_sx_xunlock(&map->lock, file, line);
 }
 
 int
 _vm_map_trylock(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !_mtx_trylock(&map->system_mtx, 0, file, line) :
 	    !_sx_try_xlock(&map->lock, file, line);
 	if (error == 0)
 		map->timestamp++;
 	return (error == 0);
 }
 
 int
 _vm_map_trylock_read(vm_map_t map, const char *file, int line)
 {
 	int error;
 
 	error = map->system_map ?
 	    !_mtx_trylock(&map->system_mtx, 0, file, line) :
 	    !_sx_try_xlock(&map->lock, file, line);
 	return (error == 0);
 }
 
 int
 _vm_map_lock_upgrade(vm_map_t map, const char *file, int line)
 {
 
 #ifdef INVARIANTS
 	if (map->system_map) {
 		_mtx_assert(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		_sx_assert(&map->lock, SX_XLOCKED, file, line);
 #endif
 	map->timestamp++;
 	return (0);
 }
 
 void
 _vm_map_lock_downgrade(vm_map_t map, const char *file, int line)
 {
 
 #ifdef INVARIANTS
 	if (map->system_map) {
 		_mtx_assert(&map->system_mtx, MA_OWNED, file, line);
 	} else
 		_sx_assert(&map->lock, SX_XLOCKED, file, line);
 #endif
 }
 
 /*
  *	vm_map_unlock_and_wait:
  */
 int
 vm_map_unlock_and_wait(vm_map_t map, boolean_t user_wait)
 {
 
 	mtx_lock(&map_sleep_mtx);
 	vm_map_unlock(map);
 	return (msleep(&map->root, &map_sleep_mtx, PDROP | PVM, "vmmaps", 0));
 }
 
 /*
  *	vm_map_wakeup:
  */
 void
 vm_map_wakeup(vm_map_t map)
 {
 
 	/*
 	 * Acquire and release map_sleep_mtx to prevent a wakeup()
 	 * from being performed (and lost) between the vm_map_unlock()
 	 * and the msleep() in vm_map_unlock_and_wait().
 	 */
 	mtx_lock(&map_sleep_mtx);
 	mtx_unlock(&map_sleep_mtx);
 	wakeup(&map->root);
 }
 
 long
 vmspace_resident_count(struct vmspace *vmspace)
 {
 	return pmap_resident_count(vmspace_pmap(vmspace));
 }
 
 long
 vmspace_wired_count(struct vmspace *vmspace)
 {
 	return pmap_wired_count(vmspace_pmap(vmspace));
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap_t pmap, vm_offset_t min, vm_offset_t max)
 {
 	vm_map_t result;
 
 	result = uma_zalloc(mapzone, M_WAITOK);
 	CTR1(KTR_VM, "vm_map_create: %p", result);
 	_vm_map_init(result, min, max);
 	result->pmap = pmap;
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  * The pmap is set elsewhere.
  */
 static void
 _vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 {
 
 	map->header.next = map->header.prev = &map->header;
 	map->needs_wakeup = FALSE;
 	map->system_map = 0;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->flags = 0;
 	map->root = NULL;
 	map->timestamp = 0;
 }
 
 void
 vm_map_init(vm_map_t map, vm_offset_t min, vm_offset_t max)
 {
 	_vm_map_init(map, min, max);
 	mtx_init(&map->system_mtx, "system map", NULL, MTX_DEF | MTX_DUPOK);
 	sx_init(&map->lock, "user map");
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(vm_map_t map, vm_map_entry_t entry)
 {
 	uma_zfree(map->system_map ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.
  */
 static vm_map_entry_t
 vm_map_entry_create(vm_map_t map)
 {
 	vm_map_entry_t new_entry;
 
 	if (map->system_map)
 		new_entry = uma_zalloc(kmapentzone, M_NOWAIT);
 	else
 		new_entry = uma_zalloc(mapentzone, M_WAITOK);
 	if (new_entry == NULL)
 		panic("vm_map_entry_create: kernel resources exhausted");
 	return (new_entry);
 }
 
 /*
  *	vm_map_entry_set_behavior:
  *
  *	Set the expected access behavior, either normal, random, or
  *	sequential.
  */
 static inline void
 vm_map_entry_set_behavior(vm_map_entry_t entry, u_char behavior)
 {
 	entry->eflags = (entry->eflags & ~MAP_ENTRY_BEHAV_MASK) |
 	    (behavior & MAP_ENTRY_BEHAV_MASK);
 }
 
 /*
  *	vm_map_entry_set_max_free:
  *
  *	Set the max_free field in a vm_map_entry.
  */
 static inline void
 vm_map_entry_set_max_free(vm_map_entry_t entry)
 {
 
 	entry->max_free = entry->adj_free;
 	if (entry->left != NULL && entry->left->max_free > entry->max_free)
 		entry->max_free = entry->left->max_free;
 	if (entry->right != NULL && entry->right->max_free > entry->max_free)
 		entry->max_free = entry->right->max_free;
 }
 
 /*
  *	vm_map_entry_splay:
  *
  *	The Sleator and Tarjan top-down splay algorithm with the
  *	following variation.  Max_free must be computed bottom-up, so
  *	on the downward pass, maintain the left and right spines in
  *	reverse order.  Then, make a second pass up each side to fix
  *	the pointers and compute max_free.  The time bound is O(log n)
  *	amortized.
  *
  *	The new root is the vm_map_entry containing "addr", or else an
  *	adjacent entry (lower or higher) if addr is not in the tree.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: the new root.
  */
 static vm_map_entry_t
 vm_map_entry_splay(vm_offset_t addr, vm_map_entry_t root)
 {
 	vm_map_entry_t llist, rlist;
 	vm_map_entry_t ltree, rtree;
 	vm_map_entry_t y;
 
 	/* Special case of empty tree. */
 	if (root == NULL)
 		return (root);
 
 	/*
 	 * Pass One: Splay down the tree until we find addr or a NULL
 	 * pointer where addr would go.  llist and rlist are the two
 	 * sides in reverse order (bottom-up), with llist linked by
 	 * the right pointer and rlist linked by the left pointer in
 	 * the vm_map_entry.  Wait until Pass Two to set max_free on
 	 * the two spines.
 	 */
 	llist = NULL;
 	rlist = NULL;
 	for (;;) {
 		/* root is never NULL in here. */
 		if (addr < root->start) {
 			y = root->left;
 			if (y == NULL)
 				break;
 			if (addr < y->start && y->left != NULL) {
 				/* Rotate right and put y on rlist. */
 				root->left = y->right;
 				y->right = root;
 				vm_map_entry_set_max_free(root);
 				root = y->left;
 				y->left = rlist;
 				rlist = y;
 			} else {
 				/* Put root on rlist. */
 				root->left = rlist;
 				rlist = root;
 				root = y;
 			}
 		} else {
 			y = root->right;
 			if (addr < root->end || y == NULL)
 				break;
 			if (addr >= y->end && y->right != NULL) {
 				/* Rotate left and put y on llist. */
 				root->right = y->left;
 				y->left = root;
 				vm_map_entry_set_max_free(root);
 				root = y->right;
 				y->right = llist;
 				llist = y;
 			} else {
 				/* Put root on llist. */
 				root->right = llist;
 				llist = root;
 				root = y;
 			}
 		}
 	}
 
 	/*
 	 * Pass Two: Walk back up the two spines, flip the pointers
 	 * and set max_free.  The subtrees of the root go at the
 	 * bottom of llist and rlist.
 	 */
 	ltree = root->left;
 	while (llist != NULL) {
 		y = llist->right;
 		llist->right = ltree;
 		vm_map_entry_set_max_free(llist);
 		ltree = llist;
 		llist = y;
 	}
 	rtree = root->right;
 	while (rlist != NULL) {
 		y = rlist->left;
 		rlist->left = rtree;
 		vm_map_entry_set_max_free(rlist);
 		rtree = rlist;
 		rlist = y;
 	}
 
 	/*
 	 * Final assembly: add ltree and rtree as subtrees of root.
 	 */
 	root->left = ltree;
 	root->right = rtree;
 	vm_map_entry_set_max_free(root);
 
 	return (root);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static void
 vm_map_entry_link(vm_map_t map,
 		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
 
 	CTR4(KTR_VM,
 	    "vm_map_entry_link: map %p, nentries %d, entry %p, after %p", map,
 	    map->nentries, entry, after_where);
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
 	entry->next->prev = entry;
 	after_where->next = entry;
 
 	if (after_where != &map->header) {
 		if (after_where != map->root)
 			vm_map_entry_splay(after_where->start, map->root);
 		entry->right = after_where->right;
 		entry->left = after_where;
 		after_where->right = NULL;
 		after_where->adj_free = entry->start - after_where->end;
 		vm_map_entry_set_max_free(after_where);
 	} else {
 		entry->right = map->root;
 		entry->left = NULL;
 	}
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 	map->root = entry;
 }
 
 static void
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev, root;
 
 	if (entry != map->root)
 		vm_map_entry_splay(entry->start, map->root);
 	if (entry->left == NULL)
 		root = entry->right;
 	else {
 		root = vm_map_entry_splay(entry->start, entry->left);
 		root->right = entry->right;
 		root->adj_free = (entry->next == &map->header ? map->max_offset :
 		    entry->next->start) - root->end;
 		vm_map_entry_set_max_free(root);
 	}
 	map->root = root;
 
 	prev = entry->prev;
 	next = entry->next;
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
 	CTR3(KTR_VM, "vm_map_entry_unlink: map %p, nentries %d, entry %p", map,
 	    map->nentries, entry);
 }
 
 /*
  *	vm_map_entry_resize_free:
  *
  *	Recompute the amount of free space following a vm_map_entry
  *	and propagate that value up the tree.  Call this function after
  *	resizing a map entry in-place, that is, without a call to
  *	vm_map_entry_link() or _unlink().
  *
  *	The map must be locked, and leaves it so.
  */
 static void
 vm_map_entry_resize_free(vm_map_t map, vm_map_entry_t entry)
 {
 
 	/*
 	 * Using splay trees without parent pointers, propagating
 	 * max_free up the tree is done by moving the entry to the
 	 * root and making the change there.
 	 */
 	if (entry != map->root)
 		map->root = vm_map_entry_splay(entry->start, map->root);
 
 	entry->adj_free = (entry->next == &map->header ? map->max_offset :
 	    entry->next->start) - entry->end;
 	vm_map_entry_set_max_free(entry);
 }
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(
 	vm_map_t map,
 	vm_offset_t address,
 	vm_map_entry_t *entry)	/* OUT */
 {
 	vm_map_entry_t cur;
 
 	cur = vm_map_entry_splay(address, map->root);
 	if (cur == NULL)
 		*entry = &map->header;
 	else {
 		map->root = cur;
 
 		if (address >= cur->start) {
 			*entry = cur;
 			if (cur->end > address)
 				return (TRUE);
 		} else
 			*entry = cur->prev;
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 	      int cow)
 {
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 	vm_eflags_t protoeflags;
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	protoeflags = 0;
 
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 
 	if (cow & MAP_NOFAULT) {
 		protoeflags |= MAP_ENTRY_NOFAULT;
 
 		KASSERT(object == NULL,
 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 
 	if (object != NULL) {
 		/*
 		 * OBJ_ONEMAPPING must be cleared unless this mapping
 		 * is trivially proven to be the only mapping for any
 		 * of the object's pages.  (Object granularity
 		 * reference counting is insufficient to recognize
 		 * aliases with precision.)
 		 */
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count > 1 || object->shadow_count != 0)
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(object);
 	}
 	else if ((prev_entry != &map->header) &&
 		 (prev_entry->eflags == protoeflags) &&
 		 (prev_entry->end == start) &&
 		 (prev_entry->wired_count == 0) &&
 		 ((prev_entry->object.vm_object == NULL) ||
 		  vm_object_coalesce(prev_entry->object.vm_object,
 				     prev_entry->offset,
 				     (vm_size_t)(prev_entry->end - prev_entry->start),
 				     (vm_size_t)(end - prev_entry->end)))) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the
 		 * new range as well.
 		 */
 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
 		    (prev_entry->protection == prot) &&
 		    (prev_entry->max_protection == max)) {
 			map->size += (end - prev_entry->end);
 			prev_entry->end = end;
 			vm_map_entry_resize_free(map, prev_entry);
 			vm_map_simplify_entry(map, prev_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
 		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 			(prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 	}
 
 	/*
 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
 	 * in things like the buffer map where we manage kva but do not manage
 	 * backing objects.
 	 */
 
 	/*
 	 * Create a new entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = VM_INHERIT_DEFAULT;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 #if 0
 	/*
 	 * Temporarily removed to avoid MAP_STACK panic, due to
 	 * MAP_STACK being a huge hack.  Will be added back in
 	 * when MAP_STACK (and the user stack mapping) is fixed.
 	 */
 	/*
 	 * It may be possible to simplify the entry
 	 */
 	vm_map_simplify_entry(map, new_entry);
 #endif
 
 	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 		vm_map_pmap_enter(map, start, prot,
 				    object, OFF_TO_IDX(offset), end - start,
 				    cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_findspace:
  *
  *	Find the first fit (lowest VM address) for "length" free bytes
  *	beginning at address >= start in the given map.
  *
  *	In a vm_map_entry, "adj_free" is the amount of free space
  *	adjacent (higher address) to this entry, and "max_free" is the
  *	maximum amount of contiguous free space in its subtree.  This
  *	allows finding a free region in one path down the tree, so
  *	O(log n) amortized with splay trees.
  *
  *	The map must be locked, and leaves it so.
  *
  *	Returns: 0 on success, and starting address in *addr,
  *		 1 if insufficient space.
  */
 int
 vm_map_findspace(vm_map_t map, vm_offset_t start, vm_size_t length,
     vm_offset_t *addr)	/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_offset_t end, st;
 
 	/*
 	 * Request must fit within min/max VM address and must avoid
 	 * address wrap.
 	 */
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start + length > map->max_offset || start + length < start)
 		return (1);
 
 	/* Empty tree means wide open address space. */
 	if (map->root == NULL) {
 		*addr = start;
 		goto found;
 	}
 
 	/*
 	 * After splay, if start comes before root node, then there
 	 * must be a gap from start to the root.
 	 */
 	map->root = vm_map_entry_splay(start, map->root);
 	if (start + length <= map->root->start) {
 		*addr = start;
 		goto found;
 	}
 
 	/*
 	 * Root is the last node that might begin its gap before
 	 * start, and this is the last comparison where address
 	 * wrap might be a problem.
 	 */
 	st = (start > map->root->end) ? start : map->root->end;
 	if (length <= map->root->end + map->root->adj_free - st) {
 		*addr = st;
 		goto found;
 	}
 
 	/* With max_free, can immediately tell if no solution. */
 	entry = map->root->right;
 	if (entry == NULL || length > entry->max_free)
 		return (1);
 
 	/*
 	 * Search the right subtree in the order: left subtree, root,
 	 * right subtree (first fit).  The previous splay implies that
 	 * all regions in the right subtree have addresses > start.
 	 */
 	while (entry != NULL) {
 		if (entry->left != NULL && entry->left->max_free >= length)
 			entry = entry->left;
 		else if (entry->adj_free >= length) {
 			*addr = entry->end;
 			goto found;
 		} else
 			entry = entry->right;
 	}
 
 	/* Can't get here, so panic if we do. */
 	panic("vm_map_findspace: max_free corrupt");
 
 found:
 	/* Expand the kernel pmap, if necessary. */
 	if (map == kernel_map) {
 		end = round_page(*addr + length);
 		if (end > kernel_vm_end)
 			pmap_growkernel(end);
 	}
 	return (0);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, boolean_t find_space, vm_prot_t prot,
 	    vm_prot_t max, int cow)
 {
 	vm_offset_t start;
 	int result;
 
 	start = *addr;
 	vm_map_lock(map);
 	if (find_space) {
 		if (vm_map_findspace(map, start, length, addr)) {
 			vm_map_unlock(map);
 			return (KERN_NO_SPACE);
 		}
 		start = *addr;
 	}
 	result = vm_map_insert(map, object, offset,
 		start, start + length, prot, max, cow);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
  *	Simplify the given map entry by merging with either neighbor.  This
  *	routine also has the ability to merge with both neighbors.
  *
  *	The map must be locked.
  *
  *	This routine guarentees that the passed entry remains valid (though
  *	possibly extended).  When merging, this routine may delete one or
  *	both neighbors.
  */
 void
 vm_map_simplify_entry(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
 	if (entry->eflags & (MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_IS_SUB_MAP))
 		return;
 
 	prev = entry->prev;
 	if (prev != &map->header) {
 		prevsize = prev->end - prev->start;
 		if ( (prev->end == entry->start) &&
 		     (prev->object.vm_object == entry->object.vm_object) &&
 		     (!prev->object.vm_object ||
 			(prev->offset + prevsize == entry->offset)) &&
 		     (prev->eflags == entry->eflags) &&
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
 		     (prev->wired_count == entry->wired_count)) {
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
 			if (entry->prev != &map->header)
 				vm_map_entry_resize_free(map, entry->prev);
 			if (prev->object.vm_object)
 				vm_object_deallocate(prev->object.vm_object);
 			vm_map_entry_dispose(map, prev);
 		}
 	}
 
 	next = entry->next;
 	if (next != &map->header) {
 		esize = entry->end - entry->start;
 		if ((entry->end == next->start) &&
 		    (next->object.vm_object == entry->object.vm_object) &&
 		     (!entry->object.vm_object ||
 			(entry->offset + esize == next->offset)) &&
 		    (next->eflags == entry->eflags) &&
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
 		    (next->wired_count == entry->wired_count)) {
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			vm_map_entry_resize_free(map, entry);
 			if (next->object.vm_object)
 				vm_object_deallocate(next->object.vm_object);
 			vm_map_entry_dispose(map, next);
 		}
 	}
 }
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(vm_map_t map, vm_map_entry_t entry, vm_offset_t start)
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if ((endaddr) < (entry->end)) \
 		_vm_map_clip_end((map), (entry), (endaddr)); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(vm_map_t map, vm_map_entry_t entry, vm_offset_t end)
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	vm_map_t submap)
 {
 	vm_map_entry_t entry;
 	int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  * The maximum number of pages to map
  */
 #define	MAX_INIT_PT	96
 
 /*
  *	vm_map_pmap_enter:
  *
  *	Preload read-only mappings for the given object into the specified
  *	map.  This eliminates the soft faults on process startup and
  *	immediately after an mmap(2).
  */
 void
 vm_map_pmap_enter(vm_map_t map, vm_offset_t addr, vm_prot_t prot,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size, int flags)
 {
 	vm_offset_t start;
 	vm_page_t p, p_start;
 	vm_pindex_t psize, tmpidx;
 	boolean_t are_queues_locked;
 
 	if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == 0 || object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	if (object->type == OBJT_DEVICE) {
 		pmap_object_init_pt(map->pmap, addr, object, pindex, size);
 		goto unlock_return;
 	}
 
 	psize = atop(size);
 
 	if (object->type != OBJT_VNODE ||
 	    ((flags & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
 	     (object->resident_page_count > MAX_INIT_PT))) {
 		goto unlock_return;
 	}
 
 	if (psize + pindex > object->size) {
 		if (object->size < pindex)
 			goto unlock_return;
 		psize = object->size - pindex;
 	}
 
 	are_queues_locked = FALSE;
 	start = 0;
 	p_start = NULL;
 
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < pindex) {
 			p = vm_page_splay(pindex, object->root);
 			if ((object->root = p)->pindex < pindex)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (tmpidx = p->pindex - pindex) < psize;
 	     p = TAILQ_NEXT(p, listq)) {
 		/*
 		 * don't allow an madvise to blow away our really
 		 * free pages allocating pv entries.
 		 */
 		if ((flags & MAP_PREFAULT_MADVISE) &&
-		    cnt.v_free_count < cnt.v_free_reserved) {
+		    VMCNT_GET(free_count) < VMCNT_GET(free_reserved)) {
 			psize = tmpidx;
 			break;
 		}
 		if ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL &&
 		    (p->busy == 0)) {
 			if (p_start == NULL) {
 				start = addr + ptoa(tmpidx);
 				p_start = p;
 			}
 			if (!are_queues_locked) {
 				are_queues_locked = TRUE;
 				vm_page_lock_queues();
 			}
 			if (VM_PAGE_INQUEUE1(p, PQ_CACHE))
 				vm_page_deactivate(p);
 		} else if (p_start != NULL) {
 			pmap_enter_object(map->pmap, start, addr +
 			    ptoa(tmpidx), p_start, prot);
 			p_start = NULL;
 		}
 	}
 	if (p_start != NULL)
 		pmap_enter_object(map->pmap, start, addr + ptoa(psize),
 		    p_start, prot);
 	if (are_queues_locked)
 		vm_page_unlock_queues();
 unlock_return:
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		vm_prot_t old_prot;
 
 		vm_map_clip_end(map, current, end);
 
 		old_prot = current->protection;
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * Update physical map if necessary. Worry about copy-on-write
 		 * here -- CHECK THIS XXX
 		 */
 		if (current->protection != old_prot) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 		vm_map_simplify_entry(map, current);
 		current = current->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  *	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying
  *	objects.
  */
 int
 vm_map_madvise(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	int behav)
 {
 	vm_map_entry_t current, entry;
 	int modify_map = 0;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pindex;
 		int count;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			vm_offset_t useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pindex = OFF_TO_IDX(current->offset);
 			count = atop(current->end - current->start);
 			useStart = current->start;
 
 			if (current->start < start) {
 				pindex += atop(start - current->start);
 				count -= atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end)
 				count -= atop(current->end - end);
 
 			if (count <= 0)
 				continue;
 
 			vm_object_madvise(current->object.vm_object,
 					  pindex, count, behav);
 			if (behav == MADV_WILLNEED) {
 				vm_map_pmap_enter(map,
 				    useStart,
 				    current->protection,
 				    current->object.vm_object,
 				    pindex,
 				    (count << PAGE_SHIFT),
 				    MAP_PREFAULT_MADVISE
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return (0);
 }
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vm_map_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 		entry->inheritance = new_inheritance;
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_unwire:
  *
  *	Implements both kernel and user unwiring.
  */
 int
 vm_map_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t need_wakeup, result, user_unwire;
 
 	user_unwire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, user_unwire)) {
 				/*
 				 * Allow interruption of user unwiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp+1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * First_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		/*
 		 * If system unwiring, require that the entry is system wired.
 		 */
 		if (!user_unwire &&
 		    vm_map_entry_system_wired_count(entry) == 0) {
 			end = entry->end;
 			rv = KERN_INVALID_ARGUMENT;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_unwire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (rv == KERN_SUCCESS && (!user_unwire ||
 		    (entry->eflags & MAP_ENTRY_USER_WIRED))) {
 			if (user_unwire)
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    entry->object.vm_object->type == OBJT_DEVICE);
 			}
 		}
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_unwire: in-transition flag missing"));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  *	vm_map_wire:
  *
  *	Implements both kernel and user wiring.
  */
 int
 vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     int flags)
 {
 	vm_map_entry_t entry, first_entry, tmp_entry;
 	vm_offset_t saved_end, saved_start;
 	unsigned int last_timestamp;
 	int rv;
 	boolean_t fictitious, need_wakeup, result, user_wire;
 
 	user_wire = (flags & VM_MAP_WIRE_USER) ? TRUE : FALSE;
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &first_entry)) {
 		if (flags & VM_MAP_WIRE_HOLESOK)
 			first_entry = first_entry->next;
 		else {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 	last_timestamp = map->timestamp;
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (entry->eflags & MAP_ENTRY_IN_TRANSITION) {
 			/*
 			 * We have not yet clipped the entry.
 			 */
 			saved_start = (start >= entry->start) ? start :
 			    entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			if (vm_map_unlock_and_wait(map, user_wire)) {
 				/*
 				 * Allow interruption of user wiring?
 				 */
 			}
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry)) {
 					if (flags & VM_MAP_WIRE_HOLESOK)
 						tmp_entry = tmp_entry->next;
 					else {
 						if (saved_start == start) {
 							/*
 							 * first_entry has been deleted.
 							 */
 							vm_map_unlock(map);
 							return (KERN_INVALID_ADDRESS);
 						}
 						end = saved_start;
 						rv = KERN_INVALID_ADDRESS;
 						goto done;
 					}
 				}
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 			}
 			last_timestamp = map->timestamp;
 			continue;
 		}
 		vm_map_clip_start(map, entry, start);
 		vm_map_clip_end(map, entry, end);
 		/*
 		 * Mark the entry in case the map lock is released.  (See
 		 * above.)
 		 */
 		entry->eflags |= MAP_ENTRY_IN_TRANSITION;
 		/*
 		 *
 		 */
 		if (entry->wired_count == 0) {
 			entry->wired_count++;
 			saved_start = entry->start;
 			saved_end = entry->end;
 			fictitious = entry->object.vm_object != NULL &&
 			    entry->object.vm_object->type == OBJT_DEVICE;
 			/*
 			 * Release the map lock, relying on the in-transition
 			 * mark.
 			 */
 			vm_map_unlock(map);
 			rv = vm_fault_wire(map, saved_start, saved_end,
 			    user_wire, fictitious);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.  The entry
 				 * may have been clipped, but NOT merged or
 				 * deleted.
 				 */
 				result = vm_map_lookup_entry(map, saved_start,
 				    &tmp_entry);
 				KASSERT(result, ("vm_map_wire: lookup failed"));
 				if (entry == first_entry)
 					first_entry = tmp_entry;
 				else
 					first_entry = NULL;
 				entry = tmp_entry;
 				while (entry->end < saved_end) {
 					if (rv != KERN_SUCCESS) {
 						KASSERT(entry->wired_count == 1,
 						    ("vm_map_wire: bad count"));
 						entry->wired_count = -1;
 					}
 					entry = entry->next;
 				}
 			}
 			last_timestamp = map->timestamp;
 			if (rv != KERN_SUCCESS) {
 				KASSERT(entry->wired_count == 1,
 				    ("vm_map_wire: bad count"));
 				/*
 				 * Assign an out-of-range value to represent
 				 * the failure to wire this entry.
 				 */
 				entry->wired_count = -1;
 				end = entry->end;
 				goto done;
 			}
 		} else if (!user_wire ||
 			   (entry->eflags & MAP_ENTRY_USER_WIRED) == 0) {
 			entry->wired_count++;
 		}
 		/*
 		 * Check the map for holes in the specified region.
 		 * If VM_MAP_WIRE_HOLESOK was specified, skip this check.
 		 */
 		if (((flags & VM_MAP_WIRE_HOLESOK) == 0) &&
 		    (entry->end < end && (entry->next == &map->header ||
 		    entry->next->start > entry->end))) {
 			end = entry->end;
 			rv = KERN_INVALID_ADDRESS;
 			goto done;
 		}
 		entry = entry->next;
 	}
 	rv = KERN_SUCCESS;
 done:
 	need_wakeup = FALSE;
 	if (first_entry == NULL) {
 		result = vm_map_lookup_entry(map, start, &first_entry);
 		if (!result && (flags & VM_MAP_WIRE_HOLESOK))
 			first_entry = first_entry->next;
 		else
 			KASSERT(result, ("vm_map_wire: lookup failed"));
 	}
 	entry = first_entry;
 	while (entry != &map->header && entry->start < end) {
 		if (rv == KERN_SUCCESS) {
 			if (user_wire)
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 		} else if (entry->wired_count == -1) {
 			/*
 			 * Wiring failed on this entry.  Thus, unwiring is
 			 * unnecessary.
 			 */
 			entry->wired_count = 0;
 		} else {
 			if (!user_wire ||
 			    (entry->eflags & MAP_ENTRY_USER_WIRED) == 0)
 				entry->wired_count--;
 			if (entry->wired_count == 0) {
 				/*
 				 * Retain the map lock.
 				 */
 				vm_fault_unwire(map, entry->start, entry->end,
 				    entry->object.vm_object != NULL &&
 				    entry->object.vm_object->type == OBJT_DEVICE);
 			}
 		}
 		KASSERT(entry->eflags & MAP_ENTRY_IN_TRANSITION,
 			("vm_map_wire: in-transition flag missing"));
 		entry->eflags &= ~MAP_ENTRY_IN_TRANSITION;
 		if (entry->eflags & MAP_ENTRY_NEEDS_WAKEUP) {
 			entry->eflags &= ~MAP_ENTRY_NEEDS_WAKEUP;
 			need_wakeup = TRUE;
 		}
 		vm_map_simplify_entry(map, entry);
 		entry = entry->next;
 	}
 	vm_map_unlock(map);
 	if (need_wakeup)
 		vm_map_wakeup(map);
 	return (rv);
 }
 
 /*
  * vm_map_sync
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * If the size of the region from start to end is zero, we are
  * supposed to flush all modified pages within the region containing
  * start.  Unfortunately, a region can be split or coalesced with
  * neighboring regions, making it difficult to determine what the
  * original region was.  Therefore, we approximate this requirement by
  * flushing the current region containing start.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_sync(
 	vm_map_t map,
 	vm_offset_t start,
 	vm_offset_t end,
 	boolean_t syncio,
 	boolean_t invalidate)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	} else if (start == end) {
 		start = entry->start;
 		end = entry->end;
 	}
 	/*
 	 * Make a first pass to check for user-wired memory and holes.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if (invalidate && (current->eflags & MAP_ENTRY_USER_WIRED)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(map->pmap, start, end);
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		vm_object_sync(object, offset, size, syncio, invalidate);
 		start += size;
 	}
 
 	vm_map_unlock_read(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_fault_unwire(map, entry->start, entry->end,
 	    entry->object.vm_object != NULL &&
 	    entry->object.vm_object->type == OBJT_DEVICE);
 	entry->wired_count = 0;
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(vm_map_t map, vm_map_entry_t entry)
 {
 	vm_object_t object;
 	vm_pindex_t offidxstart, offidxend, count;
 
 	vm_map_entry_unlink(map, entry);
 	map->size -= entry->end - entry->start;
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
 	    (object = entry->object.vm_object) != NULL) {
 		count = OFF_TO_IDX(entry->end - entry->start);
 		offidxstart = OFF_TO_IDX(entry->offset);
 		offidxend = offidxstart + count;
 		VM_OBJECT_LOCK(object);
 		if (object->ref_count != 1 &&
 		    ((object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING ||
 		    object == kernel_object || object == kmem_object)) {
 			vm_object_collapse(object);
 			vm_object_page_remove(object, offidxstart, offidxend, FALSE);
 			if (object->type == OBJT_SWAP)
 				swap_pager_freespace(object, offidxstart, count);
 			if (offidxend >= object->size &&
 			    offidxstart < object->size)
 				object->size = offidxstart;
 		}
 		VM_OBJECT_UNLOCK(object);
 		vm_object_deallocate(object);
 	}
 
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 
 		/*
 		 * Wait for wiring or unwiring of an entry to complete.
 		 * Also wait for any system wirings to disappear on
 		 * user maps.
 		 */
 		if ((entry->eflags & MAP_ENTRY_IN_TRANSITION) != 0 ||
 		    (vm_map_pmap(map) != kernel_pmap &&
 		    vm_map_entry_system_wired_count(entry) != 0)) {
 			unsigned int last_timestamp;
 			vm_offset_t saved_start;
 			vm_map_entry_t tmp_entry;
 
 			saved_start = entry->start;
 			entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			last_timestamp = map->timestamp;
 			(void) vm_map_unlock_and_wait(map, FALSE);
 			vm_map_lock(map);
 			if (last_timestamp + 1 != map->timestamp) {
 				/*
 				 * Look again for the entry because the map was
 				 * modified while it was unlocked.
 				 * Specifically, the entry may have been
 				 * clipped, merged, or deleted.
 				 */
 				if (!vm_map_lookup_entry(map, saved_start,
 							 &tmp_entry))
 					entry = tmp_entry->next;
 				else {
 					entry = tmp_entry;
 					vm_map_clip_start(map, entry,
 							  saved_start);
 				}
 			}
 			continue;
 		}
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0) {
 			vm_map_entry_unwire(map, entry);
 		}
 
 		pmap_remove(map->pmap, entry->start, entry->end);
 
 		/*
 		 * Delete the entry (which may delete the object) only after
 		 * removing all pmap entries pointing to its pages.
 		 * (Otherwise, its page frames may be reallocated, and any
 		 * modify bits will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end)
 {
 	int result;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified privilege on the
  *	entire address region given.  The entire region must be allocated.
  *
  *	WARNING!  This code does not and should not check whether the
  *	contents of the region is accessible.  For example a smaller file
  *	might be mapped into a larger address space.
  *
  *	NOTE!  This code is also called by munmap().
  *
  *	The map must be locked.  A read lock is sufficient.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry))
 		return (FALSE);
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header)
 			return (FALSE);
 		/*
 		 * No holes allowed!
 		 */
 		if (start < entry->start)
 			return (FALSE);
 		/*
 		 * Check protection associated with entry.
 		 */
 		if ((entry->protection & protection) != protection)
 			return (FALSE);
 		/* go to next entry */
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(
 	vm_map_t src_map,
 	vm_map_t dst_map,
 	vm_map_entry_t src_entry,
 	vm_map_entry_t dst_entry)
 {
 	vm_object_t src_object;
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0) {
 
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 			VM_OBJECT_LOCK(src_object);
 			if ((src_object->handle == NULL) &&
 				(src_object->type == OBJT_DEFAULT ||
 				 src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_object_split(src_entry);
 					src_object = src_entry->object.vm_object;
 				}
 			}
 			vm_object_reference_locked(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			VM_OBJECT_UNLOCK(src_object);
 			dst_entry->object.vm_object = src_object;
 			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->offset = src_entry->offset;
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 		}
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * Of course, wired down pages can't be set copy-on-write.
 		 * Cause wired pages to be copied into the new map by
 		 * simulating faults (the new pages are pageable)
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
 	}
 }
 
 /*
  * vmspace_map_entry_forked:
  * Update the newly-forked vmspace each time a map entry is inherited
  * or copied.  The values for vm_dsize and vm_tsize are approximate
  * (and mostly-obsolete ideas in the face of mmap(2) et al.)
  */
 static void
 vmspace_map_entry_forked(const struct vmspace *vm1, struct vmspace *vm2,
     vm_map_entry_t entry)
 {
 	vm_size_t entrysize;
 	vm_offset_t newend;
 
 	entrysize = entry->end - entry->start;
 	vm2->vm_map.size += entrysize;
 	if (entry->eflags & (MAP_ENTRY_GROWS_DOWN | MAP_ENTRY_GROWS_UP)) {
 		vm2->vm_ssize += btoc(entrysize);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_daddr &&
 	    entry->start < (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_daddr + ctob(vm1->vm_dsize));
 		vm2->vm_dsize += btoc(newend - entry->start);
 	} else if (entry->start >= (vm_offset_t)vm1->vm_taddr &&
 	    entry->start < (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize)) {
 		newend = MIN(entry->end,
 		    (vm_offset_t)vm1->vm_taddr + ctob(vm1->vm_tsize));
 		vm2->vm_tsize += btoc(newend - entry->start);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * XXX It might be worth coalescing the entries added to the new vmspace.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(struct vmspace *vm1)
 {
 	struct vmspace *vm2;
 	vm_map_t old_map = &vm1->vm_map;
 	vm_map_t new_map;
 	vm_map_entry_t old_entry;
 	vm_map_entry_t new_entry;
 	vm_object_t object;
 
 	vm_map_lock(old_map);
 
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
 	vm2->vm_taddr = vm1->vm_taddr;
 	vm2->vm_daddr = vm1->vm_daddr;
 	vm2->vm_maxsaddr = vm1->vm_maxsaddr;
 	new_map = &vm2->vm_map;	/* XXX */
 	new_map->timestamp = 1;
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = 0;
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 					&old_entry->offset,
 					atop(old_entry->end - old_entry->start));
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				/* Transfer the second reference too. */
 				vm_object_reference(
 				    old_entry->object.vm_object);
 				vm_object_deallocate(object);
 				object = old_entry->object.vm_object;
 			}
 			VM_OBJECT_LOCK(object);
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 			VM_OBJECT_UNLOCK(object);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vmspace_map_entry_forked(vm1, vm2, new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 
 	vm_map_unlock(old_map);
 
 	return (vm2);
 }
 
 int
 vm_map_stack(vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
     vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t new_entry, prev_entry;
 	vm_offset_t bot, top;
 	vm_size_t init_ssize;
 	int orient, rv;
 	rlim_t vmemlim;
 
 	/*
 	 * The stack orientation is piggybacked with the cow argument.
 	 * Extract it into orient and mask the cow argument so that we
 	 * don't pass it around further.
 	 * NOTE: We explicitly allow bi-directional stacks.
 	 */
 	orient = cow & (MAP_STACK_GROWS_DOWN|MAP_STACK_GROWS_UP);
 	cow &= ~orient;
 	KASSERT(orient != 0, ("No stack grow direction"));
 
 	if (addrbos < vm_map_min(map) || addrbos > map->max_offset)
 		return (KERN_NO_SPACE);
 
 	init_ssize = (max_ssize < sgrowsiz) ? max_ssize : sgrowsiz;
 
 	PROC_LOCK(curthread->td_proc);
 	vmemlim = lim_cur(curthread->td_proc, RLIMIT_VMEM);
 	PROC_UNLOCK(curthread->td_proc);
 
 	vm_map_lock(map);
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + init_ssize > vmemlim) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If we can't accomodate max_ssize in the current mapping, no go.
 	 * However, we need to be aware that subsequent user mappings might
 	 * map into the space we have reserved for stack, and currently this
 	 * space is not protected.
 	 *
 	 * Hopefully we will at least detect this condition when we try to
 	 * grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < addrbos + max_ssize)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * We initially map a stack of only init_ssize.  We will grow as
 	 * needed later.  Depending on the orientation of the stack (i.e.
 	 * the grow direction) we either map at the top of the range, the
 	 * bottom of the range or in the middle.
 	 *
 	 * Note: we would normally expect prot and max to be VM_PROT_ALL,
 	 * and cow to be 0.  Possibly we should eliminate these as input
 	 * parameters, and just pass these values here in the insert call.
 	 */
 	if (orient == MAP_STACK_GROWS_DOWN)
 		bot = addrbos + max_ssize - init_ssize;
 	else if (orient == MAP_STACK_GROWS_UP)
 		bot = addrbos;
 	else
 		bot = round_page(addrbos + max_ssize/2 - init_ssize/2);
 	top = bot + init_ssize;
 	rv = vm_map_insert(map, NULL, 0, bot, top, prot, max, cow);
 
 	/* Now set the avail_ssize amount. */
 	if (rv == KERN_SUCCESS) {
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, bot);
 		new_entry = prev_entry->next;
 		if (new_entry->end != top || new_entry->start != bot)
 			panic("Bad entry start/end for new stack entry");
 
 		new_entry->avail_ssize = max_ssize - init_ssize;
 		if (orient & MAP_STACK_GROWS_DOWN)
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		if (orient & MAP_STACK_GROWS_UP)
 			new_entry->eflags |= MAP_ENTRY_GROWS_UP;
 	}
 
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
  * desired address is already mapped, or if we successfully grow
  * the stack.  Also returns KERN_SUCCESS if addr is outside the
  * stack range (this is strange, but preserves compatibility with
  * the grow function in vm_machdep.c).
  */
 int
 vm_map_growstack(struct proc *p, vm_offset_t addr)
 {
 	vm_map_entry_t next_entry, prev_entry;
 	vm_map_entry_t new_entry, stack_entry;
 	struct vmspace *vm = p->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t end;
 	size_t grow_amount, max_grow;
 	rlim_t stacklim, vmemlim;
 	int is_procstack, rv;
 
 Retry:
 	PROC_LOCK(p);
 	stacklim = lim_cur(p, RLIMIT_STACK);
 	vmemlim = lim_cur(p, RLIMIT_VMEM);
 	PROC_UNLOCK(p);
 
 	vm_map_lock_read(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
 	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	}
 
 	next_entry = prev_entry->next;
 	if (!(prev_entry->eflags & MAP_ENTRY_GROWS_UP)) {
 		/*
 		 * This entry does not grow upwards. Since the address lies
 		 * beyond this entry, the next entry (if one exists) has to
 		 * be a downward growable entry. The entry list header is
 		 * never a growable entry, so it suffices to check the flags.
 		 */
 		if (!(next_entry->eflags & MAP_ENTRY_GROWS_DOWN)) {
 			vm_map_unlock_read(map);
 			return (KERN_SUCCESS);
 		}
 		stack_entry = next_entry;
 	} else {
 		/*
 		 * This entry grows upward. If the next entry does not at
 		 * least grow downwards, this is the entry we need to grow.
 		 * otherwise we have two possible choices and we have to
 		 * select one.
 		 */
 		if (next_entry->eflags & MAP_ENTRY_GROWS_DOWN) {
 			/*
 			 * We have two choices; grow the entry closest to
 			 * the address to minimize the amount of growth.
 			 */
 			if (addr - prev_entry->end <= next_entry->start - addr)
 				stack_entry = prev_entry;
 			else
 				stack_entry = next_entry;
 		} else
 			stack_entry = prev_entry;
 	}
 
 	if (stack_entry == next_entry) {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_DOWN, ("foo"));
 		KASSERT(addr < stack_entry->start, ("foo"));
 		end = (prev_entry != &map->header) ? prev_entry->end :
 		    stack_entry->start - stack_entry->avail_ssize;
 		grow_amount = roundup(stack_entry->start - addr, PAGE_SIZE);
 		max_grow = stack_entry->start - end;
 	} else {
 		KASSERT(stack_entry->eflags & MAP_ENTRY_GROWS_UP, ("foo"));
 		KASSERT(addr >= stack_entry->end, ("foo"));
 		end = (next_entry != &map->header) ? next_entry->start :
 		    stack_entry->end + stack_entry->avail_ssize;
 		grow_amount = roundup(addr + 1 - stack_entry->end, PAGE_SIZE);
 		max_grow = end - stack_entry->end;
 	}
 
 	if (grow_amount > stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/*
 	 * If there is no longer enough space between the entries nogo, and
 	 * adjust the available space.  Note: this  should only happen if the
 	 * user has mapped into the stack area after the stack was created,
 	 * and is probably an error.
 	 *
 	 * This also effectively destroys any guard page the user might have
 	 * intended by limiting the stack size.
 	 */
 	if (grow_amount > max_grow) {
 		if (vm_map_lock_upgrade(map))
 			goto Retry;
 
 		stack_entry->avail_ssize = max_grow;
 
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	is_procstack = (addr >= (vm_offset_t)vm->vm_maxsaddr) ? 1 : 0;
 
 	/*
 	 * If this is the main process stack, see if we're over the stack
 	 * limit.
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* Round up the grow amount modulo SGROWSIZ */
 	grow_amount = roundup (grow_amount, sgrowsiz);
 	if (grow_amount > stack_entry->avail_ssize)
 		grow_amount = stack_entry->avail_ssize;
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount > stacklim)) {
 		grow_amount = stacklim - ctob(vm->vm_ssize);
 	}
 
 	/* If we would blow our VMEM resource limit, no go */
 	if (map->size + grow_amount > vmemlim) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	if (vm_map_lock_upgrade(map))
 		goto Retry;
 
 	if (stack_entry == next_entry) {
 		/*
 		 * Growing downward.
 		 */
 		/* Get the preliminary new entry start value */
 		addr = stack_entry->start - grow_amount;
 
 		/*
 		 * If this puts us into the previous entry, cut back our
 		 * growth to the available space. Also, see the note above.
 		 */
 		if (addr < end) {
 			stack_entry->avail_ssize = max_grow;
 			addr = end;
 		}
 
 		rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
 		    p->p_sysent->sv_stackprot, VM_PROT_ALL, 0);
 
 		/* Adjust the available stack space by the amount we grew. */
 		if (rv == KERN_SUCCESS) {
 			if (prev_entry != &map->header)
 				vm_map_clip_end(map, prev_entry, addr);
 			new_entry = prev_entry->next;
 			KASSERT(new_entry == stack_entry->prev, ("foo"));
 			KASSERT(new_entry->end == stack_entry->start, ("foo"));
 			KASSERT(new_entry->start == addr, ("foo"));
 			grow_amount = new_entry->end - new_entry->start;
 			new_entry->avail_ssize = stack_entry->avail_ssize -
 			    grow_amount;
 			stack_entry->eflags &= ~MAP_ENTRY_GROWS_DOWN;
 			new_entry->eflags |= MAP_ENTRY_GROWS_DOWN;
 		}
 	} else {
 		/*
 		 * Growing upward.
 		 */
 		addr = stack_entry->end + grow_amount;
 
 		/*
 		 * If this puts us into the next entry, cut back our growth
 		 * to the available space. Also, see the note above.
 		 */
 		if (addr > end) {
 			stack_entry->avail_ssize = end - stack_entry->end;
 			addr = end;
 		}
 
 		grow_amount = addr - stack_entry->end;
 
 		/* Grow the underlying object if applicable. */
 		if (stack_entry->object.vm_object == NULL ||
 		    vm_object_coalesce(stack_entry->object.vm_object,
 		    stack_entry->offset,
 		    (vm_size_t)(stack_entry->end - stack_entry->start),
 		    (vm_size_t)grow_amount)) {
 			map->size += (addr - stack_entry->end);
 			/* Update the current entry. */
 			stack_entry->end = addr;
 			stack_entry->avail_ssize -= grow_amount;
 			vm_map_entry_resize_free(map, stack_entry);
 			rv = KERN_SUCCESS;
 
 			if (next_entry != &map->header)
 				vm_map_clip_start(map, next_entry, addr);
 		} else
 			rv = KERN_FAILURE;
 	}
 
 	if (rv == KERN_SUCCESS && is_procstack)
 		vm->vm_ssize += btoc(grow_amount);
 
 	vm_map_unlock(map);
 
 	/*
 	 * Heed the MAP_WIREFUTURE flag if it was set for this process.
 	 */
 	if (rv == KERN_SUCCESS && (map->flags & MAP_WIREFUTURE)) {
 		vm_map_wire(map,
 		    (stack_entry == next_entry) ? addr : addr - grow_amount,
 		    (stack_entry == next_entry) ? stack_entry->start : addr,
 		    (p->p_flag & P_SYSTEM)
 		    ? VM_MAP_WIRE_SYSTEM|VM_MAP_WIRE_NOHOLES
 		    : VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 	}
 
 	return (rv);
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 void
 vmspace_exec(struct proc *p, vm_offset_t minuser, vm_offset_t maxuser)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	newvmspace = vmspace_alloc(minuser, maxuser);
 	newvmspace->vm_swrss = oldvmspace->vm_swrss;
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 void
 vmspace_unshare(struct proc *p)
 {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return;
 	newvmspace = vmspace_fork(oldvmspace);
 	PROC_VMSPACE_LOCK(p);
 	p->p_vmspace = newvmspace;
 	PROC_VMSPACE_UNLOCK(p);
 	if (p == curthread->td_proc)		/* XXXKSE ? */
 		pmap_activate(curthread);
 	vmspace_free(oldvmspace);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 RetryLookup:;
 	/*
 	 * Lookup the faulting address.
 	 */
 
 	vm_map_lock_read(map);
 #define	RETURN(why) \
 		{ \
 		vm_map_unlock_read(map); \
 		return (why); \
 		}
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 	entry = map->root;
 	*out_entry = entry;
 	if (entry == NULL ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, out_entry))
 			RETURN(KERN_INVALID_ADDRESS);
 
 		entry = *out_entry;
 	}
 
 	/*
 	 * Handle submaps.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 * Note the special case for MAP_ENTRY_COW
 	 * pages with an override.  This is to implement a forced
 	 * COW for debuggers.
 	 */
 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
 		prot = entry->max_protection;
 	else
 		prot = entry->protection;
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type) {
 			RETURN(KERN_PROTECTION_FAILURE);
 	}
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
 		RETURN(KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 		if (fault_type & VM_PROT_WRITE) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			vm_object_shadow(
 			    &entry->object.vm_object,
 			    &entry->offset,
 			    atop(entry->end - entry->start));
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL &&
 	    !map->system_map) {
 		if (vm_map_lock_upgrade(map))
 			goto RetryLookup;
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(entry->end - entry->start));
 		entry->offset = 0;
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 
 #undef	RETURN
 }
 
 /*
  *	vm_map_lookup_locked:
  *
  *	Lookup the faulting address.  A version of vm_map_lookup that returns 
  *      KERN_FAILURE instead of blocking on map lock or memory allocation.
  */
 int
 vm_map_lookup_locked(vm_map_t *var_map,		/* IN/OUT */
 		     vm_offset_t vaddr,
 		     vm_prot_t fault_typea,
 		     vm_map_entry_t *out_entry,	/* OUT */
 		     vm_object_t *object,	/* OUT */
 		     vm_pindex_t *pindex,	/* OUT */
 		     vm_prot_t *out_prot,	/* OUT */
 		     boolean_t *wired)		/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 	entry = map->root;
 	*out_entry = entry;
 	if (entry == NULL ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, out_entry))
 			return (KERN_INVALID_ADDRESS);
 
 		entry = *out_entry;
 	}
 
 	/*
 	 * Fail if the entry refers to a submap.
 	 */
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return (KERN_FAILURE);
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 * Note the special case for MAP_ENTRY_COW
 	 * pages with an override.  This is to implement a forced
 	 * COW for debuggers.
 	 */
 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
 		prot = entry->max_protection;
 	else
 		prot = entry->protection;
 	fault_type &= VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE;
 	if ((fault_type & prot) != fault_type)
 		return (KERN_PROTECTION_FAILURE);
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0)
 		return (KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * Fail if the entry was copy-on-write for a write fault.
 		 */
 		if (fault_type & VM_PROT_WRITE)
 			return (KERN_FAILURE);
 		/*
 		 * We're attempting to read a copy-on-write page --
 		 * don't allow writes.
 		 */
 		prot &= ~VM_PROT_WRITE;
 	}
 
 	/*
 	 * Fail if an object should be created.
 	 */
 	if (entry->object.vm_object == NULL && !map->system_map)
 		return (KERN_FAILURE);
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 void
 vm_map_lookup_done(vm_map_t map, vm_map_entry_t entry)
 {
 	/*
 	 * Unlock the main-level map
 	 */
 	vm_map_unlock_read(map);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 /*
  *	vm_map_print:	[ debug ]
  */
 DB_SHOW_COMMAND(map, vm_map_print)
 {
 	static int nlines;
 	/* XXX convert args. */
 	vm_map_t map = (vm_map_t)addr;
 	boolean_t full = have_addr;
 
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 	nlines++;
 
 	if (!full && db_indent)
 		return;
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p\n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end);
 		nlines++;
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			db_printf(", share=%p, offset=0x%jx\n",
 			    (void *)entry->object.sub_map,
 			    (uintmax_t)entry->offset);
 			nlines++;
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((db_expr_t)(intptr_t)
 					     entry->object.sub_map,
 					     full, 0, (char *)0);
 				db_indent -= 2;
 			}
 		} else {
 			db_printf(", object=%p, offset=0x%jx",
 			    (void *)entry->object.vm_object,
 			    (uintmax_t)entry->offset);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 			nlines++;
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						full, 0, (char *)0);
 				nlines += 4;
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 	if (db_indent == 0)
 		nlines = 0;
 }
 
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = (struct proc *) addr;
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
 }
 
 #endif /* DDB */
Index: head/sys/vm/vm_meter.c
===================================================================
--- head/sys/vm/vm_meter.c	(revision 169666)
+++ head/sys/vm/vm_meter.c	(revision 169667)
@@ -1,390 +1,392 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vm_meter.c	8.4 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <sys/sysctl.h>
 
-struct vmmeter cnt;
+volatile struct vmmeter cnt;
 
 int maxslp = MAXSLP;
 
 SYSCTL_UINT(_vm, VM_V_FREE_MIN, v_free_min,
-	CTLFLAG_RW, &cnt.v_free_min, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(free_min), 0, "");
 SYSCTL_UINT(_vm, VM_V_FREE_TARGET, v_free_target,
-	CTLFLAG_RW, &cnt.v_free_target, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(free_target), 0, "");
 SYSCTL_UINT(_vm, VM_V_FREE_RESERVED, v_free_reserved,
-	CTLFLAG_RW, &cnt.v_free_reserved, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(free_reserved), 0, "");
 SYSCTL_UINT(_vm, VM_V_INACTIVE_TARGET, v_inactive_target,
-	CTLFLAG_RW, &cnt.v_inactive_target, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(inactive_target), 0, "");
 SYSCTL_UINT(_vm, VM_V_CACHE_MIN, v_cache_min,
-	CTLFLAG_RW, &cnt.v_cache_min, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(cache_min), 0, "");
 SYSCTL_UINT(_vm, VM_V_CACHE_MAX, v_cache_max,
-	CTLFLAG_RW, &cnt.v_cache_max, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(cache_max), 0, "");
 SYSCTL_UINT(_vm, VM_V_PAGEOUT_FREE_MIN, v_pageout_free_min,
-	CTLFLAG_RW, &cnt.v_pageout_free_min, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(pageout_free_min), 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, v_free_severe,
-	CTLFLAG_RW, &cnt.v_free_severe, 0, "");
+	CTLFLAG_RW, VMCNT_PTR(free_severe), 0, "");
 
 static int
 sysctl_vm_loadavg(SYSCTL_HANDLER_ARGS)
 {
 #ifdef SCTL_MASK32
 	u_int32_t la[4];
 
 	if (req->flags & SCTL_MASK32) {
 		la[0] = averunnable.ldavg[0];
 		la[1] = averunnable.ldavg[1];
 		la[2] = averunnable.ldavg[2];
 		la[3] = averunnable.fscale;
 		return SYSCTL_OUT(req, la, sizeof(la));
 	} else
 #endif
 		return SYSCTL_OUT(req, &averunnable, sizeof(averunnable));
 }
 SYSCTL_PROC(_vm, VM_LOADAVG, loadavg, CTLTYPE_STRUCT|CTLFLAG_RD, 
     NULL, 0, sysctl_vm_loadavg, "S,loadavg", "Machine loadaverage history");
 
 static int
 vmtotal(SYSCTL_HANDLER_ARGS)
 {
 /* XXXKSE almost completely broken */
 	struct proc *p;
 	struct vmtotal total;
 	vm_map_entry_t entry;
 	vm_object_t object;
 	vm_map_t map;
 	int paging;
 	struct thread *td;
 	struct vmspace *vm;
 
 	bzero(&total, sizeof(total));
 	/*
 	 * Mark all objects as inactive.
 	 */
 	GIANT_REQUIRED;
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (!VM_OBJECT_TRYLOCK(object)) {
 			/*
 			 * Avoid a lock-order reversal.  Consequently,
 			 * the reported number of active pages may be
 			 * greater than the actual number.
 			 */
 			continue;
 		}
 		vm_object_clear_flag(object, OBJ_ACTIVE);
 		VM_OBJECT_UNLOCK(object);
 	}
 	mtx_unlock(&vm_object_list_mtx);
 	/*
 	 * Calculate process statistics.
 	 */
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_flag & P_SYSTEM)
 			continue;
 		mtx_lock_spin(&sched_lock);
 		switch (p->p_state) {
 		case PRS_NEW:
 			mtx_unlock_spin(&sched_lock);
 			continue;
 			break;
 		default:
 			FOREACH_THREAD_IN_PROC(p, td) {
 				/* Need new statistics  XXX */
 				switch (td->td_state) {
 				case TDS_INHIBITED:
 					if (TD_ON_LOCK(td) ||
 					    (td->td_inhibitors ==
 					    TDI_SWAPPED)) {
 						total.t_sw++;
 					} else if (TD_IS_SLEEPING(td) ||
 					   TD_AWAITING_INTR(td) ||
 					   TD_IS_SUSPENDED(td)) {
 						if (td->td_priority <= PZERO)
 							total.t_dw++;
 						else
 							total.t_sl++;
 					}
 					break;
 
 				case TDS_CAN_RUN:
 					total.t_sw++;
 					break;
 				case TDS_RUNQ:
 				case TDS_RUNNING:
 					total.t_rq++;
 					continue;
 				default:
 					break;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		/*
 		 * Note active objects.
 		 */
 		paging = 0;
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL)
 			continue;
 		map = &vm->vm_map;
 		vm_map_lock_read(map);
 		for (entry = map->header.next;
 		    entry != &map->header; entry = entry->next) {
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			    (object = entry->object.vm_object) == NULL)
 				continue;
 			VM_OBJECT_LOCK(object);
 			vm_object_set_flag(object, OBJ_ACTIVE);
 			paging |= object->paging_in_progress;
 			VM_OBJECT_UNLOCK(object);
 		}
 		vm_map_unlock_read(map);
 		vmspace_free(vm);
 		if (paging)
 			total.t_pw++;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * Calculate object memory usage statistics.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		/*
 		 * Perform unsynchronized reads on the object to avoid
 		 * a lock-order reversal.  In this case, the lack of
 		 * synchronization should not impair the accuracy of
 		 * the reported statistics. 
 		 */
 		if (object->type == OBJT_DEVICE) {
 			/*
 			 * Devices, like /dev/mem, will badly skew our totals.
 			 */
 			continue;
 		}
 		if (object->ref_count == 0) {
 			/*
 			 * Also skip unreferenced objects, including
 			 * vnodes representing mounted file systems.
 			 */
 			continue;
 		}
 		total.t_vm += object->size;
 		total.t_rm += object->resident_page_count;
 		if (object->flags & OBJ_ACTIVE) {
 			total.t_avm += object->size;
 			total.t_arm += object->resident_page_count;
 		}
 		if (object->shadow_count > 1) {
 			/* shared object */
 			total.t_vmshr += object->size;
 			total.t_rmshr += object->resident_page_count;
 			if (object->flags & OBJ_ACTIVE) {
 				total.t_avmshr += object->size;
 				total.t_armshr += object->resident_page_count;
 			}
 		}
 	}
 	mtx_unlock(&vm_object_list_mtx);
-	total.t_free = cnt.v_free_count + cnt.v_cache_count;
+	total.t_free = VMCNT_GET(free_count) + VMCNT_GET(cache_count);
 	return (sysctl_handle_opaque(oidp, &total, sizeof(total), req));
 }
 
 /*
  * vcnt() -	accumulate statistics from all cpus and the global cnt
  *		structure.
  *
  *	The vmmeter structure is now per-cpu as well as global.  Those
  *	statistics which can be kept on a per-cpu basis (to avoid cache
  *	stalls between cpus) can be moved to the per-cpu vmmeter.  Remaining
  *	statistics, such as v_free_reserved, are left in the global
  *	structure.
  *
  * (sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
  */
 static int
 vcnt(SYSCTL_HANDLER_ARGS)
 {
 	int count = *(int *)arg1;
-	int offset = (char *)arg1 - (char *)&cnt;
+	int offset = (char *)arg1 - (char *)VMCNT;
 #ifdef SMP
 	int i;
 
 	for (i = 0; i < mp_ncpus; ++i) {
 		struct pcpu *pcpu = pcpu_find(i);
 		count += *(int *)((char *)&pcpu->pc_cnt + offset);
 	}
 #else
 	count += *(int *)((char *)PCPU_PTR(cnt) + offset);
 #endif
 	return (SYSCTL_OUT(req, &count, sizeof(int)));
 }
 
 SYSCTL_PROC(_vm, VM_TOTAL, vmtotal, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, sizeof(struct vmtotal), vmtotal, "S,vmtotal", 
     "System virtual memory statistics");
 SYSCTL_NODE(_vm, OID_AUTO, stats, CTLFLAG_RW, 0, "VM meter stats");
 static SYSCTL_NODE(_vm_stats, OID_AUTO, sys, CTLFLAG_RW, 0,
 	"VM meter sys stats");
 static SYSCTL_NODE(_vm_stats, OID_AUTO, vm, CTLFLAG_RW, 0,
 	"VM meter vm stats");
 SYSCTL_NODE(_vm_stats, OID_AUTO, misc, CTLFLAG_RW, 0, "VM meter misc stats");
 
 SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_swtch, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_swtch, 0, vcnt, "IU", "Context switches");
+	VMCNT_PTR(swtch), 0, vcnt, "IU", "Context switches");
 SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_trap, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_trap, 0, vcnt, "IU", "Traps");
+	VMCNT_PTR(trap), 0, vcnt, "IU", "Traps");
 SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_syscall, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_syscall, 0, vcnt, "IU", "Syscalls");
+	VMCNT_PTR(syscall), 0, vcnt, "IU", "Syscalls");
 SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_intr, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_intr, 0, vcnt, "IU", "Hardware interrupts");
+	VMCNT_PTR(intr), 0, vcnt, "IU", "Hardware interrupts");
 SYSCTL_PROC(_vm_stats_sys, OID_AUTO, v_soft, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_soft, 0, vcnt, "IU", "Software interrupts");
+	VMCNT_PTR(soft), 0, vcnt, "IU", "Software interrupts");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vm_faults, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vm_faults, 0, vcnt, "IU", "VM faults");
+	VMCNT_PTR(vm_faults), 0, vcnt, "IU", "VM faults");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_faults, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_cow_faults, 0, vcnt, "IU", "COW faults");
+	VMCNT_PTR(cow_faults), 0, vcnt, "IU", "COW faults");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cow_optim, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_cow_optim, 0, vcnt, "IU", "Optimized COW faults");
+	VMCNT_PTR(cow_optim), 0, vcnt, "IU", "Optimized COW faults");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_zfod, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_zfod, 0, vcnt, "IU", "Zero fill");
+	VMCNT_PTR(zfod), 0, vcnt, "IU", "Zero fill");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_ozfod, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_ozfod, 0, vcnt, "IU", "Optimized zero fill");
+	VMCNT_PTR(ozfod), 0, vcnt, "IU", "Optimized zero fill");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapin, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_swapin, 0, vcnt, "IU", "Swapin operations");
+	VMCNT_PTR(swapin), 0, vcnt, "IU", "Swapin operations");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swapout, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_swapout, 0, vcnt, "IU", "Swapout operations");
+	VMCNT_PTR(swapout), 0, vcnt, "IU", "Swapout operations");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsin, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_swappgsin, 0, vcnt, "IU", "Swapin pages");
+	VMCNT_PTR(swappgsin), 0, vcnt, "IU", "Swapin pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_swappgsout, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_swappgsout, 0, vcnt, "IU", "Swapout pages");
+	VMCNT_PTR(swappgsout), 0, vcnt, "IU", "Swapout pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodein, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vnodein, 0, vcnt, "IU", "Vnodein operations");
+	VMCNT_PTR(vnodein), 0, vcnt, "IU", "Vnodein operations");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodeout, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vnodeout, 0, vcnt, "IU", "Vnodeout operations");
+	VMCNT_PTR(vnodeout), 0, vcnt, "IU", "Vnodeout operations");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsin, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vnodepgsin, 0, vcnt, "IU", "Vnodein pages");
+	VMCNT_PTR(vnodepgsin), 0, vcnt, "IU", "Vnodein pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vnodepgsout, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vnodepgsout, 0, vcnt, "IU", "Vnodeout pages");
+	VMCNT_PTR(vnodepgsout), 0, vcnt, "IU", "Vnodeout pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_intrans, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_intrans, 0, vcnt, "IU", "In transit page blocking");
+	VMCNT_PTR(intrans), 0, vcnt, "IU", "In transit page blocking");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_reactivated, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_reactivated, 0, vcnt, "IU", "Reactivated pages");
+	VMCNT_PTR(reactivated), 0, vcnt, "IU", "Reactivated pages");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdwakeups, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_pdwakeups, 0, vcnt, "IU", "Pagedaemon wakeups");
+	VMCNT_PTR(pdwakeups), 0, vcnt, "IU", "Pagedaemon wakeups");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pdpages, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_pdpages, 0, vcnt, "IU", "Pagedaemon page scans");
+	VMCNT_PTR(pdpages), 0, vcnt, "IU", "Pagedaemon page scans");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_dfree, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_dfree, 0, vcnt, "IU", "");
+	VMCNT_PTR(dfree), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pfree, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_pfree, 0, vcnt, "IU", "");
+	VMCNT_PTR(pfree), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_tfree, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_tfree, 0, vcnt, "IU", "");
+	VMCNT_PTR(tfree), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_size, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_page_size, 0, vcnt, "IU", "");
+	VMCNT_PTR(page_size), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_page_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_page_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(page_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_reserved, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_free_reserved, 0, vcnt, "IU", "");
+	VMCNT_PTR(free_reserved), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_target, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_free_target, 0, vcnt, "IU", "");
+	VMCNT_PTR(free_target), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_min, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_free_min, 0, vcnt, "IU", "");
+	VMCNT_PTR(free_min), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_free_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_free_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(free_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_wire_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_wire_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(wire_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_active_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_active_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(active_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_target, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_inactive_target, 0, vcnt, "IU", "");
+	VMCNT_PTR(inactive_target), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_inactive_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_inactive_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(inactive_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_count, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_cache_count, 0, vcnt, "IU", "");
+	VMCNT_PTR(cache_count), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_min, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_cache_min, 0, vcnt, "IU", "");
+	VMCNT_PTR(cache_min), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_cache_max, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_cache_max, 0, vcnt, "IU", "");
+	VMCNT_PTR(cache_max), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_pageout_free_min, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_pageout_free_min, 0, vcnt, "IU", "");
-SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_interrupt_free_min, 0, vcnt, "IU", "");
+	VMCNT_PTR(pageout_free_min), 0, vcnt, "IU", "");
+SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_interrupt_free_min, CTLTYPE_UINT |
+	CTLFLAG_RD, VMCNT_PTR(interrupt_free_min), 0, vcnt, "IU", "");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forks, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_forks, 0, vcnt, "IU", "Number of fork() calls");
+	VMCNT_PTR(forks), 0, vcnt, "IU", "Number of fork() calls");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforks, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vforks, 0, vcnt, "IU", "Number of vfork() calls");
+	VMCNT_PTR(vforks), 0, vcnt, "IU", "Number of vfork() calls");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforks, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_rforks, 0, vcnt, "IU", "Number of rfork() calls");
+	VMCNT_PTR(rforks), 0, vcnt, "IU", "Number of rfork() calls");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreads, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_kthreads, 0, vcnt, "IU", "Number of fork() calls by kernel");
+	VMCNT_PTR(kthreads), 0, vcnt, "IU",
+	"Number of fork() calls by kernel");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_forkpages, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_forkpages, 0, vcnt, "IU", "VM pages affected by fork()");
+	VMCNT_PTR(forkpages), 0, vcnt, "IU", "VM pages affected by fork()");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_vforkpages, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_vforkpages, 0, vcnt, "IU", "VM pages affected by vfork()");
+	VMCNT_PTR(vforkpages), 0, vcnt, "IU", "VM pages affected by vfork()");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_rforkpages, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_rforkpages, 0, vcnt, "IU", "VM pages affected by rfork()");
+	VMCNT_PTR(rforkpages), 0, vcnt, "IU", "VM pages affected by rfork()");
 SYSCTL_PROC(_vm_stats_vm, OID_AUTO, v_kthreadpages, CTLTYPE_UINT|CTLFLAG_RD,
-	&cnt.v_kthreadpages, 0, vcnt, "IU", "VM pages affected by fork() by kernel");
+	VMCNT_PTR(kthreadpages), 0, vcnt, "IU",
+	"VM pages affected by fork() by kernel");
 
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	zero_page_count, CTLFLAG_RD, &vm_page_zero_count, 0, "");
 #if 0
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	page_mask, CTLFLAG_RD, &page_mask, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	page_shift, CTLFLAG_RD, &page_shift, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	first_page, CTLFLAG_RD, &first_page, 0, "");
 SYSCTL_INT(_vm_stats_misc, OID_AUTO,
 	last_page, CTLFLAG_RD, &last_page, 0, "");
 #endif
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c	(revision 169666)
+++ head/sys/vm/vm_mmap.c	(revision 169667)
@@ -1,1427 +1,1427 @@
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 static int max_proc_mmap;
 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
 
 /*
  * Set the maximum number of vm_map_entry structures per process.  Roughly
  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
  * of our KVM malloc space still results in generous limits.  We want a
  * default that is good enough to prevent the kernel running out of resources
  * if attacked from compromised user account but generous enough such that
  * multi-threaded processes are not unduly inconvenienced.
  */
 static void vmmapentry_rsrc_init(void *);
 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
 
 static void
 vmmapentry_rsrc_init(dummy)
         void *dummy;
 {
     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
     max_proc_mmap /= 100;
 }
 
 static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct vnode *, vm_ooffset_t, vm_object_t *);
 static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct cdev *, vm_ooffset_t, vm_object_t *);
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sbrk(td, uap)
 	struct thread *td;
 	struct sbrk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sstk(td, uap)
 	struct thread *td;
 	struct sstk_args *uap;
 {
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 ogetpagesize(td, uap)
 	struct thread *td;
 	struct getpagesize_args *uap;
 {
 	/* MP SAFE */
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 */
 
 
 /*
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  *
  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  * is maintained as long as you do not write directly to the underlying
  * character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 mmap(td, uap)
 	struct thread *td;
 	struct mmap_args *uap;
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_in pkm;
 #endif
 	struct file *fp;
 	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot, maxprot;
 	void *handle;
 	objtype_t handle_type;
 	int flags, error;
 	off_t pos;
 	struct vmspace *vms = td->td_proc->p_vmspace;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 	flags = uap->flags;
 	pos = uap->pos;
 
 	fp = NULL;
 	/* make sure mapping fits into numeric range etc */
 	if ((ssize_t) uap->len < 0 ||
 	    ((flags & MAP_ANON) && uap->fd != -1))
 		return (EINVAL);
 
 	if (flags & MAP_STACK) {
 		if ((uap->fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Adjust size for rounding (on both ends). */
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 		/* Address range must be all in user VM space. */
 		if (addr < vm_map_min(&vms->vm_map) ||
 		    addr + size > vm_map_max(&vms->vm_map))
 			return (EINVAL);
 		if (addr + size < addr)
 			return (EINVAL);
 	} else {
 	/*
 	 * XXX for non-fixed mappings where no hint is provided or
 	 * the hint would fall in the potential heap space,
 	 * place it after the end of the largest possible heap.
 	 *
 	 * There should really be a pmap call to determine a reasonable
 	 * location.
 	 */
 		PROC_LOCK(td->td_proc);
 		if (addr == 0 ||
 		    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 		    addr < round_page((vm_offset_t)vms->vm_daddr +
 		    lim_max(td->td_proc, RLIMIT_DATA))))
 			addr = round_page((vm_offset_t)vms->vm_daddr +
 			    lim_max(td->td_proc, RLIMIT_DATA));
 		PROC_UNLOCK(td->td_proc);
 	}
 	if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
 		handle = NULL;
 		handle_type = OBJT_DEFAULT;
 		maxprot = VM_PROT_ALL;
 		pos = 0;
 	} else {
 		/*
 		 * Mapping file, get fp for validation. Obtain vnode and make
 		 * sure it is of appropriate type.
 		 * don't let the descriptor disappear on us if we block
 		 */
 		if ((error = fget(td, uap->fd, &fp)) != 0)
 			goto done;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = ENODEV;
 			goto done;
 		}
 		/*
 		 * POSIX shared-memory objects are defined to have
 		 * kernel persistence, and are not defined to support
 		 * read(2)/write(2) -- or even open(2).  Thus, we can
 		 * use MAP_ASYNC to trade on-disk coherence for speed.
 		 * The shm_open(3) library routine turns on the FPOSIXSHM
 		 * flag to request this behavior.
 		 */
 		if (fp->f_flag & FPOSIXSHM)
 			flags |= MAP_NOSYNC;
 		vp = fp->f_vnode;
 		/*
 		 * Ensure that file and memory protections are
 		 * compatible.  Note that we only worry about
 		 * writability if mapping is shared; in this case,
 		 * current and max prot are dictated by the open file.
 		 * XXX use the vnode instead?  Problem is: what
 		 * credentials do we use for determination? What if
 		 * proc does a setuid?
 		 */
 		if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
 			maxprot = VM_PROT_NONE;
 		else
 			maxprot = VM_PROT_EXECUTE;
 		if (fp->f_flag & FREAD) {
 			maxprot |= VM_PROT_READ;
 		} else if (prot & PROT_READ) {
 			error = EACCES;
 			goto done;
 		}
 		/*
 		 * If we are sharing potential changes (either via
 		 * MAP_SHARED or via the implicit sharing of character
 		 * device mappings), and we are trying to get write
 		 * permission although we opened it without asking
 		 * for it, bail out.
 		 */
 		if ((flags & MAP_SHARED) != 0) {
 			if ((fp->f_flag & FWRITE) != 0) {
 				maxprot |= VM_PROT_WRITE;
 			} else if ((prot & PROT_WRITE) != 0) {
 				error = EACCES;
 				goto done;
 			}
 		} else if (vp->v_type != VCHR || (fp->f_flag & FWRITE) != 0) {
 			maxprot |= VM_PROT_WRITE;
 		}
 		handle = (void *)vp;
 		handle_type = OBJT_VNODE;
 	}
 
 	/*
 	 * Do not allow more then a certain number of vm_map_entry structures
 	 * per process.  Scale with the number of rforks sharing the map
 	 * to make the limit reasonable for threads.
 	 */
 	if (max_proc_mmap &&
 	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
 	    flags, handle_type, handle, pos);
 #ifdef HWPMC_HOOKS
 	/* inform hwpmc(4) if an executable is being mapped */
 	if (error == 0 && handle_type == OBJT_VNODE &&
 	    (prot & PROT_EXEC)) {
 		pkm.pm_file = handle;
 		pkm.pm_address = (uintptr_t) addr;
 		PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
 	}
 #endif
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 done:
 	if (fp)
 		fdrop(fp, td);
 
 	return (error);
 }
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(td, uap)
 	struct thread *td;
 	struct ommap_args *uap;
 {
 	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
 	nargs.addr = uap->addr;
 	nargs.len = uap->len;
 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
 	nargs.flags = 0;
 	if (uap->flags & OMAP_ANON)
 		nargs.flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		nargs.flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		nargs.flags |= MAP_SHARED;
 	else
 		nargs.flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		nargs.flags |= MAP_FIXED;
 	nargs.fd = uap->fd;
 	nargs.pos = uap->pos;
 	return (mmap(td, &nargs));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	int len;
 	int flags;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 msync(td, uap)
 	struct thread *td;
 	struct msync_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int flags;
 	vm_map_t map;
 	int rv;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	flags = uap->flags;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 		return (EINVAL);	/* Sun returns ENOMEM? */
 	case KERN_INVALID_ARGUMENT:
 		return (EBUSY);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 munmap(td, uap)
 	struct thread *td;
 	struct munmap_args *uap;
 {
 #ifdef HWPMC_HOOKS
 	struct pmckern_map_out pkm;
 	vm_map_entry_t entry;
 #endif
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_map_t map;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	if (size == 0)
 		return (EINVAL);
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap...
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (addr < vm_map_min(map) || addr + size > vm_map_max(map))
 		return (EINVAL);
 	vm_map_lock(map);
 	/*
 	 * Make sure entire range is allocated.
 	 */
 	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
 		vm_map_unlock(map);
 		return (EINVAL);
 	}
 #ifdef HWPMC_HOOKS
 	/*
 	 * Inform hwpmc if the address range being unmapped contains
 	 * an executable region.
 	 */
 	if (vm_map_lookup_entry(map, addr, &entry)) {
 		for (;
 		     entry != &map->header && entry->start < addr + size;
 		     entry = entry->next) {
 			if (vm_map_check_protection(map, entry->start,
 				entry->end, VM_PROT_EXECUTE) == TRUE) {
 				pkm.pm_address = (uintptr_t) addr;
 				pkm.pm_size = (size_t) size;
 				PMC_CALL_HOOK(td, PMC_FN_MUNMAP,
 				    (void *) &pkm);
 				break;
 			}
 		}
 	}
 #endif
 	/* returns nothing but KERN_SUCCESS anyway */
 	vm_map_delete(map, addr, addr + size);
 	vm_map_unlock(map);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 mprotect(td, uap)
 	struct thread *td;
 	struct mprotect_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 #if defined(VM_PROT_READ_IS_EXEC)
 	if (prot & VM_PROT_READ)
 		prot |= VM_PROT_EXECUTE;
 #endif
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, prot, FALSE)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 minherit(td, uap)
 	struct thread *td;
 	struct minherit_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	inherit = uap->inherit;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return (EINVAL);
 
 	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
 	    addr + size, inherit)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 madvise(td, uap)
 	struct thread *td;
 	struct madvise_args *uap;
 {
 	vm_offset_t start, end;
 	vm_map_t map;
 	struct proc *p;
 	int error;
 
 	/*
 	 * Check for our special case, advising the swap pager we are
 	 * "immortal."
 	 */
 	if (uap->behav == MADV_PROTECT) {
 		error = priv_check(td, PRIV_VM_MADV_PROTECT);
 		if (error == 0) {
 			p = td->td_proc;
 			PROC_LOCK(p);
 			p->p_flag |= P_PROTECTED;
 			PROC_UNLOCK(p);
 		}
 		return (error);
 	}
 	/*
 	 * Check for illegal behavior
 	 */
 	if (uap->behav < 0 || uap->behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	map = &td->td_proc->p_vmspace->vm_map;
 	if ((vm_offset_t)uap->addr < vm_map_min(map) ||
 	    (vm_offset_t)uap->addr + uap->len > vm_map_max(map))
 		return (EINVAL);
 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
 	start = trunc_page((vm_offset_t) uap->addr);
 	end = round_page((vm_offset_t) uap->addr + uap->len);
 
 	if (vm_map_madvise(map, start, end, uap->behav))
 		return (EINVAL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 mincore(td, uap)
 	struct thread *td;
 	struct mincore_args *uap;
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
 	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	int mincoreinfo;
 	unsigned int timestamp;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + (vm_size_t)round_page(uap->len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (end > vm_map_max(map) || end < addr)
 		return (ENOMEM);
 
 	/*
 	 * Address of byte vector
 	 */
 	vec = uap->vec;
 
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry)) {
 		vm_map_unlock_read(map);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	for (current = entry;
 	    (current != &map->header) && (current->start < end);
 	    current = current->next) {
 
 		/*
 		 * check for contiguity
 		 */
 		if (current->end < end &&
 		    (entry->next == &map->header ||
 		     current->next->start > current->end)) {
 			vm_map_unlock_read(map);
 			return (ENOMEM);
 		}
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			current->object.vm_object == NULL)
 			continue;
 
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		/*
 		 * scan this entry one page at a time
 		 */
 		while (addr < cend) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			mincoreinfo = pmap_mincore(pmap, addr);
 			if (!mincoreinfo) {
 				vm_pindex_t pindex;
 				vm_ooffset_t offset;
 				vm_page_t m;
 				/*
 				 * calculate the page index into the object
 				 */
 				offset = current->offset + (addr - current->start);
 				pindex = OFF_TO_IDX(offset);
 				VM_OBJECT_LOCK(current->object.vm_object);
 				m = vm_page_lookup(current->object.vm_object,
 					pindex);
 				/*
 				 * if the page is resident, then gather information about
 				 * it.
 				 */
 				if (m != NULL && m->valid != 0) {
 					mincoreinfo = MINCORE_INCORE;
 					vm_page_lock_queues();
 					if (m->dirty ||
 						pmap_is_modified(m))
 						mincoreinfo |= MINCORE_MODIFIED_OTHER;
 					if ((m->flags & PG_REFERENCED) ||
 						pmap_ts_referenced(m)) {
 						vm_page_flag_set(m, PG_REFERENCED);
 						mincoreinfo |= MINCORE_REFERENCED_OTHER;
 					}
 					vm_page_unlock_queues();
 				}
 				VM_OBJECT_UNLOCK(current->object.vm_object);
 			}
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = OFF_TO_IDX(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				error = subyte(vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 				++lastvecindex;
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte(vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 			addr += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		error = subyte(vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 		++lastvecindex;
 	}
 
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 mlock(td, uap)
 	struct thread *td;
 	struct mlock_args *uap;
 {
 	struct proc *proc;
 	vm_offset_t addr, end, last, start;
 	vm_size_t npages, size;
 	int error;
 
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 	proc = td->td_proc;
 	PROC_LOCK(proc);
 	if (ptoa(npages +
 	    pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))) >
 	    lim_cur(proc, RLIMIT_MEMLOCK)) {
 		PROC_UNLOCK(proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(proc);
-	if (npages + cnt.v_wire_count > vm_page_max_wired)
+	if (npages + VMCNT_GET(wire_count) > vm_page_max_wired)
 		return (EAGAIN);
 	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 mlockall(td, uap)
 	struct thread *td;
 	struct mlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = 0;
 
 	if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0))
 		return (EINVAL);
 
 #if 0
 	/*
 	 * If wiring all pages in the process would cause it to exceed
 	 * a hard resource limit, return ENOMEM.
 	 */
 	PROC_LOCK(td->td_proc);
 	if (map->size - ptoa(pmap_wired_count(vm_map_pmap(map)) >
 		lim_cur(td->td_proc, RLIMIT_MEMLOCK))) {
 		PROC_UNLOCK(td->td_proc);
 		return (ENOMEM);
 	}
 	PROC_UNLOCK(td->td_proc);
 #else
 	error = priv_check(td, PRIV_VM_MLOCK);
 	if (error)
 		return (error);
 #endif
 
 	if (uap->how & MCL_FUTURE) {
 		vm_map_lock(map);
 		vm_map_modflags(map, MAP_WIREFUTURE, 0);
 		vm_map_unlock(map);
 		error = 0;
 	}
 
 	if (uap->how & MCL_CURRENT) {
 		/*
 		 * P1003.1-2001 mandates that all currently mapped pages
 		 * will be memory resident and locked (wired) upon return
 		 * from mlockall(). vm_map_wire() will wire pages, by
 		 * calling vm_fault_wire() for each page in the region.
 		 */
 		error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 		error = (error == KERN_SUCCESS ? 0 : EAGAIN);
 	}
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlockall_args {
 	register_t dummy;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 munlockall(td, uap)
 	struct thread *td;
 	struct munlockall_args *uap;
 {
 	vm_map_t map;
 	int error;
 
 	map = &td->td_proc->p_vmspace->vm_map;
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 
 	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
 	vm_map_lock(map);
 	vm_map_modflags(map, 0, MAP_WIREFUTURE);
 	vm_map_unlock(map);
 
 	/* Forcibly unwire all pages. */
 	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
 	    VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 munlock(td, uap)
 	struct thread *td;
 	struct munlock_args *uap;
 {
 	vm_offset_t addr, end, last, start;
 	vm_size_t size;
 	int error;
 
 	error = priv_check(td, PRIV_VM_MUNLOCK);
 	if (error)
 		return (error);
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	last = addr + size;
 	start = trunc_page(addr);
 	end = round_page(last);
 	if (last < addr || end < addr)
 		return (EINVAL);
 	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap_vnode()
  *
  * MPSAFE
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on vnodes.
  */
 int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t foff, vm_object_t *objp)
 {
 	struct vattr va;
 	void *handle;
 	vm_object_t obj;
 	struct mount *mp;
 	int error, flags, type;
 	int vfslocked;
 
 	mp = vp->v_mount;
 	vfslocked = VFS_LOCK_GIANT(mp);
 	if ((error = vget(vp, LK_EXCLUSIVE, td)) != 0) {
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	flags = *flagsp;
 	obj = vp->v_object;
 	if (vp->v_type == VREG) {
 		/*
 		 * Get the proper underlying object
 		 */
 		if (obj == NULL) {
 			error = EINVAL;
 			goto done;
 		}
 		if (obj->handle != vp) {
 			vput(vp);
 			vp = (struct vnode*)obj->handle;
 			vget(vp, LK_EXCLUSIVE, td);
 		}
 		type = OBJT_VNODE;
 		handle = vp;
 	} else if (vp->v_type == VCHR) {
 		type = OBJT_DEVICE;
 		handle = vp->v_rdev;
 
 		/* XXX: lack thredref on device */
 		if(vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON) {
 			*maxprotp = VM_PROT_ALL;
 			*flagsp |= MAP_ANON;
 			error = 0;
 			goto done;
 		}
 		/*
 		 * cdevs does not provide private mappings of any kind.
 		 */
 		if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 		    (prot & PROT_WRITE) != 0) {
 			error = EACCES;
 			goto done;
 		}
 		if (flags & (MAP_PRIVATE|MAP_COPY)) {
 			error = EINVAL;
 			goto done;
 		}
 		/*
 		 * Force device mappings to be shared.
 		 */
 		flags |= MAP_SHARED;
 	} else {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = VOP_GETATTR(vp, &va, td->td_ucred, td))) {
 		goto done;
 	}
 #ifdef MAC
 	error = mac_check_vnode_mmap(td->td_ucred, vp, prot, flags);
 	if (error != 0)
 		goto done;
 #endif
 	if ((flags & MAP_SHARED) != 0) {
 		if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) {
 			if (prot & PROT_WRITE) {
 				error = EPERM;
 				goto done;
 			}
 			*maxprotp &= ~VM_PROT_WRITE;
 		}
 	}
 	/*
 	 * If it is a regular file without any references
 	 * we do not need to sync it.
 	 * Adjust object size to be the size of actual file.
 	 */
 	if (vp->v_type == VREG) {
 		objsize = round_page(va.va_size);
 		if (va.va_nlink == 0)
 			flags |= MAP_NOSYNC;
 	}
 	obj = vm_pager_allocate(type, handle, objsize, prot, foff);
 	if (obj == NULL) {
 		error = (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 		goto done;
 	}
 	*objp = obj;
 	*flagsp = flags;
 	vfs_mark_atime(vp, td);
 
 done:
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * vm_mmap_cdev()
  *
  * MPSAFE
  *
  * Helper function for vm_mmap.  Perform sanity check specific for mmap
  * operations on cdevs.
  */
 int
 vm_mmap_cdev(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct cdev *cdev, vm_ooffset_t foff, vm_object_t *objp)
 {
 	vm_object_t obj;
 	int flags;
 
 	flags = *flagsp;
 
 	/* XXX: lack thredref on device */
 	if (cdev->si_devsw->d_flags & D_MMAP_ANON) {
 		*maxprotp = VM_PROT_ALL;
 		*flagsp |= MAP_ANON;
 		return (0);
 	}
 	/*
 	 * cdevs does not provide private mappings of any kind.
 	 */
 	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
 	    (prot & PROT_WRITE) != 0)
 		return (EACCES);
 	if (flags & (MAP_PRIVATE|MAP_COPY))
 		return (EINVAL);
 	/*
 	 * Force device mappings to be shared.
 	 */
 	flags |= MAP_SHARED;
 #ifdef MAC_XXX
 	error = mac_check_cdev_mmap(td->td_ucred, cdev, prot);
 	if (error != 0)
 		return (error);
 #endif
 	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, foff);
 	if (obj == NULL)
 		return (EINVAL);
 	*objp = obj;
 	*flagsp = flags;
 	return (0);
 }
 
 /*
  * vm_mmap()
  *
  * MPSAFE
  *
  * Internal version of mmap.  Currently used by mmap, exec, and sys5
  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
  */
 int
 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	vm_prot_t maxprot, int flags,
 	objtype_t handle_type, void *handle,
 	vm_ooffset_t foff)
 {
 	boolean_t fitit;
 	vm_object_t object;
 	int rv = KERN_SUCCESS;
 	int docow, error;
 	struct thread *td = curthread;
 
 	if (size == 0)
 		return (0);
 
 	size = round_page(size);
 
 	PROC_LOCK(td->td_proc);
 	if (td->td_proc->p_vmspace->vm_map.size + size >
 	    lim_cur(td->td_proc, RLIMIT_VMEM)) {
 		PROC_UNLOCK(td->td_proc);
 		return(ENOMEM);
 	}
 	PROC_UNLOCK(td->td_proc);
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The check is here rather than in the syscall because the
 	 * kernel calls this function internally for other mmaping
 	 * operations (such as in exec) and non-aligned offsets will
 	 * cause pmap inconsistencies...so we want to be sure to
 	 * disallow this in all cases.
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 		(void) vm_map_remove(map, *addr, *addr + size);
 	}
 	/*
 	 * Lookup/allocate object.
 	 */
 	switch (handle_type) {
 	case OBJT_DEVICE:
 		error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
 		    handle, foff, &object);
 		break;
 	case OBJT_VNODE:
 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
 		    handle, foff, &object);
 		break;
 	case OBJT_DEFAULT:
 		if (handle == NULL) {
 			error = 0;
 			break;
 		}
 		/* FALLTHROUGH */
 	default:
 		error = EINVAL;
 	}
 	if (error)
 		return (error);
 	if (flags & MAP_ANON) {
 		object = NULL;
 		docow = 0;
 		/*
 		 * Unnamed anonymous regions always start at 0.
 		 */
 		if (handle == 0)
 			foff = 0;
 	} else {
 		docow = MAP_PREFAULT_PARTIAL;
 	}
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 
 #if defined(VM_PROT_READ_IS_EXEC)
 	if (prot & VM_PROT_READ)
 		prot |= VM_PROT_EXECUTE;
 
 	if (maxprot & VM_PROT_READ)
 		maxprot |= VM_PROT_EXECUTE;
 #endif
 
 	if (fitit)
 		*addr = pmap_addr_hint(object, *addr, size);
 
 	if (flags & MAP_STACK)
 		rv = vm_map_stack(map, *addr, size, prot, maxprot,
 		    docow | MAP_STACK_GROWS_DOWN);
 	else
 		rv = vm_map_find(map, object, foff, addr, size, fitit,
 				 prot, maxprot, docow);
 
 	if (rv != KERN_SUCCESS) {
 		/*
 		 * Lose the object reference. Will destroy the
 		 * object if it's an unnamed anonymous mapping
 		 * or named anonymous without other references.
 		 */
 		vm_object_deallocate(object);
 	} else if (flags & MAP_SHARED) {
 		/*
 		 * Shared memory is also shared with children.
 		 */
 		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 		if (rv != KERN_SUCCESS)
 			(void) vm_map_remove(map, *addr, *addr + size);
 	}
 
 	/*
 	 * If the process has requested that all future mappings
 	 * be wired, then heed this.
 	 */
 	if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
 		vm_map_wire(map, *addr, *addr + size,
 		    VM_MAP_WIRE_USER|VM_MAP_WIRE_NOHOLES);
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 169666)
+++ head/sys/vm/vm_object.c	(revision 169667)
@@ -1,2210 +1,2210 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #define EASY_SCAN_FACTOR       8
 
 #define MSYNC_FLUSH_HARDSEQ	0x01
 #define MSYNC_FLUSH_SOFTSEQ	0x02
 
 /*
  * msync / VM object flushing optimizations
  */
 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
         CTLFLAG_RW, &msync_flush_flags, 0, "");
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static void	vm_object_qcollapse(vm_object_t object);
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 /*
  * next_index determines the page color that is assigned to the next
  * allocated object.  Accesses to next_index are not synchronized
  * because the effects of two or more object allocations using
  * next_index simultaneously are inconsequential.  At any given time,
  * numerous objects have the same page color.
  */
 static int next_index;
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	bzero(&object->mtx, sizeof(object->mtx));
 	VM_OBJECT_LOCK_INIT(object, "standard object");
 
 	/* These are true for any object that has been freed */
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	return (0);
 }
 
 void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 	int incr;
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->root = NULL;
 	object->type = type;
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->flags = 0;
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		object->flags = OBJ_ONEMAPPING;
 	incr = PQ_MAXLENGTH;
 	if (size <= incr)
 		incr = size;
 	object->pg_color = next_index;
 	next_index = (object->pg_color + incr) & PQ_COLORMASK;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 
 	VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->flags &= ~bits;
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	struct vnode *vp;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		int vfslocked;
 
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vget(vp, LK_RETRY, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("vm_object_reference_locked: dead object referenced"));
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
 	object->ref_count--;
 	if (object->ref_count == 0) {
 		mp_fixme("Unlocked vflag access.");
 		vp->v_vflag &= ~VV_TEXT;
 	}
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * vrele may need a vop lock
 	 */
 	vrele(vp);
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 
 	while (object != NULL) {
 		int vfslocked;
 
 		vfslocked = 0;
 	restart:
 		VM_OBJECT_LOCK(object);
 		if (object->type == OBJT_VNODE) {
 			struct vnode *vp = (struct vnode *) object->handle;
 
 			/*
 			 * Conditionally acquire Giant for a vnode-backed
 			 * object.  We have to be careful since the type of
 			 * a vnode object can change while the object is
 			 * unlocked.
 			 */
 			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
 				vfslocked = 1;
 				if (!mtx_trylock(&Giant)) {
 					VM_OBJECT_UNLOCK(object);
 					mtx_lock(&Giant);
 					goto restart;
 				}
 			}
 			vm_object_vndeallocate(object);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return;
 		} else
 			/*
 			 * This is to handle the case that the object
 			 * changed type while we dropped its lock to
 			 * obtain Giant.
 			 */
 			VFS_UNLOCK_GIANT(vfslocked);
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_UNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->shadow_count == 0) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				if (!VM_OBJECT_TRYLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_UNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_UNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_UNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						msleep(object,
 						    VM_OBJECT_MTX(object),
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_LOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_UNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_UNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_UNLOCK(robject);
 			}
 			VM_OBJECT_UNLOCK(object);
 			return;
 		}
 doterm:
 		temp = object->backing_object;
 		if (temp != NULL) {
 			VM_OBJECT_LOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			temp->generation++;
 			VM_OBJECT_UNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_UNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, NULL, 0, 0);
 
 		VM_OBJECT_LOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Now free any remaining pages. For internal objects, this also
 	 * removes them from paging queues. Don't free wired pages, just
 	 * remove them from the object. 
 	 */
 	vm_page_lock_queues();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 			("vm_object_terminate: freeing busy page %p "
 			"p->busy = %d, p->flags %x\n", p, p->busy, p->flags));
 		if (p->wire_count == 0) {
 			vm_page_free(p);
-			cnt.v_pfree++;
+			VMCNT_ADD(pfree, 1);
 		} else {
 			vm_page_remove(p);
 		}
 	}
 	vm_page_unlock_queues();
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * Remove the object from the global object list.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_REMOVE(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
 {
 	vm_page_t p, np;
 	vm_pindex_t tstart, tend;
 	vm_pindex_t pi;
 	int clearobjflags;
 	int pagerflags;
 	int curgeneration;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_VNODE ||
 		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
 		return;
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 
 	vm_object_set_flag(object, OBJ_CLEANING);
 
 	tstart = start;
 	if (end == 0) {
 		tend = object->size;
 	} else {
 		tend = end;
 	}
 
 	vm_page_lock_queues();
 	/*
 	 * If the caller is smart and only msync()s a range he knows is
 	 * dirty, we may be able to avoid an object scan.  This results in
 	 * a phenominal improvement in performance.  We cannot do this
 	 * as a matter of course because the object may be huge - e.g.
 	 * the size might be in the gigabytes or terrabytes.
 	 */
 	if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
 		vm_pindex_t tscan;
 		int scanlimit;
 		int scanreset;
 
 		scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
 		if (scanreset < 16)
 			scanreset = 16;
 		pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
 
 		scanlimit = scanreset;
 		tscan = tstart;
 		while (tscan < tend) {
 			curgeneration = object->generation;
 			p = vm_page_lookup(object, tscan);
 			if (p == NULL || p->valid == 0 ||
 			    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			vm_page_test_dirty(p);
 			if ((p->dirty & p->valid) == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			/*
 			 * If we have been asked to skip nosync pages and 
 			 * this is a nosync page, we can't continue.
 			 */
 			if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			scanlimit = scanreset;
 
 			/*
 			 * This returns 0 if it was unable to busy the first
 			 * page (i.e. had to sleep).
 			 */
 			tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
 		}
 
 		/*
 		 * If everything was dirty and we flushed it successfully,
 		 * and the requested range is not the entire object, we
 		 * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
 		 * return immediately.
 		 */
 		if (tscan >= tend && (tstart || tend < object->size)) {
 			vm_page_unlock_queues();
 			vm_object_clear_flag(object, OBJ_CLEANING);
 			return;
 		}
 		pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
 	}
 
 	/*
 	 * Generally set CLEANCHK interlock and make the page read-only so
 	 * we can then clear the object flags.
 	 *
 	 * However, if this is a nosync mmap then the object is likely to 
 	 * stay dirty so do not mess with the page and do not clear the
 	 * object flags.
 	 */
 	clearobjflags = 1;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		p->oflags |= VPO_CLEANCHK;
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
 			clearobjflags = 0;
 		else
 			pmap_remove_write(p);
 	}
 
 	if (clearobjflags && (tstart == 0) && (tend == object->size)) {
 		struct vnode *vp;
 
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 		if (object->type == OBJT_VNODE &&
 		    (vp = (struct vnode *)object->handle) != NULL) {
 			VI_LOCK(vp);
 			if (vp->v_iflag & VI_OBJDIRTY)
 				vp->v_iflag &= ~VI_OBJDIRTY;
 			VI_UNLOCK(vp);
 		}
 	}
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = TAILQ_FIRST(&object->memq); p; p = np) {
 		int n;
 
 		np = TAILQ_NEXT(p, listq);
 
 again:
 		pi = p->pindex;
 		if ((p->oflags & VPO_CLEANCHK) == 0 ||
 			(pi < tstart) || (pi >= tend) ||
 			(p->valid == 0) ||
 		    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		/*
 		 * If we have been asked to skip nosync pages and this is a
 		 * nosync page, skip it.  Note that the object flags were
 		 * not cleared in this case so we do not have to set them.
 		 */
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		n = vm_object_page_collect_flush(object, p,
 			curgeneration, pagerflags);
 		if (n == 0)
 			goto rescan;
 
 		if (object->generation != curgeneration)
 			goto rescan;
 
 		/*
 		 * Try to optimize the next page.  If we can't we pick up
 		 * our (random) scan where we left off.
 		 */
 		if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
 			if ((p = vm_page_lookup(object, pi + n)) != NULL)
 				goto again;
 		}
 	}
 	vm_page_unlock_queues();
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
 #endif
 
 	vm_object_clear_flag(object, OBJ_CLEANING);
 	return;
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
 {
 	int runlen;
 	int maxf;
 	int chkb;
 	int maxb;
 	int i;
 	vm_pindex_t pi;
 	vm_page_t maf[vm_pageout_page_count];
 	vm_page_t mab[vm_pageout_page_count];
 	vm_page_t ma[vm_pageout_page_count];
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pi = p->pindex;
 	while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
 		vm_page_lock_queues();
 		if (object->generation != curgeneration) {
 			return(0);
 		}
 	}
 	maxf = 0;
 	for(i = 1; i < vm_pageout_page_count; i++) {
 		vm_page_t tp;
 
 		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
 			if ((tp->oflags & VPO_BUSY) ||
 				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 				 (tp->oflags & VPO_CLEANCHK) == 0) ||
 				(tp->busy != 0))
 				break;
 			if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
 				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			vm_page_test_dirty(tp);
 			if ((tp->dirty & tp->valid) == 0) {
 				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			maf[ i - 1 ] = tp;
 			maxf++;
 			continue;
 		}
 		break;
 	}
 
 	maxb = 0;
 	chkb = vm_pageout_page_count -  maxf;
 	if (chkb) {
 		for(i = 1; i < chkb;i++) {
 			vm_page_t tp;
 
 			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
 				if ((tp->oflags & VPO_BUSY) ||
 					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 					 (tp->oflags & VPO_CLEANCHK) == 0) ||
 					(tp->busy != 0))
 					break;
 				if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
 					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				vm_page_test_dirty(tp);
 				if ((tp->dirty & tp->valid) == 0) {
 					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				mab[ i - 1 ] = tp;
 				maxb++;
 				continue;
 			}
 			break;
 		}
 	}
 
 	for(i = 0; i < maxb; i++) {
 		int index = (maxb - i) - 1;
 		ma[index] = mab[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	p->oflags &= ~VPO_CLEANCHK;
 	ma[maxb] = p;
 	for(i = 0; i < maxf; i++) {
 		int index = (maxb + i) + 1;
 		ma[index] = maf[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	runlen = maxb + maxf + 1;
 
 	vm_pageout_flush(ma, runlen, pagerflags);
 	for (i = 0; i < runlen; i++) {
 		if (ma[i]->valid & ma[i]->dirty) {
 			pmap_remove_write(ma[i]);
 			ma[i]->oflags |= VPO_CLEANCHK;
 
 			/*
 			 * maxf will end up being the actual number of pages
 			 * we wrote out contiguously, non-inclusive of the
 			 * first page.  We do not count look-behind pages.
 			 */
 			if (i >= maxb + 1 && (maxf > i - maxb - 1))
 				maxf = i - maxb - 1;
 		}
 	}
 	return(maxf + 1);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 void
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_LOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_UNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		int vfslocked;
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 		flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 		flags |= invalidate ? OBJPC_INVAL : 0;
 		VM_OBJECT_LOCK(object);
 		vm_object_page_clean(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    flags);
 		VM_OBJECT_UNLOCK(object);
 		VOP_UNLOCK(vp, 0, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vn_finished_write(mp);
 		VM_OBJECT_LOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		boolean_t purge;
 		purge = old_msync || (object->type == OBJT_DEVICE);
 		vm_object_page_remove(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    purge ? FALSE : TRUE);
 	}
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
 {
 	vm_pindex_t end, tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	end = pindex + count;
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		}
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_LOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_UNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		}
 		/*
 		 * If the page is busy or not in a normal active state,
 		 * we skip it.  If the page is not managed there are no
 		 * page queues to mess with.  Things can break if we mess
 		 * with pages in any of the below states.
 		 */
 		vm_page_lock_queues();
 		if (m->hold_count ||
 		    m->wire_count ||
 		    (m->flags & PG_UNMANAGED) ||
 		    m->valid != VM_PAGE_BITS_ALL) {
 			vm_page_unlock_queues();
 			goto unlock_tobject;
 		}
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			if (object != tobject)
 				VM_OBJECT_UNLOCK(object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0);
 			VM_OBJECT_LOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_dontneed(m);
 		} else if (advise == MADV_FREE) {
 			/*
 			 * Mark the page clean.  This will allow the page
 			 * to be freed up by the system.  However, such pages
 			 * are often reused quickly by malloc()/free()
 			 * so we do not do anything that would cause
 			 * a page fault if we can help it.
 			 *
 			 * Specifically, we do not try to actually free
 			 * the page now nor do we try to put it in the
 			 * cache (which would cause a page fault on reuse).
 			 *
 			 * But we do make the page is freeable as we
 			 * can without actually taking the step of unmapping
 			 * it.
 			 */
 			pmap_clear_modify(m);
 			m->dirty = 0;
 			m->act_count = 0;
 			vm_page_dontneed(m);
 		}
 		vm_page_unlock_queues();
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_UNLOCK(tobject);
 	}	
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_UNLOCK(source);
 			return;
 		}
 		VM_OBJECT_UNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, length);
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 		if (length < source->size)
 			length = source->size;
 		if (length > PQ_MAXLENGTH || source->generation > 1)
 			length = PQ_MAXLENGTH;
 		result->pg_color = (source->pg_color +
 		    length * source->generation) & PQ_COLORMASK;
 		result->flags |= source->flags & OBJ_NEEDGIANT;
 		VM_OBJECT_UNLOCK(source);
 		next_index = (result->pg_color + PQ_MAXLENGTH) & PQ_COLORMASK;
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_UNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_LOCK(new_object);
 	VM_OBJECT_LOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(source);
 			VM_OBJECT_UNLOCK(orig_object);
 			VM_OBJECT_UNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_LOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	new_object->flags |= orig_object->flags & OBJ_NEEDGIANT;
 retry:
 	if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
 		if (m->pindex < offidxstart) {
 			m = vm_page_splay(offidxstart, orig_object->root);
 			if ((orig_object->root = m)->pindex < offidxstart)
 				m = TAILQ_NEXT(m, listq);
 		}
 	}
 	vm_page_lock_queues();
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(new_object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
 			VM_OBJECT_LOCK(new_object);
 			goto retry;
 		}
 		vm_page_rename(m, new_object, idx);
 		/* page automatically made dirty by rename and cache handled */
 		vm_page_busy(m);
 	}
 	vm_page_unlock_queues();
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
 		vm_page_wakeup(m);
 	VM_OBJECT_UNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_LOCK(new_object);
 }
 
 #define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static int
 vm_object_backing_scan(vm_object_t object, int op)
 {
 	int r = 1;
 	vm_page_t p;
 	vm_object_t backing_object;
 	vm_pindex_t backing_offset_index;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
 		 * We do not want to have to test for the existence of
 		 * swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
 		 * been ZFOD faulted yet?  If we do not test for this, the
 		 * shadow test may succeed! XXX
 		 */
 		if (backing_object->type != OBJT_DEFAULT) {
 			return (0);
 		}
 	}
 	if (op & OBSC_COLLAPSE_WAIT) {
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 	}
 
 	/*
 	 * Our scan
 	 */
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next = TAILQ_NEXT(p, listq);
 		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
 
 		if (op & OBSC_TEST_ALL_SHADOWED) {
 			vm_page_t pp;
 
 			/*
 			 * Ignore pages outside the parent object's range
 			 * and outside the parent object's mapping of the 
 			 * backing object.
 			 *
 			 * note that we do not busy the backing object's
 			 * page.
 			 */
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				p = next;
 				continue;
 			}
 
 			/*
 			 * See if the parent has the page or if the parent's
 			 * object pager has the page.  If the parent has the
 			 * page but the page is not valid, the parent's
 			 * object pager must have the page.
 			 *
 			 * If this fails, the parent does not completely shadow
 			 * the object and we might as well give up now.
 			 */
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    (pp == NULL || pp->valid == 0) &&
 			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				r = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Check for busy page
 		 */
 		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
 				if ((p->oflags & VPO_BUSY) ||
 				    !p->valid || 
 				    p->busy) {
 					p = next;
 					continue;
 				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
 				if ((p->oflags & VPO_BUSY) || p->busy) {
 					vm_page_lock_queues();
 					vm_page_flag_set(p, PG_REFERENCED);
 					vm_page_unlock_queues();
 					VM_OBJECT_UNLOCK(object);
 					p->oflags |= VPO_WANTED;
 					msleep(p, VM_OBJECT_MTX(backing_object),
 					    PDROP | PVM, "vmocol", 0);
 					VM_OBJECT_LOCK(object);
 					VM_OBJECT_LOCK(backing_object);
 					/*
 					 * If we slept, anything could have
 					 * happened.  Since the object is
 					 * marked dead, the backing offset
 					 * should not have changed so we
 					 * just restart our scan.
 					 */
 					p = TAILQ_FIRST(&backing_object->memq);
 					continue;
 				}
 			}
 
 			KASSERT(
 			    p->object == backing_object,
 			    ("vm_object_backing_scan: object mismatch")
 			);
 
 			/*
 			 * Destroy any associated swap
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				swap_pager_freespace(
 				    backing_object, 
 				    p->pindex,
 				    1
 				);
 			}
 
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				/*
 				 * Page is out of the parent object's range, we 
 				 * can simply destroy it. 
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    pp != NULL ||
 			    vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				/*
 				 * page already exists in parent OR swap exists
 				 * for this location in the parent.  Destroy 
 				 * the original page from the backing object.
 				 *
 				 * Leave the parent's page alone
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 			/*
 			 * Page does not exist in parent, rename the
 			 * page from the backing object to the main object. 
 			 *
 			 * If the page was mapped to a process, it can remain 
 			 * mapped through the rename.
 			 */
 			vm_page_lock_queues();
 			vm_page_rename(p, object, new_pindex);
 			vm_page_unlock_queues();
 			/* page automatically made dirty by rename */
 		}
 		p = next;
 	}
 	return (r);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_LOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_backing_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.  
 			 */
 			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 			if (backing_object->backing_object) {
 				VM_OBJECT_LOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				backing_object->backing_object->generation++;
 				VM_OBJECT_UNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
 			VM_OBJECT_UNLOCK(backing_object);
 
 			mtx_lock(&vm_object_list_mtx);
 			TAILQ_REMOVE(
 			    &vm_object_list, 
 			    backing_object,
 			    object_list
 			);
 			mtx_unlock(&vm_object_list_mtx);
 
 			uma_zfree(obj_zone, backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    vm_object_backing_scan(object,
 			    OBSC_TEST_ALL_SHADOWED) == 0) {
 				VM_OBJECT_UNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_LOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				new_backing_object->generation++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_UNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_UNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	Removes all physical pages in the given range from the
  *	object's list of pages.  If the range's end is zero, all
  *	physical pages from the range's start to the end of the object
  *	are deleted.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     boolean_t clean_only)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->resident_page_count == 0)
 		return;
 
 	/*
 	 * Since physically-backed objects do not use managed pages, we can't
 	 * remove pages from the object (we must instead remove the page
 	 * references, and then destroy the object).
 	 */
 	KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
 	    object == kmem_object,
 	    ("attempt to remove pages from a physical object"));
 
 	vm_object_pip_add(object, 1);
 again:
 	vm_page_lock_queues();
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < start) {
 			p = vm_page_splay(start, object->root);
 			if ((object->root = p)->pindex < start)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (p->pindex < end || end == 0);
 	     p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		if (p->wire_count != 0) {
 			pmap_remove_all(p);
 			if (!clean_only)
 				p->valid = 0;
 			continue;
 		}
 		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
 			goto again;
 		if (clean_only && p->valid) {
 			pmap_remove_write(p);
 			if (p->valid & p->dirty)
 				continue;
 		}
 		pmap_remove_all(p);
 		vm_page_free(p);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_wakeup(object);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
 	vm_size_t prev_size, vm_size_t next_size)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_LOCK(prev_object);
 	if (prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object,
 				      next_pindex,
 				      next_pindex + next_size, FALSE);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_UNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 	if (object->type == OBJT_VNODE &&
 	    (vp = (struct vnode *)object->handle) != NULL) {
 		VI_LOCK(vp);
 		vp->v_iflag |= VI_OBJDIRTY;
 		VI_UNLOCK(vp);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(kmem_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(pager_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(buffer_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	int nl = 0;
 	int c;
 
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		vm_pindex_t idx, fidx;
 		vm_pindex_t osize;
 		vm_paddr_t pa = -1, padiff;
 		int rcount;
 		vm_page_t m;
 
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		osize = object->size;
 		if (osize > 128)
 			osize = 128;
 		for (idx = 0; idx < osize; idx++) {
 			m = vm_page_lookup(object, idx);
 			if (m == NULL) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 				continue;
 			}
 
 				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
 				padiff >>= PAGE_SHIFT;
 				padiff &= PQ_COLORMASK;
 				if (padiff == 0) {
 					pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
 					++rcount;
 					continue;
 				}
 				db_printf(" index(%ld)run(%d)pa(0x%lx)",
 					(long)fidx, rcount, (long)pa);
 				db_printf("pd(%ld)\n", (long)padiff);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = idx;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_page.c
===================================================================
--- head/sys/vm/vm_page.c	(revision 169666)
+++ head/sys/vm/vm_page.c	(revision 169667)
@@ -1,1814 +1,1816 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- a pageq mutex is required when adding or removing a page from a
  *	  page queue (vm_page_queue[]), regardless of other mutexes or the
  *	  busy state of a page.
  *
  *	- a hash chain mutex is required when associating or disassociating
  *	  a page from the VM PAGE CACHE hash table (vm_page_buckets),
  *	  regardless of other mutexes or the busy state of a page.
  *
  *	- either a hash chain mutex OR a busied page is required in order
  *	  to modify the page flags.  A hash chain mutex must be obtained in
  *	  order to busy a page.  A page's flags cannot be modified by a
  *	  hash chain mutex if the page is marked busy.
  *
  *	- The object memq mutex is held when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).  This
  *	  is different from the object's main mutex.
  *
  *	Generally speaking, you have to be aware of side effects when running
  *	vm_page ops.  A vm_page_lookup() will return with the hash chain
  *	locked, whether it was able to lookup the page or not.  vm_page_free(),
  *	vm_page_cache(), vm_page_activate(), and a number of other routines
  *	will release the hash chain mutex for you.  Intermediate manipulation
  *	routines such as vm_page_flag_set() expect the hash chain to be held
  *	on entry and the hash chain will remain held on return.
  *
  *	pageq scanning can only occur with the pageq in question locked.
  *	We have a known bottleneck with the active queue, but the cache
  *	and free queues are actually arrays already. 
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 struct mtx vm_page_queue_mtx;
 struct mtx vm_page_queue_free_mtx;
 
 vm_page_t vm_page_array = 0;
 int vm_page_array_size = 0;
 long first_page = 0;
 int vm_page_zero_count = 0;
 
 static int boot_pages = UMA_BOOT_PAGES;
 TUNABLE_INT("vm.boot_pages", &boot_pages);
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
 	"number of pages allocated for bootstrapping the VM system");
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
-	if (cnt.v_page_size == 0)
-		cnt.v_page_size = PAGE_SIZE;
-	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
+	if (VMCNT_GET(page_size) == 0)
+		VMCNT_SET(page_size, PAGE_SIZE);
+	if (((VMCNT_GET(page_size) - 1) & VMCNT_GET(page_size)) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_lookup:
  *
  *	See if a physical address in this page has been listed
  *	in the blacklist tunable.  Entries in the tunable are
  *	separated by spaces or commas.  If an invalid integer is
  *	encountered then the rest of the string is skipped.
  */
 static int
 vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	for (pos = list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp != '\0') {
 			if (*cp == ' ' || *cp == ',') {
 				cp++;
 				if (cp == pos)
 					continue;
 			} else
 				break;
 		}
 		if (pa == trunc_page(bad))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
  *
  *	Allocates memory for the page cells, and
  *	for the object/offset-to-page hash table headers.
  *	Each page cell is initialized and placed on the free list.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
 	vm_size_t npages;
 	vm_paddr_t page_range;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
 	int nblocks;
 	vm_paddr_t last_pa;
 	char *list;
 
 	/* the biggest memory array is the second group of pages */
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
 	vm_paddr_t low_water, high_water;
 	int biggestone;
 
 	vm_paddr_t total;
 
 	total = 0;
 	biggestsize = 0;
 	biggestone = 0;
 	nblocks = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
 	low_water = phys_avail[0];
 	high_water = phys_avail[1];
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 		if (phys_avail[i] < low_water)
 			low_water = phys_avail[i];
 		if (phys_avail[i + 1] > high_water)
 			high_water = phys_avail[i + 1];
 		++nblocks;
 		total += size;
 	}
 
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the locks.
 	 */
 	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
 	    MTX_RECURSE);
 	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
 	    MTX_DEF);
 
 	/*
 	 * Initialize the queue headers for the free queue, the active queue
 	 * and the inactive queue.
 	 */
 	vm_pageq_init();
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 */
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
 #if defined(__amd64__) || defined(__i386__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
 	first_page = low_water / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	page_range = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 #elif defined(VM_PHYSSEG_DENSE)
 	page_range = high_water / PAGE_SIZE - first_page;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 	npages = (total - (page_range * sizeof(struct vm_page)) -
 	    (end - new_end)) / PAGE_SIZE;
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
 	 * Initialize the mem entry structures now, and put them in the free
 	 * queue.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
 #ifdef __amd64__
 	/*
 	 * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
 	 * so the pages must be tracked for a crashdump to include this data.
 	 * This includes the vm_page_array and the early UMA bootstrap pages.
 	 */
 	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif	
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	vm_page_array_size = page_range;
 
 	/*
 	 * This assertion tests the hypothesis that npages and total are
 	 * redundant.  XXX
 	 */
 	page_range = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 	KASSERT(page_range == npages,
 	    ("vm_page_startup: inconsistent page counts"));
 
 	/*
 	 * Construct the free queue(s) in descending order (by physical
 	 * address) so that the first 16MB of physical memory is allocated
 	 * last rather than first.  On large-memory machines, this avoids
 	 * the exhaustion of low physical memory before isa_dma_init has run.
 	 */
-	cnt.v_page_count = 0;
-	cnt.v_free_count = 0;
+	VMCNT_SET(page_count, 0);
+	VMCNT_SET(free_count, 0);
 	list = getenv("vm.blacklist");
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
 		while (pa < last_pa) {
 			if (list != NULL &&
 			    vm_page_blacklist_lookup(list, pa))
 				printf("Skipping page with pa 0x%jx\n",
 				    (uintmax_t)pa);
 			else
 				vm_pageq_add_new_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
 	freeenv(list);
 	return (vaddr);
 }
 
 void
 vm_page_flag_set(vm_page_t m, unsigned short bits)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->flags |= bits;
 } 
 
 void
 vm_page_flag_clear(vm_page_t m, unsigned short bits)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->flags &= ~bits;
 }
 
 void
 vm_page_busy(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("vm_page_busy: page already busy!!!"));
 	m->oflags |= VPO_BUSY;
 }
 
 /*
  *      vm_page_flash:
  *
  *      wakeup anyone waiting for the page.
  */
 void
 vm_page_flash(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->oflags & VPO_WANTED) {
 		m->oflags &= ~VPO_WANTED;
 		wakeup(m);
 	}
 }
 
 /*
  *      vm_page_wakeup:
  *
  *      clear the VPO_BUSY flag and wakeup anyone waiting for the
  *      page.
  *
  */
 void
 vm_page_wakeup(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
 	m->oflags &= ~VPO_BUSY;
 	vm_page_flash(m);
 }
 
 void
 vm_page_io_start(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	m->busy++;
 }
 
 void
 vm_page_io_finish(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	m->busy--;
 	if (m->busy == 0)
 		vm_page_flash(m);
 }
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 void
 vm_page_hold(vm_page_t mem)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
         mem->hold_count++;
 }
 
 void
 vm_page_unhold(vm_page_t mem)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	--mem->hold_count;
 	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
 	if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_sleep:
  *
  *	Sleep and release the page queues lock.
  *
  *	The object containing the given page must be locked.
  */
 void
 vm_page_sleep(vm_page_t m, const char *msg)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (!mtx_owned(&vm_page_queue_mtx))
 		vm_page_lock_queues();
 	vm_page_flag_set(m, PG_REFERENCED);
 	vm_page_unlock_queues();
 
 	/*
 	 * It's possible that while we sleep, the page will get
 	 * unbusied and freed.  If we are holding the object
 	 * lock, we will assume we hold a reference to the object
 	 * such that even if m->object changes, we can re-lock
 	 * it.
 	 */
 	m->oflags |= VPO_WANTED;
 	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
 }
 
 /*
  *	vm_page_dirty:
  *
  *	make page all dirty
  */
 void
 vm_page_dirty(vm_page_t m)
 {
 	KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_CACHE,
 	    ("vm_page_dirty: page in cache!"));
 	KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_FREE,
 	    ("vm_page_dirty: page is free!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_splay:
  *
  *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
  *	the vm_page containing the given pindex.  If, however, that
  *	pindex is not found in the vm_object, returns a vm_page that is
  *	adjacent to the pindex, coming before or after it.
  */
 vm_page_t
 vm_page_splay(vm_pindex_t pindex, vm_page_t root)
 {
 	struct vm_page dummy;
 	vm_page_t lefttreemax, righttreemin, y;
 
 	if (root == NULL)
 		return (root);
 	lefttreemax = righttreemin = &dummy;
 	for (;; root = y) {
 		if (pindex < root->pindex) {
 			if ((y = root->left) == NULL)
 				break;
 			if (pindex < y->pindex) {
 				/* Rotate right. */
 				root->left = y->right;
 				y->right = root;
 				root = y;
 				if ((y = root->left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->left = root;
 			righttreemin = root;
 		} else if (pindex > root->pindex) {
 			if ((y = root->right) == NULL)
 				break;
 			if (pindex > y->pindex) {
 				/* Rotate left. */
 				root->right = y->left;
 				y->left = root;
 				root = y;
 				if ((y = root->right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->right = root;
 			lefttreemax = root;
 		} else
 			break;
 	}
 	/* Assemble the new root. */
 	lefttreemax->right = root->left;
 	righttreemin->left = root->right;
 	root->left = dummy.right;
 	root->right = dummy.left;
 	return (root);
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The pagetables are not updated but will presumably fault the page
  *	in if necessary, or if a kernel page the caller will at some point
  *	enter the page into the kernel's pmap.  We are not allowed to block
  *	here so we *can't* do this anyway.
  *
  *	The object and page must be locked.
  *	This routine may not block.
  */
 void
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t root;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (m->object != NULL)
 		panic("vm_page_insert: page already inserted");
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 	m->object = object;
 	m->pindex = pindex;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	root = object->root;
 	if (root == NULL) {
 		m->left = NULL;
 		m->right = NULL;
 		TAILQ_INSERT_TAIL(&object->memq, m, listq);
 	} else {
 		root = vm_page_splay(pindex, root);
 		if (pindex < root->pindex) {
 			m->left = root->left;
 			m->right = root;
 			root->left = NULL;
 			TAILQ_INSERT_BEFORE(root, m, listq);
 		} else if (pindex == root->pindex)
 			panic("vm_page_insert: offset already allocated");
 		else {
 			m->right = root->right;
 			m->left = root;
 			root->right = NULL;
 			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
 		}
 	}
 	object->root = m;
 	object->generation++;
 
 	/*
 	 * show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold((struct vnode *)object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (m->flags & PG_WRITEABLE)
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  *	vm_page_remove:
  *				NOTE: used by device pager as well -wfj
  *
  *	Removes the given mem entry from the object/offset-page
  *	table and the object page list, but do not invalidate/terminate
  *	the backing store.
  *
  *	The object and page must be locked.
  *	The underlying pmap entry (if any) is NOT removed here.
  *	This routine may not block.
  */
 void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t root;
 
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (m->oflags & VPO_BUSY) {
 		m->oflags &= ~VPO_BUSY;
 		vm_page_flash(m);
 	}
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	if (m != object->root)
 		vm_page_splay(m->pindex, object->root);
 	if (m->left == NULL)
 		root = m->right;
 	else {
 		root = vm_page_splay(m->pindex, m->left);
 		root->right = m->right;
 	}
 	object->root = root;
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 	object->generation++;
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop((struct vnode *)object->handle);
 
 	m->object = NULL;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  *	This routine may not block.
  *	This is a critical path routine
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((m = object->root) != NULL && m->pindex != pindex) {
 		m = vm_page_splay(pindex, m);
 		if ((object->root = m)->pindex != pindex)
 			m = NULL;
 	}
 	return (m);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	The object must be locked.
  *	This routine may not block.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.  If the page is on the cache, we have to deactivate it
  *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
  *	      on the cache.
  */
 void
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 
 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
 	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
 		vm_page_deactivate(m);
 	vm_page_dirty(m);
 }
 
 /*
  *	vm_page_select_cache:
  *
  *	Move a page of the given color from the cache queue to the free
  *	queue.  As pages might be found, but are not applicable, they are
  *	deactivated.
  *
  *	This routine may not block.
  */
 vm_page_t
 vm_page_select_cache(int color)
 {
 	vm_object_t object;
 	vm_page_t m;
 	boolean_t was_trylocked;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) {
 		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("Found mapped cache page %p", m));
 		KASSERT((m->flags & PG_UNMANAGED) == 0,
 		    ("Found unmanaged cache page %p", m));
 		KASSERT(m->wire_count == 0, ("Found wired cache page %p", m));
 		if (m->hold_count == 0 && (object = m->object,
 		    (was_trylocked = VM_OBJECT_TRYLOCK(object)) ||
 		    VM_OBJECT_LOCKED(object))) {
 			KASSERT((m->oflags & VPO_BUSY) == 0 && m->busy == 0,
 			    ("Found busy cache page %p", m));
 			vm_page_free(m);
 			if (was_trylocked)
 				VM_OBJECT_UNLOCK(object);
 			break;
 		}
 		vm_page_deactivate(m);
 	}
 	return (m);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a memory cell associated
  *	with this VM object/offset pair.
  *
  *	page_req classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *	VM_ALLOC_ZERO		zero page
  *
  *	This routine may not block.
  *
  *	Additional special handling is required when called from an
  *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
  *	the page cache in this case.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 	vm_page_t m = NULL;
 	int color, flags, page_req;
 
 	page_req = req & VM_ALLOC_CLASS_MASK;
 	KASSERT(curthread->td_intr_nesting_level == 0 ||
 	    page_req == VM_ALLOC_INTERRUPT,
 	    ("vm_page_alloc(NORMAL|SYSTEM) in interrupt context"));
 
 	if ((req & VM_ALLOC_NOOBJ) == 0) {
 		KASSERT(object != NULL,
 		    ("vm_page_alloc: NULL object."));
 		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 		color = (pindex + object->pg_color) & PQ_COLORMASK;
 	} else
 		color = pindex & PQ_COLORMASK;
 
 	/*
 	 * The pager is allowed to eat deeper into the free page list.
 	 */
 	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
 loop:
 	mtx_lock(&vm_page_queue_free_mtx);
-	if (cnt.v_free_count > cnt.v_free_reserved ||
+	if (VMCNT_GET(free_count) > VMCNT_GET(free_reserved) ||
 	    (page_req == VM_ALLOC_SYSTEM && 
-	     cnt.v_cache_count == 0 && 
-	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
-	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
+	     VMCNT_GET(cache_count) == 0 && 
+	     VMCNT_GET(free_count) > VMCNT_GET(interrupt_free_min)) ||
+	    (page_req == VM_ALLOC_INTERRUPT && VMCNT_GET(free_count) > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
 		 * exceeds the minimum for the request class.
 		 */
 		m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
 	} else if (page_req != VM_ALLOC_INTERRUPT) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		/*
 		 * Allocatable from cache (non-interrupt only).  On success,
 		 * we must free the page and try again, thus ensuring that
 		 * cnt.v_*_free_min counters are replenished.
 		 */
 		vm_page_lock_queues();
 		if ((m = vm_page_select_cache(color)) == NULL) {
-			KASSERT(cnt.v_cache_count == 0,
+			KASSERT(VMCNT_GET(cache_count) == 0,
 			    ("vm_page_alloc: cache queue is missing %d pages",
-			    cnt.v_cache_count));
+			    VMCNT_GET(cache_count)));
 			vm_page_unlock_queues();
 			atomic_add_int(&vm_pageout_deficit, 1);
 			pagedaemon_wakeup();
 
 			if (page_req != VM_ALLOC_SYSTEM) 
 				return (NULL);
 
 			mtx_lock(&vm_page_queue_free_mtx);
-			if (cnt.v_free_count <= cnt.v_interrupt_free_min) {
+			if (VMCNT_GET(free_count) <=
+			    VMCNT_GET(interrupt_free_min)) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				return (NULL);
 			}
 			m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
 		} else {
 			vm_page_unlock_queues();
 			goto loop;
 		}
 	} else {
 		/*
 		 * Not allocatable from cache from interrupt, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, 1);
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 
 	/*
 	 *  At this point we had better have found a good page.
 	 */
 
 	KASSERT(
 	    m != NULL,
 	    ("vm_page_alloc(): missing page on free queue")
 	);
 
 	/*
 	 * Remove from free queue
 	 */
 	vm_pageq_remove_nowakeup(m);
 
 	/*
 	 * Initialize structure.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if (m->flags & PG_ZERO) {
 		vm_page_zero_count--;
 		if (req & VM_ALLOC_ZERO)
 			flags = PG_ZERO;
 	}
 	if (object != NULL && object->type == OBJT_PHYS)
 		flags |= PG_UNMANAGED;
 	m->flags = flags;
 	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
 		m->oflags = 0;
 	else
 		m->oflags = VPO_BUSY;
 	if (req & VM_ALLOC_WIRED) {
-		atomic_add_int(&cnt.v_wire_count, 1);
+		VMCNT_ADD(wire_count, 1);
 		m->wire_count = 1;
 	} else
 		m->wire_count = 0;
 	m->hold_count = 0;
 	m->act_count = 0;
 	m->busy = 0;
 	m->valid = 0;
 	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
 	mtx_unlock(&vm_page_queue_free_mtx);
 
 	if ((req & VM_ALLOC_NOOBJ) == 0)
 		vm_page_insert(m, object, pindex);
 	else
 		m->pindex = pindex;
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
 /*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Block until free pages are available for allocation
  *	- Called in various places before memory allocations.
  */
 void
 vm_wait(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
 			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
-		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
-		    "vmwait", 0);
+		msleep(VMCNT_PTR(free_count), &vm_page_queue_free_mtx, PDROP |
+		    PVM, "vmwait", 0);
 	}
 }
 
 /*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Block until free pages are available for allocation
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pages_needed) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
-	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
+	msleep(VMCNT_PTR(free_count), &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *	Ensure that act_count is at least ACT_INIT but do not otherwise
  *	mess with it.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_activate(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
 		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			cnt.v_reactivated++;
+			VMCNT_ADD(reactivated, 1);
 		vm_pageq_remove(m);
 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
 			vm_pageq_enqueue(PQ_ACTIVE, m);
 		}
 	} else {
 		if (m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
 	}
 }
 
 /*
  *	vm_page_free_wakeup:
  *
  *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
  *	routine is called when a page has been added to the cache or free
  *	queues.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 static inline void
 vm_page_free_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
-	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
+	    VMCNT_GET(cache_count) + VMCNT_GET(free_count) >=
+	    VMCNT_GET(pageout_free_min)) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	/*
 	 * wakeup processes that are waiting on memory if we hit a
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
-		wakeup(&cnt.v_free_count);
+		wakeup(VMCNT_PTR(free_count));
 	}
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the PQ_FREE list,
  *	disassociating it with any VM object.
  *
  *	Object and page must be locked prior to entry.
  *	This routine may not block.
  */
 
 void
 vm_page_free_toq(vm_page_t m)
 {
 	struct vpgqueues *pq;
 
 	if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT(!pmap_page_is_mapped(m),
 	    ("vm_page_free_toq: freeing mapped page %p", m));
-	cnt.v_tfree++;
+	VMCNT_ADD(tfree, 1);
 
 	if (m->busy || VM_PAGE_INQUEUE1(m, PQ_FREE)) {
 		printf(
 		"vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
 		    (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
 		    m->hold_count);
 		if (VM_PAGE_INQUEUE1(m, PQ_FREE))
 			panic("vm_page_free: freeing free page");
 		else
 			panic("vm_page_free: freeing busy page");
 	}
 
 	/*
 	 * unqueue, then remove page.  Note that we cannot destroy
 	 * the page here because we do not want to call the pager's
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
 	vm_pageq_remove_nowakeup(m);
 	vm_page_remove(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		return;
 	}
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->wire_count != 0) {
 		if (m->wire_count > 1) {
 			panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
 				m->wire_count, (long)m->pindex);
 		}
 		panic("vm_page_free: freeing wired page");
 	}
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		vm_pageq_enqueue(PQ_HOLD, m);
 		return;
 	}
 	VM_PAGE_SETQUEUE1(m, PQ_FREE);
 	mtx_lock(&vm_page_queue_free_mtx);
 	pq = &vm_page_queues[VM_PAGE_GETQUEUE(m)];
 	pq->lcnt++;
 	++(*pq->cnt);
 
 	/*
 	 * Put zero'd pages on the end ( where we look for zero'd pages
 	 * first ) and non-zerod pages at the head.
 	 */
 	if (m->flags & PG_ZERO) {
 		TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
 		++vm_page_zero_count;
 	} else {
 		TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
 		vm_page_zero_idle_wakeup();
 	}
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 
 	/*
 	 * Only bump the wire statistics if the page is not already wired,
 	 * and only unqueue the page if it is on some queue (if it is unmanaged
 	 * it is already off the queues).
 	 */
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->flags & PG_FICTITIOUS)
 		return;
 	if (m->wire_count == 0) {
 		if ((m->flags & PG_UNMANAGED) == 0)
 			vm_pageq_remove(m);
-		atomic_add_int(&cnt.v_wire_count, 1);
+		VMCNT_ADD(wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 }
 
 /*
  *	vm_page_unwire:
  *
  *	Release one wiring of this page, potentially
  *	enabling it to be paged again.
  *
  *	Many pages placed on the inactive queue should actually go
  *	into the cache, but it is difficult to figure out which.  What
  *	we do instead, if the inactive target is well met, is to put
  *	clean pages at the head of the inactive queue instead of the tail.
  *	This will cause them to be moved to the cache more quickly and
  *	if not actively re-referenced, freed more quickly.  If we just
  *	stick these pages at the end of the inactive queue, heavy filesystem
  *	meta-data accesses can cause an unnecessary paging load on memory bound 
  *	processes.  This optimization causes one-time-use metadata to be
  *	reused more quickly.
  *
  *	BUT, if we are in a low-memory situation we have no choice but to
  *	put clean pages on the cache queue.
  *
  *	A number of routines use vm_page_unwire() to guarantee that the page
  *	will go into either the inactive or active queues, and will NEVER
  *	be placed in the cache - for example, just after dirtying a page.
  *	dirty pages in the cache are not allowed.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_unwire(vm_page_t m, int activate)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->flags & PG_FICTITIOUS)
 		return;
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
-			atomic_subtract_int(&cnt.v_wire_count, 1);
+			VMCNT_DEC(wire_count, 1);
 			if (m->flags & PG_UNMANAGED) {
 				;
 			} else if (activate)
 				vm_pageq_enqueue(PQ_ACTIVE, m);
 			else {
 				vm_page_flag_clear(m, PG_WINATCFLS);
 				vm_pageq_enqueue(PQ_INACTIVE, m);
 			}
 		}
 	} else {
 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
 	}
 }
 
 
 /*
  * Move the specified page to the inactive queue.  If the page has
  * any associated swap, the swap is deallocated.
  *
  * Normally athead is 0 resulting in LRU operation.  athead is set
  * to 1 if we want this page to be 'as if it were placed in the cache',
  * except without unmapping it from the process address space.
  *
  * This routine may not block.
  */
 static inline void
 _vm_page_deactivate(vm_page_t m, int athead)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Ignore if already inactive.
 	 */
 	if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
 		return;
 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			cnt.v_reactivated++;
+			VMCNT_ADD(reactivated, 1);
 		vm_page_flag_clear(m, PG_WINATCFLS);
 		vm_pageq_remove(m);
 		if (athead)
 			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 		else
 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 		VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
 		vm_page_queues[PQ_INACTIVE].lcnt++;
-		cnt.v_inactive_count++;
+		VMCNT_ADD(inactive_count, 1);
 	}
 }
 
 void
 vm_page_deactivate(vm_page_t m)
 {
     _vm_page_deactivate(m, 0);
 }
 
 /*
  * vm_page_try_to_cache:
  *
  * Returns 0 on failure, 1 on success
  */
 int
 vm_page_try_to_cache(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_cache(m);
 	return (1);
 }
 
 /*
  * vm_page_try_to_free()
  *
  *	Attempt to free the page.  If we cannot free it, we do nothing.
  *	1 is returned on success, 0 on failure.
  */
 int
 vm_page_try_to_free(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->object != NULL)
 		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_free(m);
 	return (1);
 }
 
 /*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
  *
  * This routine may not block.
  */
 void
 vm_page_cache(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
 	    m->hold_count || m->wire_count) {
 		printf("vm_page_cache: attempting to cache busy page\n");
 		return;
 	}
 	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
 		return;
 
 	/*
 	 * Remove all pmaps and indicate that the page is not
 	 * writeable or mapped.
 	 */
 	pmap_remove_all(m);
 	if (m->dirty != 0) {
 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
 			(long)m->pindex);
 	}
 	vm_pageq_remove_nowakeup(m);
 	vm_pageq_enqueue(PQ_CACHE + m->pc, m);
 	mtx_lock(&vm_page_queue_free_mtx);
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * vm_page_dontneed
  *
  *	Cache, deactivate, or do nothing as appropriate.  This routine
  *	is typically used by madvise() MADV_DONTNEED.
  *
  *	Generally speaking we want to move the page into the cache so
  *	it gets reused quickly.  However, this can result in a silly syndrome
  *	due to the page recycling too quickly.  Small objects will not be
  *	fully cached.  On the otherhand, if we move the page to the inactive
  *	queue we wind up with a problem whereby very large objects 
  *	unnecessarily blow away our inactive and cache queues.
  *
  *	The solution is to move the pages based on a fixed weighting.  We
  *	either leave them alone, deactivate them, or move them to the cache,
  *	where moving them to the cache has the highest weighting.
  *	By forcing some pages into other queues we eventually force the
  *	system to balance the queues, potentially recovering other unrelated
  *	space from active.  The idea is to not force this to happen too
  *	often.
  */
 void
 vm_page_dontneed(vm_page_t m)
 {
 	static int dnweight;
 	int dnw;
 	int head;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	dnw = ++dnweight;
 
 	/*
 	 * occassionally leave the page alone
 	 */
 	if ((dnw & 0x01F0) == 0 ||
 	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE) || 
 	    VM_PAGE_INQUEUE1(m, PQ_CACHE)
 	) {
 		if (m->act_count >= ACT_INIT)
 			--m->act_count;
 		return;
 	}
 
 	if (m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	if (m->dirty || (dnw & 0x0070) == 0) {
 		/*
 		 * Deactivate the page 3 times out of 32.
 		 */
 		head = 0;
 	} else {
 		/*
 		 * Cache the page 28 times out of every 32.  Note that
 		 * the page is deactivated instead of cached, but placed
 		 * at the head of the queue instead of the tail.
 		 */
 		head = 1;
 	}
 	_vm_page_deactivate(m, head);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may block.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
 			if ((allocflags & VM_ALLOC_RETRY) == 0)
 				return (NULL);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
 				vm_page_lock_queues();
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 			}
 			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
 				vm_page_busy(m);
 			return (m);
 		}
 	}
 	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
 	if (m == NULL) {
 		VM_OBJECT_UNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_LOCK(object);
 		if ((allocflags & VM_ALLOC_RETRY) == 0)
 			return (NULL);
 		goto retrylookup;
 	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 /*
  * Mapping function for valid bits or for dirty bits in
  * a page.  May not block.
  *
  * Inputs are required to range within a page.
  */
 inline int
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return ((2 << last_bit) - (1 << first_bit));
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	This routine may not block.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	int pagebits;
 	int frag;
 	int endoff;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the 
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	m->dirty &= ~pagebits;
 	if (base == 0 && size == PAGE_SIZE) {
 		pmap_clear_modify(m);
 		m->oflags &= ~VPO_NOSYNC;
 	}
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->dirty &= ~vm_page_bits(base, size);
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  *
  *	May not block.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	int bits;
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	bits = vm_page_bits(base, size);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
 		pmap_remove_all(m);
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 	m->object->generation++;
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain 
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped 
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zerod by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) || 
 		    (m->valid & (1 << i))
 		) {
 			if (i > b) {
 				pmap_zero_page_area(m, 
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		m->valid = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  *
  *	May not block.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	int bits = vm_page_bits(base, size);
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->valid && ((m->valid & bits) == bits))
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * update dirty bits from pmap/mmu.  May not block.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
 		vm_page_dirty(m);
 	}
 }
 
 int so_zerocp_fullpage = 0;
 
 void
 vm_page_cowfault(vm_page_t m)
 {
 	vm_page_t mnew;
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	object = m->object;
 	pindex = m->pindex;
 
  retry_alloc:
 	pmap_remove_all(m);
 	vm_page_remove(m);
 	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (mnew == NULL) {
 		vm_page_insert(m, object, pindex);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		goto retry_alloc;
 	}
 
 	if (m->cow == 0) {
 		/* 
 		 * check to see if we raced with an xmit complete when 
 		 * waiting to allocate a page.  If so, put things back 
 		 * the way they were 
 		 */
 		vm_page_free(mnew);
 		vm_page_insert(m, object, pindex);
 	} else { /* clear COW & copy page */
 		if (!so_zerocp_fullpage)
 			pmap_copy_page(m, mnew);
 		mnew->valid = VM_PAGE_BITS_ALL;
 		vm_page_dirty(mnew);
 		mnew->wire_count = m->wire_count - m->cow;
 		m->wire_count = m->cow;
 	}
 }
 
 void 
 vm_page_cowclear(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->cow) {
 		m->cow--;
 		/* 
 		 * let vm_fault add back write permission  lazily
 		 */
 	} 
 	/*
 	 *  sf_buf_free() will free the page, so we needn't do it here
 	 */ 
 }
 
 void
 vm_page_cowsetup(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->cow++;
 	pmap_remove_write(m);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
-	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
-	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
-	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
-	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
-	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
-	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
-	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
-	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
-	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
-	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
+	db_printf("cnt.v_free_count: %d\n", VMCNT_GET(free_count));
+	db_printf("cnt.v_cache_count: %d\n", VMCNT_GET(cache_count));
+	db_printf("cnt.v_inactive_count: %d\n", VMCNT_GET(inactive_count));
+	db_printf("cnt.v_active_count: %d\n", VMCNT_GET(active_count));
+	db_printf("cnt.v_wire_count: %d\n", VMCNT_GET(wire_count));
+	db_printf("cnt.v_free_reserved: %d\n", VMCNT_GET(free_reserved));
+	db_printf("cnt.v_free_min: %d\n", VMCNT_GET(free_min));
+	db_printf("cnt.v_free_target: %d\n", VMCNT_GET(free_target));
+	db_printf("cnt.v_cache_min: %d\n", VMCNT_GET(cache_min));
+	db_printf("cnt.v_inactive_target: %d\n", VMCNT_GET(inactive_target));
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int i;
 	db_printf("PQ_FREE:");
 	for (i = 0; i < PQ_NUMCOLORS; i++) {
 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
 	}
 	db_printf("\n");
 		
 	db_printf("PQ_CACHE:");
 	for (i = 0; i < PQ_NUMCOLORS; i++) {
 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
 	}
 	db_printf("\n");
 
 	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
 		vm_page_queues[PQ_ACTIVE].lcnt,
 		vm_page_queues[PQ_INACTIVE].lcnt);
 }
 #endif /* DDB */
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c	(revision 169666)
+++ head/sys/vm/vm_pageout.c	(revision 169667)
@@ -1,1637 +1,1637 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/mutex.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(int pass);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
 #endif
 
 
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 #endif
 static int vm_max_launder = 32;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 static int vm_pageout_full_stats_interval = 0;
 static int vm_pageout_algorithm=0;
 static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
 #else
 static int vm_swap_enabled=1;
 static int vm_swap_idle_enabled=0;
 #endif
 
 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
 	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
 
 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
 #else
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(void);
 #endif
 static void vm_pageout_page_stats(void);
 
 /*
  * vm_pageout_fallback_object_lock:
  * 
  * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
  * known to have failed and page queue must be either PQ_ACTIVE or
  * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
  * while locking the vm object.  Use marker page to detect page queue
  * changes and maintain notion of next page on page queue.  Return
  * TRUE if no changes were detected, FALSE otherwise.  vm object is
  * locked on return.
  * 
  * This function depends on both the lock portion of struct vm_object
  * and normal struct vm_page being type stable.
  */
 static boolean_t
 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
 	boolean_t unchanged;
 	u_short queue;
 	vm_object_t object;
 
 	/*
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
 	marker.flags = PG_FICTITIOUS | PG_MARKER;
 	marker.oflags = VPO_BUSY;
 	marker.queue = m->queue;
 	marker.wire_count = 1;
 
 	queue = m->queue;
 	object = m->object;
 	
 	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
 			   m, &marker, pageq);
 	vm_page_unlock_queues();
 	VM_OBJECT_LOCK(object);
 	vm_page_lock_queues();
 
 	/* Page queue might have changed. */
 	*next = TAILQ_NEXT(&marker, pageq);
 	unchanged = (m->queue == queue &&
 		     m->object == object &&
 		     &marker == TAILQ_NEXT(m, pageq));
 	TAILQ_REMOVE(&vm_page_queues[queue].pl,
 		     &marker, pageq);
 	return (unchanged);
 }
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
 {
 	vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
 	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 	 * with the new swapper, but we could have serious problems paging
 	 * out other object types if there is insufficient memory.  
 	 *
 	 * Unfortunately, checking free memory here is far too late, so the
 	 * check has been moved up a procedural level.
 	 */
 
 	/*
 	 * Don't mess with the page if it's busy, held, or special
 	 */
 	if ((m->hold_count != 0) ||
 	    ((m->busy != 0) || (m->oflags & VPO_BUSY) ||
 	     (m->flags & PG_UNMANAGED))) {
 		return 0;
 	}
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying
 	 * align the clusters (which leave sporatic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 	object = m->object;
 more:
 	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;
 
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 
 		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
 			ib = 0;
 			break;
 		}
 		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
 		    (p->oflags & VPO_BUSY) || p->busy ||
 		    (p->flags & PG_UNMANAGED)) {
 			ib = 0;
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			ib = 0;
 			break;
 		}
 		mc[--page_base] = p;
 		++pageout_count;
 		++ib;
 		/*
 		 * alignment boundry, stop here and switch directions.  Do
 		 * not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		vm_page_t p;
 
 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
 			break;
 		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
 		    (p->oflags & VPO_BUSY) || p->busy ||
 		    (p->flags & PG_UNMANAGED)) {
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			break;
 		}
 		mc[page_base + pageout_count] = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past a page boundry.  This catches boundry
 	 * conditions.
 	 */
 	if (ib && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	return (vm_pageout_flush(&mc[page_base], pageout_count, 0));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
 	 * mark the pages read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		pmap_remove_write(mc[i]);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count,
 	    (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
 	    pageout_status);
 
 	vm_page_lock_queues();
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT((mt->flags & PG_WRITEABLE) == 0,
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			pmap_clear_modify(mt);
 			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
 			vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
 			if (vm_page_count_severe())
 				vm_page_try_to_cache(mt);
 		}
 	}
 	return numpagedout;
 }
 
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	deactivate enough pages to satisfy the inactive target
  *	requirements or if vm_page_proc_limit is set, then
  *	deactivate all of the pages in the object and its
  *	backing_objects.
  *
  *	The object and map must be locked.
  */
 static void
 vm_pageout_object_deactivate_pages(pmap, first_object, desired)
 	pmap_t pmap;
 	vm_object_t first_object;
 	long desired;
 {
 	vm_object_t backing_object, object;
 	vm_page_t p, next;
 	int actcount, rcount, remove_mode;
 
 	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
 	if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
 		if (object->paging_in_progress)
 			goto unlock_return;
 
 		remove_mode = 0;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 		/*
 		 * scan the objects entire memory queue
 		 */
 		rcount = object->resident_page_count;
 		p = TAILQ_FIRST(&object->memq);
 		vm_page_lock_queues();
 		while (p && (rcount-- > 0)) {
 			if (pmap_resident_count(pmap) <= desired) {
 				vm_page_unlock_queues();
 				goto unlock_return;
 			}
 			next = TAILQ_NEXT(p, listq);
-			cnt.v_pdpages++;
+			VMCNT_ADD(pdpages, 1);
 			if (p->wire_count != 0 ||
 			    p->hold_count != 0 ||
 			    p->busy != 0 ||
 			    (p->oflags & VPO_BUSY) ||
 			    (p->flags & PG_UNMANAGED) ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				p = next;
 				continue;
 			}
 			actcount = pmap_ts_referenced(p);
 			if (actcount) {
 				vm_page_flag_set(p, PG_REFERENCED);
 			} else if (p->flags & PG_REFERENCED) {
 				actcount = 1;
 			}
 			if ((p->queue != PQ_ACTIVE) &&
 				(p->flags & PG_REFERENCED)) {
 				vm_page_activate(p);
 				p->act_count += actcount;
 				vm_page_flag_clear(p, PG_REFERENCED);
 			} else if (p->queue == PQ_ACTIVE) {
 				if ((p->flags & PG_REFERENCED) == 0) {
 					p->act_count -= min(p->act_count, ACT_DECLINE);
 					if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
 					} else {
 						vm_pageq_requeue(p);
 					}
 				} else {
 					vm_page_activate(p);
 					vm_page_flag_clear(p, PG_REFERENCED);
 					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
 						p->act_count += ACT_ADVANCE;
 					vm_pageq_requeue(p);
 				}
 			} else if (p->queue == PQ_INACTIVE) {
 				pmap_remove_all(p);
 			}
 			p = next;
 		}
 		vm_page_unlock_queues();
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
 		VM_OBJECT_LOCK(backing_object);
 		if (object != first_object)
 			VM_OBJECT_UNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	long desired;
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 	int nothingwired;
 
 	if (!vm_map_trylock(map))
 		return;
 
 	bigobj = NULL;
 	nothingwired = TRUE;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count < obj->resident_page_count)) {
 					if (bigobj != NULL)
 						VM_OBJECT_UNLOCK(bigobj);
 					bigobj = obj;
 				} else
 					VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
 			nothingwired = FALSE;
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj != NULL) {
 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
 		VM_OBJECT_UNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
 				VM_OBJECT_LOCK(obj);
 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
 				VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0 && nothingwired) {
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 	}
 	vm_map_unlock(map);
 }
 #endif		/* !defined(NO_SWAPPING) */
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static void
 vm_pageout_scan(int pass)
 {
 	vm_page_t m, next;
 	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	struct proc *p, *bigproc;
 	struct thread *td;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
 	int actcount, cache_cur, cache_first_failure;
 	static int cache_last_free;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 
 	mtx_lock(&Giant);
 	/*
 	 * Decrease registered cache sizes.
 	 */
 	EVENTHANDLER_INVOKE(vm_lowmem, 0);
 	/*
 	 * We do this explicitly after the caches have been drained above.
 	 */
 	uma_reclaim();
 
 	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
 	page_shortage = vm_paging_target() + addl_page_shortage_init;
 
 	/*
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
 	marker.flags = PG_FICTITIOUS | PG_MARKER;
 	marker.oflags = VPO_BUSY;
 	marker.queue = PQ_INACTIVE;
 	marker.wire_count = 1;
 
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.  Note that m->act_count
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 *
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
 	 * to disk can result in extreme performance degredation.  However,
 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
 	 * used) will die horribly with limited laundering.  If the pageout
 	 * daemon cannot clean enough pages in the first pass, we let it go
 	 * all out in succeeding passes.
 	 */
 	if ((maxlaunder = vm_max_launder) <= 1)
 		maxlaunder = 1;
 	if (pass)
 		maxlaunder = 10000;
 	vm_page_lock_queues();
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
-	maxscan = cnt.v_inactive_count;
+	maxscan = VMCNT_GET(inactive_count);
 
 	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
 	     m = next) {
 
-		cnt.v_pdpages++;
+		VMCNT_ADD(pdpages, 1);
 
 		if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE) {
 			goto rescan0;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 
 		/*
 		 * skip marker pages
 		 */
 		if (m->flags & PG_MARKER)
 			continue;
 
 		/*
 		 * A held page may be undergoing I/O, so skip it.
 		 */
 		if (m->hold_count) {
 			vm_pageq_requeue(m);
 			addl_page_shortage++;
 			continue;
 		}
 		/*
 		 * Don't mess with busy pages, keep in the front of the
 		 * queue, most likely are being paged out.
 		 */
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    (!vm_pageout_fallback_object_lock(m, &next) ||
 		     m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 		if (m->busy || (m->oflags & VPO_BUSY)) {
 			VM_OBJECT_UNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * If the object is not being used, we ignore previous 
 		 * references.
 		 */
 		if (object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(m);
 
 		/*
 		 * Otherwise, if the page has been referenced while in the 
 		 * inactive queue, we bump the "activation count" upwards, 
 		 * making it less likely that the page will be added back to 
 		 * the inactive queue prematurely again.  Here we check the 
 		 * page tables (or emulated bits, if any), given the upper 
 		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(m))) {
 			vm_page_activate(m);
 			VM_OBJECT_UNLOCK(object);
 			m->act_count += (actcount + ACT_ADVANCE);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
 		 * references, we activate the page.  We also set the 
 		 * "activation count" higher than normal so that we will less 
 		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount = pmap_ts_referenced(m);
 			vm_page_activate(m);
 			VM_OBJECT_UNLOCK(object);
 			m->act_count += (actcount + ACT_ADVANCE + 1);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system doesn't know anything about 
 		 * the page being dirty, we have to check for it again.  As 
 		 * far as the VM code knows, any partially dirty pages are 
 		 * fully dirty.
 		 */
 		if (m->dirty == 0 && !pmap_is_modified(m)) {
 			/*
 			 * Avoid a race condition: Unless write access is
 			 * removed from the page, another processor could
 			 * modify it before all access is removed by the call
 			 * to vm_page_cache() below.  If vm_page_cache() finds
 			 * that the page has been modified when it removes all
 			 * access, it panics because it cannot cache dirty
 			 * pages.  In principle, we could eliminate just write
 			 * access here rather than all access.  In the expected
 			 * case, when there are no last instant modifications
 			 * to the page, removing all access will be cheaper
 			 * overall.
 			 */
 			if ((m->flags & PG_WRITEABLE) != 0)
 				pmap_remove_all(m);
 		} else {
 			vm_page_dirty(m);
 		}
 
 		if (m->valid == 0) {
 			/*
 			 * Invalid pages can be easily freed
 			 */
 			vm_page_free(m);
-			cnt.v_dfree++;
+			VMCNT_ADD(dfree, 1);
 			--page_shortage;
 		} else if (m->dirty == 0) {
 			/*
 			 * Clean pages can be placed onto the cache queue.
 			 * This effectively frees them.
 			 */
 			vm_page_cache(m);
 			--page_shortage;
 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
 			 * a page is extremely expensive verses freeing
 			 * a clean page.  Rather then artificially limiting
 			 * the number of pages we can flush, we instead give
 			 * dirty pages extra priority on the inactive queue
 			 * by forcing them to be cycled through the queue
 			 * twice before being flushed, after which the
 			 * (now clean) page will cycle through once more
 			 * before being freed.  This significantly extends
 			 * the thrash point for a heavily loaded machine.
 			 */
 			vm_page_flag_set(m, PG_WINATCFLS);
 			vm_pageq_requeue(m);
 		} else if (maxlaunder > 0) {
 			/*
 			 * We always want to try to flush some dirty pages if
 			 * we encounter them, to keep the system stable.
 			 * Normally this number is small, but under extreme
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
 				vm_page_count_min());
 										
 			}
 
 			/*
 			 * We don't bother paging objects that are "dead".  
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				VM_OBJECT_UNLOCK(object);
 				vm_pageq_requeue(m);
 				continue;
 			}
 
 			/*
 			 * Following operations may unlock
 			 * vm_page_queue_mtx, invalidating the 'next'
 			 * pointer.  To prevent an inordinate number
 			 * of restarts we use our marker to remember
 			 * our place.
 			 *
 			 */
 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
 					   m, &marker, pageq);
 			/*
 			 * The object is already known NOT to be dead.   It
 			 * is possible for the vget() to block the whole
 			 * pageout daemon, but the new low-memory handling
 			 * code should prevent it.
 			 *
 			 * The previous code skipped locked vnodes and, worse,
 			 * reordered pages in the queue.  This results in
 			 * completely non-deterministic operation and, on a
 			 * busy system, can lead to extremely non-optimal
 			 * pageouts.  For example, it can cause clean pages
 			 * to be freed and dirty pages to be moved to the end
 			 * of the queue.  Since dirty pages are also moved to
 			 * the end of the queue once-cleaned, this gives
 			 * way too large a weighting to defering the freeing
 			 * of dirty pages.
 			 *
 			 * We can't wait forever for the vnode lock, we might
 			 * deadlock due to a vn_read() getting stuck in
 			 * vm_wait while holding this vnode.  We skip the 
 			 * vnode if we can't get it in a reasonable amount
 			 * of time.
 			 */
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
 				mp = NULL;
 				if (vp->v_type == VREG &&
 				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
 					goto unlock_and_continue;
 				}
 				vm_page_unlock_queues();
 				VI_LOCK(vp);
 				VM_OBJECT_UNLOCK(object);
 				if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK |
 				    LK_TIMELOCK, curthread)) {
 					VM_OBJECT_LOCK(object);
 					vm_page_lock_queues();
 					++pageout_lock_miss;
 					vn_finished_write(mp);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
 					goto unlock_and_continue;
 				}
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				/*
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
 				 * above.  The page might have been freed and
 				 * reused for another vnode.  The object might
 				 * have been reused for another vnode.
 				 */
 				if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE ||
 				    m->object != object ||
 				    object->handle != vp ||
 				    TAILQ_NEXT(m, pageq) != &marker) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 	
 				/*
 				 * The page may have been busied during the
 				 * blocking in vput();  We don't move the
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->oflags & VPO_BUSY)) {
 					goto unlock_and_continue;
 				}
 
 				/*
 				 * If the page has become held it might
 				 * be undergoing I/O, so skip it
 				 */
 				if (m->hold_count) {
 					vm_pageq_requeue(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation. 
 			 *
 			 * decrement page_shortage on success to account for
 			 * the (future) cleaned page.  Otherwise we could wind
 			 * up laundering or cleaning too many pages.
 			 */
 			if (vm_pageout_clean(m) != 0) {
 				--page_shortage;
 				--maxlaunder;
 			}
 unlock_and_continue:
 			VM_OBJECT_UNLOCK(object);
 			if (vp) {
 				vm_page_unlock_queues();
 				vput(vp);
 				vn_finished_write(mp);
 				vm_page_lock_queues();
 			}
 			next = TAILQ_NEXT(&marker, pageq);
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
 				     &marker, pageq);
 			continue;
 		}
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
-	page_shortage = vm_paging_target() +
-		cnt.v_inactive_target - cnt.v_inactive_count;
+	page_shortage = vm_paging_target() + VMCNT_GET(inactive_target) -
+	    VMCNT_GET(inactive_count);
 	page_shortage += addl_page_shortage;
 
 	/*
 	 * Scan the active queue for things we can deactivate. We nominally
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
-	pcount = cnt.v_active_count;
+	pcount = VMCNT_GET(active_count);
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_scan: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 		if ((m->flags & PG_MARKER) != 0) {
 			m = next;
 			continue;
 		}
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			VM_OBJECT_UNLOCK(object);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * The count for pagedaemon pages is done after checking the
 		 * page for eligibility...
 		 */
-		cnt.v_pdpages++;
+		VMCNT_ADD(pdpages, 1);
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
 		actcount = 0;
 		if (object->ref_count != 0) {
 			if (m->flags & PG_REFERENCED) {
 				actcount += 1;
 			}
 			actcount += pmap_ts_referenced(m);
 			if (actcount) {
 				m->act_count += ACT_ADVANCE + actcount;
 				if (m->act_count > ACT_MAX)
 					m->act_count = ACT_MAX;
 			}
 		}
 
 		/*
 		 * Since we have "tested" this bit, we need to clear it now.
 		 */
 		vm_page_flag_clear(m, PG_REFERENCED);
 
 		/*
 		 * Only if an object is currently being used, do we use the
 		 * page activation count stats.
 		 */
 		if (actcount && (object->ref_count != 0)) {
 			vm_pageq_requeue(m);
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm ||
 			    object->ref_count == 0 ||
 			    m->act_count == 0) {
 				page_shortage--;
 				if (object->ref_count == 0) {
 					pmap_remove_all(m);
 					if (m->dirty == 0)
 						vm_page_cache(m);
 					else
 						vm_page_deactivate(m);
 				} else {
 					vm_page_deactivate(m);
 				}
 			} else {
 				vm_pageq_requeue(m);
 			}
 		}
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
 
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
 	 * code to be guaranteed space.  Since both cache and free queues 
 	 * are considered basically 'free', moving pages from cache to free
 	 * does not effect other calculations.
 	 */
 	cache_cur = cache_last_free;
 	cache_first_failure = -1;
-	while (cnt.v_free_count < cnt.v_free_reserved && (cache_cur =
-	    (cache_cur + PQ_PRIME2) & PQ_COLORMASK) != cache_first_failure) {
+	while (VMCNT_GET(free_count) < VMCNT_GET(free_reserved) &&
+	    (cache_cur = (cache_cur + PQ_PRIME2) & PQ_COLORMASK) !=
+	    cache_first_failure) {
 		TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE + cache_cur].pl,
 		    pageq) {
 			KASSERT(m->dirty == 0,
 			    ("Found dirty cache page %p", m));
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("Found mapped cache page %p", m));
 			KASSERT((m->flags & PG_UNMANAGED) == 0,
 			    ("Found unmanaged cache page %p", m));
 			KASSERT(m->wire_count == 0,
 			    ("Found wired cache page %p", m));
 			if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object =
 			    m->object)) {
 				KASSERT((m->oflags & VPO_BUSY) == 0 &&
 				    m->busy == 0, ("Found busy cache page %p",
 				    m));
 				vm_page_free(m);
 				VM_OBJECT_UNLOCK(object);
-				cnt.v_dfree++;
+				VMCNT_ADD(dfree, 1);
 				cache_last_free = cache_cur;
 				cache_first_failure = -1;
 				break;
 			}
 		}
 		if (m == NULL && cache_first_failure == -1)
 			cache_first_failure = cache_cur;
 	}
 	vm_page_unlock_queues();
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
 	 */
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
 			vm_req_vmdaemon();
 			lsec = time_second;
 		}
 	}
 #endif
 		
 	/*
 	 * If we didn't get enough free pages, and we have skipped a vnode
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
 	if (vm_paging_target() > 0) {
 		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
 #if !defined(NO_SWAPPING)
 		if (vm_swap_enabled && vm_page_count_target()) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
 #endif
 	}
 
 	/*
 	 * If we are critically low on one of RAM or swap and low on
 	 * the other, kill the largest process.  However, we avoid
 	 * doing this on the first pass in order to give ourselves a
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 *
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of it's child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	if (pass != 0 &&
 	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
 	     (swap_pager_full && vm_paging_target() > 0))) {
 		bigproc = NULL;
 		bigsize = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			int breakout;
 
 			if (PROC_TRYLOCK(p) == 0)
 				continue;
 			/*
 			 * If this is a system or protected process, skip it.
 			 */
 			if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
 			    (p->p_flag & P_PROTECTED) ||
 			    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * If the process is in a non-running type state,
 			 * don't touch it.  Check all the threads individually.
 			 */
 			mtx_lock_spin(&sched_lock);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					breakout = 1;
 					break;
 				}
 			}
 			if (breakout) {
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(p);
 				continue;
 			}
 			mtx_unlock_spin(&sched_lock);
 			/*
 			 * get the process size
 			 */
 			if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			size = vmspace_swap_count(p->p_vmspace);
 			vm_map_unlock_read(&p->p_vmspace->vm_map);
 			size += vmspace_resident_count(p->p_vmspace);
 			/*
 			 * if the this process is bigger than the biggest one
 			 * remember it.
 			 */
 			if (size > bigsize) {
 				if (bigproc != NULL)
 					PROC_UNLOCK(bigproc);
 				bigproc = p;
 				bigsize = size;
 			} else
 				PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (bigproc != NULL) {
 			killproc(bigproc, "out of swap space");
 			mtx_lock_spin(&sched_lock);
 			sched_nice(bigproc, PRIO_MIN);
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(bigproc);
-			wakeup(&cnt.v_free_count);
+			wakeup(VMCNT_PTR(free_count));
 		}
 	}
 	mtx_unlock(&Giant);
 }
 
 /*
  * This routine tries to maintain the pseudo LRU active queue,
  * so that during long periods of time where there is no paging,
  * that some statistic accumulation still occurs.  This code
  * helps the situation where paging just starts to occur.
  */
 static void
 vm_pageout_page_stats()
 {
 	vm_object_t object;
 	vm_page_t m,next;
 	int pcount,tpcount;		/* Number of pages to check */
 	static int fullintervalcount = 0;
 	int page_shortage;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	page_shortage = 
-	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
-	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
+	    (VMCNT_GET(inactive_target) + VMCNT_GET(cache_max) +
+	    VMCNT_GET(free_min)) - (VMCNT_GET(free_count) +
+	    VMCNT_GET(inactive_count) + VMCNT_GET(cache_count));
 
 	if (page_shortage <= 0)
 		return;
 
-	pcount = cnt.v_active_count;
+	pcount = VMCNT_GET(active_count);
 	fullintervalcount += vm_pageout_stats_interval;
 	if (fullintervalcount < vm_pageout_full_stats_interval) {
-		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
+		tpcount = (vm_pageout_stats_max * VMCNT_GET(active_count)) /
+		    VMCNT_GET(page_count);
 		if (pcount > tpcount)
 			pcount = tpcount;
 	} else {
 		fullintervalcount = 0;
 	}
 
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 	while ((m != NULL) && (pcount-- > 0)) {
 		int actcount;
 
 		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_page_stats: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 
 		if ((m->flags & PG_MARKER) != 0) {
 			m = next;
 			continue;
 		}
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			VM_OBJECT_UNLOCK(object);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		actcount = 0;
 		if (m->flags & PG_REFERENCED) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount += 1;
 		}
 
 		actcount += pmap_ts_referenced(m);
 		if (actcount) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 			vm_pageq_requeue(m);
 		} else {
 			if (m->act_count == 0) {
 				/*
 				 * We turn off page access, so that we have
 				 * more accurate RSS stats.  We don't do this
 				 * in the normal page deactivation when the
 				 * system is loaded VM wise, because the
 				 * cost of the large number of page protect
 				 * operations would be higher than the value
 				 * of doing the operation.
 				 */
 				pmap_remove_all(m);
 				vm_page_deactivate(m);
 			} else {
 				m->act_count -= min(m->act_count, ACT_DECLINE);
 				vm_pageq_requeue(m);
 			}
 		}
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
 }
 
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout()
 {
 	int error, pass;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
-	cnt.v_interrupt_free_min = 2;
-	if (cnt.v_page_count < 2000)
+	VMCNT_SET(interrupt_free_min, 2);
+	if (VMCNT_GET(page_count) < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
-	if (cnt.v_page_count > 1024)
-		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
-	else
-		cnt.v_free_min = 4;
-	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
-	    cnt.v_interrupt_free_min;
-	cnt.v_free_reserved = vm_pageout_page_count +
-	    cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_NUMCOLORS;
-	cnt.v_free_severe = cnt.v_free_min / 2;
-	cnt.v_free_min += cnt.v_free_reserved;
-	cnt.v_free_severe += cnt.v_free_reserved;
+	VMCNT_SET(free_min, (VMCNT_GET(page_count) > 1024) ? (4 +
+	    (VMCNT_GET(page_count) - 1024) / 200) : 4);
+	VMCNT_SET(pageout_free_min, (2 * MAXBSIZE) / PAGE_SIZE +
+	    VMCNT_GET(interrupt_free_min));
+	VMCNT_SET(free_reserved, vm_pageout_page_count +
+	    VMCNT_GET(pageout_free_min) + (VMCNT_GET(page_count) / 768) +
+	    PQ_NUMCOLORS);
+	VMCNT_SET(free_severe, VMCNT_GET(free_min) / 2);
+	VMCNT_ADD(free_min, VMCNT_GET(free_reserved));
+	VMCNT_ADD(free_severe, VMCNT_GET(free_reserved));
 
 	/*
 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
 	 * that these are more a measure of the VM cache queue hysteresis
 	 * then the VM free queue.  Specifically, v_free_target is the
 	 * high water mark (free+cache pages).
 	 *
 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
 	 * low water mark, while v_free_min is the stop.  v_cache_min must
 	 * be big enough to handle memory needs while the pageout daemon
 	 * is signalled and run to free more pages.
 	 */
-	if (cnt.v_free_count > 6144)
-		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
-	else
-		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
+	VMCNT_SET(free_target, ((VMCNT_GET(free_count) > 6144) ? 4 : 2) *
+	    VMCNT_GET(free_min) + VMCNT_GET(free_reserved));
 
-	if (cnt.v_free_count > 2048) {
-		cnt.v_cache_min = cnt.v_free_target;
-		cnt.v_cache_max = 2 * cnt.v_cache_min;
-		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
+	if (VMCNT_GET(free_count) > 2048) {
+		VMCNT_SET(cache_min, VMCNT_GET(free_target));
+		VMCNT_SET(cache_max, 2 * VMCNT_GET(cache_min));
+		VMCNT_SET(inactive_target, (3 * VMCNT_GET(free_target) / 2));
 	} else {
-		cnt.v_cache_min = 0;
-		cnt.v_cache_max = 0;
-		cnt.v_inactive_target = cnt.v_free_count / 4;
+		VMCNT_SET(cache_min, 0);
+		VMCNT_SET(cache_max, 0);
+		VMCNT_SET(inactive_target, VMCNT_GET(free_count) / 4);
 	}
-	if (cnt.v_inactive_target > cnt.v_free_count / 3)
-		cnt.v_inactive_target = cnt.v_free_count / 3;
+	if (VMCNT_GET(inactive_target) > VMCNT_GET(free_count) / 3)
+		VMCNT_SET(inactive_target, VMCNT_GET(free_count) / 3);
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
-		vm_page_max_wired = cnt.v_free_count / 3;
+		vm_page_max_wired = VMCNT_GET(free_count) / 3;
 
 	if (vm_pageout_stats_max == 0)
-		vm_pageout_stats_max = cnt.v_free_target;
+		vm_pageout_stats_max = VMCNT_GET(free_target);
 
 	/*
 	 * Set interval in seconds for stats scan.
 	 */
 	if (vm_pageout_stats_interval == 0)
 		vm_pageout_stats_interval = 5;
 	if (vm_pageout_full_stats_interval == 0)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
 
 	swap_pager_swap_init();
 	pass = 0;
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
 		/*
 		 * If we have enough free memory, wakeup waiters.  Do
 		 * not clear vm_pages_needed until we reach our target,
 		 * otherwise we may be woken up over and over again and
 		 * waste a lot of cpu.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
 			if (!vm_paging_needed())
 				vm_pages_needed = 0;
-			wakeup(&cnt.v_free_count);
+			wakeup(VMCNT_PTR(free_count));
 		}
 		if (vm_pages_needed) {
 			/*
 			 * Still not done, take a second pass without waiting
 			 * (unlimited dirty cleaning), otherwise sleep a bit
 			 * and try again.
 			 */
 			++pass;
 			if (pass > 1)
 				msleep(&vm_pages_needed,
 				    &vm_page_queue_free_mtx, PVM, "psleep",
 				    hz / 2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats.  Prime the pass
 			 * for the next run.
 			 */
 			if (pass > 1)
 				pass = 1;
 			else
 				pass = 0;
 			error = msleep(&vm_pages_needed,
 			    &vm_page_queue_free_mtx, PVM, "psleep",
 			    vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				pass = 0;
 				vm_page_lock_queues();
 				vm_pageout_page_stats();
 				vm_page_unlock_queues();
 				continue;
 			}
 		}
 		if (vm_pages_needed)
-			cnt.v_pdwakeups++;
+			VMCNT_ADD(pdwakeups, 1);
 		mtx_unlock(&vm_page_queue_free_mtx);
 		vm_pageout_scan(pass);
 	}
 }
 
 /*
  * Unless the free page queue lock is held by the caller, this function
  * should be regarded as advisory.  Specifically, the caller should
  * not msleep() on &cnt.v_free_count following this function unless
  * the free page queue lock is held until the msleep() is performed.
  */
 void
 pagedaemon_wakeup()
 {
 
 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 }
 
 #if !defined(NO_SWAPPING)
 static void
 vm_req_vmdaemon()
 {
 	static int lastrun = 0;
 
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 }
 
 static void
 vm_daemon()
 {
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
 	int breakout;
 
 	mtx_lock(&Giant);
 	while (TRUE) {
 		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
 		if (vm_pageout_req_swapout) {
 			swapout_procs(vm_pageout_req_swapout);
 			vm_pageout_req_swapout = 0;
 		}
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			PROC_LOCK(p);
 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			mtx_lock_spin(&sched_lock);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					breakout = 1;
 					break;
 				}
 			}
 			mtx_unlock_spin(&sched_lock);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			lim_rlimit(p, RLIMIT_RSS, &rsslim);
 			limit = OFF_TO_IDX(
 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_sflag & PS_INMEM) == 0)
 				limit = 0;	/* XXX */
 			PROC_UNLOCK(p);
 
 			size = vmspace_resident_count(p->p_vmspace);
 			if (limit >= 0 && size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &p->p_vmspace->vm_map, limit);
 			}
 		}
 		sx_sunlock(&allproc_lock);
 	}
 }
 #endif			/* !defined(NO_SWAPPING) */
Index: head/sys/vm/vm_pageq.c
===================================================================
--- head/sys/vm/vm_pageq.c	(revision 169666)
+++ head/sys/vm/vm_pageq.c	(revision 169667)
@@ -1,330 +1,330 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vmpage.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 static void vm_coloring_init(void);
 void setPQL2(int *const size, int *const ways);
 
 struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
 struct pq_coloring page_queue_coloring;
 
 static int pq_cachesize = 0;	/* size of the cache in KB */
 static int pq_cachenways = 0;	/* associativity of the cache */
 
 SYSCTL_NODE(_vm_stats, OID_AUTO, pagequeue, CTLFLAG_RW, 0, "VM meter stats");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, page_colors, CTLFLAG_RD,
     &(PQ_NUMCOLORS), 0, "Number of colors in the page queue");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, cachesize, CTLFLAG_RD,
     &pq_cachesize, 0, "Size of the processor cache in KB");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, cachenways, CTLFLAG_RD,
     &pq_cachenways, 0, "Associativity of the processor cache");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, prime1, CTLFLAG_RD,
     &(PQ_PRIME1), 0, "Cache tuning value");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, prime2, CTLFLAG_RD,
     &(PQ_PRIME2), 0, "Cache tuning value");
 
 static void
 vm_coloring_init(void)
 {
 #ifdef PQ_NOOPT
 	PQ_NUMCOLORS = PQ_PRIME1 = PQ_PRIME2 = 1;
 #else
 
 	setPQL2(&pq_cachesize, &pq_cachenways);
 
 	CTASSERT(PAGE_SIZE/1024 > 0);
 
 	if (pq_cachesize > 0 && pq_cachenways > 0)
 		PQ_NUMCOLORS = pq_cachesize / (PAGE_SIZE/1024) / \
 		    pq_cachenways;
 	else
 		PQ_NUMCOLORS = 32;
 
 	if (PQ_MAXCOLORS < PQ_NUMCOLORS) {
 		printf("VM-PQ color limit (PQ_MAXCOLORS=%u) exceeded (%u), see vm_page.h", PQ_MAXCOLORS, PQ_NUMCOLORS);
 		PQ_NUMCOLORS = PQ_MAXCOLORS;
 	}
 
 	if (PQ_NUMCOLORS >= 128) {
 		PQ_PRIME1 = 31;
 		PQ_PRIME2 = 23;
 	} else if (PQ_NUMCOLORS >= 64) {
 		PQ_PRIME1 = 13;
 		PQ_PRIME2 = 7;
 	} else if (PQ_NUMCOLORS >= 32) {
 		PQ_PRIME1 = 9;
 		PQ_PRIME2 = 5;
 	} else if (PQ_NUMCOLORS >= 16) {
 		PQ_PRIME1 = 5;
 		PQ_PRIME2 = 3;
 	} else
 		PQ_NUMCOLORS = PQ_PRIME1 = PQ_PRIME2 = 1;
 #endif
 
 	/*
 	 * PQ_CACHE represents a
 	 * PQ_NUMCOLORS consecutive queue.
 	 */
 	PQ_COLORMASK = PQ_NUMCOLORS - 1;
 	PQ_INACTIVE  = 1 + PQ_NUMCOLORS;
 	PQ_ACTIVE    = 2 + PQ_NUMCOLORS;
 	PQ_CACHE     = 3 + PQ_NUMCOLORS;
 	PQ_HOLD      = 3 + 2 * PQ_NUMCOLORS;
 	PQ_COUNT     = 4 + 2 * PQ_NUMCOLORS;
 	PQ_MAXLENGTH = PQ_NUMCOLORS / 3 + PQ_PRIME1;
 
 #if 0
 	/* XXX: is it possible to allocate vm_page_queues[PQ_COUNT] here? */
 #error XXX: vm_page_queues = malloc(PQ_COUNT * sizeof(struct vpgqueues));
 #endif
 
 	if (bootverbose)
 		if (PQ_NUMCOLORS > 1)
 		    printf("Using %d colors for the VM-PQ tuning (%d, %d)\n",
 		    PQ_NUMCOLORS, pq_cachesize, pq_cachenways);
 }
 
 void
 vm_pageq_init(void)
 {
 	int i;
 
 	vm_coloring_init();
 
 	for (i = 0; i < PQ_NUMCOLORS; ++i) {
-		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
+		vm_page_queues[PQ_FREE+i].cnt = VMCNT_PTR(free_count);
 	}
 	for (i = 0; i < PQ_NUMCOLORS; ++i) {
-		vm_page_queues[PQ_CACHE + i].cnt = &cnt.v_cache_count;
+		vm_page_queues[PQ_CACHE + i].cnt = VMCNT_PTR(cache_count);
 	}
-	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
-	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
-	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
+	vm_page_queues[PQ_INACTIVE].cnt = VMCNT_PTR(inactive_count);
+	vm_page_queues[PQ_ACTIVE].cnt = VMCNT_PTR(active_count);
+	vm_page_queues[PQ_HOLD].cnt = VMCNT_PTR(active_count);
 
 	for (i = 0; i < PQ_COUNT; i++) {
 		TAILQ_INIT(&vm_page_queues[i].pl);
 	}
 }
 
 void
 vm_pageq_requeue(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *vpq;
 
 	if (queue != PQ_NONE) {
 		vpq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&vpq->pl, m, pageq);
 		TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 	}
 }
 
 /*
  *	vm_pageq_enqueue:
  */
 void
 vm_pageq_enqueue(int queue, vm_page_t m)
 {
 	struct vpgqueues *vpq;
 
 	vpq = &vm_page_queues[queue];
 	VM_PAGE_SETQUEUE2(m, queue);
 	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 	++*vpq->cnt;
 	++vpq->lcnt;
 }
 
 /*
  *	vm_add_new_page:
  *
  *	Add a new page to the freelist for use by the system.
  */
 void
 vm_pageq_add_new_page(vm_paddr_t pa)
 {
 	vm_page_t m;
 
-	atomic_add_int(&cnt.v_page_count, 1);
+	VMCNT_ADD(page_count, 1);
 	m = PHYS_TO_VM_PAGE(pa);
 	m->phys_addr = pa;
 	m->flags = 0;
 	m->pc = (pa >> PAGE_SHIFT) & PQ_COLORMASK;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
 	vm_pageq_enqueue(m->pc + PQ_FREE, m);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * vm_pageq_remove_nowakeup:
  *
  * 	vm_page_unqueue() without any wakeup
  *
  *	The queue containing the given page must be locked.
  *	This routine may not block.
  */
 void
 vm_pageq_remove_nowakeup(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *pq;
 
 	if (queue != PQ_NONE) {
 		pq = &vm_page_queues[queue];
 		VM_PAGE_SETQUEUE2(m, PQ_NONE);
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
 		pq->lcnt--;
 	}
 }
 
 /*
  * vm_pageq_remove:
  *
  *	Remove a page from its queue.
  *
  *	The queue containing the given page must be locked.
  *	This routine may not block.
  */
 void
 vm_pageq_remove(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *pq;
 
 	if (queue != PQ_NONE) {
 		VM_PAGE_SETQUEUE2(m, PQ_NONE);
 		pq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
 		pq->lcnt--;
 		if (VM_PAGE_RESOLVEQUEUE(m, queue) == PQ_CACHE) {
 			if (vm_paging_needed())
 				pagedaemon_wakeup();
 		}
 	}
 }
 
 #ifndef PQ_NOOPT
 
 /*
  *	vm_pageq_find:
  *
  *	Find a page on the specified queue with color optimization.
  *
  *	The page coloring optimization attempts to locate a page
  *	that does not overload other nearby pages in the object in
  *	the cpu's L2 cache.  We need this optimization because cpu
  *	caches tend to be physical caches, while object spaces tend 
  *	to be virtual.
  *
  *	The specified queue must be locked.
  *	This routine may not block.
  *
  *	This routine may only be called from the vm_pageq_find()
  *	function in this file.
  */
 static inline vm_page_t
 _vm_pageq_find(int basequeue, int index)
 {
 	int i;
 	vm_page_t m = NULL;
 	struct vpgqueues *pq;
 
 	pq = &vm_page_queues[basequeue];
 
 	/*
 	 * Note that for the first loop, index+i and index-i wind up at the
 	 * same place.  Even though this is not totally optimal, we've already
 	 * blown it by missing the cache case so we do not care.
 	 */
 	for (i = PQ_NUMCOLORS / 2; i > 0; --i) {
 		if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_COLORMASK].pl)) \
 		    != NULL)
 			break;
 
 		if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_COLORMASK].pl)) \
 		    != NULL)
 			break;
 	}
 	return (m);
 }
 #endif /* PQ_NOOPT */
 
 vm_page_t
 vm_pageq_find(int basequeue, int index, boolean_t prefer_zero)
 {
         vm_page_t m;
 
 #ifndef PQ_NOOPT
 	if (PQ_NUMCOLORS > 1) {
 	        if (prefer_zero) {
 	                m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, \
 			    pglist);
         	} else {
                 	m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
         	}
         	if (m == NULL) {
                 	m = _vm_pageq_find(basequeue, index);
 		}
 	} else {
 #endif
         	if (prefer_zero) {
                 	m = TAILQ_LAST(&vm_page_queues[basequeue].pl, pglist);
         	} else {
                 	m = TAILQ_FIRST(&vm_page_queues[basequeue].pl);
         	}
 #ifndef PQ_NOOPT
 	}
 #endif
         return (m);
 }
 
Index: head/sys/vm/vm_zeroidle.c
===================================================================
--- head/sys/vm/vm_zeroidle.c	(revision 169666)
+++ head/sys/vm/vm_zeroidle.c	(revision 169667)
@@ -1,186 +1,187 @@
 /*-
  * Copyright (c) 1994 John Dyson
  * Copyright (c) 2001 Matt Dillon
  *
  * All Rights Reserved.
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
  * from: FreeBSD: .../i386/vm_machdep.c,v 1.165 2001/07/04 23:27:04 dillon
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <opt_sched.h>
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/kthread.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 static int cnt_prezero;
 SYSCTL_INT(_vm_stats_misc, OID_AUTO, cnt_prezero, CTLFLAG_RD,
     &cnt_prezero, 0, "");
 
 static int idlezero_enable_default = 1;
 TUNABLE_INT("vm.idlezero_enable", &idlezero_enable_default);
 /* Defer setting the enable flag until the kthread is running. */
 static int idlezero_enable = 0;
 SYSCTL_INT(_vm, OID_AUTO, idlezero_enable, CTLFLAG_RW, &idlezero_enable, 0, "");
 
 static int idlezero_maxrun = 16;
 SYSCTL_INT(_vm, OID_AUTO, idlezero_maxrun, CTLFLAG_RW, &idlezero_maxrun, 0, "");
 TUNABLE_INT("vm.idlezero_maxrun", &idlezero_maxrun);
 
 /*
  * Implement the pre-zeroed page mechanism.
  */
 
 #define ZIDLE_LO(v)	((v) * 2 / 3)
 #define ZIDLE_HI(v)	((v) * 4 / 5)
 
 static boolean_t wakeup_needed = FALSE;
 static int zero_state;
 
 static int
 vm_page_zero_check(void)
 {
 
 	if (!idlezero_enable)
 		return (0);
 	/*
 	 * Attempt to maintain approximately 1/2 of our free pages in a
 	 * PG_ZERO'd state.   Add some hysteresis to (attempt to) avoid
 	 * generally zeroing a page when the system is near steady-state.
 	 * Otherwise we might get 'flutter' during disk I/O / IPC or 
 	 * fast sleeps.  We also do not want to be continuously zeroing
 	 * pages because doing so may flush our L1 and L2 caches too much.
 	 */
-	if (zero_state && vm_page_zero_count >= ZIDLE_LO(cnt.v_free_count))
+	if (zero_state && vm_page_zero_count >=
+	    ZIDLE_LO(VMCNT_GET(free_count)))
 		return (0);
-	if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+	if (vm_page_zero_count >= ZIDLE_HI(VMCNT_GET(free_count)))
 		return (0);
 	return (1);
 }
 
 static void
 vm_page_zero_idle(void)
 {
 	static int free_rover;
 	vm_page_t m;
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	zero_state = 0;
 	m = vm_pageq_find(PQ_FREE, free_rover, FALSE);
 	if (m != NULL && (m->flags & PG_ZERO) == 0) {
 		vm_pageq_remove_nowakeup(m);
 		mtx_unlock(&vm_page_queue_free_mtx);
 		pmap_zero_page_idle(m);
 		mtx_lock(&vm_page_queue_free_mtx);
 		m->flags |= PG_ZERO;
 		vm_pageq_enqueue(PQ_FREE + m->pc, m);
 		++vm_page_zero_count;
 		++cnt_prezero;
-		if (vm_page_zero_count >= ZIDLE_HI(cnt.v_free_count))
+		if (vm_page_zero_count >= ZIDLE_HI(VMCNT_GET(free_count)))
 			zero_state = 1;
 	}
 	free_rover = (free_rover + PQ_PRIME2) & PQ_COLORMASK;
 }
 
 /* Called by vm_page_free to hint that a new page is available. */
 void
 vm_page_zero_idle_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	if (wakeup_needed && vm_page_zero_check()) {
 		wakeup_needed = FALSE;
 		wakeup(&zero_state);
 	}
 }
 
 static void
 vm_pagezero(void __unused *arg)
 {
 
 	idlezero_enable = idlezero_enable_default;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	for (;;) {
 		if (vm_page_zero_check()) {
 			vm_page_zero_idle();
 #ifndef PREEMPTION
 			if (sched_runnable()) {
 				mtx_lock_spin(&sched_lock);
 				mi_switch(SW_VOL, NULL);
 				mtx_unlock_spin(&sched_lock);
 			}
 #endif
 		} else {
 			wakeup_needed = TRUE;
 			msleep(&zero_state, &vm_page_queue_free_mtx, 0,
 			    "pgzero", hz * 300);
 		}
 	}
 }
 
 static struct proc *pagezero_proc;
 
 static void
 pagezero_start(void __unused *arg)
 {
 	int error;
 	struct thread *td;
 
 	error = kthread_create(vm_pagezero, NULL, &pagezero_proc, RFSTOPPED, 0,
 	    "pagezero");
 	if (error)
 		panic("pagezero_start: error %d\n", error);
 	/*
 	 * We're an idle task, don't count us in the load.
 	 */
 	PROC_LOCK(pagezero_proc);
 	pagezero_proc->p_flag |= P_NOLOAD;
 	PROC_UNLOCK(pagezero_proc);
 	mtx_lock_spin(&sched_lock);
 	td = FIRST_THREAD_IN_PROC(pagezero_proc);
 	sched_class(td, PRI_IDLE);
 	sched_prio(td, PRI_MAX_IDLE);
 	sched_add(td, SRQ_BORING);
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(pagezero, SI_SUB_KTHREAD_VM, SI_ORDER_ANY, pagezero_start, NULL)
Index: head/sys/vm/vnode_pager.c
===================================================================
--- head/sys/vm/vnode_pager.c	(revision 169666)
+++ head/sys/vm/vnode_pager.c	(revision 169667)
@@ -1,1222 +1,1223 @@
 /*-
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
 #include <sys/sf_buf.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
     daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t, vm_ooffset_t);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 int vnode_pbuf_freecnt;
 
 /* Create the VM system backing object for this vnode */
 int
 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
 	struct vattr va;
 
 	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_LOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0, td);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vodead", 0);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 
 	if (size == 0) {
 		if (vn_isdisk(vp, NULL)) {
 			size = IDX_TO_OFF(INT_MAX);
 		} else {
 			if (VOP_GETATTR(vp, &va, td->td_ucred, td) != 0)
 				return (0);
 			size = va.va_size;
 		}
 	}
 
 	object = vnode_pager_alloc(vp, size, 0, 0);
 	/*
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
 	VM_OBJECT_LOCK(object);
 	object->ref_count--;
 	VM_OBJECT_UNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 
 	return (0);
 }
 
 void
 vnode_destroy_vobject(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_LOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_LOCK(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * vclean() may be called twice. The first time
 		 * removes the primary reference to the object,
 		 * the second time goes one further and is a
 		 * special-case to terminate the object.
 		 *
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(obj);
 		else
 			VM_OBJECT_UNLOCK(obj);
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
 		VM_OBJECT_UNLOCK(obj);
 	}
 	vp->v_object = NULL;
 }
 
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  *
  * MPSAFE
  */
 vm_object_t
 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
 		  vm_ooffset_t offset)
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	ASSERT_VOP_LOCKED(vp, "vnode_pager_alloc");
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_LOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		msleep(object, VM_OBJECT_MTX(object), PDROP | PVM, "vadead", 0);
 	}
 
 	if (vp->v_usecount == 0)
 		panic("vnode_pager_alloc: no vnode reference");
 
 	if (object == NULL) {
 		/*
 		 * And an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 
 		object->un_pager.vnp.vnp_size = size;
 
 		object->handle = handle;
 		if (VFS_NEEDSGIANT(vp->v_mount))
 			vm_object_set_flag(object, OBJ_NEEDGIANT);
 		vp->v_object = object;
 	} else {
 		object->ref_count++;
 		VM_OBJECT_UNLOCK(object);
 	}
 	vref(vp);
 	return (object);
 }
 
 /*
  *	The object must be locked.
  */
 static void
 vnode_pager_dealloc(object)
 	vm_object_t object;
 {
 	struct vnode *vp = object->handle;
 
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	vm_object_pip_wait(object, "vnpdea");
 
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 	if (object->flags & OBJ_DISCONNECTWNT) {
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
 	ASSERT_VOP_LOCKED(vp, "vnode_pager_dealloc");
 	vp->v_object = NULL;
 	vp->v_vflag &= ~VV_TEXT;
 }
 
 static boolean_t
 vnode_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 	int vfslocked;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
 	 */
 	if (vp == NULL || vp->v_iflag & VI_DOOMED)
 		return FALSE;
 	/*
 	 * If the offset is beyond end of file we do
 	 * not have the page.
 	 */
 	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	VM_OBJECT_UNLOCK(object);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 	VFS_UNLOCK_GIANT(vfslocked);
 	VM_OBJECT_LOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			int numafter;
 			*after *= pagesperblock;
 			numafter = pagesperblock - (poff + 1);
 			if (IDX_TO_OFF(pindex + numafter) >
 			    object->un_pager.vnp.vnp_size) {
 				numafter =
 		    		    OFF_TO_IDX(object->un_pager.vnp.vnp_size) -
 				    pindex;
 			}
 			*after += numafter;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(vp, nsize)
 	struct vnode *vp;
 	vm_ooffset_t nsize;
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	if (nsize == object->un_pager.vnp.vnp_size) {
 		/*
 		 * Hasn't changed size
 		 */
 		VM_OBJECT_UNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		/*
 		 * File has shrunk. Toss any cached pages beyond the new EOF.
 		 */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    FALSE);
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
 		 *
 		 * XXX for some reason (I don't know yet), if we take a
 		 * completely invalid page and mark it partially valid
 		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
 		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 		    m->valid != 0) {
 			int base = (int)nsize & PAGE_MASK;
 			int size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
 			 * the page has been mapped.
 			 */
 			pmap_zero_page_area(m, base, size);
 
 			/*
 			 * XXX work around SMP data integrity race
 			 * by unmapping the page from user processes.
 			 * The garbage we just cleared may be mapped
 			 * to a user process running on another cpu
 			 * and this code is not running through normal
 			 * I/O channels which handle SMP issues for
 			 * us, so unmap page to synchronize all cpus.
 			 *
 			 * XXX should vm_pager_unmap_page() have
 			 * dealt with this?
 			 */
 			vm_page_lock_queues();
 			pmap_remove_all(m);
 
 			/*
 			 * Clear out partial-page dirty bits.  This
 			 * has the side effect of setting the valid
 			 * bits, but that is ok.  There are a bunch
 			 * of places in the VM system where we expected
 			 * m->dirty == VM_PAGE_BITS_ALL.  The file EOF
 			 * case is one of them.  If the page is still
 			 * partially dirty, make it fully dirty.
 			 *
 			 * note that we do not clear out the valid
 			 * bits.  This would prevent bogus_page
 			 * replacement from working properly.
 			 */
 			vm_page_set_validclean(m, base, size);
 			if (m->dirty != 0)
 				m->dirty = VM_PAGE_BITS_ALL;
 			vm_page_unlock_queues();
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static int
 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
     int *run)
 {
 	int bsize;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
 
 	if (address < 0)
 		return -1;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 	if (err == 0) {
 		if (*rtaddress != -1)
 			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
 			*run -= voffset/PAGE_SIZE;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * small block filesystem vnode pager input
  */
 static int
 vnode_pager_input_smlfs(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	int i;
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct buf *bp;
 	struct sf_buf *sf;
 	daddr_t fileaddr;
 	vm_offset_t bsize;
 	int error = 0;
 
 	vp = object->handle;
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 
 	sf = sf_buf_alloc(m, 0);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 		vm_ooffset_t address;
 
 		if (vm_page_bits(i * bsize, bsize) & m->valid)
 			continue;
 
 		address = IDX_TO_OFF(m->pindex) + i * bsize;
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
 			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 			if (error)
 				break;
 		}
 		if (fileaddr != -1) {
 			bp = getpbuf(&vnode_pbuf_freecnt);
 
 			/* build a minimal buffer header */
 			bp->b_iocmd = BIO_READ;
 			bp->b_iodone = bdone;
 			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 			bp->b_rcred = crhold(curthread->td_ucred);
 			bp->b_wcred = crhold(curthread->td_ucred);
 			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetbo(bo, bp);
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 			bp->b_runningbufspace = bp->b_bufsize;
 			atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
 			/* do the input */
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
 
 			bwait(bp, PVM, "vnsrd");
 
 			if ((bp->b_ioflags & BIO_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			pbrelbo(bp);
 			relpbuf(bp, &vnode_pbuf_freecnt);
 			if (error)
 				break;
 
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 		} else {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			vm_page_set_validclean(m, (i * bsize) & PAGE_MASK, bsize);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		}
 	}
 	sf_buf_free(sf);
 	vm_page_lock_queues();
 	pmap_clear_modify(m);
 	vm_page_unlock_queues();
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 
 }
 
 
 /*
  * old style vnode pager input routine
  */
 static int
 vnode_pager_input_old(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	struct sf_buf *sf;
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		sf = sf_buf_alloc(m, 0);
 
 		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_td = curthread;
 
 		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 		if (!error) {
 			int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t)sf_buf_kva(sf) + count,
 				    PAGE_SIZE - count);
 		}
 		sf_buf_free(sf);
 
 		VM_OBJECT_LOCK(object);
 	}
 	vm_page_lock_queues();
 	pmap_clear_modify(m);
 	vm_page_undirty(m);
 	vm_page_unlock_queues();
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 /*
  * Local media VFS's that do not implement their own VOP_GETPAGES
  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
  * to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_GETPAGES.
  */
 static int
 vnode_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
 	int vfslocked;
 
 	vp = object->handle;
 	VM_OBJECT_UNLOCK(object);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	rtval = VOP_GETPAGES(vp, m, bytes, reqpage, 0);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VFS_UNLOCK_GIANT(vfslocked);
 	VM_OBJECT_LOCK(object);
 	return rtval;
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
 vnode_pager_generic_getpages(vp, m, bytecount, reqpage)
 	struct vnode *vp;
 	vm_page_t *m;
 	int bytecount;
 	int reqpage;
 {
 	vm_object_t object;
 	vm_offset_t kva;
 	off_t foff, tfoff, nextoff;
 	int i, j, size, bsize, first;
 	daddr_t firstaddr, reqblock;
 	struct bufobj *bo;
 	int runpg;
 	int runend;
 	struct buf *bp;
 	int count;
 	int error;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("vnode_pager_generic_getpages does not support devices"));
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	/* get the UNDERLYING device for the file with VOP_BMAP() */
 
 	/*
 	 * originally, we did not check for an error return value -- assuming
 	 * an fs always has a bmap entry point -- that assumption is wrong!!!
 	 */
 	foff = IDX_TO_OFF(m[reqpage]->pindex);
 
 	/*
 	 * if we can't bmap, use old VOP code
 	 */
 	error = VOP_BMAP(vp, foff / bsize, &bo, &reqblock, NULL, NULL);
 	if (error == EOPNOTSUPP) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
-		cnt.v_vnodein++;
-		cnt.v_vnodepgsin++;
+		VMCNT_ADD(vnodein, 1);
+		VMCNT_ADD(vnodepgsin, 1);
 		error = vnode_pager_input_old(object, m[reqpage]);
 		VM_OBJECT_UNLOCK(object);
 		return (error);
 	} else if (error != 0) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return (VM_PAGER_ERROR);
 
 		/*
 		 * if the blocksize is smaller than a page size, then use
 		 * special small filesystem code.  NFS sometimes has a small
 		 * blocksize, but it can handle large reads itself.
 		 */
 	} else if ((PAGE_SIZE / bsize) > 1 &&
 	    (vp->v_mount->mnt_stat.f_type != nfs_mount_type)) {
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
-		cnt.v_vnodein++;
-		cnt.v_vnodepgsin++;
+		VMCNT_ADD(vnodein, 1);
+		VMCNT_ADD(vnodepgsin, 1);
 		return vnode_pager_input_smlfs(object, m[reqpage]);
 	}
 
 	/*
 	 * If we have a completely valid page available to us, we can
 	 * clean up and return.  Otherwise we have to re-read the
 	 * media.
 	 */
 	VM_OBJECT_LOCK(object);
 	if (m[reqpage]->valid == VM_PAGE_BITS_ALL) {
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return VM_PAGER_OK;
 	} else if (reqblock == -1) {
 		pmap_zero_page(m[reqpage]);
 		vm_page_undirty(m[reqpage]);
 		m[reqpage]->valid = VM_PAGE_BITS_ALL;
 		vm_page_lock_queues();
 		for (i = 0; i < count; i++)
 			if (i != reqpage)
 				vm_page_free(m[i]);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 	m[reqpage]->valid = 0;
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * here on direct device I/O
 	 */
 	firstaddr = -1;
 
 	/*
 	 * calculate the run that includes the required page
 	 */
 	for (first = 0, i = 0; i < count; i = runend) {
 		if (vnode_pager_addr(vp, IDX_TO_OFF(m[i]->pindex), &firstaddr,
 		    &runpg) != 0) {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			for (; i < count; i++)
 				if (i != reqpage)
 					vm_page_free(m[i]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			return (VM_PAGER_ERROR);
 		}
 		if (firstaddr == -1) {
 			VM_OBJECT_LOCK(object);
 			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
 				panic("vnode_pager_getpages: unexpected missing page: firstaddr: %jd, foff: 0x%jx%08jx, vnp_size: 0x%jx%08jx",
 				    (intmax_t)firstaddr, (uintmax_t)(foff >> 32),
 				    (uintmax_t)foff,
 				    (uintmax_t)
 				    (object->un_pager.vnp.vnp_size >> 32),
 				    (uintmax_t)object->un_pager.vnp.vnp_size);
 			}
 			vm_page_lock_queues();
 			vm_page_free(m[i]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 			runend = i + 1;
 			first = runend;
 			continue;
 		}
 		runend = i + runpg;
 		if (runend <= reqpage) {
 			VM_OBJECT_LOCK(object);
 			vm_page_lock_queues();
 			for (j = i; j < runend; j++)
 				vm_page_free(m[j]);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(object);
 		} else {
 			if (runpg < (count - first)) {
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				for (i = first + runpg; i < count; i++)
 					vm_page_free(m[i]);
 				vm_page_unlock_queues();
 				VM_OBJECT_UNLOCK(object);
 				count = first + runpg;
 			}
 			break;
 		}
 		first = runend;
 	}
 
 	/*
 	 * the first and last page have been calculated now, move input pages
 	 * to be zero based...
 	 */
 	if (first != 0) {
 		m += first;
 		count -= first;
 		reqpage -= first;
 	}
 
 	/*
 	 * calculate the file virtual address for the transfer
 	 */
 	foff = IDX_TO_OFF(m[0]->pindex);
 
 	/*
 	 * calculate the size of the transfer
 	 */
 	size = count * PAGE_SIZE;
 	KASSERT(count > 0, ("zero count"));
 	if ((foff + size) > object->un_pager.vnp.vnp_size)
 		size = object->un_pager.vnp.vnp_size - foff;
 	KASSERT(size > 0, ("zero size"));
 
 	/*
 	 * round up physical size for real devices.
 	 */
 	if (1) {
 		int secmask = bo->bo_bsize - 1;
 		KASSERT(secmask < PAGE_SIZE && secmask > 0,
 		    ("vnode_pager_generic_getpages: sector size %d too large",
 		    secmask + 1));
 		size = (size + secmask) & ~secmask;
 	}
 
 	bp = getpbuf(&vnode_pbuf_freecnt);
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
 	 * and map the pages to be read into the kva
 	 */
 	pmap_qenter(kva, m, count);
 
 	/* build a minimal buffer header */
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = bdone;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	bp->b_blkno = firstaddr;
 	pbgetbo(bo, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
-	cnt.v_vnodein++;
-	cnt.v_vnodepgsin += count;
+	VMCNT_ADD(vnodein, 1);
+	VMCNT_ADD(vnodepgsin, 1);
 
 	/* do the input */
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	bwait(bp, PVM, "vnread");
 
 	if ((bp->b_ioflags & BIO_ERROR) != 0)
 		error = EIO;
 
 	if (!error) {
 		if (size != count * PAGE_SIZE)
 			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
 	}
 	pmap_qremove(kva, count);
 
 	/*
 	 * free the buffer header back to the swap buffer pool
 	 */
 	pbrelbo(bp);
 	relpbuf(bp, &vnode_pbuf_freecnt);
 
 	VM_OBJECT_LOCK(object);
 	vm_page_lock_queues();
 	for (i = 0, tfoff = foff; i < count; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
 		mt = m[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			vm_page_undirty(mt);	/* should be an assert? XXX */
 			pmap_clear_modify(mt);
 		} else {
 			/*
 			 * Read did not fill up entire page.  Since this
 			 * is getpages, the page may be mapped, so we have
 			 * to zero the invalid portions of the page even
 			 * though we aren't setting them valid.
 			 *
 			 * Currently we do not set the entire page valid,
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
 			vm_page_set_validclean(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			/* handled by vm_fault now */
 			/* vm_page_zero_invalid(mt, FALSE); */
 		}
 		
 		if (i != reqpage) {
 
 			/*
 			 * whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere. (it already is in the object). Result:
 			 * It appears that empirical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * just in case someone was asking for this page we
 			 * now tell them that it is ok to use
 			 */
 			if (!error) {
 				if (mt->oflags & VPO_WANTED)
 					vm_page_activate(mt);
 				else
 					vm_page_deactivate(mt);
 				vm_page_wakeup(mt);
 			} else {
 				vm_page_free(mt);
 			}
 		}
 	}
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 	if (error) {
 		printf("vnode_pager_getpages: I/O read error\n");
 	}
 	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 /*
  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
  * vnode_pager_generic_putpages() to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_PUTPAGES.
  */
 static void
 vnode_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	int rtval;
 	struct vnode *vp;
 	struct mount *mp;
 	int bytes = count * PAGE_SIZE;
 
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
 	 * allocate more memory to initiate the I/O ( i.e. do a BMAP 
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
 	 *
 	 * Also, the backing vnode's iodone routine may not wake the pageout
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
-	if ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min)
+	if ((VMCNT_GET(free_count) + VMCNT_GET(cache_count)) <
+	    VMCNT_GET(pageout_free_min))
 		sync |= OBJPC_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
 	VM_OBJECT_UNLOCK(object);
 	if (vp->v_type != VREG)
 		mp = NULL;
 	rtval = VOP_PUTPAGES(vp, m, bytes, sync, rtvals, 0);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_LOCK(object);
 }
 
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
  * clustering has already typically occured, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
 int
 vnode_pager_generic_putpages(vp, m, bytecount, flags, rtvals)
 	struct vnode *vp;
 	vm_page_t *m;
 	int bytecount;
 	int flags;
 	int *rtvals;
 {
 	int i;
 	vm_object_t object;
 	int count;
 
 	int maxsize, ncount;
 	vm_ooffset_t poffset;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int ioflags;
 	int ppscheck = 0;
 	static struct timeval lastfail;
 	static int curfail;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	if ((int64_t)m[0]->pindex < 0) {
 		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%lx(%lx)\n",
 			(long)m[0]->pindex, (u_long)m[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return VM_PAGER_BAD;
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(m[0]->pindex);
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
 	 * have to invalidate pages occuring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
 	 *
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			int pgoff;
 
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
 				vm_page_lock_queues();
 				vm_page_clear_dirty(m[ncount - 1], pgoff,
 					PAGE_SIZE - pgoff);
 				vm_page_unlock_queues();
 			}
 		} else {
 			maxsize = 0;
 			ncount = 0;
 		}
 		if (ncount < count) {
 			for (i = ncount; i < count; i++) {
 				rtvals[i] = VM_PAGER_BAD;
 			}
 		}
 	}
 
 	/*
 	 * pageouts are already clustered, use IO_ASYNC t o force a bawrite()
 	 * rather then a bdwrite() to prevent paging I/O from saturating 
 	 * the buffer cache.  Dummy-up the sequential heuristic to cause
 	 * large ranges to cluster.  If neither IO_SYNC or IO_ASYNC is set,
 	 * the system decides how to cluster.
 	 */
 	ioflags = IO_VMIO;
 	if (flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL))
 		ioflags |= IO_SYNC;
 	else if ((flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
 	ioflags |= (flags & VM_PAGER_PUT_INVAL) ? IO_INVAL: 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 
 	aiov.iov_base = (caddr_t) 0;
 	aiov.iov_len = maxsize;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = poffset;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_resid = maxsize;
 	auio.uio_td = (struct thread *) 0;
 	error = VOP_WRITE(vp, &auio, ioflags, curthread->td_ucred);
-	cnt.v_vnodeout++;
-	cnt.v_vnodepgsout += ncount;
+	VMCNT_ADD(vnodein, 1);
+	VMCNT_ADD(vnodepgsin, ncount);
 
 	if (error) {
 		if ((ppscheck = ppsratecheck(&lastfail, &curfail, 1)))
 			printf("vnode_pager_putpages: I/O error %d\n", error);
 	}
 	if (auio.uio_resid) {
 		if (ppscheck || ppsratecheck(&lastfail, &curfail, 1))
 			printf("vnode_pager_putpages: residual I/O %d at %lu\n",
 			    auio.uio_resid, (u_long)m[0]->pindex);
 	}
 	for (i = 0; i < ncount; i++) {
 		rtvals[i] = VM_PAGER_OK;
 	}
 	return rtvals[0];
 }
 
 struct vnode *
 vnode_pager_lock(vm_object_t first_object)
 {
 	struct vnode *vp;
 	vm_object_t backing_object, object;
 
 	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
 	for (object = first_object; object != NULL; object = backing_object) {
 		if (object->type != OBJT_VNODE) {
 			if ((backing_object = object->backing_object) != NULL)
 				VM_OBJECT_LOCK(backing_object);
 			if (object != first_object)
 				VM_OBJECT_UNLOCK(object);
 			continue;
 		}
 	retry:
 		if (object->flags & OBJ_DEAD) {
 			if (object != first_object)
 				VM_OBJECT_UNLOCK(object);
 			return NULL;
 		}
 		vp = object->handle;
 		VI_LOCK(vp);
 		VM_OBJECT_UNLOCK(object);
 		if (first_object != object)
 			VM_OBJECT_UNLOCK(first_object);
 		VFS_ASSERT_GIANT(vp->v_mount);
 		if (vget(vp, LK_CANRECURSE | LK_INTERLOCK |
 		    LK_RETRY | LK_SHARED, curthread)) {
 			VM_OBJECT_LOCK(first_object);
 			if (object != first_object)
 				VM_OBJECT_LOCK(object);
 			if (object->type != OBJT_VNODE) {
 				if (object != first_object)
 					VM_OBJECT_UNLOCK(object);
 				return NULL;
 			}
 			printf("vnode_pager_lock: retrying\n");
 			goto retry;
 		}
 		VM_OBJECT_LOCK(first_object);
 		return (vp);
 	}
 	return NULL;
 }