Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 170516)
+++ head/sys/amd64/amd64/trap.c	(revision 170517)
@@ -1,908 +1,904 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * AMD64 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 #define MAX_TRAP_MSG		30
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 };
 
 #ifdef KDB
 static int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
 	&kdb_on_nmi, 0, "Go to KDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 extern char *syscallnames[];
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int i = 0, ucode = 0, type, code;
 	register_t addr = 0;
 	ksiginfo_t ksi;
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 #ifdef STOP_NMI
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* STOP_NMI */
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * CPU PMCs interrupt using an NMI.  If the PMC module is
 	 * active, pass the 'rip' value to the PMC module's interrupt
 	 * handler.  A return value of '1' from the handler means that
 	 * the NMI was handled by it and we can return immediately.
 	 */
 	if (type == T_NMI && pmc_intr &&
 	    (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame->tf_rip,
 		TRAPF_USERMODE(frame)))
 		goto out;
 #endif
 
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame->tf_cs) == SEL_UPL)
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_NMI && type != T_BPTFLT &&
 		    type != T_TRCTRAP) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * We shouldn't enable interrupts while holding a
 			 * spin lock or servicing an NMI.
 			 */
 			if (type != T_NMI && td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	code = frame->tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0)
 			trap_fatal(frame, frame->tf_addr);
 	}
 
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_rip;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 			frame->tf_rflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = fputrap();
 			if (ucode == -1)
 				goto userout;
 			i = SIGFPE;
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_STKFLT:		/* stack fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			addr = frame->tf_addr;
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				thread_user_enter(td);
 #endif
 			i = trap_pfault(frame, TRUE);
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				i = SIGSEGV; /* XXX hack */
 				ucode = SEGV_ACCERR;
 			}
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 			/* transparent fault (due to context switch "late") */
 			if (fpudna())
 				goto userout;
 			printf("pid %d killed due to lack of floating point\n",
 				p->p_pid);
 			i = SIGKILL;
 			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE);
 			goto out;
 
 		case T_DNA:
 			/*
 			 * The kernel is apparently using fpu for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (fpudna()) {
 				printf("fpudna in kernel mode!\n");
 				goto out;
 			}
 			break;
 
 		case T_STKFLT:		/* stack fault */
 			break;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %rip's and %rsp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_rip == (long)doreti_iret) {
 				frame->tf_rip = (long)doreti_iret_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame->tf_rip =
 				    (long)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_rflags & PSL_NT) {
 				frame->tf_rflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			if (user_dbreg_trap()) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				/* XXX check upper bits here */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, 0, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, 0);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_trapno = type;
 	ksi.ksi_addr = (void *)addr;
 	trapsignal(td, &ksi);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", frame->tf_addr);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, frame);
 	mtx_assert(&Giant, MA_NOTOWNED);
 userout:
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	vm_offset_t eva = frame->tf_addr;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_rip = (long)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss;
 	long esp;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%lx\n", eva);
 		printf("fault code		= %s %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_I ? "instruction" : "data",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%lx:0x%lx\n",
 	       frame->tf_cs & 0xffff, frame->tf_rip);
         if (ISPL(frame->tf_cs) == SEL_UPL) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_rsp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (long)&frame->tf_rsp;
 	}
 	printf("stack pointer	        = 0x%x:0x%lx\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%lx\n", ss, frame->tf_rbp);
 	printf("code segment		= base 0x%lx, limit 0x%lx, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, long %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_rflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_rflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_rflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_rflags & PSL_RF)
 		printf("resume, ");
 	printf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active)
 		if (kdb_trap(type, 0, frame))
 			return;
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault\n");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 /*
  *	syscall -	system call request C handler
  *
  *	A system call is essentially treated as a trap.
  */
 void
 syscall(struct trapframe *frame)
 {
 	caddr_t params;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_rflags;
 	int error;
 	int narg;
 	register_t args[8];
 	register_t *argp;
 	u_int code;
 	int reg, regcnt;
 	ksiginfo_t ksi;
 
-	/*
-	 * note: PCPU_INC() can only be used if we can afford
-	 * occassional inaccuracy in the count.
-	 */
 	PCPU_INC(cnt.v_syscall);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame->tf_cs) != SEL_UPL) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 
 	reg = 0;
 	regcnt = 6;
 	td->td_pticks = 0;
 	td->td_frame = frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		thread_user_enter(td);
 #endif
 	params = (caddr_t)frame->tf_rsp + sizeof(register_t);
 	code = frame->tf_rax;
 	orig_tf_rflags = frame->tf_rflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(frame, (int *)args, &code, &params);
 	} else {
 		if (code == SYS_syscall || code == SYS___syscall) {
 			code = frame->tf_rdi;
 			reg++;
 			regcnt--;
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	KASSERT(narg <= sizeof(args) / sizeof(args[0]),
 	    ("Too many syscall arguments!"));
 	error = 0;
 	argp = &frame->tf_rdi;
 	argp += reg;
 	bcopy(argp, args, sizeof(args[0]) * regcnt);
 	if (narg > regcnt) {
 		KASSERT(params != NULL, ("copyin args with no params!"));
 		error = copyin(params, &args[regcnt],
 	    		(narg - regcnt) * sizeof(args[0]));
 	}
 	argp = &args[0];
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, argp);
 #endif
 
 	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
 	td->td_syscalls++;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		PTRACESTOP_SC(p, td, S_PT_SCE);
 
 		AUDIT_SYSCALL_ENTER(code, td);
 		error = (*callp->sy_call)(td, argp);
 		AUDIT_SYSCALL_EXIT(error, td);
 	}
 
 	switch (error) {
 	case 0:
 		frame->tf_rax = td->td_retval[0];
 		frame->tf_rdx = td->td_retval[1];
 		frame->tf_rflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, we know that 'syscall' is 2 bytes.
 		 * We have to do a full context restore so that %r10
 		 * (which was holding the value of %rcx) is restored for
 		 * the next iteration.
 		 */
 		frame->tf_rip -= frame->tf_err;
 		frame->tf_r10 = frame->tf_rcx;
 		td->td_pcb->pcb_flags |= PCB_FULLCTX;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame->tf_rax = error;
 		frame->tf_rflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.
 	 */
 	if (orig_tf_rflags & PSL_T) {
 		frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	/*
 	 * Check for misbehavior.
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	KASSERT(td->td_critnest == 0,
 	    ("System call %s returning in a critical section",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"));
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???",
 	    td->td_locks));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 	PTRACESTOP_SC(p, td, S_PT_SCX);
 }
Index: head/sys/amd64/ia32/ia32_syscall.c
===================================================================
--- head/sys/amd64/ia32/ia32_syscall.c	(revision 170516)
+++ head/sys/amd64/ia32/ia32_syscall.c	(revision 170517)
@@ -1,284 +1,279 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_isa.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ptrace.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t IDTVEC(int0x80_syscall), IDTVEC(rsvd);
 extern const char *freebsd32_syscallnames[];
 
 void ia32_syscall(struct trapframe *frame);	/* Called from asm code */
 
 void
 ia32_syscall(struct trapframe *frame)
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_rflags;
 	int error;
 	int narg;
 	u_int32_t args[8];
 	u_int64_t args64[8];
 	u_int code;
 	ksiginfo_t ksi;
 
-	/*
-	 * note: PCPU_INC() can only be used if we can afford
-	 * occassional inaccuracy in the count.
-	 */
 	PCPU_INC(cnt.v_syscall);
-
 	td->td_pticks = 0;
 	td->td_frame = frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 	params = (caddr_t)frame->tf_rsp + sizeof(u_int32_t);
 	code = frame->tf_rax;
 	orig_tf_rflags = frame->tf_rflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(frame, args, &code, &params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword32(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 * We use a 32-bit fetch in case params is not
 			 * aligned.
 			 */
 			code = fuword32(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	if (params != NULL && narg != 0)
 		error = copyin(params, (caddr_t)args,
 		    (u_int)(narg * sizeof(int)));
 	else
 		error = 0;
 
 	for (i = 0; i < narg; i++)
 		args64[i] = args[i];
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, args64);
 #endif
 	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_rdx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		PTRACESTOP_SC(p, td, S_PT_SCE);
 
 		AUDIT_SYSCALL_ENTER(code, td);
 		error = (*callp->sy_call)(td, args64);
 		AUDIT_SYSCALL_EXIT(error, td);
 	}
 
 	switch (error) {
 	case 0:
 		frame->tf_rax = td->td_retval[0];
 		frame->tf_rdx = td->td_retval[1];
 		frame->tf_rflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame->tf_rip -= frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame->tf_rax = error;
 		frame->tf_rflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.
 	 */
 	if (orig_tf_rflags & PSL_T) {
 		frame->tf_rflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_rip;
 		trapsignal(td, &ksi);
 	}
 
 	/*
 	 * Check for misbehavior.
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???");
 	KASSERT(td->td_critnest == 0,
 	    ("System call %s returning in a critical section",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???"));
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? freebsd32_syscallnames[code] : "???",
 	    td->td_locks));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
  
 	PTRACESTOP_SC(p, td, S_PT_SCX);
 }
 
 
 static void
 ia32_syscall_enable(void *dummy)
 {
 
  	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0);
 }
 
 static void
 ia32_syscall_disable(void *dummy)
 {
 
  	setidt(IDT_SYSCALL, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
 }
 
 SYSINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_enable, NULL);
 SYSUNINIT(ia32_syscall, SI_SUB_EXEC, SI_ORDER_ANY, ia32_syscall_disable, NULL);
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 170516)
+++ head/sys/i386/i386/trap.c	(revision 170517)
@@ -1,1094 +1,1090 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_clock.h"
 #include "opt_cpu.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_isa.h"
 #include "opt_kdb.h"
 #include "opt_ktrace.h"
 #include "opt_npx.h"
 #include "opt_trap.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/ptrace.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/intr_machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/tss.h>
 #include <machine/vm86.h>
 
 #ifdef POWERFAIL_NMI
 #include <sys/syslog.h>
 #include <machine/clock.h>
 #endif
 
 extern void trap(struct trapframe *frame);
 extern void syscall(struct trapframe *frame);
 
 static int trap_pfault(struct trapframe *, int, vm_offset_t);
 static void trap_fatal(struct trapframe *, vm_offset_t);
 void dblfault_handler(void);
 
 extern inthand_t IDTVEC(lcall_syscall);
 
 #define MAX_TRAP_MSG		30
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"",					/*  7 unused */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 	"machine check trap",			/* 28 T_MCHK */
 	"SIMD floating-point exception",	/* 29 T_XMMFLT */
 	"reserved (unknown) fault",		/* 30 T_RESERVED */
 };
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 #ifdef KDB
 static int kdb_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
 	&kdb_on_nmi, 0, "Go to KDB on NMI");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
 	&panic_on_nmi, 0, "Panic on NMI");
 
 extern char *syscallnames[];
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(struct trapframe *frame)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	int i = 0, ucode = 0, type, code;
 	register_t addr = 0;
 	vm_offset_t eva;
 	ksiginfo_t ksi;
 #ifdef POWERFAIL_NMI
 	static int lastalert = 0;
 #endif
 
 	PCPU_INC(cnt.v_trap);
 	type = frame->tf_trapno;
 
 #ifdef SMP
 #ifdef STOP_NMI
 	/* Handler for NMI IPIs used for stopping CPUs. */
 	if (type == T_NMI) {
 	         if (ipi_nmi_handler() == 0)
 	                   goto out;
 	}
 #endif /* STOP_NMI */
 #endif /* SMP */
 
 #ifdef KDB
 	if (kdb_active) {
 		kdb_reenter();
 		goto out;
 	}
 #endif
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * CPU PMCs interrupt using an NMI so we check for that first.
 	 * If the HWPMC module is active, 'pmc_hook' will point to
 	 * the function to be called.  A return value of '1' from the
 	 * hook means that the NMI was handled by it and that we can
 	 * return immediately.
 	 */
 	if (type == T_NMI && pmc_intr &&
 	    (*pmc_intr)(PCPU_GET(cpuid), (uintptr_t) frame->tf_eip,
 		TRAPF_USERMODE(frame)))
 	    goto out;
 #endif
 
 	if ((frame->tf_eflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled
 		 * interrupts and then trapped.  Enabling interrupts
 		 * now is wrong, but it is better than running with
 		 * interrupts disabled until they are accidentally
 		 * enabled later.
 		 */
 		if (ISPL(frame->tf_cs) == SEL_UPL || (frame->tf_eflags & PSL_VM))
 			printf(
 			    "pid %ld (%s): trap %d with interrupts disabled\n",
 			    (long)curproc->p_pid, curproc->p_comm, type);
 		else if (type != T_BPTFLT && type != T_TRCTRAP &&
 			 frame->tf_eip != (int)cpu_switch_load_gs) {
 			/*
 			 * XXX not quite right, since this may be for a
 			 * multiple fault in user mode.
 			 */
 			printf("kernel trap %d with interrupts disabled\n",
 			    type);
 			/*
 			 * Page faults need interrupts disabled until later,
 			 * and we shouldn't enable interrupts while holding
 			 * a spin lock or if servicing an NMI.
 			 */
 			if (type != T_NMI && type != T_PAGEFLT &&
 			    td->td_md.md_spinlock_count == 0)
 				enable_intr();
 		}
 	}
 
 	eva = 0;
 	code = frame->tf_err;
 	if (type == T_PAGEFLT) {
 		/*
 		 * For some Cyrix CPUs, %cr2 is clobbered by
 		 * interrupts.  This problem is worked around by using
 		 * an interrupt gate for the pagefault handler.  We
 		 * are finally ready to read %cr2 and then must
 		 * reenable interrupts.
 		 *
 		 * If we get a page fault while in a critical section, then
 		 * it is most likely a fatal kernel page fault.  The kernel
 		 * is already going to panic trying to get a sleep lock to
 		 * do the VM lookup, so just consider it a fatal trap so the
 		 * kernel can print out a useful trap message and even get
 		 * to the debugger.
 		 *
 		 * If we get a page fault while holding a non-sleepable
 		 * lock, then it is most likely a fatal kernel page fault.
 		 * If WITNESS is enabled, then it's going to whine about
 		 * bogus LORs with various VM locks, so just skip to the
 		 * fatal trap handling directly.
 		 */
 		eva = rcr2();
 		if (td->td_critnest != 0 ||
 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
 		    "Kernel page fault") != 0)
 			trap_fatal(frame, eva);
 		else
 			enable_intr();
 	}
 
         if ((ISPL(frame->tf_cs) == SEL_UPL) ||
 	    ((frame->tf_eflags & PSL_VM) && 
 		!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) {
 		/* user trap */
 
 		td->td_pticks = 0;
 		td->td_frame = frame;
 		addr = frame->tf_eip;
 		if (td->td_ucred != p->p_ucred) 
 			cred_update_thread(td);
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			i = SIGILL;
 			ucode = ILL_PRVOPC;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			enable_intr();
 			frame->tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 #ifdef DEV_NPX
 			ucode = npxtrap();
 			if (ucode == -1)
 				goto userout;
 #else
 			ucode = 0;
 #endif
 			i = SIGFPE;
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i == 0)
 					goto user;
 				break;
 			}
 			i = SIGBUS;
 			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
 			break;
 		case T_SEGNPFLT:	/* segment not present fault */
 			i = SIGBUS;
 			ucode = BUS_ADRERR;
 			break;
 		case T_TSSFLT:		/* invalid TSS fault */
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			i = SIGBUS;
 			ucode = BUS_OBJERR;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 #ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				thread_user_enter(td);
 #endif
 
 			i = trap_pfault(frame, TRUE, eva);
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 			if (i == -2) {
 				/*
 				 * The f00f hack workaround has triggered, so
 				 * treat the fault as an illegal instruction 
 				 * (T_PRIVINFLT) instead of a page fault.
 				 */
 				type = frame->tf_trapno = T_PRIVINFLT;
 
 				/* Proceed as in that case. */
 				ucode = ILL_PRVOPC;
 				i = SIGILL;
 				break;
 			}
 #endif
 			if (i == -1)
 				goto userout;
 			if (i == 0)
 				goto user;
 
 			if (i == SIGSEGV)
 				ucode = SEGV_MAPERR;
 			else {
 				i = SIGSEGV; /* XXX hack */
 				ucode = SEGV_ACCERR;
 			}
 			addr = eva;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV;
 			i = SIGFPE;
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto userout;
 #else /* !POWERFAIL_NMI */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			/* XXX Giant */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto userout;
 			} else if (panic_on_nmi)
 				panic("NMI indicates hardware failure");
 			break;
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_FLTSUB;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/* transparent fault (due to context switch "late") */
 			if (npxdna())
 				goto userout;
 #endif
 			printf("pid %d killed due to lack of floating point\n",
 				p->p_pid);
 			i = SIGKILL;
 			ucode = 0;
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = ILL_COPROC;
 			i = SIGILL;
 			break;
 
 		case T_XMMFLT:		/* SIMD floating-point exception */
 			ucode = 0; /* XXX */
 			i = SIGFPE;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		KASSERT(cold || td->td_ucred != NULL,
 		    ("kernel trap doesn't have ucred"));
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(frame, FALSE, eva);
 			goto out;
 
 		case T_DNA:
 #ifdef DEV_NPX
 			/*
 			 * The kernel is apparently using npx for copying.
 			 * XXX this should be fatal unless the kernel has
 			 * registered such use.
 			 */
 			if (npxdna())
 				goto out;
 #endif
 			break;
 
 			/*
 			 * The following two traps can happen in
 			 * vm86 mode, and, if so, we want to handle
 			 * them specially.
 			 */
 		case T_PROTFLT:		/* general protection fault */
 		case T_STKFLT:		/* stack fault */
 			if (frame->tf_eflags & PSL_VM) {
 				i = vm86_emulate((struct vm86frame *)frame);
 				if (i != 0)
 					/*
 					 * returns to original process
 					 */
 					vm86_trap((struct vm86frame *)frame);
 				goto out;
 			}
 			if (type == T_STKFLT)
 				break;
 
 			/* FALL THROUGH */
 
 		case T_SEGNPFLT:	/* segment not present fault */
 			if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)
 				break;
 
 			/*
 			 * Invalid %fs's and %gs's can be created using
 			 * procfs or PT_SETREGS or by invalidating the
 			 * underlying LDT entry.  This causes a fault
 			 * in kernel mode when the kernel attempts to
 			 * switch contexts.  Lose the bad context
 			 * (XXX) so that we can continue, and generate
 			 * a signal.
 			 */
 			if (frame->tf_eip == (int)cpu_switch_load_gs) {
 				PCPU_GET(curpcb)->pcb_gs = 0;
 #if 0				
 				PROC_LOCK(p);
 				psignal(p, SIGBUS);
 				PROC_UNLOCK(p);
 #endif				
 				goto out;
 			}
 
 			if (td->td_intr_nesting_level != 0)
 				break;
 
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 			if (frame->tf_eip == (int)doreti_iret) {
 				frame->tf_eip = (int)doreti_iret_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_ds) {
 				frame->tf_eip = (int)doreti_popl_ds_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_es) {
 				frame->tf_eip = (int)doreti_popl_es_fault;
 				goto out;
 			}
 			if (frame->tf_eip == (int)doreti_popl_fs) {
 				frame->tf_eip = (int)doreti_popl_fs_fault;
 				goto out;
 			}
 			if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
 				frame->tf_eip =
 				    (int)PCPU_GET(curpcb)->pcb_onfault;
 				goto out;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame->tf_eflags & PSL_NT) {
 				frame->tf_eflags &= ~PSL_NT;
 				goto out;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				goto out;
 			}
 			if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame->tf_eflags &= ~PSL_T;
 				goto out;
 			}
 			/*
 			 * Ignore debug register trace traps due to
 			 * accesses in the user's address space, which
 			 * can happen under several conditions such as
 			 * if a user sets a watchpoint on a buffer and
 			 * then passes that buffer to a system call.
 			 * We still want to get TRCTRAPS for addresses
 			 * in kernel space because that is useful when
 			 * debugging the kernel.
 			 */
 			/* XXX Giant */
 			if (user_dbreg_trap() && 
 			   !(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) {
 				/*
 				 * Reset breakpoint bits because the
 				 * processor doesn't
 				 */
 				load_dr6(rdr6() & 0xfffffff0);
 				goto out;
 			}
 			/*
 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
 			 */
 		case T_BPTFLT:
 			/*
 			 * If KDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef KDB
 			if (kdb_trap(type, 0, frame))
 				goto out;
 #endif
 			break;
 
 #ifdef DEV_ISA
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			mtx_lock(&Giant);
 			if (time_second - lastalert > 10) {
 				log(LOG_WARNING, "NMI: power fail\n");
 				sysbeep(TIMER_FREQ/880, hz);
 				lastalert = time_second;
 			}
 			mtx_unlock(&Giant);
 			goto out;
 #else /* !POWERFAIL_NMI */
 			/* XXX Giant */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) {
 #ifdef KDB
 				/*
 				 * NMI can be hooked up to a pushbutton
 				 * for debugging.
 				 */
 				if (kdb_on_nmi) {
 					printf ("NMI ... going to debugger\n");
 					kdb_trap(type, 0, frame);
 				}
 #endif /* KDB */
 				goto out;
 			} else if (panic_on_nmi == 0)
 				goto out;
 			/* FALLTHROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* DEV_ISA */
 		}
 
 		trap_fatal(frame, eva);
 		goto out;
 	}
 
 	/* Translate fault for emulators (e.g. Linux) */
 	if (*p->p_sysent->sv_transtrap)
 		i = (*p->p_sysent->sv_transtrap)(i, type);
 
 	ksiginfo_init_trap(&ksi);
 	ksi.ksi_signo = i;
 	ksi.ksi_code = ucode;
 	ksi.ksi_addr = (void *)addr;
 	ksi.ksi_trapno = type;
 	trapsignal(td, &ksi);
 
 #ifdef DEBUG
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%lx", (u_long)eva);
 		uprintf("\n");
 	}
 #endif
 
 user:
 	userret(td, frame);
 	mtx_assert(&Giant, MA_NOTOWNED);
 userout:
 out:
 	return;
 }
 
 static int
 trap_pfault(frame, usermode, eva)
 	struct trapframe *frame;
 	int usermode;
 	vm_offset_t eva;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
 		 * F00F bug workaround was triggered, and we need to
 		 * treat it is as an illegal instruction, and not a page
 		 * fault.
 		 */
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
 			return -2;
 #endif
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	/*
 	 * PGEX_I is defined only if the execute disable bit capability is
 	 * supported and enabled.
 	 */
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_WRITE;
 #ifdef PAE
 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
 		ftype = VM_PROT_EXECUTE;
 #endif
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		PROC_LOCK(p);
 		++p->p_lock;
 		PROC_UNLOCK(p);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
 			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
 						      : VM_FAULT_NORMAL);
 
 		PROC_LOCK(p);
 		--p->p_lock;
 		PROC_UNLOCK(p);
 	} else {
 		/*
 		 * Don't have to worry about process locking or stacks in the
 		 * kernel.
 		 */
 		rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
 	}
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (td->td_intr_nesting_level == 0 &&
 		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame, eva);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame, eva)
 	struct trapframe *frame;
 	vm_offset_t eva;
 {
 	int code, type, ss, esp;
 	struct soft_segment_descriptor softseg;
 	char *msg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		msg = trap_msg[type];
 	else
 		msg = "UNKNOWN";
 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
 	    frame->tf_eflags & PSL_VM ? "vm86" :
 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n",
 	       frame->tf_cs & 0xffff, frame->tf_eip);
         if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) {
 		ss = frame->tf_ss & 0xffff;
 		esp = frame->tf_esp;
 	} else {
 		ss = GSEL(GDATA_SEL, SEL_KPL);
 		esp = (int)&frame->tf_esp;
 	}
 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
 	       softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n", (u_long)curproc->p_pid, curproc->p_comm);
 	} else {
 		printf("Idle\n");
 	}
 
 #ifdef KDB
 	if (debugger_on_panic || kdb_active) {
 		frame->tf_err = eva;	/* smuggle fault address to ddb */
 		if (kdb_trap(type, 0, frame)) {
 			frame->tf_err = code;	/* restore error code */
 			return;
 		}
 		frame->tf_err = code;		/* restore error code */
 	}
 #endif
 	printf("trap number		= %d\n", type);
 	if (type <= MAX_TRAP_MSG)
 		panic("%s", trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	printf("\nFatal double fault:\n");
 	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
 	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
 	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
 #ifdef SMP
 	/* two separate prints in case of a trap on an unmapped page */
 	printf("cpuid = %d; ", PCPU_GET(cpuid));
 	printf("apic id = %02x\n", PCPU_GET(apic_id));
 #endif
 	panic("double fault");
 }
 
 /*
  *	syscall -	system call request C handler
  *
  *	A system call is essentially treated as a trap.
  */
 void
 syscall(struct trapframe *frame)
 {
 	caddr_t params;
 	struct sysent *callp;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	register_t orig_tf_eflags;
 	int error;
 	int narg;
 	int args[8];
 	u_int code;
 	ksiginfo_t ksi;
 
-	/*
-	 * note: PCPU_INC() can only be used if we can afford
-	 * occassional inaccuracy in the count.
-	 */
 	PCPU_INC(cnt.v_syscall);
 
 #ifdef DIAGNOSTIC
 	if (ISPL(frame->tf_cs) != SEL_UPL) {
 		panic("syscall");
 		/* NOT REACHED */
 	}
 #endif
 
 	td->td_pticks = 0;
 	td->td_frame = frame;
 	if (td->td_ucred != p->p_ucred) 
 		cred_update_thread(td);
 #ifdef KSE
 	if (p->p_flag & P_SA)
 		thread_user_enter(td);
 #endif
 	params = (caddr_t)frame->tf_esp + sizeof(int);
 	code = frame->tf_eax;
 	orig_tf_eflags = frame->tf_eflags;
 
 	if (p->p_sysent->sv_prepsyscall) {
 		/*
 		 * The prep code is MP aware.
 		 */
 		(*p->p_sysent->sv_prepsyscall)(frame, args, &code, &params);
 	} else {
 		/*
 		 * Need to check if this is a 32 bit or 64 bit syscall.
 		 * fuword is MP aware.
 		 */
 		if (code == SYS_syscall) {
 			/*
 			 * Code is first argument, followed by actual args.
 			 */
 			code = fuword(params);
 			params += sizeof(int);
 		} else if (code == SYS___syscall) {
 			/*
 			 * Like syscall, but code is a quad, so as to maintain
 			 * quad alignment for the rest of the arguments.
 			 */
 			code = fuword(params);
 			params += sizeof(quad_t);
 		}
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	narg = callp->sy_narg;
 
 	/*
 	 * copyin and the ktrsyscall()/ktrsysret() code is MP-aware
 	 */
 	if (params != NULL && narg != 0)
 		error = copyin(params, (caddr_t)args,
 		    (u_int)(narg * sizeof(int)));
 	else
 		error = 0;
 		
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSCALL))
 		ktrsyscall(code, narg, args);
 #endif
 
 	CTR4(KTR_SYSC, "syscall enter thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
 	td->td_syscalls++;
 
 	if (error == 0) {
 		td->td_retval[0] = 0;
 		td->td_retval[1] = frame->tf_edx;
 
 		STOPEVENT(p, S_SCE, narg);
 
 		PTRACESTOP_SC(p, td, S_PT_SCE);
 
 		AUDIT_SYSCALL_ENTER(code, td);
 		error = (*callp->sy_call)(td, args);
 		AUDIT_SYSCALL_EXIT(error, td);
 	}
 
 	switch (error) {
 	case 0:
 		frame->tf_eax = td->td_retval[0];
 		frame->tf_edx = td->td_retval[1];
 		frame->tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
 		 * int 0x80 is 2 bytes. We saved this in tf_err.
 		 */
 		frame->tf_eip -= frame->tf_err;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize) {
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		}
 		frame->tf_eax = error;
 		frame->tf_eflags |= PSL_C;
 		break;
 	}
 
 	/*
 	 * Traced syscall.
 	 */
 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
 		frame->tf_eflags &= ~PSL_T;
 		ksiginfo_init_trap(&ksi);
 		ksi.ksi_signo = SIGTRAP;
 		ksi.ksi_code = TRAP_TRACE;
 		ksi.ksi_addr = (void *)frame->tf_eip;
 		trapsignal(td, &ksi);
 	}
 
 	/*
 	 * Check for misbehavior.
 	 */
 	WITNESS_WARN(WARN_PANIC, NULL, "System call %s returning",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???");
 	KASSERT(td->td_critnest == 0,
 	    ("System call %s returning in a critical section",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???"));
 	KASSERT(td->td_locks == 0,
 	    ("System call %s returning with %d locks held",
 	    (code >= 0 && code < SYS_MAXSYSCALL) ? syscallnames[code] : "???",
 	    td->td_locks));
 
 	/*
 	 * Handle reschedule and other end-of-syscall issues
 	 */
 	userret(td, frame);
 
 	CTR4(KTR_SYSC, "syscall exit thread %p pid %d proc %s code %d", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm, code);
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(code, error, td->td_retval[0]);
 #endif
 
 	/*
 	 * This works because errno is findable through the
 	 * register set.  If we ever support an emulation where this
 	 * is not the case, this code will need to be revisited.
 	 */
 	STOPEVENT(p, S_SCX, code);
 
 	PTRACESTOP_SC(p, td, S_PT_SCX);
 }
 
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 170516)
+++ head/sys/kern/kern_fork.c	(revision 170517)
@@ -1,821 +1,821 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/syscall.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sx.h>
 #include <sys/signalvar.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	struct proc *p2;
 	int error;
 
 	/* Don't allow kernel-only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
 	AUDIT_ARG(fflags, uap->flags);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	return (error);
 }
 
 int	nprocs = 1;		/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	error = sysctl_wire_old_buffer(req, sizeof(int));
 	if (error != 0)
 		return(error);
 	sx_xlock(&allproc_lock);
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 			pid = PID_MAX - 100;
 		else if (pid < 2)			/* NOP */
 			pid = 0;
 		else if (pid < 100)			/* Make it reasonable */
 			pid = 100;
 		randompid = pid;
 	}
 	sx_xunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 int
 fork1(td, flags, pages, procp)
 	struct thread *td;
 	int flags;
 	int pages;
 	struct proc **procp;
 {
 	struct proc *p1, *p2, *pptr;
 	struct proc *newproc;
 	int ok, trypid;
 	static int curfail, pidchecked = 0;
 	static struct timeval lastfail;
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
 	struct sigacts *newsigacts;
 	int error;
 
 	/* Can't copy and clear. */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	p1 = td->td_proc;
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			if (thread_single(SINGLE_BOUNDARY)) {
 				PROC_UNLOCK(p1);
 				return (ERESTART);
 			}
 			PROC_UNLOCK(p1);
 		}
 
 		vm_forkproc(td, NULL, NULL, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td->td_proc->p_fd);
 			fdfree(td);
 			p1->p_fd = fdtmp;
 		}
 
 		/*
 		 * Unshare file descriptors (from parent).
 		 */
 		if (flags & RFFDG) 
 			fdunshare(p1, td);
 
 		if ((p1->p_flag & P_HADTHREADS) &&
 		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			thread_single_end();
 			PROC_UNLOCK(p1);
 		}
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Note 1:1 allows for forking with one thread coming out on the
 	 * other side with the expectation that the process is about to
 	 * exec.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		/*
 		 * Idle the other threads for a second.
 		 * Since the user space is copied, it must remain stable.
 		 * In addition, all threads (from the user perspective)
 		 * need to either be suspended or in the kernel,
 		 * where they will try restart in the parent and will
 		 * be aborted in the child.
 		 */
 		PROC_LOCK(p1);
 		if (thread_single(SINGLE_NO_EXIT)) {
 			/* Abort. Someone else is single threading before us. */
 			PROC_UNLOCK(p1);
 			return (ERESTART);
 		}
 		PROC_UNLOCK(p1);
 		/*
 		 * All other activity in this process
 		 * is now suspended at the user boundary,
 		 * (or other safe places if we think of any).
 		 */
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
 	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last ten processes; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
 	if ((nprocs >= maxproc - 10 &&
 	    priv_check_cred(td->td_ucred, PRIV_MAXPROC, SUSER_RUID) != 0) ||
 	    nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 *
 	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
 	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, SUSER_RUID |
 	    SUSER_ALLOWJAIL);
 	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
 		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (!ok) {
 		error = EAGAIN;
 		goto fail;
 	}
 
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10)
 			trypid = 10;
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			while (p2->p_pid == trypid ||
 			    (p2->p_pgrp != NULL &&
 			    (p2->p_pgrp->pg_id == trypid ||
 			    (p2->p_session != NULL &&
 			    p2->p_session->s_sid == trypid)))) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp != NULL) {
 				if (p2->p_pgrp->pg_id > trypid &&
 				    pidchecked > p2->p_pgrp->pg_id)
 					pidchecked = p2->p_pgrp->pg_id;
 				if (p2->p_session != NULL &&
 				    p2->p_session->s_sid > trypid &&
 				    pidchecked > p2->p_session->s_sid)
 					pidchecked = p2->p_session->s_sid;
 			}
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 	sx_sunlock(&proctree_lock);
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	td2 = FIRST_THREAD_IN_PROC(newproc);
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
 	/*
 	 * Allow the scheduler to initialize the child.
 	 */
 	thread_lock(td);
 	sched_fork(td, td2);
 	thread_unlock(td);
 	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	sx_xunlock(&allproc_lock);
 
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	PROC_UNLOCK(p1);
 
 	bzero(&p2->p_startzero,
 	    __rangeof(struct proc, p_startzero, p_endzero));
 
 	p2->p_ucred = crhold(td->td_ucred);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
 	if (flags & RFSIGSHARE)
 		newsigacts = NULL;
 	else
 		newsigacts = sigacts_alloc();
 
 	/*
 	 * Copy filedesc.
 	 */
 	if (flags & RFCFDG) {
 		fd = fdinit(p1->p_fd);
 		fdtol = NULL;
 	} else if (flags & RFFDG) {
 		fd = fdcopy(p1->p_fd);
 		fdtol = NULL;
 	} else {
 		fd = fdshare(p1->p_fd);
 		if (p1->p_fdtol == NULL)
 			p1->p_fdtol =
 				filedesc_to_leader_alloc(NULL,
 							 NULL,
 							 p1->p_leader);
 		if ((flags & RFTHREAD) != 0) {
 			/*
 			 * Shared file descriptor table and
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
 			FILEDESC_XLOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
 			FILEDESC_XUNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
 			 * different process leaders 
 			 */
 			fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
 							 p1->p_fd,
 							 p2);
 		}
 	}
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	/* Allocate and switch to an alternate kstack if specified. */
 	if (pages != 0)
 		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
 
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_sigmask = td->td_sigmask;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
 	p2->p_flag = 0;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
 	PROC_SLOCK(p2);
 	p2->p_sflag = PS_INMEM;
 	PROC_SUNLOCK(p2);
 	td2->td_ucred = crhold(p2->p_ucred);
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
 		p2->p_sigacts = sigacts_hold(p1->p_sigacts);
 	} else {
 		sigacts_copy(newsigacts, p1->p_sigacts);
 		p2->p_sigacts = newsigacts;
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	p2->p_textvp = p1->p_textvp;
 	p2->p_fd = fd;
 	p2->p_fdtol = fdtol;
 
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
 	lim_fork(p1, p2);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/* Bump references to the text vnode (for procfs) */
 	if (p2->p_textvp)
 		vref(p2->p_textvp);
 
 	/*
 	 * Set up linkage for kernel based threading.
 	 */
 	if ((flags & RFTHREAD) != 0) {
 		mtx_lock(&ppeers_lock);
 		p2->p_peers = p1->p_peers;
 		p1->p_peers = p2;
 		p2->p_leader = p1->p_leader;
 		mtx_unlock(&ppeers_lock);
 		PROC_LOCK(p1->p_leader);
 		if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(p1->p_leader);
 			/*
 			 * The task leader is exiting, so process p1 is
 			 * going to be killed shortly.  Since p1 obviously
 			 * isn't dead yet, we know that the leader is either
 			 * sending SIGKILL's to all the processes in this
 			 * task or is sleeping waiting for all the peers to
 			 * exit.  We let p1 complete the fork, but we need
 			 * to go ahead and kill the new process p2 since
 			 * the task leader may not get a chance to send
 			 * SIGKILL to it.  We leave it on the list so that
 			 * the task leader will wait for this new process
 			 * to commit suicide.
 			 */
 			PROC_LOCK(p2);
 			psignal(p2, SIGKILL);
 			PROC_UNLOCK(p2);
 		} else
 			PROC_UNLOCK(p1->p_leader);
 	} else {
 		p2->p_peers = NULL;
 		p2->p_leader = p2;
 	}
 
 	sx_xlock(&proctree_lock);
 	PGRP_LOCK(p1->p_pgrp);
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
 	/*
 	 * Preserve some more flags in subprocess.  P_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & P_SUGID;
 	td2->td_pflags |= td->td_pflags & TDP_ALTSTACK;
 	SESS_LOCK(p1->p_session);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	SESS_UNLOCK(p1->p_session);
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	p2->p_pgrp = p1->p_pgrp;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PGRP_UNLOCK(p1->p_pgrp);
 	LIST_INIT(&p2->p_children);
 
 	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);
 
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.
 	 */
 	mtx_lock(&ktrace_mtx);
 	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
 	if (p1->p_traceflag & KTRFAC_INHERIT) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
 			VREF(p2->p_tracevp);
 			KASSERT(p1->p_tracecred != NULL,
 			    ("ktrace vnode with no cred"));
 			p2->p_tracecred = crhold(p1->p_tracecred);
 		}
 	}
 	mtx_unlock(&ktrace_mtx);
 #endif
 
 	/*
 	 * If PF_FORK is set, the child process inherits the
 	 * procfs ioctl flags from its parent.
 	 */
 	if (p1->p_pfsflags & PF_FORK) {
 		p2->p_stops = p1->p_stops;
 		p2->p_pfsflags = p1->p_pfsflags;
 	}
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	p2->p_pptr = pptr;
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 
 	/* Inform accounting that we have forked. */
 	p2->p_acflag = AFORK;
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, td2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
-		atomic_add_int(&cnt.v_forks, 1);
-		atomic_add_int(&cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_forks);
+		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
-		atomic_add_int(&cnt.v_vforks, 1);
-		atomic_add_int(&cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_vforks);
+		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
-		atomic_add_int(&cnt.v_kthreads, 1);
-		atomic_add_int(&cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_kthreads);
+		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
-		atomic_add_int(&cnt.v_rforks, 1);
-		atomic_add_int(&cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_rforks);
+		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);
 
 	/*
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
 	PROC_SLOCK(p2);
 	p2->p_state = PRS_NORMAL;
 	PROC_SUNLOCK(p2);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
 		thread_lock(td2);
 		TD_SET_CAN_RUN(td2);
 		sched_add(td2, SRQ_BORING);
 		thread_unlock(td2);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * Tell any interested parties about the new process.
 	 */
 	KNOTE_LOCKED(&p1->p_klist, NOTE_FORK | p2->p_pid);
 
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * If other threads are waiting, let them continue now.
 	 */
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
 	return (0);
 fail:
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
 		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_destroy_proc(newproc);
 #endif
 	uma_zfree(proc_zone, newproc);
 	if (p1->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p1);
 		thread_single_end();
 		PROC_UNLOCK(p1);
 	}
 	pause("fork", hz / 2);
 	return (error);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct proc *p;
 	struct thread *td;
 
 	td = curthread;
 	p = td->td_proc;
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
 	sched_fork_exit(td);
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
 	 * have this call a non-return function to stay in kernel mode.
 	 * initproc has its own fork handler, but it does return.
 	 */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	if (p->p_flag & P_KTHREAD) {
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	EVENTHANDLER_INVOKE(schedtail, p);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/sys/vmmeter.h
===================================================================
--- head/sys/sys/vmmeter.h	(revision 170516)
+++ head/sys/sys/vmmeter.h	(revision 170517)
@@ -1,210 +1,216 @@
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vmmeter.h	8.2 (Berkeley) 7/10/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_VMMETER_H_
 #define _SYS_VMMETER_H_
 
 /*
  * System wide statistics counters.
+ * Locking:
+ *      a - locked by atomic operations
+ *      c - constant after initialization
+ *      f - locked by vm_page_queue_free_mtx
+ *      p - locked by being in the PCPU and atomicity respect to interrupts
+ *      q - locked by vm_page_queue_mtx
  */
 struct vmmeter {
 	/*
 	 * General system activity.
 	 */
-	u_int v_swtch;		/* context switches */
-	u_int v_trap;		/* calls to trap */
-	u_int v_syscall;	/* calls to syscall() */
-	u_int v_intr;		/* device interrupts */
-	u_int v_soft;		/* software interrupts */
+	u_int v_swtch;		/* (p) context switches */
+	u_int v_trap;		/* (p) calls to trap */
+	u_int v_syscall;	/* (p) calls to syscall() */
+	u_int v_intr;		/* (p) device interrupts */
+	u_int v_soft;		/* (p) software interrupts */
 	/*
 	 * Virtual memory activity.
 	 */
-	u_int v_vm_faults;	/* number of address memory faults */
-	u_int v_cow_faults;	/* number of copy-on-writes */
-	u_int v_cow_optim;	/* number of optimized copy-on-writes */
-	u_int v_zfod;		/* pages zero filled on demand */
-	u_int v_ozfod;		/* optimized zero fill pages */
-	u_int v_swapin;		/* swap pager pageins */
-	u_int v_swapout;	/* swap pager pageouts */
-	u_int v_swappgsin;	/* swap pager pages paged in */
-	u_int v_swappgsout;	/* swap pager pages paged out */
-	u_int v_vnodein;	/* vnode pager pageins */
-	u_int v_vnodeout;	/* vnode pager pageouts */
-	u_int v_vnodepgsin;	/* vnode_pager pages paged in */
-	u_int v_vnodepgsout;	/* vnode pager pages paged out */
-	u_int v_intrans;	/* intransit blocking page faults */
-	u_int v_reactivated;	/* number of pages reactivated from free list */
-	u_int v_pdwakeups;	/* number of times daemon has awaken from sleep */
-	u_int v_pdpages;	/* number of pages analyzed by daemon */
+	u_int v_vm_faults;	/* (p) address memory faults */
+	u_int v_cow_faults;	/* (p) copy-on-writes faults */
+	u_int v_cow_optim;	/* (p) optimized copy-on-writes faults */
+	u_int v_zfod;		/* (p) pages zero filled on demand */
+	u_int v_ozfod;		/* (p) optimized zero fill pages */
+	u_int v_swapin;		/* (p) swap pager pageins */
+	u_int v_swapout;	/* (p) swap pager pageouts */
+	u_int v_swappgsin;	/* (p) swap pager pages paged in */
+	u_int v_swappgsout;	/* (p) swap pager pages paged out */
+	u_int v_vnodein;	/* (p) vnode pager pageins */
+	u_int v_vnodeout;	/* (p) vnode pager pageouts */
+	u_int v_vnodepgsin;	/* (p) vnode_pager pages paged in */
+	u_int v_vnodepgsout;	/* (p) vnode pager pages paged out */
+	u_int v_intrans;	/* (p) intransit blocking page faults */
+	u_int v_reactivated;	/* (q) pages reactivated from free list */
+	u_int v_pdwakeups;	/* (f) times daemon has awaken from sleep */
+	u_int v_pdpages;	/* (q) pages analyzed by daemon */
 
-	u_int v_dfree;		/* pages freed by daemon */
-	u_int v_pfree;		/* pages freed by exiting processes */
-	u_int v_tfree;		/* total pages freed */
+	u_int v_dfree;		/* (q) pages freed by daemon */
+	u_int v_pfree;		/* (q) pages freed by exiting processes */
+	u_int v_tfree;		/* (p) total pages freed */
 	/*
 	 * Distribution of page usages.
 	 */
-	u_int v_page_size;	/* page size in bytes */
-	u_int v_page_count;	/* total number of pages in system */
-	u_int v_free_reserved;	/* number of pages reserved for deadlock */
-	u_int v_free_target;	/* number of pages desired free */
-	u_int v_free_min;	/* minimum number of pages desired free */
-	u_int v_free_count;	/* number of pages free */
-	u_int v_wire_count;	/* number of pages wired down */
-	u_int v_active_count;	/* number of pages active */
-	u_int v_inactive_target; /* number of pages desired inactive */
-	u_int v_inactive_count;	/* number of pages inactive */
-	u_int v_cache_count;	/* number of pages on buffer cache queue */
-	u_int v_cache_min;	/* min number of pages desired on cache queue */
-	u_int v_cache_max;	/* max number of pages in cached obj */
-	u_int v_pageout_free_min;   /* min number pages reserved for kernel */
-	u_int v_interrupt_free_min; /* reserved number of pages for int code */
-	u_int v_free_severe;	/* severe depletion of pages below this pt */
+	u_int v_page_size;	/* (c) page size in bytes */
+	u_int v_page_count;	/* (c) total number of pages in system */
+	u_int v_free_reserved;	/* (c) pages reserved for deadlock */
+	u_int v_free_target;	/* (c) pages desired free */
+	u_int v_free_min;	/* (c) pages desired free */
+	u_int v_free_count;	/* (f) pages free */
+	u_int v_wire_count;	/* (a) pages wired down */
+	u_int v_active_count;	/* (q) pages active */
+	u_int v_inactive_target; /* (c) pages desired inactive */
+	u_int v_inactive_count;	/* (q) pages inactive */
+	u_int v_cache_count;	/* (q) pages on buffer cache queue */
+	u_int v_cache_min;	/* (c) min pages desired on cache queue */
+	u_int v_cache_max;	/* (c) max pages in cached obj */
+	u_int v_pageout_free_min;   /* (c) min pages reserved for kernel */
+	u_int v_interrupt_free_min; /* (c) reserved pages for int code */
+	u_int v_free_severe;	/* (c) severe page depletion point */
 	/*
 	 * Fork/vfork/rfork activity.
 	 */
-	u_int v_forks;		/* number of fork() calls */
-	u_int v_vforks;		/* number of vfork() calls */
-	u_int v_rforks;		/* number of rfork() calls */
-	u_int v_kthreads;	/* number of fork() calls by kernel */
-	u_int v_forkpages;	/* number of VM pages affected by fork() */
-	u_int v_vforkpages;	/* number of VM pages affected by vfork() */
-	u_int v_rforkpages;	/* number of VM pages affected by rfork() */
-	u_int v_kthreadpages;	/* number of VM pages affected by fork() by kernel */
+	u_int v_forks;		/* (p) fork() calls */
+	u_int v_vforks;		/* (p) vfork() calls */
+	u_int v_rforks;		/* (p) rfork() calls */
+	u_int v_kthreads;	/* (p) fork() calls by kernel */
+	u_int v_forkpages;	/* (p) VM pages affected by fork() */
+	u_int v_vforkpages;	/* (p) VM pages affected by vfork() */
+	u_int v_rforkpages;	/* (p) VM pages affected by rfork() */
+	u_int v_kthreadpages;	/* (p) VM pages affected by fork() by kernel */
 };
 #ifdef _KERNEL
 
 extern struct vmmeter cnt;
 
 /*
  * Return TRUE if we are under our reserved low-free-pages threshold
  */
 
 static __inline 
 int
 vm_page_count_reserved(void)
 {
     return (cnt.v_free_reserved > (cnt.v_free_count + cnt.v_cache_count));
 }
 
 /*
  * Return TRUE if we are under our severe low-free-pages threshold
  *
  * This routine is typically used at the user<->system interface to determine
  * whether we need to block in order to avoid a low memory deadlock.
  */
 
 static __inline 
 int
 vm_page_count_severe(void)
 {
     return (cnt.v_free_severe > (cnt.v_free_count + cnt.v_cache_count));
 }
 
 /*
  * Return TRUE if we are under our minimum low-free-pages threshold.
  *
  * This routine is typically used within the system to determine whether
  * we can execute potentially very expensive code in terms of memory.  It
  * is also used by the pageout daemon to calculate when to sleep, when
  * to wake waiters up, and when (after making a pass) to become more
  * desparate.
  */
 
 static __inline 
 int
 vm_page_count_min(void)
 {
     return (cnt.v_free_min > (cnt.v_free_count + cnt.v_cache_count));
 }
 
 /*
  * Return TRUE if we have not reached our free page target during
  * free page recovery operations.
  */
 
 static __inline 
 int
 vm_page_count_target(void)
 {
     return (cnt.v_free_target > (cnt.v_free_count + cnt.v_cache_count));
 }
 
 /*
  * Return the number of pages we need to free-up or cache
  * A positive number indicates that we do not have enough free pages.
  */
 
 static __inline 
 int
 vm_paging_target(void)
 {
     return (
 	(cnt.v_free_target + cnt.v_cache_min) -
 	(cnt.v_free_count + cnt.v_cache_count)
     );
 }
 
 /*
  * Returns TRUE if the pagedaemon needs to be woken up.
  */
 
 static __inline 
 int
 vm_paging_needed(void)
 {
     return (
 	(cnt.v_free_reserved + cnt.v_cache_min) >
 	(cnt.v_free_count + cnt.v_cache_count)
     );
 }
 
 #endif
 
 /* systemwide totals computed every five seconds */
 struct vmtotal {
 	int16_t	t_rq;		/* length of the run queue */
 	int16_t	t_dw;		/* jobs in ``disk wait'' (neg priority) */
 	int16_t	t_pw;		/* jobs in page wait */
 	int16_t	t_sl;		/* jobs sleeping in core */
 	int16_t	t_sw;		/* swapped out runnable/short block jobs */
 	int32_t	t_vm;		/* total virtual memory */
 	int32_t	t_avm;		/* active virtual memory */
 	int32_t	t_rm;		/* total real memory in use */
 	int32_t	t_arm;		/* active real memory */
 	int32_t	t_vmshr;	/* shared virtual memory */
 	int32_t	t_avmshr;	/* active shared virtual memory */
 	int32_t	t_rmshr;	/* shared real memory */
 	int32_t	t_armshr;	/* active shared real memory */
 	int32_t	t_free;		/* free memory pages */
 };
 
 #endif
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 170516)
+++ head/sys/vm/vm_object.c	(revision 170517)
@@ -1,2210 +1,2210 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Virtual memory object module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #define EASY_SCAN_FACTOR       8
 
 #define MSYNC_FLUSH_HARDSEQ	0x01
 #define MSYNC_FLUSH_SOFTSEQ	0x02
 
 /*
  * msync / VM object flushing optimizations
  */
 static int msync_flush_flags = MSYNC_FLUSH_HARDSEQ | MSYNC_FLUSH_SOFTSEQ;
 SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
         CTLFLAG_RW, &msync_flush_flags, 0, "");
 
 static int old_msync;
 SYSCTL_INT(_vm, OID_AUTO, old_msync, CTLFLAG_RW, &old_msync, 0,
     "Use old (insecure) msync behavior");
 
 static void	vm_object_qcollapse(vm_object_t object);
 static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
 static void	vm_object_vndeallocate(vm_object_t object);
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 struct object_q vm_object_list;
 struct mtx vm_object_list_mtx;	/* lock for object list and count */
 
 struct vm_object kernel_object_store;
 struct vm_object kmem_object_store;
 
 SYSCTL_NODE(_vm_stats, OID_AUTO, object, CTLFLAG_RD, 0, "VM object stats");
 
 static long object_collapses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, collapses, CTLFLAG_RD,
     &object_collapses, 0, "VM object collapses");
 
 static long object_bypasses;
 SYSCTL_LONG(_vm_stats_object, OID_AUTO, bypasses, CTLFLAG_RD,
     &object_bypasses, 0, "VM object bypasses");
 
 /*
  * next_index determines the page color that is assigned to the next
  * allocated object.  Accesses to next_index are not synchronized
  * because the effects of two or more object allocations using
  * next_index simultaneously are inconsequential.  At any given time,
  * numerous objects have the same page color.
  */
 static int next_index;
 
 static uma_zone_t obj_zone;
 
 static int vm_object_zinit(void *mem, int size, int flags);
 
 #ifdef INVARIANTS
 static void vm_object_zdtor(void *mem, int size, void *arg);
 
 static void
 vm_object_zdtor(void *mem, int size, void *arg)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	KASSERT(TAILQ_EMPTY(&object->memq),
 	    ("object %p has resident pages",
 	    object));
 	KASSERT(object->paging_in_progress == 0,
 	    ("object %p paging_in_progress = %d",
 	    object, object->paging_in_progress));
 	KASSERT(object->resident_page_count == 0,
 	    ("object %p resident_page_count = %d",
 	    object, object->resident_page_count));
 	KASSERT(object->shadow_count == 0,
 	    ("object %p shadow_count = %d",
 	    object, object->shadow_count));
 }
 #endif
 
 static int
 vm_object_zinit(void *mem, int size, int flags)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)mem;
 	bzero(&object->mtx, sizeof(object->mtx));
 	VM_OBJECT_LOCK_INIT(object, "standard object");
 
 	/* These are true for any object that has been freed */
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->shadow_count = 0;
 	return (0);
 }
 
 void
 _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object)
 {
 	int incr;
 
 	TAILQ_INIT(&object->memq);
 	LIST_INIT(&object->shadow_head);
 
 	object->root = NULL;
 	object->type = type;
 	object->size = size;
 	object->generation = 1;
 	object->ref_count = 1;
 	object->flags = 0;
 	if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP))
 		object->flags = OBJ_ONEMAPPING;
 	incr = PQ_MAXLENGTH;
 	if (size <= incr)
 		incr = size;
 	object->pg_color = next_index;
 	next_index = (object->pg_color + incr) & PQ_COLORMASK;
 	object->handle = NULL;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init(void)
 {
 	TAILQ_INIT(&vm_object_list);
 	mtx_init(&vm_object_list_mtx, "vm object_list", NULL, MTX_DEF);
 	
 	VM_OBJECT_LOCK_INIT(&kernel_object_store, "kernel object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 
 	VM_OBJECT_LOCK_INIT(&kmem_object_store, "kmem object");
 	_vm_object_allocate(OBJT_PHYS, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 
 	/*
 	 * The lock portion of struct vm_object must be type stable due
 	 * to vm_pageout_fallback_object_lock locking a vm object
 	 * without holding any references to it.
 	 */
 	obj_zone = uma_zcreate("VM OBJECT", sizeof (struct vm_object), NULL,
 #ifdef INVARIANTS
 	    vm_object_zdtor,
 #else
 	    NULL,
 #endif
 	    vm_object_zinit, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM|UMA_ZONE_NOFREE);
 }
 
 void
 vm_object_clear_flag(vm_object_t object, u_short bits)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->flags &= ~bits;
 }
 
 void
 vm_object_pip_add(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress += i;
 }
 
 void
 vm_object_pip_subtract(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress -= i;
 }
 
 void
 vm_object_pip_wakeup(vm_object_t object)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	object->paging_in_progress--;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wakeupn(vm_object_t object, short i)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (i)
 		object->paging_in_progress -= i;
 	if ((object->flags & OBJ_PIPWNT) && object->paging_in_progress == 0) {
 		vm_object_clear_flag(object, OBJ_PIPWNT);
 		wakeup(object);
 	}
 }
 
 void
 vm_object_pip_wait(vm_object_t object, char *waitid)
 {
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		msleep(object, VM_OBJECT_MTX(object), PVM, waitid, 0);
 	}
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 vm_object_t
 vm_object_allocate(objtype_t type, vm_pindex_t size)
 {
 	vm_object_t object;
 
 	object = (vm_object_t)uma_zalloc(obj_zone, M_WAITOK);
 	_vm_object_allocate(type, size, object);
 	return (object);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.  Note: OBJ_DEAD
  *	objects can be referenced during final cleaning.
  */
 void
 vm_object_reference(vm_object_t object)
 {
 	struct vnode *vp;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		int vfslocked;
 
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vget(vp, LK_RETRY, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_reference_locked:
  *
  *	Gets another reference to the given object.
  *
  *	The object must be locked.
  */
 void
 vm_object_reference_locked(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT((object->flags & OBJ_DEAD) == 0,
 	    ("vm_object_reference_locked: dead object referenced"));
 	object->ref_count++;
 	if (object->type == OBJT_VNODE) {
 		vp = object->handle;
 		vref(vp);
 	}
 }
 
 /*
  * Handle deallocating an object of type OBJT_VNODE.
  */
 static void
 vm_object_vndeallocate(vm_object_t object)
 {
 	struct vnode *vp = (struct vnode *) object->handle;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	KASSERT(object->type == OBJT_VNODE,
 	    ("vm_object_vndeallocate: not a vnode object"));
 	KASSERT(vp != NULL, ("vm_object_vndeallocate: missing vp"));
 #ifdef INVARIANTS
 	if (object->ref_count == 0) {
 		vprint("vm_object_vndeallocate", vp);
 		panic("vm_object_vndeallocate: bad object reference count");
 	}
 #endif
 
 	object->ref_count--;
 	if (object->ref_count == 0) {
 		mp_fixme("Unlocked vflag access.");
 		vp->v_vflag &= ~VV_TEXT;
 	}
 	VM_OBJECT_UNLOCK(object);
 	/*
 	 * vrele may need a vop lock
 	 */
 	vrele(vp);
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(vm_object_t object)
 {
 	vm_object_t temp;
 
 	while (object != NULL) {
 		int vfslocked;
 
 		vfslocked = 0;
 	restart:
 		VM_OBJECT_LOCK(object);
 		if (object->type == OBJT_VNODE) {
 			struct vnode *vp = (struct vnode *) object->handle;
 
 			/*
 			 * Conditionally acquire Giant for a vnode-backed
 			 * object.  We have to be careful since the type of
 			 * a vnode object can change while the object is
 			 * unlocked.
 			 */
 			if (VFS_NEEDSGIANT(vp->v_mount) && !vfslocked) {
 				vfslocked = 1;
 				if (!mtx_trylock(&Giant)) {
 					VM_OBJECT_UNLOCK(object);
 					mtx_lock(&Giant);
 					goto restart;
 				}
 			}
 			vm_object_vndeallocate(object);
 			VFS_UNLOCK_GIANT(vfslocked);
 			return;
 		} else
 			/*
 			 * This is to handle the case that the object
 			 * changed type while we dropped its lock to
 			 * obtain Giant.
 			 */
 			VFS_UNLOCK_GIANT(vfslocked);
 
 		KASSERT(object->ref_count != 0,
 			("vm_object_deallocate: object deallocated too many times: %d", object->type));
 
 		/*
 		 * If the reference count goes to 0 we start calling
 		 * vm_object_terminate() on the object chain.
 		 * A ref count of 1 may be a special case depending on the
 		 * shadow count being 0 or 1.
 		 */
 		object->ref_count--;
 		if (object->ref_count > 1) {
 			VM_OBJECT_UNLOCK(object);
 			return;
 		} else if (object->ref_count == 1) {
 			if (object->shadow_count == 0) {
 				vm_object_set_flag(object, OBJ_ONEMAPPING);
 			} else if ((object->shadow_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 
 				robject = LIST_FIRST(&object->shadow_head);
 				KASSERT(robject != NULL,
 				    ("vm_object_deallocate: ref_count: %d, shadow_count: %d",
 					 object->ref_count,
 					 object->shadow_count));
 				if (!VM_OBJECT_TRYLOCK(robject)) {
 					/*
 					 * Avoid a potential deadlock.
 					 */
 					object->ref_count++;
 					VM_OBJECT_UNLOCK(object);
 					/*
 					 * More likely than not the thread
 					 * holding robject's lock has lower
 					 * priority than the current thread.
 					 * Let the lower priority thread run.
 					 */
 					pause("vmo_de", 1);
 					continue;
 				}
 				/*
 				 * Collapse object into its shadow unless its
 				 * shadow is dead.  In that case, object will
 				 * be deallocated by the thread that is
 				 * deallocating its shadow.
 				 */
 				if ((robject->flags & OBJ_DEAD) == 0 &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 
 					robject->ref_count++;
 retry:
 					if (robject->paging_in_progress) {
 						VM_OBJECT_UNLOCK(object);
 						vm_object_pip_wait(robject,
 						    "objde1");
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else if (object->paging_in_progress) {
 						VM_OBJECT_UNLOCK(robject);
 						object->flags |= OBJ_PIPWNT;
 						msleep(object,
 						    VM_OBJECT_MTX(object),
 						    PDROP | PVM, "objde2", 0);
 						VM_OBJECT_LOCK(robject);
 						temp = robject->backing_object;
 						if (object == temp) {
 							VM_OBJECT_LOCK(object);
 							goto retry;
 						}
 					} else
 						VM_OBJECT_UNLOCK(object);
 
 					if (robject->ref_count == 1) {
 						robject->ref_count--;
 						object = robject;
 						goto doterm;
 					}
 					object = robject;
 					vm_object_collapse(object);
 					VM_OBJECT_UNLOCK(object);
 					continue;
 				}
 				VM_OBJECT_UNLOCK(robject);
 			}
 			VM_OBJECT_UNLOCK(object);
 			return;
 		}
 doterm:
 		temp = object->backing_object;
 		if (temp != NULL) {
 			VM_OBJECT_LOCK(temp);
 			LIST_REMOVE(object, shadow_list);
 			temp->shadow_count--;
 			temp->generation++;
 			VM_OBJECT_UNLOCK(temp);
 			object->backing_object = NULL;
 		}
 		/*
 		 * Don't double-terminate, we could be in a termination
 		 * recursion due to the terminate having to sync data
 		 * to disk.
 		 */
 		if ((object->flags & OBJ_DEAD) == 0)
 			vm_object_terminate(object);
 		else
 			VM_OBJECT_UNLOCK(object);
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  *	This routine may block.
  */
 void
 vm_object_terminate(vm_object_t object)
 {
 	vm_page_t p;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 
 	/*
 	 * Make sure no one uses us.
 	 */
 	vm_object_set_flag(object, OBJ_DEAD);
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	vm_object_pip_wait(object, "objtrm");
 
 	KASSERT(!object->paging_in_progress,
 		("vm_object_terminate: pageout in progress"));
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = (struct vnode *)object->handle;
 
 		/*
 		 * Clean pages and flush buffers.
 		 */
 		vm_object_page_clean(object, 0, 0, OBJPC_SYNC);
 		VM_OBJECT_UNLOCK(object);
 
 		vinvalbuf(vp, V_SAVE, NULL, 0, 0);
 
 		VM_OBJECT_LOCK(object);
 	}
 
 	KASSERT(object->ref_count == 0, 
 		("vm_object_terminate: object with references, ref_count=%d",
 		object->ref_count));
 
 	/*
 	 * Now free any remaining pages. For internal objects, this also
 	 * removes them from paging queues. Don't free wired pages, just
 	 * remove them from the object. 
 	 */
 	vm_page_lock_queues();
 	while ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		KASSERT(!p->busy && (p->oflags & VPO_BUSY) == 0,
 			("vm_object_terminate: freeing busy page %p "
 			"p->busy = %d, p->flags %x\n", p, p->busy, p->flags));
 		if (p->wire_count == 0) {
 			vm_page_free(p);
-			PCPU_INC(cnt.v_pfree);
+			cnt.v_pfree++;
 		} else {
 			vm_page_remove(p);
 		}
 	}
 	vm_page_unlock_queues();
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 	VM_OBJECT_UNLOCK(object);
 
 	/*
 	 * Remove the object from the global object list.
 	 */
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_REMOVE(&vm_object_list, object, object_list);
 	mtx_unlock(&vm_object_list_mtx);
 
 	/*
 	 * Free the space for the object.
 	 */
 	uma_zfree(obj_zone, object);
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.  Leaves page 
  * 	on whatever queue it is currently on.   If NOSYNC is set then do not
  *	write out pages with VPO_NOSYNC set (originally comes from MAP_NOSYNC),
  *	leaving the object dirty.
  *
  *	When stuffing pages asynchronously, allow clustering.  XXX we need a
  *	synchronous clustering mode implementation.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int flags)
 {
 	vm_page_t p, np;
 	vm_pindex_t tstart, tend;
 	vm_pindex_t pi;
 	int clearobjflags;
 	int pagerflags;
 	int curgeneration;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->type != OBJT_VNODE ||
 		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
 		return;
 
 	pagerflags = (flags & (OBJPC_SYNC | OBJPC_INVAL)) ? VM_PAGER_PUT_SYNC : VM_PAGER_CLUSTER_OK;
 	pagerflags |= (flags & OBJPC_INVAL) ? VM_PAGER_PUT_INVAL : 0;
 
 	vm_object_set_flag(object, OBJ_CLEANING);
 
 	tstart = start;
 	if (end == 0) {
 		tend = object->size;
 	} else {
 		tend = end;
 	}
 
 	vm_page_lock_queues();
 	/*
 	 * If the caller is smart and only msync()s a range he knows is
 	 * dirty, we may be able to avoid an object scan.  This results in
 	 * a phenominal improvement in performance.  We cannot do this
 	 * as a matter of course because the object may be huge - e.g.
 	 * the size might be in the gigabytes or terrabytes.
 	 */
 	if (msync_flush_flags & MSYNC_FLUSH_HARDSEQ) {
 		vm_pindex_t tscan;
 		int scanlimit;
 		int scanreset;
 
 		scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
 		if (scanreset < 16)
 			scanreset = 16;
 		pagerflags |= VM_PAGER_IGNORE_CLEANCHK;
 
 		scanlimit = scanreset;
 		tscan = tstart;
 		while (tscan < tend) {
 			curgeneration = object->generation;
 			p = vm_page_lookup(object, tscan);
 			if (p == NULL || p->valid == 0 ||
 			    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			vm_page_test_dirty(p);
 			if ((p->dirty & p->valid) == 0) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			/*
 			 * If we have been asked to skip nosync pages and 
 			 * this is a nosync page, we can't continue.
 			 */
 			if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 				if (--scanlimit == 0)
 					break;
 				++tscan;
 				continue;
 			}
 			scanlimit = scanreset;
 
 			/*
 			 * This returns 0 if it was unable to busy the first
 			 * page (i.e. had to sleep).
 			 */
 			tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
 		}
 
 		/*
 		 * If everything was dirty and we flushed it successfully,
 		 * and the requested range is not the entire object, we
 		 * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
 		 * return immediately.
 		 */
 		if (tscan >= tend && (tstart || tend < object->size)) {
 			vm_page_unlock_queues();
 			vm_object_clear_flag(object, OBJ_CLEANING);
 			return;
 		}
 		pagerflags &= ~VM_PAGER_IGNORE_CLEANCHK;
 	}
 
 	/*
 	 * Generally set CLEANCHK interlock and make the page read-only so
 	 * we can then clear the object flags.
 	 *
 	 * However, if this is a nosync mmap then the object is likely to 
 	 * stay dirty so do not mess with the page and do not clear the
 	 * object flags.
 	 */
 	clearobjflags = 1;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		p->oflags |= VPO_CLEANCHK;
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC))
 			clearobjflags = 0;
 		else
 			pmap_remove_write(p);
 	}
 
 	if (clearobjflags && (tstart == 0) && (tend == object->size)) {
 		struct vnode *vp;
 
 		vm_object_clear_flag(object, OBJ_MIGHTBEDIRTY);
 		if (object->type == OBJT_VNODE &&
 		    (vp = (struct vnode *)object->handle) != NULL) {
 			VI_LOCK(vp);
 			if (vp->v_iflag & VI_OBJDIRTY)
 				vp->v_iflag &= ~VI_OBJDIRTY;
 			VI_UNLOCK(vp);
 		}
 	}
 
 rescan:
 	curgeneration = object->generation;
 
 	for (p = TAILQ_FIRST(&object->memq); p; p = np) {
 		int n;
 
 		np = TAILQ_NEXT(p, listq);
 
 again:
 		pi = p->pindex;
 		if ((p->oflags & VPO_CLEANCHK) == 0 ||
 			(pi < tstart) || (pi >= tend) ||
 			(p->valid == 0) ||
 		    VM_PAGE_INQUEUE1(p, PQ_CACHE)) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		/*
 		 * If we have been asked to skip nosync pages and this is a
 		 * nosync page, skip it.  Note that the object flags were
 		 * not cleared in this case so we do not have to set them.
 		 */
 		if ((flags & OBJPC_NOSYNC) && (p->oflags & VPO_NOSYNC)) {
 			p->oflags &= ~VPO_CLEANCHK;
 			continue;
 		}
 
 		n = vm_object_page_collect_flush(object, p,
 			curgeneration, pagerflags);
 		if (n == 0)
 			goto rescan;
 
 		if (object->generation != curgeneration)
 			goto rescan;
 
 		/*
 		 * Try to optimize the next page.  If we can't we pick up
 		 * our (random) scan where we left off.
 		 */
 		if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQ) {
 			if ((p = vm_page_lookup(object, pi + n)) != NULL)
 				goto again;
 		}
 	}
 	vm_page_unlock_queues();
 #if 0
 	VOP_FSYNC(vp, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
 #endif
 
 	vm_object_clear_flag(object, OBJ_CLEANING);
 	return;
 }
 
 static int
 vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
 {
 	int runlen;
 	int maxf;
 	int chkb;
 	int maxb;
 	int i;
 	vm_pindex_t pi;
 	vm_page_t maf[vm_pageout_page_count];
 	vm_page_t mab[vm_pageout_page_count];
 	vm_page_t ma[vm_pageout_page_count];
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	pi = p->pindex;
 	while (vm_page_sleep_if_busy(p, TRUE, "vpcwai")) {
 		vm_page_lock_queues();
 		if (object->generation != curgeneration) {
 			return(0);
 		}
 	}
 	maxf = 0;
 	for(i = 1; i < vm_pageout_page_count; i++) {
 		vm_page_t tp;
 
 		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
 			if ((tp->oflags & VPO_BUSY) ||
 				((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 				 (tp->oflags & VPO_CLEANCHK) == 0) ||
 				(tp->busy != 0))
 				break;
 			if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
 				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			vm_page_test_dirty(tp);
 			if ((tp->dirty & tp->valid) == 0) {
 				tp->oflags &= ~VPO_CLEANCHK;
 				break;
 			}
 			maf[ i - 1 ] = tp;
 			maxf++;
 			continue;
 		}
 		break;
 	}
 
 	maxb = 0;
 	chkb = vm_pageout_page_count -  maxf;
 	if (chkb) {
 		for(i = 1; i < chkb;i++) {
 			vm_page_t tp;
 
 			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
 				if ((tp->oflags & VPO_BUSY) ||
 					((pagerflags & VM_PAGER_IGNORE_CLEANCHK) == 0 &&
 					 (tp->oflags & VPO_CLEANCHK) == 0) ||
 					(tp->busy != 0))
 					break;
 				if (VM_PAGE_INQUEUE1(tp, PQ_CACHE)) {
 					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				vm_page_test_dirty(tp);
 				if ((tp->dirty & tp->valid) == 0) {
 					tp->oflags &= ~VPO_CLEANCHK;
 					break;
 				}
 				mab[ i - 1 ] = tp;
 				maxb++;
 				continue;
 			}
 			break;
 		}
 	}
 
 	for(i = 0; i < maxb; i++) {
 		int index = (maxb - i) - 1;
 		ma[index] = mab[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	p->oflags &= ~VPO_CLEANCHK;
 	ma[maxb] = p;
 	for(i = 0; i < maxf; i++) {
 		int index = (maxb + i) + 1;
 		ma[index] = maf[i];
 		ma[index]->oflags &= ~VPO_CLEANCHK;
 	}
 	runlen = maxb + maxf + 1;
 
 	vm_pageout_flush(ma, runlen, pagerflags);
 	for (i = 0; i < runlen; i++) {
 		if (ma[i]->valid & ma[i]->dirty) {
 			pmap_remove_write(ma[i]);
 			ma[i]->oflags |= VPO_CLEANCHK;
 
 			/*
 			 * maxf will end up being the actual number of pages
 			 * we wrote out contiguously, non-inclusive of the
 			 * first page.  We do not count look-behind pages.
 			 */
 			if (i >= maxb + 1 && (maxf > i - maxb - 1))
 				maxf = i - maxb - 1;
 		}
 	}
 	return(maxf + 1);
 }
 
 /*
  * Note that there is absolutely no sense in writing out
  * anonymous objects, so we track down the vnode object
  * to write out.
  * We invalidate (remove) all pages from the address space
  * for semantic correctness.
  *
  * Note: certain anonymous maps, such as MAP_NOSYNC maps,
  * may start out with a NULL object.
  */
 void
 vm_object_sync(vm_object_t object, vm_ooffset_t offset, vm_size_t size,
     boolean_t syncio, boolean_t invalidate)
 {
 	vm_object_t backing_object;
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	while ((backing_object = object->backing_object) != NULL) {
 		VM_OBJECT_LOCK(backing_object);
 		offset += object->backing_object_offset;
 		VM_OBJECT_UNLOCK(object);
 		object = backing_object;
 		if (object->size < OFF_TO_IDX(offset + size))
 			size = IDX_TO_OFF(object->size) - offset;
 	}
 	/*
 	 * Flush pages if writing is allowed, invalidate them
 	 * if invalidation requested.  Pages undergoing I/O
 	 * will be ignored by vm_object_page_remove().
 	 *
 	 * We cannot lock the vnode and then wait for paging
 	 * to complete without deadlocking against vm_fault.
 	 * Instead we simply call vm_object_page_remove() and
 	 * allow it to block internally on a page-by-page
 	 * basis when it encounters pages undergoing async
 	 * I/O.
 	 */
 	if (object->type == OBJT_VNODE &&
 	    (object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		int vfslocked;
 		vp = object->handle;
 		VM_OBJECT_UNLOCK(object);
 		(void) vn_start_write(vp, &mp, V_WAIT);
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread);
 		flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 		flags |= invalidate ? OBJPC_INVAL : 0;
 		VM_OBJECT_LOCK(object);
 		vm_object_page_clean(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    flags);
 		VM_OBJECT_UNLOCK(object);
 		VOP_UNLOCK(vp, 0, curthread);
 		VFS_UNLOCK_GIANT(vfslocked);
 		vn_finished_write(mp);
 		VM_OBJECT_LOCK(object);
 	}
 	if ((object->type == OBJT_VNODE ||
 	     object->type == OBJT_DEVICE) && invalidate) {
 		boolean_t purge;
 		purge = old_msync || (object->type == OBJT_DEVICE);
 		vm_object_page_remove(object,
 		    OFF_TO_IDX(offset),
 		    OFF_TO_IDX(offset + size + PAGE_MASK),
 		    purge ? FALSE : TRUE);
 	}
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_madvise:
  *
  *	Implements the madvise function at the object/page level.
  *
  *	MADV_WILLNEED	(any object)
  *
  *	    Activate the specified pages if they are resident.
  *
  *	MADV_DONTNEED	(any object)
  *
  *	    Deactivate the specified pages if they are resident.
  *
  *	MADV_FREE	(OBJT_DEFAULT/OBJT_SWAP objects,
  *			 OBJ_ONEMAPPING only)
  *
  *	    Deactivate and clean the specified pages if they are
  *	    resident.  This permits the process to reuse the pages
  *	    without faulting or the kernel to reclaim the pages
  *	    without I/O.
  */
 void
 vm_object_madvise(vm_object_t object, vm_pindex_t pindex, int count, int advise)
 {
 	vm_pindex_t end, tpindex;
 	vm_object_t backing_object, tobject;
 	vm_page_t m;
 
 	if (object == NULL)
 		return;
 	VM_OBJECT_LOCK(object);
 	end = pindex + count;
 	/*
 	 * Locate and adjust resident pages
 	 */
 	for (; pindex < end; pindex += 1) {
 relookup:
 		tobject = object;
 		tpindex = pindex;
 shadowlookup:
 		/*
 		 * MADV_FREE only operates on OBJT_DEFAULT or OBJT_SWAP pages
 		 * and those pages must be OBJ_ONEMAPPING.
 		 */
 		if (advise == MADV_FREE) {
 			if ((tobject->type != OBJT_DEFAULT &&
 			     tobject->type != OBJT_SWAP) ||
 			    (tobject->flags & OBJ_ONEMAPPING) == 0) {
 				goto unlock_tobject;
 			}
 		}
 		m = vm_page_lookup(tobject, tpindex);
 		if (m == NULL) {
 			/*
 			 * There may be swap even if there is no backing page
 			 */
 			if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 				swap_pager_freespace(tobject, tpindex, 1);
 			/*
 			 * next object
 			 */
 			backing_object = tobject->backing_object;
 			if (backing_object == NULL)
 				goto unlock_tobject;
 			VM_OBJECT_LOCK(backing_object);
 			tpindex += OFF_TO_IDX(tobject->backing_object_offset);
 			if (tobject != object)
 				VM_OBJECT_UNLOCK(tobject);
 			tobject = backing_object;
 			goto shadowlookup;
 		}
 		/*
 		 * If the page is busy or not in a normal active state,
 		 * we skip it.  If the page is not managed there are no
 		 * page queues to mess with.  Things can break if we mess
 		 * with pages in any of the below states.
 		 */
 		vm_page_lock_queues();
 		if (m->hold_count ||
 		    m->wire_count ||
 		    (m->flags & PG_UNMANAGED) ||
 		    m->valid != VM_PAGE_BITS_ALL) {
 			vm_page_unlock_queues();
 			goto unlock_tobject;
 		}
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			if (object != tobject)
 				VM_OBJECT_UNLOCK(object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(tobject), PDROP | PVM, "madvpo", 0);
 			VM_OBJECT_LOCK(object);
   			goto relookup;
 		}
 		if (advise == MADV_WILLNEED) {
 			vm_page_activate(m);
 		} else if (advise == MADV_DONTNEED) {
 			vm_page_dontneed(m);
 		} else if (advise == MADV_FREE) {
 			/*
 			 * Mark the page clean.  This will allow the page
 			 * to be freed up by the system.  However, such pages
 			 * are often reused quickly by malloc()/free()
 			 * so we do not do anything that would cause
 			 * a page fault if we can help it.
 			 *
 			 * Specifically, we do not try to actually free
 			 * the page now nor do we try to put it in the
 			 * cache (which would cause a page fault on reuse).
 			 *
 			 * But we do make the page is freeable as we
 			 * can without actually taking the step of unmapping
 			 * it.
 			 */
 			pmap_clear_modify(m);
 			m->dirty = 0;
 			m->act_count = 0;
 			vm_page_dontneed(m);
 		}
 		vm_page_unlock_queues();
 		if (advise == MADV_FREE && tobject->type == OBJT_SWAP)
 			swap_pager_freespace(tobject, tpindex, 1);
 unlock_tobject:
 		if (tobject != object)
 			VM_OBJECT_UNLOCK(tobject);
 	}	
 	VM_OBJECT_UNLOCK(object);
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 void
 vm_object_shadow(
 	vm_object_t *object,	/* IN/OUT */
 	vm_ooffset_t *offset,	/* IN/OUT */
 	vm_size_t length)
 {
 	vm_object_t source;
 	vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Don't create the new object if the old object isn't shared.
 	 */
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if (source->ref_count == 1 &&
 		    source->handle == NULL &&
 		    (source->type == OBJT_DEFAULT ||
 		     source->type == OBJT_SWAP)) {
 			VM_OBJECT_UNLOCK(source);
 			return;
 		}
 		VM_OBJECT_UNLOCK(source);
 	}
 
 	/*
 	 * Allocate a new object with the given length.
 	 */
 	result = vm_object_allocate(OBJT_DEFAULT, length);
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 *
 	 * Try to optimize the result object's page color when shadowing
 	 * in order to maintain page coloring consistency in the combined 
 	 * shadowed object.
 	 */
 	result->backing_object = source;
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 	result->backing_object_offset = *offset;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		LIST_INSERT_HEAD(&source->shadow_head, result, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 		if (length < source->size)
 			length = source->size;
 		if (length > PQ_MAXLENGTH || source->generation > 1)
 			length = PQ_MAXLENGTH;
 		result->pg_color = (source->pg_color +
 		    length * source->generation) & PQ_COLORMASK;
 		result->flags |= source->flags & OBJ_NEEDGIANT;
 		VM_OBJECT_UNLOCK(source);
 		next_index = (result->pg_color + PQ_MAXLENGTH) & PQ_COLORMASK;
 	}
 
 
 	/*
 	 * Return the new things
 	 */
 	*offset = 0;
 	*object = result;
 }
 
 /*
  *	vm_object_split:
  *
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 void
 vm_object_split(vm_map_entry_t entry)
 {
 	vm_page_t m, m_next;
 	vm_object_t orig_object, new_object, source;
 	vm_pindex_t idx, offidxstart;
 	vm_size_t size;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 	VM_OBJECT_UNLOCK(orig_object);
 
 	offidxstart = OFF_TO_IDX(entry->offset);
 	size = atop(entry->end - entry->start);
 
 	/*
 	 * If swap_pager_copy() is later called, it will convert new_object
 	 * into a swap object.
 	 */
 	new_object = vm_object_allocate(OBJT_DEFAULT, size);
 
 	/*
 	 * At this point, the new object is still private, so the order in
 	 * which the original and new objects are locked does not matter.
 	 */
 	VM_OBJECT_LOCK(new_object);
 	VM_OBJECT_LOCK(orig_object);
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		VM_OBJECT_LOCK(source);
 		if ((source->flags & OBJ_DEAD) != 0) {
 			VM_OBJECT_UNLOCK(source);
 			VM_OBJECT_UNLOCK(orig_object);
 			VM_OBJECT_UNLOCK(new_object);
 			vm_object_deallocate(new_object);
 			VM_OBJECT_LOCK(orig_object);
 			return;
 		}
 		LIST_INSERT_HEAD(&source->shadow_head,
 				  new_object, shadow_list);
 		source->shadow_count++;
 		source->generation++;
 		vm_object_reference_locked(source);	/* for new_object */
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		VM_OBJECT_UNLOCK(source);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + entry->offset;
 		new_object->backing_object = source;
 	}
 	new_object->flags |= orig_object->flags & OBJ_NEEDGIANT;
 retry:
 	if ((m = TAILQ_FIRST(&orig_object->memq)) != NULL) {
 		if (m->pindex < offidxstart) {
 			m = vm_page_splay(offidxstart, orig_object->root);
 			if ((orig_object->root = m)->pindex < offidxstart)
 				m = TAILQ_NEXT(m, listq);
 		}
 	}
 	vm_page_lock_queues();
 	for (; m != NULL && (idx = m->pindex - offidxstart) < size;
 	    m = m_next) {
 		m_next = TAILQ_NEXT(m, listq);
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if ((m->oflags & VPO_BUSY) || m->busy) {
 			vm_page_flag_set(m, PG_REFERENCED);
 			vm_page_unlock_queues();
 			VM_OBJECT_UNLOCK(new_object);
 			m->oflags |= VPO_WANTED;
 			msleep(m, VM_OBJECT_MTX(orig_object), PVM, "spltwt", 0);
 			VM_OBJECT_LOCK(new_object);
 			goto retry;
 		}
 		vm_page_rename(m, new_object, idx);
 		/* page automatically made dirty by rename and cache handled */
 		vm_page_busy(m);
 	}
 	vm_page_unlock_queues();
 	if (orig_object->type == OBJT_SWAP) {
 		/*
 		 * swap_pager_copy() can sleep, in which case the orig_object's
 		 * and new_object's locks are released and reacquired. 
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 	}
 	VM_OBJECT_UNLOCK(orig_object);
 	TAILQ_FOREACH(m, &new_object->memq, listq)
 		vm_page_wakeup(m);
 	VM_OBJECT_UNLOCK(new_object);
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 	VM_OBJECT_LOCK(new_object);
 }
 
 #define	OBSC_TEST_ALL_SHADOWED	0x0001
 #define	OBSC_COLLAPSE_NOWAIT	0x0002
 #define	OBSC_COLLAPSE_WAIT	0x0004
 
 static int
 vm_object_backing_scan(vm_object_t object, int op)
 {
 	int r = 1;
 	vm_page_t p;
 	vm_object_t backing_object;
 	vm_pindex_t backing_offset_index;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object->backing_object, MA_OWNED);
 
 	backing_object = object->backing_object;
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
 	/*
 	 * Initial conditions
 	 */
 	if (op & OBSC_TEST_ALL_SHADOWED) {
 		/*
 		 * We do not want to have to test for the existence of
 		 * swap pages in the backing object.  XXX but with the
 		 * new swapper this would be pretty easy to do.
 		 *
 		 * XXX what about anonymous MAP_SHARED memory that hasn't
 		 * been ZFOD faulted yet?  If we do not test for this, the
 		 * shadow test may succeed! XXX
 		 */
 		if (backing_object->type != OBJT_DEFAULT) {
 			return (0);
 		}
 	}
 	if (op & OBSC_COLLAPSE_WAIT) {
 		vm_object_set_flag(backing_object, OBJ_DEAD);
 	}
 
 	/*
 	 * Our scan
 	 */
 	p = TAILQ_FIRST(&backing_object->memq);
 	while (p) {
 		vm_page_t next = TAILQ_NEXT(p, listq);
 		vm_pindex_t new_pindex = p->pindex - backing_offset_index;
 
 		if (op & OBSC_TEST_ALL_SHADOWED) {
 			vm_page_t pp;
 
 			/*
 			 * Ignore pages outside the parent object's range
 			 * and outside the parent object's mapping of the 
 			 * backing object.
 			 *
 			 * note that we do not busy the backing object's
 			 * page.
 			 */
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				p = next;
 				continue;
 			}
 
 			/*
 			 * See if the parent has the page or if the parent's
 			 * object pager has the page.  If the parent has the
 			 * page but the page is not valid, the parent's
 			 * object pager must have the page.
 			 *
 			 * If this fails, the parent does not completely shadow
 			 * the object and we might as well give up now.
 			 */
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    (pp == NULL || pp->valid == 0) &&
 			    !vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				r = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Check for busy page
 		 */
 		if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
 			vm_page_t pp;
 
 			if (op & OBSC_COLLAPSE_NOWAIT) {
 				if ((p->oflags & VPO_BUSY) ||
 				    !p->valid || 
 				    p->busy) {
 					p = next;
 					continue;
 				}
 			} else if (op & OBSC_COLLAPSE_WAIT) {
 				if ((p->oflags & VPO_BUSY) || p->busy) {
 					vm_page_lock_queues();
 					vm_page_flag_set(p, PG_REFERENCED);
 					vm_page_unlock_queues();
 					VM_OBJECT_UNLOCK(object);
 					p->oflags |= VPO_WANTED;
 					msleep(p, VM_OBJECT_MTX(backing_object),
 					    PDROP | PVM, "vmocol", 0);
 					VM_OBJECT_LOCK(object);
 					VM_OBJECT_LOCK(backing_object);
 					/*
 					 * If we slept, anything could have
 					 * happened.  Since the object is
 					 * marked dead, the backing offset
 					 * should not have changed so we
 					 * just restart our scan.
 					 */
 					p = TAILQ_FIRST(&backing_object->memq);
 					continue;
 				}
 			}
 
 			KASSERT(
 			    p->object == backing_object,
 			    ("vm_object_backing_scan: object mismatch")
 			);
 
 			/*
 			 * Destroy any associated swap
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				swap_pager_freespace(
 				    backing_object, 
 				    p->pindex,
 				    1
 				);
 			}
 
 			if (
 			    p->pindex < backing_offset_index ||
 			    new_pindex >= object->size
 			) {
 				/*
 				 * Page is out of the parent object's range, we 
 				 * can simply destroy it. 
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 			pp = vm_page_lookup(object, new_pindex);
 			if (
 			    pp != NULL ||
 			    vm_pager_has_page(object, new_pindex, NULL, NULL)
 			) {
 				/*
 				 * page already exists in parent OR swap exists
 				 * for this location in the parent.  Destroy 
 				 * the original page from the backing object.
 				 *
 				 * Leave the parent's page alone
 				 */
 				vm_page_lock_queues();
 				KASSERT(!pmap_page_is_mapped(p),
 				    ("freeing mapped page %p", p));
 				if (p->wire_count == 0)
 					vm_page_free(p);
 				else
 					vm_page_remove(p);
 				vm_page_unlock_queues();
 				p = next;
 				continue;
 			}
 
 			/*
 			 * Page does not exist in parent, rename the
 			 * page from the backing object to the main object. 
 			 *
 			 * If the page was mapped to a process, it can remain 
 			 * mapped through the rename.
 			 */
 			vm_page_lock_queues();
 			vm_page_rename(p, object, new_pindex);
 			vm_page_unlock_queues();
 			/* page automatically made dirty by rename */
 		}
 		p = next;
 	}
 	return (r);
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(vm_object_t object)
 {
 	vm_object_t backing_object = object->backing_object;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(backing_object, MA_OWNED);
 
 	if (backing_object->ref_count != 1)
 		return;
 
 	vm_object_backing_scan(object, OBSC_COLLAPSE_NOWAIT);
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(vm_object_t object)
 {
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	
 	while (TRUE) {
 		vm_object_t backing_object;
 
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and the backing object exists.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			break;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		VM_OBJECT_LOCK(backing_object);
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 
 		if (
 		    object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0
 		) {
 			vm_object_qcollapse(object);
 			VM_OBJECT_UNLOCK(backing_object);
 			break;
 		}
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) have
 		 * the parent bypass the object if the parent happens to shadow
 		 * all the resident pages in the entire backing object.
 		 *
 		 * This is ignoring pager-backed pages such as swap pages.
 		 * vm_object_backing_scan fails the shadowing test in this
 		 * case.
 		 */
 		if (backing_object->ref_count == 1) {
 			/*
 			 * If there is exactly one reference to the backing
 			 * object, we can collapse it into the parent.  
 			 */
 			vm_object_backing_scan(object, OBSC_COLLAPSE_WAIT);
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 			if (backing_object->type == OBJT_SWAP) {
 				/*
 				 * swap_pager_copy() can sleep, in which case
 				 * the backing_object's and object's locks are
 				 * released and reacquired.
 				 */
 				swap_pager_copy(
 				    backing_object,
 				    object,
 				    OFF_TO_IDX(object->backing_object_offset), TRUE);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to 
 			 * backing_object->backing_object moves from within 
 			 * backing_object to within object.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 			if (backing_object->backing_object) {
 				VM_OBJECT_LOCK(backing_object->backing_object);
 				LIST_REMOVE(backing_object, shadow_list);
 				LIST_INSERT_HEAD(
 				    &backing_object->backing_object->shadow_head,
 				    object, shadow_list);
 				/*
 				 * The shadow_count has not changed.
 				 */
 				backing_object->backing_object->generation++;
 				VM_OBJECT_UNLOCK(backing_object->backing_object);
 			}
 			object->backing_object = backing_object->backing_object;
 			object->backing_object_offset +=
 			    backing_object->backing_object_offset;
 
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 			KASSERT(backing_object->ref_count == 1, ("backing_object %p was somehow re-referenced during collapse!", backing_object));
 			VM_OBJECT_UNLOCK(backing_object);
 
 			mtx_lock(&vm_object_list_mtx);
 			TAILQ_REMOVE(
 			    &vm_object_list, 
 			    backing_object,
 			    object_list
 			);
 			mtx_unlock(&vm_object_list_mtx);
 
 			uma_zfree(obj_zone, backing_object);
 
 			object_collapses++;
 		} else {
 			vm_object_t new_backing_object;
 
 			/*
 			 * If we do not entirely shadow the backing object,
 			 * there is nothing we can do so we give up.
 			 */
 			if (object->resident_page_count != object->size &&
 			    vm_object_backing_scan(object,
 			    OBSC_TEST_ALL_SHADOWED) == 0) {
 				VM_OBJECT_UNLOCK(backing_object);
 				break;
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 			LIST_REMOVE(object, shadow_list);
 			backing_object->shadow_count--;
 			backing_object->generation++;
 
 			new_backing_object = backing_object->backing_object;
 			if ((object->backing_object = new_backing_object) != NULL) {
 				VM_OBJECT_LOCK(new_backing_object);
 				LIST_INSERT_HEAD(
 				    &new_backing_object->shadow_head,
 				    object,
 				    shadow_list
 				);
 				new_backing_object->shadow_count++;
 				new_backing_object->generation++;
 				vm_object_reference_locked(new_backing_object);
 				VM_OBJECT_UNLOCK(new_backing_object);
 				object->backing_object_offset +=
 					backing_object->backing_object_offset;
 			}
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish.
 			 */
 			backing_object->ref_count--;
 			VM_OBJECT_UNLOCK(backing_object);
 			object_bypasses++;
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove:
  *
  *	Removes all physical pages in the given range from the
  *	object's list of pages.  If the range's end is zero, all
  *	physical pages from the range's start to the end of the object
  *	are deleted.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
     boolean_t clean_only)
 {
 	vm_page_t p, next;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->resident_page_count == 0)
 		return;
 
 	/*
 	 * Since physically-backed objects do not use managed pages, we can't
 	 * remove pages from the object (we must instead remove the page
 	 * references, and then destroy the object).
 	 */
 	KASSERT(object->type != OBJT_PHYS || object == kernel_object ||
 	    object == kmem_object,
 	    ("attempt to remove pages from a physical object"));
 
 	vm_object_pip_add(object, 1);
 again:
 	vm_page_lock_queues();
 	if ((p = TAILQ_FIRST(&object->memq)) != NULL) {
 		if (p->pindex < start) {
 			p = vm_page_splay(start, object->root);
 			if ((object->root = p)->pindex < start)
 				p = TAILQ_NEXT(p, listq);
 		}
 	}
 	/*
 	 * Assert: the variable p is either (1) the page with the
 	 * least pindex greater than or equal to the parameter pindex
 	 * or (2) NULL.
 	 */
 	for (;
 	     p != NULL && (p->pindex < end || end == 0);
 	     p = next) {
 		next = TAILQ_NEXT(p, listq);
 
 		if (p->wire_count != 0) {
 			pmap_remove_all(p);
 			if (!clean_only)
 				p->valid = 0;
 			continue;
 		}
 		if (vm_page_sleep_if_busy(p, TRUE, "vmopar"))
 			goto again;
 		if (clean_only && p->valid) {
 			pmap_remove_write(p);
 			if (p->valid & p->dirty)
 				continue;
 		}
 		pmap_remove_all(p);
 		vm_page_free(p);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_wakeup(object);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to the second object
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(vm_object_t prev_object, vm_ooffset_t prev_offset,
 	vm_size_t prev_size, vm_size_t next_size)
 {
 	vm_pindex_t next_pindex;
 
 	if (prev_object == NULL)
 		return (TRUE);
 	VM_OBJECT_LOCK(prev_object);
 	if (prev_object->type != OBJT_DEFAULT &&
 	    prev_object->type != OBJT_SWAP) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 	if (prev_object->backing_object != NULL) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	next_pindex = OFF_TO_IDX(prev_offset) + prev_size;
 
 	if ((prev_object->ref_count > 1) &&
 	    (prev_object->size != next_pindex)) {
 		VM_OBJECT_UNLOCK(prev_object);
 		return (FALSE);
 	}
 
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 	if (next_pindex < prev_object->size) {
 		vm_object_page_remove(prev_object,
 				      next_pindex,
 				      next_pindex + next_size, FALSE);
 		if (prev_object->type == OBJT_SWAP)
 			swap_pager_freespace(prev_object,
 					     next_pindex, next_size);
 	}
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	if (next_pindex + next_size > prev_object->size)
 		prev_object->size = next_pindex + next_size;
 
 	VM_OBJECT_UNLOCK(prev_object);
 	return (TRUE);
 }
 
 void
 vm_object_set_writeable_dirty(vm_object_t object)
 {
 	struct vnode *vp;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0)
 		return;
 	vm_object_set_flag(object, OBJ_MIGHTBEDIRTY);
 	if (object->type == OBJT_VNODE &&
 	    (vp = (struct vnode *)object->handle) != NULL) {
 		VI_LOCK(vp);
 		vp->v_iflag |= VI_OBJDIRTY;
 		VI_UNLOCK(vp);
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 static int
 _vm_object_in_map(vm_map_t map, vm_object_t object, vm_map_entry_t entry)
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if (_vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		tmpm = entry->object.sub_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if (_vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if ((obj = entry->object.vm_object) != NULL) {
 		for (; obj; obj = obj->backing_object)
 			if (obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map(vm_object_t object)
 {
 	struct proc *p;
 
 	/* sx_slock(&allproc_lock); */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 		if (_vm_object_in_map(&p->p_vmspace->vm_map, object, 0)) {
 			/* sx_sunlock(&allproc_lock); */
 			return 1;
 		}
 	}
 	/* sx_sunlock(&allproc_lock); */
 	if (_vm_object_in_map(kernel_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(kmem_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(pager_map, object, 0))
 		return 1;
 	if (_vm_object_in_map(buffer_map, object, 0))
 		return 1;
 	return 0;
 }
 
 DB_SHOW_COMMAND(vmochk, vm_object_check)
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				db_printf("vmochk: internal obj has zero ref count: %ld\n",
 					(long)object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				db_printf(
 			"vmochk: internal obj is not in a map: "
 			"ref: %d, size: %lu: 0x%lx, backing_object: %p\n",
 				    object->ref_count, (u_long)object->size, 
 				    (u_long)object->size,
 				    (void *)object->backing_object);
 			}
 		}
 	}
 }
 
 /*
  *	vm_object_print:	[ debug ]
  */
 DB_SHOW_COMMAND(object, vm_object_print_static)
 {
 	/* XXX convert args. */
 	vm_object_t object = (vm_object_t)addr;
 	boolean_t full = have_addr;
 
 	vm_page_t p;
 
 	/* XXX count is an (unused) arg.  Avoid shadowing it. */
 #define	count	was_count
 
 	int count;
 
 	if (object == NULL)
 		return;
 
 	db_iprintf(
 	    "Object %p: type=%d, size=0x%jx, res=%d, ref=%d, flags=0x%x\n",
 	    object, (int)object->type, (uintmax_t)object->size,
 	    object->resident_page_count, object->ref_count, object->flags);
 	db_iprintf(" sref=%d, backing_object(%d)=(%p)+0x%jx\n",
 	    object->shadow_count, 
 	    object->backing_object ? object->backing_object->ref_count : 0,
 	    object->backing_object, (uintmax_t)object->backing_object_offset);
 
 	if (!full)
 		return;
 
 	db_indent += 2;
 	count = 0;
 	TAILQ_FOREACH(p, &object->memq, listq) {
 		if (count == 0)
 			db_iprintf("memory:=");
 		else if (count == 6) {
 			db_printf("\n");
 			db_iprintf(" ...");
 			count = 0;
 		} else
 			db_printf(",");
 		count++;
 
 		db_printf("(off=0x%jx,page=0x%jx)",
 		    (uintmax_t)p->pindex, (uintmax_t)VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		db_printf("\n");
 	db_indent -= 2;
 }
 
 /* XXX. */
 #undef count
 
 /* XXX need this non-static entry for calling from vm_map_print. */
 void
 vm_object_print(
         /* db_expr_t */ long addr,
 	boolean_t have_addr,
 	/* db_expr_t */ long count,
 	char *modif)
 {
 	vm_object_print_static(addr, have_addr, count, modif);
 }
 
 DB_SHOW_COMMAND(vmopag, vm_object_print_pages)
 {
 	vm_object_t object;
 	int nl = 0;
 	int c;
 
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		vm_pindex_t idx, fidx;
 		vm_pindex_t osize;
 		vm_paddr_t pa = -1, padiff;
 		int rcount;
 		vm_page_t m;
 
 		db_printf("new object: %p\n", (void *)object);
 		if (nl > 18) {
 			c = cngetc();
 			if (c != ' ')
 				return;
 			nl = 0;
 		}
 		nl++;
 		rcount = 0;
 		fidx = 0;
 		osize = object->size;
 		if (osize > 128)
 			osize = 128;
 		for (idx = 0; idx < osize; idx++) {
 			m = vm_page_lookup(object, idx);
 			if (m == NULL) {
 				if (rcount) {
 					db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 						(long)fidx, rcount, (long)pa);
 					if (nl > 18) {
 						c = cngetc();
 						if (c != ' ')
 							return;
 						nl = 0;
 					}
 					nl++;
 					rcount = 0;
 				}
 				continue;
 			}
 
 				
 			if (rcount &&
 				(VM_PAGE_TO_PHYS(m) == pa + rcount * PAGE_SIZE)) {
 				++rcount;
 				continue;
 			}
 			if (rcount) {
 				padiff = pa + rcount * PAGE_SIZE - VM_PAGE_TO_PHYS(m);
 				padiff >>= PAGE_SHIFT;
 				padiff &= PQ_COLORMASK;
 				if (padiff == 0) {
 					pa = VM_PAGE_TO_PHYS(m) - rcount * PAGE_SIZE;
 					++rcount;
 					continue;
 				}
 				db_printf(" index(%ld)run(%d)pa(0x%lx)",
 					(long)fidx, rcount, (long)pa);
 				db_printf("pd(%ld)\n", (long)padiff);
 				if (nl > 18) {
 					c = cngetc();
 					if (c != ' ')
 						return;
 					nl = 0;
 				}
 				nl++;
 			}
 			fidx = idx;
 			pa = VM_PAGE_TO_PHYS(m);
 			rcount = 1;
 		}
 		if (rcount) {
 			db_printf(" index(%ld)run(%d)pa(0x%lx)\n",
 				(long)fidx, rcount, (long)pa);
 			if (nl > 18) {
 				c = cngetc();
 				if (c != ' ')
 					return;
 				nl = 0;
 			}
 			nl++;
 		}
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_page.c
===================================================================
--- head/sys/vm/vm_page.c	(revision 170516)
+++ head/sys/vm/vm_page.c	(revision 170517)
@@ -1,1819 +1,1814 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- a pageq mutex is required when adding or removing a page from a
  *	  page queue (vm_page_queue[]), regardless of other mutexes or the
  *	  busy state of a page.
  *
  *	- a hash chain mutex is required when associating or disassociating
  *	  a page from the VM PAGE CACHE hash table (vm_page_buckets),
  *	  regardless of other mutexes or the busy state of a page.
  *
  *	- either a hash chain mutex OR a busied page is required in order
  *	  to modify the page flags.  A hash chain mutex must be obtained in
  *	  order to busy a page.  A page's flags cannot be modified by a
  *	  hash chain mutex if the page is marked busy.
  *
  *	- The object memq mutex is held when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).  This
  *	  is different from the object's main mutex.
  *
  *	Generally speaking, you have to be aware of side effects when running
  *	vm_page ops.  A vm_page_lookup() will return with the hash chain
  *	locked, whether it was able to lookup the page or not.  vm_page_free(),
  *	vm_page_cache(), vm_page_activate(), and a number of other routines
  *	will release the hash chain mutex for you.  Intermediate manipulation
  *	routines such as vm_page_flag_set() expect the hash chain to be held
  *	on entry and the hash chain will remain held on return.
  *
  *	pageq scanning can only occur with the pageq in question locked.
  *	We have a known bottleneck with the active queue, but the cache
  *	and free queues are actually arrays already. 
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 struct mtx vm_page_queue_mtx;
 struct mtx vm_page_queue_free_mtx;
 
 vm_page_t vm_page_array = 0;
 int vm_page_array_size = 0;
 long first_page = 0;
 int vm_page_zero_count = 0;
 
 static int boot_pages = UMA_BOOT_PAGES;
 TUNABLE_INT("vm.boot_pages", &boot_pages);
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
 	"number of pages allocated for bootstrapping the VM system");
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
 	if (cnt.v_page_size == 0)
 		cnt.v_page_size = PAGE_SIZE;
 	if (((cnt.v_page_size - 1) & cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_lookup:
  *
  *	See if a physical address in this page has been listed
  *	in the blacklist tunable.  Entries in the tunable are
  *	separated by spaces or commas.  If an invalid integer is
  *	encountered then the rest of the string is skipped.
  */
 static int
 vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	for (pos = list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp != '\0') {
 			if (*cp == ' ' || *cp == ',') {
 				cp++;
 				if (cp == pos)
 					continue;
 			} else
 				break;
 		}
 		if (pa == trunc_page(bad))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
  *
  *	Allocates memory for the page cells, and
  *	for the object/offset-to-page hash table headers.
  *	Each page cell is initialized and placed on the free list.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
 	vm_size_t npages;
 	vm_paddr_t page_range;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
 	int nblocks;
 	vm_paddr_t last_pa;
 	char *list;
 
 	/* the biggest memory array is the second group of pages */
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
 	vm_paddr_t low_water, high_water;
 	int biggestone;
 
 	vm_paddr_t total;
 
 	total = 0;
 	biggestsize = 0;
 	biggestone = 0;
 	nblocks = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
 	low_water = phys_avail[0];
 	high_water = phys_avail[1];
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 		if (phys_avail[i] < low_water)
 			low_water = phys_avail[i];
 		if (phys_avail[i + 1] > high_water)
 			high_water = phys_avail[i + 1];
 		++nblocks;
 		total += size;
 	}
 
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the locks.
 	 */
 	mtx_init(&vm_page_queue_mtx, "vm page queue mutex", NULL, MTX_DEF |
 	    MTX_RECURSE);
 	mtx_init(&vm_page_queue_free_mtx, "vm page queue free mutex", NULL,
 	    MTX_DEF);
 
 	/*
 	 * Initialize the queue headers for the free queue, the active queue
 	 * and the inactive queue.
 	 */
 	vm_pageq_init();
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 */
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
 #if defined(__amd64__) || defined(__i386__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	page_range = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
 	first_page = low_water / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	page_range = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 #elif defined(VM_PHYSSEG_DENSE)
 	page_range = high_water / PAGE_SIZE - first_page;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 	npages = (total - (page_range * sizeof(struct vm_page)) -
 	    (end - new_end)) / PAGE_SIZE;
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
 	 * Initialize the mem entry structures now, and put them in the free
 	 * queue.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
 #ifdef __amd64__
 	/*
 	 * pmap_map on amd64 comes out of the direct-map, not kvm like i386,
 	 * so the pages must be tracked for a crashdump to include this data.
 	 * This includes the vm_page_array and the early UMA bootstrap pages.
 	 */
 	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif	
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	vm_page_array_size = page_range;
 
 	/*
 	 * This assertion tests the hypothesis that npages and total are
 	 * redundant.  XXX
 	 */
 	page_range = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 	KASSERT(page_range == npages,
 	    ("vm_page_startup: inconsistent page counts"));
 
 	/*
 	 * Construct the free queue(s) in descending order (by physical
 	 * address) so that the first 16MB of physical memory is allocated
 	 * last rather than first.  On large-memory machines, this avoids
 	 * the exhaustion of low physical memory before isa_dma_init has run.
 	 */
 	cnt.v_page_count = 0;
 	cnt.v_free_count = 0;
 	list = getenv("vm.blacklist");
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
 		while (pa < last_pa) {
 			if (list != NULL &&
 			    vm_page_blacklist_lookup(list, pa))
 				printf("Skipping page with pa 0x%jx\n",
 				    (uintmax_t)pa);
 			else
 				vm_pageq_add_new_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
 	freeenv(list);
 	return (vaddr);
 }
 
 void
 vm_page_flag_set(vm_page_t m, unsigned short bits)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->flags |= bits;
 } 
 
 void
 vm_page_flag_clear(vm_page_t m, unsigned short bits)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->flags &= ~bits;
 }
 
 void
 vm_page_busy(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT((m->oflags & VPO_BUSY) == 0,
 	    ("vm_page_busy: page already busy!!!"));
 	m->oflags |= VPO_BUSY;
 }
 
 /*
  *      vm_page_flash:
  *
  *      wakeup anyone waiting for the page.
  */
 void
 vm_page_flash(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->oflags & VPO_WANTED) {
 		m->oflags &= ~VPO_WANTED;
 		wakeup(m);
 	}
 }
 
 /*
  *      vm_page_wakeup:
  *
  *      clear the VPO_BUSY flag and wakeup anyone waiting for the
  *      page.
  *
  */
 void
 vm_page_wakeup(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	KASSERT(m->oflags & VPO_BUSY, ("vm_page_wakeup: page not busy!!!"));
 	m->oflags &= ~VPO_BUSY;
 	vm_page_flash(m);
 }
 
 void
 vm_page_io_start(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	m->busy++;
 }
 
 void
 vm_page_io_finish(vm_page_t m)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	m->busy--;
 	if (m->busy == 0)
 		vm_page_flash(m);
 }
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 void
 vm_page_hold(vm_page_t mem)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
         mem->hold_count++;
 }
 
 void
 vm_page_unhold(vm_page_t mem)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	--mem->hold_count;
 	KASSERT(mem->hold_count >= 0, ("vm_page_unhold: hold count < 0!!!"));
 	if (mem->hold_count == 0 && VM_PAGE_INQUEUE2(mem, PQ_HOLD))
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_sleep:
  *
  *	Sleep and release the page queues lock.
  *
  *	The object containing the given page must be locked.
  */
 void
 vm_page_sleep(vm_page_t m, const char *msg)
 {
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (!mtx_owned(&vm_page_queue_mtx))
 		vm_page_lock_queues();
 	vm_page_flag_set(m, PG_REFERENCED);
 	vm_page_unlock_queues();
 
 	/*
 	 * It's possible that while we sleep, the page will get
 	 * unbusied and freed.  If we are holding the object
 	 * lock, we will assume we hold a reference to the object
 	 * such that even if m->object changes, we can re-lock
 	 * it.
 	 */
 	m->oflags |= VPO_WANTED;
 	msleep(m, VM_OBJECT_MTX(m->object), PVM, msg, 0);
 }
 
 /*
  *	vm_page_dirty:
  *
  *	make page all dirty
  */
 void
 vm_page_dirty(vm_page_t m)
 {
 	KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_CACHE,
 	    ("vm_page_dirty: page in cache!"));
 	KASSERT(VM_PAGE_GETKNOWNQUEUE1(m) != PQ_FREE,
 	    ("vm_page_dirty: page is free!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_splay:
  *
  *	Implements Sleator and Tarjan's top-down splay algorithm.  Returns
  *	the vm_page containing the given pindex.  If, however, that
  *	pindex is not found in the vm_object, returns a vm_page that is
  *	adjacent to the pindex, coming before or after it.
  */
 vm_page_t
 vm_page_splay(vm_pindex_t pindex, vm_page_t root)
 {
 	struct vm_page dummy;
 	vm_page_t lefttreemax, righttreemin, y;
 
 	if (root == NULL)
 		return (root);
 	lefttreemax = righttreemin = &dummy;
 	for (;; root = y) {
 		if (pindex < root->pindex) {
 			if ((y = root->left) == NULL)
 				break;
 			if (pindex < y->pindex) {
 				/* Rotate right. */
 				root->left = y->right;
 				y->right = root;
 				root = y;
 				if ((y = root->left) == NULL)
 					break;
 			}
 			/* Link into the new root's right tree. */
 			righttreemin->left = root;
 			righttreemin = root;
 		} else if (pindex > root->pindex) {
 			if ((y = root->right) == NULL)
 				break;
 			if (pindex > y->pindex) {
 				/* Rotate left. */
 				root->right = y->left;
 				y->left = root;
 				root = y;
 				if ((y = root->right) == NULL)
 					break;
 			}
 			/* Link into the new root's left tree. */
 			lefttreemax->right = root;
 			lefttreemax = root;
 		} else
 			break;
 	}
 	/* Assemble the new root. */
 	lefttreemax->right = root->left;
 	righttreemin->left = root->right;
 	root->left = dummy.right;
 	root->right = dummy.left;
 	return (root);
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The pagetables are not updated but will presumably fault the page
  *	in if necessary, or if a kernel page the caller will at some point
  *	enter the page into the kernel's pmap.  We are not allowed to block
  *	here so we *can't* do this anyway.
  *
  *	The object and page must be locked.
  *	This routine may not block.
  */
 void
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t root;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (m->object != NULL)
 		panic("vm_page_insert: page already inserted");
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 	m->object = object;
 	m->pindex = pindex;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	root = object->root;
 	if (root == NULL) {
 		m->left = NULL;
 		m->right = NULL;
 		TAILQ_INSERT_TAIL(&object->memq, m, listq);
 	} else {
 		root = vm_page_splay(pindex, root);
 		if (pindex < root->pindex) {
 			m->left = root->left;
 			m->right = root;
 			root->left = NULL;
 			TAILQ_INSERT_BEFORE(root, m, listq);
 		} else if (pindex == root->pindex)
 			panic("vm_page_insert: offset already allocated");
 		else {
 			m->right = root->right;
 			m->left = root;
 			root->right = NULL;
 			TAILQ_INSERT_AFTER(&object->memq, root, m, listq);
 		}
 	}
 	object->root = m;
 	object->generation++;
 
 	/*
 	 * show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold((struct vnode *)object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (m->flags & PG_WRITEABLE)
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  *	vm_page_remove:
  *				NOTE: used by device pager as well -wfj
  *
  *	Removes the given mem entry from the object/offset-page
  *	table and the object page list, but do not invalidate/terminate
  *	the backing store.
  *
  *	The object and page must be locked.
  *	The underlying pmap entry (if any) is NOT removed here.
  *	This routine may not block.
  */
 void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t root;
 
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (m->oflags & VPO_BUSY) {
 		m->oflags &= ~VPO_BUSY;
 		vm_page_flash(m);
 	}
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	if (m != object->root)
 		vm_page_splay(m->pindex, object->root);
 	if (m->left == NULL)
 		root = m->right;
 	else {
 		root = vm_page_splay(m->pindex, m->left);
 		root->right = m->right;
 	}
 	object->root = root;
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 	object->generation++;
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop((struct vnode *)object->handle);
 
 	m->object = NULL;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  *	This routine may not block.
  *	This is a critical path routine
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if ((m = object->root) != NULL && m->pindex != pindex) {
 		m = vm_page_splay(pindex, m);
 		if ((object->root = m)->pindex != pindex)
 			m = NULL;
 	}
 	return (m);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	The object must be locked.
  *	This routine may not block.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.  If the page is on the cache, we have to deactivate it
  *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
  *	      on the cache.
  */
 void
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 
 	vm_page_remove(m);
 	vm_page_insert(m, new_object, new_pindex);
 	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
 		vm_page_deactivate(m);
 	vm_page_dirty(m);
 }
 
 /*
  *	vm_page_select_cache:
  *
  *	Move a page of the given color from the cache queue to the free
  *	queue.  As pages might be found, but are not applicable, they are
  *	deactivated.
  *
  *	This routine may not block.
  */
 vm_page_t
 vm_page_select_cache(int color)
 {
 	vm_object_t object;
 	vm_page_t m;
 	boolean_t was_trylocked;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	while ((m = vm_pageq_find(PQ_CACHE, color, FALSE)) != NULL) {
 		KASSERT(m->dirty == 0, ("Found dirty cache page %p", m));
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("Found mapped cache page %p", m));
 		KASSERT((m->flags & PG_UNMANAGED) == 0,
 		    ("Found unmanaged cache page %p", m));
 		KASSERT(m->wire_count == 0, ("Found wired cache page %p", m));
 		if (m->hold_count == 0 && (object = m->object,
 		    (was_trylocked = VM_OBJECT_TRYLOCK(object)) ||
 		    VM_OBJECT_LOCKED(object))) {
 			KASSERT((m->oflags & VPO_BUSY) == 0 && m->busy == 0,
 			    ("Found busy cache page %p", m));
 			vm_page_free(m);
 			if (was_trylocked)
 				VM_OBJECT_UNLOCK(object);
 			break;
 		}
 		vm_page_deactivate(m);
 	}
 	return (m);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a memory cell associated
  *	with this VM object/offset pair.
  *
  *	page_req classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *	VM_ALLOC_ZERO		zero page
  *
  *	This routine may not block.
  *
  *	Additional special handling is required when called from an
  *	interrupt (VM_ALLOC_INTERRUPT).  We are not allowed to mess with
  *	the page cache in this case.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 	vm_page_t m = NULL;
 	int color, flags, page_req;
 
 	page_req = req & VM_ALLOC_CLASS_MASK;
 	KASSERT(curthread->td_intr_nesting_level == 0 ||
 	    page_req == VM_ALLOC_INTERRUPT,
 	    ("vm_page_alloc(NORMAL|SYSTEM) in interrupt context"));
 
 	if ((req & VM_ALLOC_NOOBJ) == 0) {
 		KASSERT(object != NULL,
 		    ("vm_page_alloc: NULL object."));
 		VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 		color = (pindex + object->pg_color) & PQ_COLORMASK;
 	} else
 		color = pindex & PQ_COLORMASK;
 
 	/*
 	 * The pager is allowed to eat deeper into the free page list.
 	 */
 	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
 loop:
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (cnt.v_free_count > cnt.v_free_reserved ||
 	    (page_req == VM_ALLOC_SYSTEM && 
 	     cnt.v_cache_count == 0 && 
 	     cnt.v_free_count > cnt.v_interrupt_free_min) ||
 	    (page_req == VM_ALLOC_INTERRUPT && cnt.v_free_count > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
 		 * exceeds the minimum for the request class.
 		 */
 		m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
 	} else if (page_req != VM_ALLOC_INTERRUPT) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		/*
 		 * Allocatable from cache (non-interrupt only).  On success,
 		 * we must free the page and try again, thus ensuring that
 		 * cnt.v_*_free_min counters are replenished.
 		 */
 		vm_page_lock_queues();
 		if ((m = vm_page_select_cache(color)) == NULL) {
 			KASSERT(cnt.v_cache_count == 0,
 			    ("vm_page_alloc: cache queue is missing %d pages",
 			    cnt.v_cache_count));
 			vm_page_unlock_queues();
 			atomic_add_int(&vm_pageout_deficit, 1);
 			pagedaemon_wakeup();
 
 			if (page_req != VM_ALLOC_SYSTEM) 
 				return (NULL);
 
 			mtx_lock(&vm_page_queue_free_mtx);
 			if (cnt.v_free_count <= cnt.v_interrupt_free_min) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				return (NULL);
 			}
 			m = vm_pageq_find(PQ_FREE, color, (req & VM_ALLOC_ZERO) != 0);
 		} else {
 			vm_page_unlock_queues();
 			goto loop;
 		}
 	} else {
 		/*
 		 * Not allocatable from cache from interrupt, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, 1);
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 
 	/*
 	 *  At this point we had better have found a good page.
 	 */
 
 	KASSERT(
 	    m != NULL,
 	    ("vm_page_alloc(): missing page on free queue")
 	);
 
 	/*
 	 * Remove from free queue
 	 */
 	vm_pageq_remove_nowakeup(m);
 
 	/*
 	 * Initialize structure.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if (m->flags & PG_ZERO) {
 		vm_page_zero_count--;
 		if (req & VM_ALLOC_ZERO)
 			flags = PG_ZERO;
 	}
 	if (object != NULL && object->type == OBJT_PHYS)
 		flags |= PG_UNMANAGED;
 	m->flags = flags;
 	if (req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ))
 		m->oflags = 0;
 	else
 		m->oflags = VPO_BUSY;
 	if (req & VM_ALLOC_WIRED) {
 		atomic_add_int(&cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	} else
 		m->wire_count = 0;
 	m->hold_count = 0;
 	m->act_count = 0;
 	m->busy = 0;
 	m->valid = 0;
 	KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m));
 	mtx_unlock(&vm_page_queue_free_mtx);
 
 	if ((req & VM_ALLOC_NOOBJ) == 0)
 		vm_page_insert(m, object, pindex);
 	else
 		m->pindex = pindex;
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
 /*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Block until free pages are available for allocation
  *	- Called in various places before memory allocations.
  */
 void
 vm_wait(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
 			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
 		msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 		    "vmwait", 0);
 	}
 }
 
 /*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Block until free pages are available for allocation
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pages_needed) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 	msleep(&cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *	Ensure that act_count is at least ACT_INIT but do not otherwise
  *	mess with it.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_activate(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (VM_PAGE_GETKNOWNQUEUE2(m) != PQ_ACTIVE) {
 		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			PCPU_INC(cnt.v_reactivated);
+			cnt.v_reactivated++;
 		vm_pageq_remove(m);
 		if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
 			vm_pageq_enqueue(PQ_ACTIVE, m);
 		}
 	} else {
 		if (m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
 	}
 }
 
 /*
  *	vm_page_free_wakeup:
  *
  *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
  *	routine is called when a page has been added to the cache or free
  *	queues.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 static inline void
 vm_page_free_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
 	    cnt.v_cache_count + cnt.v_free_count >= cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	/*
 	 * wakeup processes that are waiting on memory if we hit a
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
 		wakeup(&cnt.v_free_count);
 	}
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the PQ_FREE list,
  *	disassociating it with any VM object.
  *
  *	Object and page must be locked prior to entry.
  *	This routine may not block.
  */
 
 void
 vm_page_free_toq(vm_page_t m)
 {
 	struct vpgqueues *pq;
 
 	if (VM_PAGE_GETQUEUE(m) != PQ_NONE)
 		mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	KASSERT(!pmap_page_is_mapped(m),
 	    ("vm_page_free_toq: freeing mapped page %p", m));
 	PCPU_INC(cnt.v_tfree);
 
 	if (m->busy || VM_PAGE_INQUEUE1(m, PQ_FREE)) {
 		printf(
 		"vm_page_free: pindex(%lu), busy(%d), VPO_BUSY(%d), hold(%d)\n",
 		    (u_long)m->pindex, m->busy, (m->oflags & VPO_BUSY) ? 1 : 0,
 		    m->hold_count);
 		if (VM_PAGE_INQUEUE1(m, PQ_FREE))
 			panic("vm_page_free: freeing free page");
 		else
 			panic("vm_page_free: freeing busy page");
 	}
 
 	/*
 	 * unqueue, then remove page.  Note that we cannot destroy
 	 * the page here because we do not want to call the pager's
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
 	vm_pageq_remove_nowakeup(m);
 	vm_page_remove(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		return;
 	}
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->wire_count != 0) {
 		if (m->wire_count > 1) {
 			panic("vm_page_free: invalid wire count (%d), pindex: 0x%lx",
 				m->wire_count, (long)m->pindex);
 		}
 		panic("vm_page_free: freeing wired page");
 	}
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		vm_pageq_enqueue(PQ_HOLD, m);
 		return;
 	}
 	VM_PAGE_SETQUEUE1(m, PQ_FREE);
 	mtx_lock(&vm_page_queue_free_mtx);
 	pq = &vm_page_queues[VM_PAGE_GETQUEUE(m)];
 	pq->lcnt++;
 	++(*pq->cnt);
 
 	/*
 	 * Put zero'd pages on the end ( where we look for zero'd pages
 	 * first ) and non-zerod pages at the head.
 	 */
 	if (m->flags & PG_ZERO) {
 		TAILQ_INSERT_TAIL(&pq->pl, m, pageq);
 		++vm_page_zero_count;
 	} else {
 		TAILQ_INSERT_HEAD(&pq->pl, m, pageq);
 		vm_page_zero_idle_wakeup();
 	}
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 
 	/*
 	 * Only bump the wire statistics if the page is not already wired,
 	 * and only unqueue the page if it is on some queue (if it is unmanaged
 	 * it is already off the queues).
 	 */
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->flags & PG_FICTITIOUS)
 		return;
 	if (m->wire_count == 0) {
 		if ((m->flags & PG_UNMANAGED) == 0)
 			vm_pageq_remove(m);
 		atomic_add_int(&cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 }
 
 /*
  *	vm_page_unwire:
  *
  *	Release one wiring of this page, potentially
  *	enabling it to be paged again.
  *
  *	Many pages placed on the inactive queue should actually go
  *	into the cache, but it is difficult to figure out which.  What
  *	we do instead, if the inactive target is well met, is to put
  *	clean pages at the head of the inactive queue instead of the tail.
  *	This will cause them to be moved to the cache more quickly and
  *	if not actively re-referenced, freed more quickly.  If we just
  *	stick these pages at the end of the inactive queue, heavy filesystem
  *	meta-data accesses can cause an unnecessary paging load on memory bound 
  *	processes.  This optimization causes one-time-use metadata to be
  *	reused more quickly.
  *
  *	BUT, if we are in a low-memory situation we have no choice but to
  *	put clean pages on the cache queue.
  *
  *	A number of routines use vm_page_unwire() to guarantee that the page
  *	will go into either the inactive or active queues, and will NEVER
  *	be placed in the cache - for example, just after dirtying a page.
  *	dirty pages in the cache are not allowed.
  *
  *	The page queues must be locked.
  *	This routine may not block.
  */
 void
 vm_page_unwire(vm_page_t m, int activate)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->flags & PG_FICTITIOUS)
 		return;
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			atomic_subtract_int(&cnt.v_wire_count, 1);
 			if (m->flags & PG_UNMANAGED) {
 				;
 			} else if (activate)
 				vm_pageq_enqueue(PQ_ACTIVE, m);
 			else {
 				vm_page_flag_clear(m, PG_WINATCFLS);
 				vm_pageq_enqueue(PQ_INACTIVE, m);
 			}
 		}
 	} else {
 		panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
 	}
 }
 
 
 /*
  * Move the specified page to the inactive queue.  If the page has
  * any associated swap, the swap is deallocated.
  *
  * Normally athead is 0 resulting in LRU operation.  athead is set
  * to 1 if we want this page to be 'as if it were placed in the cache',
  * except without unmapping it from the process address space.
  *
  * This routine may not block.
  */
 static inline void
 _vm_page_deactivate(vm_page_t m, int athead)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 
 	/*
 	 * Ignore if already inactive.
 	 */
 	if (VM_PAGE_INQUEUE2(m, PQ_INACTIVE))
 		return;
 	if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
 		if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
-			PCPU_INC(cnt.v_reactivated);
+			cnt.v_reactivated++;
 		vm_page_flag_clear(m, PG_WINATCFLS);
 		vm_pageq_remove(m);
 		if (athead)
 			TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 		else
 			TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE].pl, m, pageq);
 		VM_PAGE_SETQUEUE2(m, PQ_INACTIVE);
 		vm_page_queues[PQ_INACTIVE].lcnt++;
-
-		/*
-		 * Just not use an atomic here since vm_page_queues_lock
-		 * alredy protects this field.
-		 */
 		cnt.v_inactive_count++;
 	}
 }
 
 void
 vm_page_deactivate(vm_page_t m)
 {
     _vm_page_deactivate(m, 0);
 }
 
 /*
  * vm_page_try_to_cache:
  *
  * Returns 0 on failure, 1 on success
  */
 int
 vm_page_try_to_cache(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_cache(m);
 	return (1);
 }
 
 /*
  * vm_page_try_to_free()
  *
  *	Attempt to free the page.  If we cannot free it, we do nothing.
  *	1 is returned on success, 0 on failure.
  */
 int
 vm_page_try_to_free(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->object != NULL)
 		VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->dirty || m->hold_count || m->busy || m->wire_count ||
 	    (m->oflags & VPO_BUSY) || (m->flags & PG_UNMANAGED)) {
 		return (0);
 	}
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_free(m);
 	return (1);
 }
 
 /*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
  *
  * This routine may not block.
  */
 void
 vm_page_cache(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if ((m->flags & PG_UNMANAGED) || (m->oflags & VPO_BUSY) || m->busy ||
 	    m->hold_count || m->wire_count) {
 		printf("vm_page_cache: attempting to cache busy page\n");
 		return;
 	}
 	if (VM_PAGE_INQUEUE1(m, PQ_CACHE))
 		return;
 
 	/*
 	 * Remove all pmaps and indicate that the page is not
 	 * writeable or mapped.
 	 */
 	pmap_remove_all(m);
 	if (m->dirty != 0) {
 		panic("vm_page_cache: caching a dirty page, pindex: %ld",
 			(long)m->pindex);
 	}
 	vm_pageq_remove_nowakeup(m);
 	vm_pageq_enqueue(PQ_CACHE + m->pc, m);
 	mtx_lock(&vm_page_queue_free_mtx);
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * vm_page_dontneed
  *
  *	Cache, deactivate, or do nothing as appropriate.  This routine
  *	is typically used by madvise() MADV_DONTNEED.
  *
  *	Generally speaking we want to move the page into the cache so
  *	it gets reused quickly.  However, this can result in a silly syndrome
  *	due to the page recycling too quickly.  Small objects will not be
  *	fully cached.  On the otherhand, if we move the page to the inactive
  *	queue we wind up with a problem whereby very large objects 
  *	unnecessarily blow away our inactive and cache queues.
  *
  *	The solution is to move the pages based on a fixed weighting.  We
  *	either leave them alone, deactivate them, or move them to the cache,
  *	where moving them to the cache has the highest weighting.
  *	By forcing some pages into other queues we eventually force the
  *	system to balance the queues, potentially recovering other unrelated
  *	space from active.  The idea is to not force this to happen too
  *	often.
  */
 void
 vm_page_dontneed(vm_page_t m)
 {
 	static int dnweight;
 	int dnw;
 	int head;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	dnw = ++dnweight;
 
 	/*
 	 * occassionally leave the page alone
 	 */
 	if ((dnw & 0x01F0) == 0 ||
 	    VM_PAGE_INQUEUE2(m, PQ_INACTIVE) || 
 	    VM_PAGE_INQUEUE1(m, PQ_CACHE)
 	) {
 		if (m->act_count >= ACT_INIT)
 			--m->act_count;
 		return;
 	}
 
 	if (m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	if (m->dirty || (dnw & 0x0070) == 0) {
 		/*
 		 * Deactivate the page 3 times out of 32.
 		 */
 		head = 0;
 	} else {
 		/*
 		 * Cache the page 28 times out of every 32.  Note that
 		 * the page is deactivated instead of cached, but placed
 		 * at the head of the queue instead of the tail.
 		 */
 		head = 1;
 	}
 	_vm_page_deactivate(m, head);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may block.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		if (vm_page_sleep_if_busy(m, TRUE, "pgrbwt")) {
 			if ((allocflags & VM_ALLOC_RETRY) == 0)
 				return (NULL);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
 				vm_page_lock_queues();
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 			}
 			if ((allocflags & VM_ALLOC_NOBUSY) == 0)
 				vm_page_busy(m);
 			return (m);
 		}
 	}
 	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY);
 	if (m == NULL) {
 		VM_OBJECT_UNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_LOCK(object);
 		if ((allocflags & VM_ALLOC_RETRY) == 0)
 			return (NULL);
 		goto retrylookup;
 	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 /*
  * Mapping function for valid bits or for dirty bits in
  * a page.  May not block.
  *
  * Inputs are required to range within a page.
  */
 inline int
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return ((2 << last_bit) - (1 << first_bit));
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	This routine may not block.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	int pagebits;
 	int frag;
 	int endoff;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the 
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	m->dirty &= ~pagebits;
 	if (base == 0 && size == PAGE_SIZE) {
 		pmap_clear_modify(m);
 		m->oflags &= ~VPO_NOSYNC;
 	}
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->dirty &= ~vm_page_bits(base, size);
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  *
  *	May not block.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	int bits;
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	bits = vm_page_bits(base, size);
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
 		pmap_remove_all(m);
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 	m->object->generation++;
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain 
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped 
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zerod by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) || 
 		    (m->valid & (1 << i))
 		) {
 			if (i > b) {
 				pmap_zero_page_area(m, 
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		m->valid = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  *
  *	May not block.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	int bits = vm_page_bits(base, size);
 
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 	if (m->valid && ((m->valid & bits) == bits))
 		return 1;
 	else
 		return 0;
 }
 
 /*
  * update dirty bits from pmap/mmu.  May not block.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 	if ((m->dirty != VM_PAGE_BITS_ALL) && pmap_is_modified(m)) {
 		vm_page_dirty(m);
 	}
 }
 
 int so_zerocp_fullpage = 0;
 
 void
 vm_page_cowfault(vm_page_t m)
 {
 	vm_page_t mnew;
 	vm_object_t object;
 	vm_pindex_t pindex;
 
 	object = m->object;
 	pindex = m->pindex;
 
  retry_alloc:
 	pmap_remove_all(m);
 	vm_page_remove(m);
 	mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (mnew == NULL) {
 		vm_page_insert(m, object, pindex);
 		vm_page_unlock_queues();
 		VM_OBJECT_UNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_LOCK(object);
 		vm_page_lock_queues();
 		goto retry_alloc;
 	}
 
 	if (m->cow == 0) {
 		/* 
 		 * check to see if we raced with an xmit complete when 
 		 * waiting to allocate a page.  If so, put things back 
 		 * the way they were 
 		 */
 		vm_page_free(mnew);
 		vm_page_insert(m, object, pindex);
 	} else { /* clear COW & copy page */
 		if (!so_zerocp_fullpage)
 			pmap_copy_page(m, mnew);
 		mnew->valid = VM_PAGE_BITS_ALL;
 		vm_page_dirty(mnew);
 		mnew->wire_count = m->wire_count - m->cow;
 		m->wire_count = m->cow;
 	}
 }
 
 void 
 vm_page_cowclear(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	if (m->cow) {
 		m->cow--;
 		/* 
 		 * let vm_fault add back write permission  lazily
 		 */
 	} 
 	/*
 	 *  sf_buf_free() will free the page, so we needn't do it here
 	 */ 
 }
 
 void
 vm_page_cowsetup(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	m->cow++;
 	pmap_remove_write(m);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
 	db_printf("cnt.v_free_count: %d\n", cnt.v_free_count);
 	db_printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
 	db_printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
 	db_printf("cnt.v_active_count: %d\n", cnt.v_active_count);
 	db_printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
 	db_printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
 	db_printf("cnt.v_free_min: %d\n", cnt.v_free_min);
 	db_printf("cnt.v_free_target: %d\n", cnt.v_free_target);
 	db_printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
 	db_printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int i;
 	db_printf("PQ_FREE:");
 	for (i = 0; i < PQ_NUMCOLORS; i++) {
 		db_printf(" %d", vm_page_queues[PQ_FREE + i].lcnt);
 	}
 	db_printf("\n");
 		
 	db_printf("PQ_CACHE:");
 	for (i = 0; i < PQ_NUMCOLORS; i++) {
 		db_printf(" %d", vm_page_queues[PQ_CACHE + i].lcnt);
 	}
 	db_printf("\n");
 
 	db_printf("PQ_ACTIVE: %d, PQ_INACTIVE: %d\n",
 		vm_page_queues[PQ_ACTIVE].lcnt,
 		vm_page_queues[PQ_INACTIVE].lcnt);
 }
 #endif /* DDB */
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c	(revision 170516)
+++ head/sys/vm/vm_pageout.c	(revision 170517)
@@ -1,1647 +1,1642 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <machine/mutex.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(int pass);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
 #endif
 
 
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 #endif
 static int vm_max_launder = 32;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 static int vm_pageout_full_stats_interval = 0;
 static int vm_pageout_algorithm=0;
 static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
 #else
 static int vm_swap_enabled=1;
 static int vm_swap_idle_enabled=0;
 #endif
 
 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
 	CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt");
 
 SYSCTL_INT(_vm, OID_AUTO, max_launder,
 	CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
 #else
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
 static void vm_pageout_map_deactivate_pages(vm_map_t, long);
 static void vm_pageout_object_deactivate_pages(pmap_t, vm_object_t, long);
 static void vm_req_vmdaemon(void);
 #endif
 static void vm_pageout_page_stats(void);
 
 /*
  * vm_pageout_fallback_object_lock:
  * 
  * Lock vm object currently associated with `m'. VM_OBJECT_TRYLOCK is
  * known to have failed and page queue must be either PQ_ACTIVE or
  * PQ_INACTIVE.  To avoid lock order violation, unlock the page queues
  * while locking the vm object.  Use marker page to detect page queue
  * changes and maintain notion of next page on page queue.  Return
  * TRUE if no changes were detected, FALSE otherwise.  vm object is
  * locked on return.
  * 
  * This function depends on both the lock portion of struct vm_object
  * and normal struct vm_page being type stable.
  */
 static boolean_t
 vm_pageout_fallback_object_lock(vm_page_t m, vm_page_t *next)
 {
 	struct vm_page marker;
 	boolean_t unchanged;
 	u_short queue;
 	vm_object_t object;
 
 	/*
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
 	marker.flags = PG_FICTITIOUS | PG_MARKER;
 	marker.oflags = VPO_BUSY;
 	marker.queue = m->queue;
 	marker.wire_count = 1;
 
 	queue = m->queue;
 	object = m->object;
 	
 	TAILQ_INSERT_AFTER(&vm_page_queues[queue].pl,
 			   m, &marker, pageq);
 	vm_page_unlock_queues();
 	VM_OBJECT_LOCK(object);
 	vm_page_lock_queues();
 
 	/* Page queue might have changed. */
 	*next = TAILQ_NEXT(&marker, pageq);
 	unchanged = (m->queue == queue &&
 		     m->object == object &&
 		     &marker == TAILQ_NEXT(m, pageq));
 	TAILQ_REMOVE(&vm_page_queues[queue].pl,
 		     &marker, pageq);
 	return (unchanged);
 }
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
 {
 	vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
 	int ib, is, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 	 * with the new swapper, but we could have serious problems paging
 	 * out other object types if there is insufficient memory.  
 	 *
 	 * Unfortunately, checking free memory here is far too late, so the
 	 * check has been moved up a procedural level.
 	 */
 
 	/*
 	 * Don't mess with the page if it's busy, held, or special
 	 */
 	if ((m->hold_count != 0) ||
 	    ((m->busy != 0) || (m->oflags & VPO_BUSY) ||
 	     (m->flags & PG_UNMANAGED))) {
 		return 0;
 	}
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying
 	 * align the clusters (which leave sporatic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 	object = m->object;
 more:
 	while (ib && pageout_count < vm_pageout_page_count) {
 		vm_page_t p;
 
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 
 		if ((p = vm_page_lookup(object, pindex - ib)) == NULL) {
 			ib = 0;
 			break;
 		}
 		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
 		    (p->oflags & VPO_BUSY) || p->busy ||
 		    (p->flags & PG_UNMANAGED)) {
 			ib = 0;
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			ib = 0;
 			break;
 		}
 		mc[--page_base] = p;
 		++pageout_count;
 		++ib;
 		/*
 		 * alignment boundry, stop here and switch directions.  Do
 		 * not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		vm_page_t p;
 
 		if ((p = vm_page_lookup(object, pindex + is)) == NULL)
 			break;
 		if (VM_PAGE_INQUEUE1(p, PQ_CACHE) ||
 		    (p->oflags & VPO_BUSY) || p->busy ||
 		    (p->flags & PG_UNMANAGED)) {
 			break;
 		}
 		vm_page_test_dirty(p);
 		if ((p->dirty & p->valid) == 0 ||
 		    p->queue != PQ_INACTIVE ||
 		    p->wire_count != 0 ||	/* may be held by buf cache */
 		    p->hold_count != 0) {	/* may be undergoing I/O */
 			break;
 		}
 		mc[page_base + pageout_count] = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past a page boundry.  This catches boundry
 	 * conditions.
 	 */
 	if (ib && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	return (vm_pageout_flush(&mc[page_base], pageout_count, 0));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
 	 * mark the pages read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		vm_page_io_start(mc[i]);
 		pmap_remove_write(mc[i]);
 	}
 	vm_page_unlock_queues();
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count,
 	    (flags | ((object == kernel_object) ? VM_PAGER_PUT_SYNC : 0)),
 	    pageout_status);
 
 	vm_page_lock_queues();
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT((mt->flags & PG_WRITEABLE) == 0,
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			pmap_clear_modify(mt);
 			vm_page_undirty(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
 			vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
 			if (vm_page_count_severe())
 				vm_page_try_to_cache(mt);
 		}
 	}
 	return numpagedout;
 }
 
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	deactivate enough pages to satisfy the inactive target
  *	requirements or if vm_page_proc_limit is set, then
  *	deactivate all of the pages in the object and its
  *	backing_objects.
  *
  *	The object and map must be locked.
  */
 static void
 vm_pageout_object_deactivate_pages(pmap, first_object, desired)
 	pmap_t pmap;
 	vm_object_t first_object;
 	long desired;
 {
 	vm_object_t backing_object, object;
 	vm_page_t p, next;
 	int actcount, rcount, remove_mode;
 
 	VM_OBJECT_LOCK_ASSERT(first_object, MA_OWNED);
 	if (first_object->type == OBJT_DEVICE || first_object->type == OBJT_PHYS)
 		return;
 	for (object = first_object;; object = backing_object) {
 		if (pmap_resident_count(pmap) <= desired)
 			goto unlock_return;
 		if (object->paging_in_progress)
 			goto unlock_return;
 
 		remove_mode = 0;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 		/*
 		 * scan the objects entire memory queue
 		 */
 		rcount = object->resident_page_count;
 		p = TAILQ_FIRST(&object->memq);
 		vm_page_lock_queues();
 		while (p && (rcount-- > 0)) {
 			if (pmap_resident_count(pmap) <= desired) {
 				vm_page_unlock_queues();
 				goto unlock_return;
 			}
 			next = TAILQ_NEXT(p, listq);
-			PCPU_INC(cnt.v_pdpages);
+			cnt.v_pdpages++;
 			if (p->wire_count != 0 ||
 			    p->hold_count != 0 ||
 			    p->busy != 0 ||
 			    (p->oflags & VPO_BUSY) ||
 			    (p->flags & PG_UNMANAGED) ||
 			    !pmap_page_exists_quick(pmap, p)) {
 				p = next;
 				continue;
 			}
 			actcount = pmap_ts_referenced(p);
 			if (actcount) {
 				vm_page_flag_set(p, PG_REFERENCED);
 			} else if (p->flags & PG_REFERENCED) {
 				actcount = 1;
 			}
 			if ((p->queue != PQ_ACTIVE) &&
 				(p->flags & PG_REFERENCED)) {
 				vm_page_activate(p);
 				p->act_count += actcount;
 				vm_page_flag_clear(p, PG_REFERENCED);
 			} else if (p->queue == PQ_ACTIVE) {
 				if ((p->flags & PG_REFERENCED) == 0) {
 					p->act_count -= min(p->act_count, ACT_DECLINE);
 					if (!remove_mode && (vm_pageout_algorithm || (p->act_count == 0))) {
 						pmap_remove_all(p);
 						vm_page_deactivate(p);
 					} else {
 						vm_pageq_requeue(p);
 					}
 				} else {
 					vm_page_activate(p);
 					vm_page_flag_clear(p, PG_REFERENCED);
 					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
 						p->act_count += ACT_ADVANCE;
 					vm_pageq_requeue(p);
 				}
 			} else if (p->queue == PQ_INACTIVE) {
 				pmap_remove_all(p);
 			}
 			p = next;
 		}
 		vm_page_unlock_queues();
 		if ((backing_object = object->backing_object) == NULL)
 			goto unlock_return;
 		VM_OBJECT_LOCK(backing_object);
 		if (object != first_object)
 			VM_OBJECT_UNLOCK(object);
 	}
 unlock_return:
 	if (object != first_object)
 		VM_OBJECT_UNLOCK(object);
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	long desired;
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 	int nothingwired;
 
 	if (!vm_map_trylock(map))
 		return;
 
 	bigobj = NULL;
 	nothingwired = TRUE;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL && VM_OBJECT_TRYLOCK(obj)) {
 				if (obj->shadow_count <= 1 &&
 				    (bigobj == NULL ||
 				     bigobj->resident_page_count < obj->resident_page_count)) {
 					if (bigobj != NULL)
 						VM_OBJECT_UNLOCK(bigobj);
 					bigobj = obj;
 				} else
 					VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		if (tmpe->wired_count > 0)
 			nothingwired = FALSE;
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj != NULL) {
 		vm_pageout_object_deactivate_pages(map->pmap, bigobj, desired);
 		VM_OBJECT_UNLOCK(bigobj);
 	}
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj != NULL) {
 				VM_OBJECT_LOCK(obj);
 				vm_pageout_object_deactivate_pages(map->pmap, obj, desired);
 				VM_OBJECT_UNLOCK(obj);
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0 && nothingwired) {
 		pmap_remove(vm_map_pmap(map), vm_map_min(map),
 		    vm_map_max(map));
 	}
 	vm_map_unlock(map);
 }
 #endif		/* !defined(NO_SWAPPING) */
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static void
 vm_pageout_scan(int pass)
 {
 	vm_page_t m, next;
 	struct vm_page marker;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	struct proc *p, *bigproc;
 	struct thread *td;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
 	int actcount, cache_cur, cache_first_failure;
 	static int cache_last_free;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 
 	mtx_lock(&Giant);
 	/*
 	 * Decrease registered cache sizes.
 	 */
 	EVENTHANDLER_INVOKE(vm_lowmem, 0);
 	/*
 	 * We do this explicitly after the caches have been drained above.
 	 */
 	uma_reclaim();
 
 	addl_page_shortage_init = atomic_readandclear_int(&vm_pageout_deficit);
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
 	page_shortage = vm_paging_target() + addl_page_shortage_init;
 
 	/*
 	 * Initialize our marker
 	 */
 	bzero(&marker, sizeof(marker));
 	marker.flags = PG_FICTITIOUS | PG_MARKER;
 	marker.oflags = VPO_BUSY;
 	marker.queue = PQ_INACTIVE;
 	marker.wire_count = 1;
 
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.  Note that m->act_count
 	 * is not used to form decisions for the inactive queue, only for the
 	 * active queue.
 	 *
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
 	 * to disk can result in extreme performance degredation.  However,
 	 * systems with excessive dirty pages (especially when MAP_NOSYNC is
 	 * used) will die horribly with limited laundering.  If the pageout
 	 * daemon cannot clean enough pages in the first pass, we let it go
 	 * all out in succeeding passes.
 	 */
 	if ((maxlaunder = vm_max_launder) <= 1)
 		maxlaunder = 1;
 	if (pass)
 		maxlaunder = 10000;
 	vm_page_lock_queues();
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
 	maxscan = cnt.v_inactive_count;
 
 	for (m = TAILQ_FIRST(&vm_page_queues[PQ_INACTIVE].pl);
 	     m != NULL && maxscan-- > 0 && page_shortage > 0;
 	     m = next) {
 
-		PCPU_INC(cnt.v_pdpages);
+		cnt.v_pdpages++;
 
 		if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE) {
 			goto rescan0;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 
 		/*
 		 * skip marker pages
 		 */
 		if (m->flags & PG_MARKER)
 			continue;
 
 		/*
 		 * A held page may be undergoing I/O, so skip it.
 		 */
 		if (m->hold_count) {
 			vm_pageq_requeue(m);
 			addl_page_shortage++;
 			continue;
 		}
 		/*
 		 * Don't mess with busy pages, keep in the front of the
 		 * queue, most likely are being paged out.
 		 */
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    (!vm_pageout_fallback_object_lock(m, &next) ||
 		     m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 		if (m->busy || (m->oflags & VPO_BUSY)) {
 			VM_OBJECT_UNLOCK(object);
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * If the object is not being used, we ignore previous 
 		 * references.
 		 */
 		if (object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(m);
 
 		/*
 		 * Otherwise, if the page has been referenced while in the 
 		 * inactive queue, we bump the "activation count" upwards, 
 		 * making it less likely that the page will be added back to 
 		 * the inactive queue prematurely again.  Here we check the 
 		 * page tables (or emulated bits, if any), given the upper 
 		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(m))) {
 			vm_page_activate(m);
 			VM_OBJECT_UNLOCK(object);
 			m->act_count += (actcount + ACT_ADVANCE);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
 		 * references, we activate the page.  We also set the 
 		 * "activation count" higher than normal so that we will less 
 		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount = pmap_ts_referenced(m);
 			vm_page_activate(m);
 			VM_OBJECT_UNLOCK(object);
 			m->act_count += (actcount + ACT_ADVANCE + 1);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system doesn't know anything about 
 		 * the page being dirty, we have to check for it again.  As 
 		 * far as the VM code knows, any partially dirty pages are 
 		 * fully dirty.
 		 */
 		if (m->dirty == 0 && !pmap_is_modified(m)) {
 			/*
 			 * Avoid a race condition: Unless write access is
 			 * removed from the page, another processor could
 			 * modify it before all access is removed by the call
 			 * to vm_page_cache() below.  If vm_page_cache() finds
 			 * that the page has been modified when it removes all
 			 * access, it panics because it cannot cache dirty
 			 * pages.  In principle, we could eliminate just write
 			 * access here rather than all access.  In the expected
 			 * case, when there are no last instant modifications
 			 * to the page, removing all access will be cheaper
 			 * overall.
 			 */
 			if ((m->flags & PG_WRITEABLE) != 0)
 				pmap_remove_all(m);
 		} else {
 			vm_page_dirty(m);
 		}
 
 		if (m->valid == 0) {
 			/*
 			 * Invalid pages can be easily freed
 			 */
 			vm_page_free(m);
-			PCPU_INC(cnt.v_dfree);
+			cnt.v_dfree++;
 			--page_shortage;
 		} else if (m->dirty == 0) {
 			/*
 			 * Clean pages can be placed onto the cache queue.
 			 * This effectively frees them.
 			 */
 			vm_page_cache(m);
 			--page_shortage;
 		} else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) {
 			/*
 			 * Dirty pages need to be paged out, but flushing
 			 * a page is extremely expensive verses freeing
 			 * a clean page.  Rather then artificially limiting
 			 * the number of pages we can flush, we instead give
 			 * dirty pages extra priority on the inactive queue
 			 * by forcing them to be cycled through the queue
 			 * twice before being flushed, after which the
 			 * (now clean) page will cycle through once more
 			 * before being freed.  This significantly extends
 			 * the thrash point for a heavily loaded machine.
 			 */
 			vm_page_flag_set(m, PG_WINATCFLS);
 			vm_pageq_requeue(m);
 		} else if (maxlaunder > 0) {
 			/*
 			 * We always want to try to flush some dirty pages if
 			 * we encounter them, to keep the system stable.
 			 * Normally this number is small, but under extreme
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
 			 */
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 			struct mount *mp;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
 				vm_page_count_min());
 										
 			}
 
 			/*
 			 * We don't bother paging objects that are "dead".  
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				VM_OBJECT_UNLOCK(object);
 				vm_pageq_requeue(m);
 				continue;
 			}
 
 			/*
 			 * Following operations may unlock
 			 * vm_page_queue_mtx, invalidating the 'next'
 			 * pointer.  To prevent an inordinate number
 			 * of restarts we use our marker to remember
 			 * our place.
 			 *
 			 */
 			TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE].pl,
 					   m, &marker, pageq);
 			/*
 			 * The object is already known NOT to be dead.   It
 			 * is possible for the vget() to block the whole
 			 * pageout daemon, but the new low-memory handling
 			 * code should prevent it.
 			 *
 			 * The previous code skipped locked vnodes and, worse,
 			 * reordered pages in the queue.  This results in
 			 * completely non-deterministic operation and, on a
 			 * busy system, can lead to extremely non-optimal
 			 * pageouts.  For example, it can cause clean pages
 			 * to be freed and dirty pages to be moved to the end
 			 * of the queue.  Since dirty pages are also moved to
 			 * the end of the queue once-cleaned, this gives
 			 * way too large a weighting to defering the freeing
 			 * of dirty pages.
 			 *
 			 * We can't wait forever for the vnode lock, we might
 			 * deadlock due to a vn_read() getting stuck in
 			 * vm_wait while holding this vnode.  We skip the 
 			 * vnode if we can't get it in a reasonable amount
 			 * of time.
 			 */
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
 				mp = NULL;
 				if (vp->v_type == VREG &&
 				    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 					++pageout_lock_miss;
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
 					goto unlock_and_continue;
 				}
 				vm_page_unlock_queues();
 				VI_LOCK(vp);
 				VM_OBJECT_UNLOCK(object);
 				if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK |
 				    LK_TIMELOCK, curthread)) {
 					VM_OBJECT_LOCK(object);
 					vm_page_lock_queues();
 					++pageout_lock_miss;
 					vn_finished_write(mp);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vp = NULL;
 					goto unlock_and_continue;
 				}
 				VM_OBJECT_LOCK(object);
 				vm_page_lock_queues();
 				/*
 				 * The page might have been moved to another
 				 * queue during potential blocking in vget()
 				 * above.  The page might have been freed and
 				 * reused for another vnode.  The object might
 				 * have been reused for another vnode.
 				 */
 				if (VM_PAGE_GETQUEUE(m) != PQ_INACTIVE ||
 				    m->object != object ||
 				    object->handle != vp ||
 				    TAILQ_NEXT(m, pageq) != &marker) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 	
 				/*
 				 * The page may have been busied during the
 				 * blocking in vput();  We don't move the
 				 * page back onto the end of the queue so that
 				 * statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->oflags & VPO_BUSY)) {
 					goto unlock_and_continue;
 				}
 
 				/*
 				 * If the page has become held it might
 				 * be undergoing I/O, so skip it
 				 */
 				if (m->hold_count) {
 					vm_pageq_requeue(m);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					goto unlock_and_continue;
 				}
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation. 
 			 *
 			 * decrement page_shortage on success to account for
 			 * the (future) cleaned page.  Otherwise we could wind
 			 * up laundering or cleaning too many pages.
 			 */
 			if (vm_pageout_clean(m) != 0) {
 				--page_shortage;
 				--maxlaunder;
 			}
 unlock_and_continue:
 			VM_OBJECT_UNLOCK(object);
 			if (vp) {
 				vm_page_unlock_queues();
 				vput(vp);
 				vn_finished_write(mp);
 				vm_page_lock_queues();
 			}
 			next = TAILQ_NEXT(&marker, pageq);
 			TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE].pl,
 				     &marker, pageq);
 			continue;
 		}
 		VM_OBJECT_UNLOCK(object);
 	}
 
 	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
 	page_shortage = vm_paging_target() +
 		cnt.v_inactive_target - cnt.v_inactive_count;
 	page_shortage += addl_page_shortage;
 
 	/*
 	 * Scan the active queue for things we can deactivate. We nominally
 	 * track the per-page activity counter and use it to locate
 	 * deactivation candidates.
 	 */
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_scan: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 		if ((m->flags & PG_MARKER) != 0) {
 			m = next;
 			continue;
 		}
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			VM_OBJECT_UNLOCK(object);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * The count for pagedaemon pages is done after checking the
 		 * page for eligibility...
 		 */
-		PCPU_INC(cnt.v_pdpages);
+		cnt.v_pdpages++;
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
 		actcount = 0;
 		if (object->ref_count != 0) {
 			if (m->flags & PG_REFERENCED) {
 				actcount += 1;
 			}
 			actcount += pmap_ts_referenced(m);
 			if (actcount) {
 				m->act_count += ACT_ADVANCE + actcount;
 				if (m->act_count > ACT_MAX)
 					m->act_count = ACT_MAX;
 			}
 		}
 
 		/*
 		 * Since we have "tested" this bit, we need to clear it now.
 		 */
 		vm_page_flag_clear(m, PG_REFERENCED);
 
 		/*
 		 * Only if an object is currently being used, do we use the
 		 * page activation count stats.
 		 */
 		if (actcount && (object->ref_count != 0)) {
 			vm_pageq_requeue(m);
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm ||
 			    object->ref_count == 0 ||
 			    m->act_count == 0) {
 				page_shortage--;
 				if (object->ref_count == 0) {
 					pmap_remove_all(m);
 					if (m->dirty == 0)
 						vm_page_cache(m);
 					else
 						vm_page_deactivate(m);
 				} else {
 					vm_page_deactivate(m);
 				}
 			} else {
 				vm_pageq_requeue(m);
 			}
 		}
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
 
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
 	 * code to be guaranteed space.  Since both cache and free queues 
 	 * are considered basically 'free', moving pages from cache to free
 	 * does not effect other calculations.
 	 */
 	cache_cur = cache_last_free;
 	cache_first_failure = -1;
 	while (cnt.v_free_count < cnt.v_free_reserved && (cache_cur =
 	    (cache_cur + PQ_PRIME2) & PQ_COLORMASK) != cache_first_failure) {
 		TAILQ_FOREACH(m, &vm_page_queues[PQ_CACHE + cache_cur].pl,
 		    pageq) {
 			KASSERT(m->dirty == 0,
 			    ("Found dirty cache page %p", m));
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("Found mapped cache page %p", m));
 			KASSERT((m->flags & PG_UNMANAGED) == 0,
 			    ("Found unmanaged cache page %p", m));
 			KASSERT(m->wire_count == 0,
 			    ("Found wired cache page %p", m));
 			if (m->hold_count == 0 && VM_OBJECT_TRYLOCK(object =
 			    m->object)) {
 				KASSERT((m->oflags & VPO_BUSY) == 0 &&
 				    m->busy == 0, ("Found busy cache page %p",
 				    m));
 				vm_page_free(m);
 				VM_OBJECT_UNLOCK(object);
-				PCPU_INC(cnt.v_dfree);
+				cnt.v_dfree++;
 				cache_last_free = cache_cur;
 				cache_first_failure = -1;
 				break;
 			}
 		}
 		if (m == NULL && cache_first_failure == -1)
 			cache_first_failure = cache_cur;
 	}
 	vm_page_unlock_queues();
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
 	 */
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
 			vm_req_vmdaemon();
 			lsec = time_second;
 		}
 	}
 #endif
 		
 	/*
 	 * If we didn't get enough free pages, and we have skipped a vnode
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
 	if (vm_paging_target() > 0) {
 		if (vnodes_skipped && vm_page_count_min())
 			(void) speedup_syncer();
 #if !defined(NO_SWAPPING)
 		if (vm_swap_enabled && vm_page_count_target()) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
 #endif
 	}
 
 	/*
 	 * If we are critically low on one of RAM or swap and low on
 	 * the other, kill the largest process.  However, we avoid
 	 * doing this on the first pass in order to give ourselves a
 	 * chance to flush out dirty vnode-backed pages and to allow
 	 * active pages to be moved to the inactive queue and reclaimed.
 	 *
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of it's child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	if (pass != 0 &&
 	    ((swap_pager_avail < 64 && vm_page_count_min()) ||
 	     (swap_pager_full && vm_paging_target() > 0))) {
 		bigproc = NULL;
 		bigsize = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			int breakout;
 
 			if (PROC_TRYLOCK(p) == 0)
 				continue;
 			/*
 			 * If this is a system or protected process, skip it.
 			 */
 			if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
 			    (p->p_flag & P_PROTECTED) ||
 			    ((p->p_pid < 48) && (swap_pager_avail != 0))) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * If the process is in a non-running type state,
 			 * don't touch it.  Check all the threads individually.
 			 */
 			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
 				thread_unlock(td);
 			}
 			PROC_SUNLOCK(p);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get the process size
 			 */
 			if (!vm_map_trylock_read(&p->p_vmspace->vm_map)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			size = vmspace_swap_count(p->p_vmspace);
 			vm_map_unlock_read(&p->p_vmspace->vm_map);
 			size += vmspace_resident_count(p->p_vmspace);
 			/*
 			 * if the this process is bigger than the biggest one
 			 * remember it.
 			 */
 			if (size > bigsize) {
 				if (bigproc != NULL)
 					PROC_UNLOCK(bigproc);
 				bigproc = p;
 				bigsize = size;
 			} else
 				PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		if (bigproc != NULL) {
 			killproc(bigproc, "out of swap space");
 			PROC_SLOCK(bigproc);
 			sched_nice(bigproc, PRIO_MIN);
 			PROC_SUNLOCK(bigproc);
 			PROC_UNLOCK(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
 	}
 	mtx_unlock(&Giant);
 }
 
 /*
  * This routine tries to maintain the pseudo LRU active queue,
  * so that during long periods of time where there is no paging,
  * that some statistic accumulation still occurs.  This code
  * helps the situation where paging just starts to occur.
  */
 static void
 vm_pageout_page_stats()
 {
 	vm_object_t object;
 	vm_page_t m,next;
 	int pcount,tpcount;		/* Number of pages to check */
 	static int fullintervalcount = 0;
 	int page_shortage;
 
 	mtx_assert(&vm_page_queue_mtx, MA_OWNED);
 	page_shortage = 
 	    (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 
 	if (page_shortage <= 0)
 		return;
 
 	pcount = cnt.v_active_count;
 	fullintervalcount += vm_pageout_stats_interval;
 	if (fullintervalcount < vm_pageout_full_stats_interval) {
 		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
 		if (pcount > tpcount)
 			pcount = tpcount;
 	} else {
 		fullintervalcount = 0;
 	}
 
 	m = TAILQ_FIRST(&vm_page_queues[PQ_ACTIVE].pl);
 	while ((m != NULL) && (pcount-- > 0)) {
 		int actcount;
 
 		KASSERT(VM_PAGE_INQUEUE2(m, PQ_ACTIVE),
 		    ("vm_pageout_page_stats: page %p isn't active", m));
 
 		next = TAILQ_NEXT(m, pageq);
 		object = m->object;
 
 		if ((m->flags & PG_MARKER) != 0) {
 			m = next;
 			continue;
 		}
 		if (!VM_OBJECT_TRYLOCK(object) &&
 		    !vm_pageout_fallback_object_lock(m, &next)) {
 			VM_OBJECT_UNLOCK(object);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->oflags & VPO_BUSY) ||
 		    (m->hold_count != 0)) {
 			VM_OBJECT_UNLOCK(object);
 			vm_pageq_requeue(m);
 			m = next;
 			continue;
 		}
 
 		actcount = 0;
 		if (m->flags & PG_REFERENCED) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount += 1;
 		}
 
 		actcount += pmap_ts_referenced(m);
 		if (actcount) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 			vm_pageq_requeue(m);
 		} else {
 			if (m->act_count == 0) {
 				/*
 				 * We turn off page access, so that we have
 				 * more accurate RSS stats.  We don't do this
 				 * in the normal page deactivation when the
 				 * system is loaded VM wise, because the
 				 * cost of the large number of page protect
 				 * operations would be higher than the value
 				 * of doing the operation.
 				 */
 				pmap_remove_all(m);
 				vm_page_deactivate(m);
 			} else {
 				m->act_count -= min(m->act_count, ACT_DECLINE);
 				vm_pageq_requeue(m);
 			}
 		}
 		VM_OBJECT_UNLOCK(object);
 		m = next;
 	}
 }
 
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout()
 {
 	int error, pass;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 	cnt.v_interrupt_free_min = 2;
 	if (cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	if (cnt.v_page_count > 1024)
 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
 	else
 		cnt.v_free_min = 4;
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 	    cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 	    cnt.v_pageout_free_min + (cnt.v_page_count / 768) + PQ_NUMCOLORS;
 	cnt.v_free_severe = cnt.v_free_min / 2;
-
-	/*
-	 * Here adds don't need to be atomic since we are only initializing
-	 * v_free_min and v_free_severe.
-	 */
 	cnt.v_free_min += cnt.v_free_reserved;
 	cnt.v_free_severe += cnt.v_free_reserved;
 
 	/*
 	 * v_free_target and v_cache_min control pageout hysteresis.  Note
 	 * that these are more a measure of the VM cache queue hysteresis
 	 * then the VM free queue.  Specifically, v_free_target is the
 	 * high water mark (free+cache pages).
 	 *
 	 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the
 	 * low water mark, while v_free_min is the stop.  v_cache_min must
 	 * be big enough to handle memory needs while the pageout daemon
 	 * is signalled and run to free more pages.
 	 */
 	if (cnt.v_free_count > 6144)
 		cnt.v_free_target = 4 * cnt.v_free_min + cnt.v_free_reserved;
 	else
 		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
 
 	if (cnt.v_free_count > 2048) {
 		cnt.v_cache_min = cnt.v_free_target;
 		cnt.v_cache_max = 2 * cnt.v_cache_min;
 		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
 	} else {
 		cnt.v_cache_min = 0;
 		cnt.v_cache_max = 0;
 		cnt.v_inactive_target = cnt.v_free_count / 4;
 	}
 	if (cnt.v_inactive_target > cnt.v_free_count / 3)
 		cnt.v_inactive_target = cnt.v_free_count / 3;
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
 
 	if (vm_pageout_stats_max == 0)
 		vm_pageout_stats_max = cnt.v_free_target;
 
 	/*
 	 * Set interval in seconds for stats scan.
 	 */
 	if (vm_pageout_stats_interval == 0)
 		vm_pageout_stats_interval = 5;
 	if (vm_pageout_full_stats_interval == 0)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
 
 	swap_pager_swap_init();
 	pass = 0;
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
 		/*
 		 * If we have enough free memory, wakeup waiters.  Do
 		 * not clear vm_pages_needed until we reach our target,
 		 * otherwise we may be woken up over and over again and
 		 * waste a lot of cpu.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		if (vm_pages_needed && !vm_page_count_min()) {
 			if (!vm_paging_needed())
 				vm_pages_needed = 0;
 			wakeup(&cnt.v_free_count);
 		}
 		if (vm_pages_needed) {
 			/*
 			 * Still not done, take a second pass without waiting
 			 * (unlimited dirty cleaning), otherwise sleep a bit
 			 * and try again.
 			 */
 			++pass;
 			if (pass > 1)
 				msleep(&vm_pages_needed,
 				    &vm_page_queue_free_mtx, PVM, "psleep",
 				    hz / 2);
 		} else {
 			/*
 			 * Good enough, sleep & handle stats.  Prime the pass
 			 * for the next run.
 			 */
 			if (pass > 1)
 				pass = 1;
 			else
 				pass = 0;
 			error = msleep(&vm_pages_needed,
 			    &vm_page_queue_free_mtx, PVM, "psleep",
 			    vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				pass = 0;
 				vm_page_lock_queues();
 				vm_pageout_page_stats();
 				vm_page_unlock_queues();
 				continue;
 			}
 		}
 		if (vm_pages_needed)
-			PCPU_INC(cnt.v_pdwakeups);
+			cnt.v_pdwakeups++;
 		mtx_unlock(&vm_page_queue_free_mtx);
 		vm_pageout_scan(pass);
 	}
 }
 
 /*
  * Unless the free page queue lock is held by the caller, this function
  * should be regarded as advisory.  Specifically, the caller should
  * not msleep() on &cnt.v_free_count following this function unless
  * the free page queue lock is held until the msleep() is performed.
  */
 void
 pagedaemon_wakeup()
 {
 
 	if (!vm_pages_needed && curthread->td_proc != pageproc) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 }
 
 #if !defined(NO_SWAPPING)
 static void
 vm_req_vmdaemon()
 {
 	static int lastrun = 0;
 
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 }
 
 static void
 vm_daemon()
 {
 	struct rlimit rsslim;
 	struct proc *p;
 	struct thread *td;
 	int breakout;
 
 	mtx_lock(&Giant);
 	while (TRUE) {
 		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
 		if (vm_pageout_req_swapout) {
 			swapout_procs(vm_pageout_req_swapout);
 			vm_pageout_req_swapout = 0;
 		}
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			PROC_LOCK(p);
 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			PROC_SLOCK(p);
 			breakout = 0;
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (!TD_ON_RUNQ(td) &&
 				    !TD_IS_RUNNING(td) &&
 				    !TD_IS_SLEEPING(td)) {
 					thread_unlock(td);
 					breakout = 1;
 					break;
 				}
 				thread_unlock(td);
 			}
 			PROC_SUNLOCK(p);
 			if (breakout) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			lim_rlimit(p, RLIMIT_RSS, &rsslim);
 			limit = OFF_TO_IDX(
 			    qmin(rsslim.rlim_cur, rsslim.rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_sflag & PS_INMEM) == 0)
 				limit = 0;	/* XXX */
 			PROC_UNLOCK(p);
 
 			size = vmspace_resident_count(p->p_vmspace);
 			if (limit >= 0 && size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &p->p_vmspace->vm_map, limit);
 			}
 		}
 		sx_sunlock(&allproc_lock);
 	}
 }
 #endif			/* !defined(NO_SWAPPING) */
Index: head/sys/vm/vm_pageq.c
===================================================================
--- head/sys/vm/vm_pageq.c	(revision 170516)
+++ head/sys/vm/vm_pageq.c	(revision 170517)
@@ -1,330 +1,330 @@
 /*-
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vmpage.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 static void vm_coloring_init(void);
 void setPQL2(int *const size, int *const ways);
 
 struct vpgqueues vm_page_queues[PQ_MAXCOUNT];
 struct pq_coloring page_queue_coloring;
 
 static int pq_cachesize = 0;	/* size of the cache in KB */
 static int pq_cachenways = 0;	/* associativity of the cache */
 
 SYSCTL_NODE(_vm_stats, OID_AUTO, pagequeue, CTLFLAG_RW, 0, "VM meter stats");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, page_colors, CTLFLAG_RD,
     &(PQ_NUMCOLORS), 0, "Number of colors in the page queue");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, cachesize, CTLFLAG_RD,
     &pq_cachesize, 0, "Size of the processor cache in KB");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, cachenways, CTLFLAG_RD,
     &pq_cachenways, 0, "Associativity of the processor cache");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, prime1, CTLFLAG_RD,
     &(PQ_PRIME1), 0, "Cache tuning value");
 SYSCTL_INT(_vm_stats_pagequeue, OID_AUTO, prime2, CTLFLAG_RD,
     &(PQ_PRIME2), 0, "Cache tuning value");
 
 static void
 vm_coloring_init(void)
 {
 #ifdef PQ_NOOPT
 	PQ_NUMCOLORS = PQ_PRIME1 = PQ_PRIME2 = 1;
 #else
 
 	setPQL2(&pq_cachesize, &pq_cachenways);
 
 	CTASSERT(PAGE_SIZE/1024 > 0);
 
 	if (pq_cachesize > 0 && pq_cachenways > 0)
 		PQ_NUMCOLORS = pq_cachesize / (PAGE_SIZE/1024) / \
 		    pq_cachenways;
 	else
 		PQ_NUMCOLORS = 32;
 
 	if (PQ_MAXCOLORS < PQ_NUMCOLORS) {
 		printf("VM-PQ color limit (PQ_MAXCOLORS=%u) exceeded (%u), see vm_page.h", PQ_MAXCOLORS, PQ_NUMCOLORS);
 		PQ_NUMCOLORS = PQ_MAXCOLORS;
 	}
 
 	if (PQ_NUMCOLORS >= 128) {
 		PQ_PRIME1 = 31;
 		PQ_PRIME2 = 23;
 	} else if (PQ_NUMCOLORS >= 64) {
 		PQ_PRIME1 = 13;
 		PQ_PRIME2 = 7;
 	} else if (PQ_NUMCOLORS >= 32) {
 		PQ_PRIME1 = 9;
 		PQ_PRIME2 = 5;
 	} else if (PQ_NUMCOLORS >= 16) {
 		PQ_PRIME1 = 5;
 		PQ_PRIME2 = 3;
 	} else
 		PQ_NUMCOLORS = PQ_PRIME1 = PQ_PRIME2 = 1;
 #endif
 
 	/*
 	 * PQ_CACHE represents a
 	 * PQ_NUMCOLORS consecutive queue.
 	 */
 	PQ_COLORMASK = PQ_NUMCOLORS - 1;
 	PQ_INACTIVE  = 1 + PQ_NUMCOLORS;
 	PQ_ACTIVE    = 2 + PQ_NUMCOLORS;
 	PQ_CACHE     = 3 + PQ_NUMCOLORS;
 	PQ_HOLD      = 3 + 2 * PQ_NUMCOLORS;
 	PQ_COUNT     = 4 + 2 * PQ_NUMCOLORS;
 	PQ_MAXLENGTH = PQ_NUMCOLORS / 3 + PQ_PRIME1;
 
 #if 0
 	/* XXX: is it possible to allocate vm_page_queues[PQ_COUNT] here? */
 #error XXX: vm_page_queues = malloc(PQ_COUNT * sizeof(struct vpgqueues));
 #endif
 
 	if (bootverbose)
 		if (PQ_NUMCOLORS > 1)
 		    printf("Using %d colors for the VM-PQ tuning (%d, %d)\n",
 		    PQ_NUMCOLORS, pq_cachesize, pq_cachenways);
 }
 
 void
 vm_pageq_init(void)
 {
 	int i;
 
 	vm_coloring_init();
 
 	for (i = 0; i < PQ_NUMCOLORS; ++i) {
 		vm_page_queues[PQ_FREE+i].cnt = &cnt.v_free_count;
 	}
 	for (i = 0; i < PQ_NUMCOLORS; ++i) {
 		vm_page_queues[PQ_CACHE + i].cnt = &cnt.v_cache_count;
 	}
 	vm_page_queues[PQ_INACTIVE].cnt = &cnt.v_inactive_count;
 	vm_page_queues[PQ_ACTIVE].cnt = &cnt.v_active_count;
 	vm_page_queues[PQ_HOLD].cnt = &cnt.v_active_count;
 
 	for (i = 0; i < PQ_COUNT; i++) {
 		TAILQ_INIT(&vm_page_queues[i].pl);
 	}
 }
 
 void
 vm_pageq_requeue(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *vpq;
 
 	if (queue != PQ_NONE) {
 		vpq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&vpq->pl, m, pageq);
 		TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 	}
 }
 
 /*
  *	vm_pageq_enqueue:
  */
 void
 vm_pageq_enqueue(int queue, vm_page_t m)
 {
 	struct vpgqueues *vpq;
 
 	vpq = &vm_page_queues[queue];
 	VM_PAGE_SETQUEUE2(m, queue);
 	TAILQ_INSERT_TAIL(&vpq->pl, m, pageq);
 	++*vpq->cnt;
 	++vpq->lcnt;
 }
 
 /*
  *	vm_add_new_page:
  *
  *	Add a new page to the freelist for use by the system.
  */
 void
 vm_pageq_add_new_page(vm_paddr_t pa)
 {
 	vm_page_t m;
 
-	atomic_add_int(&cnt.v_page_count, 1);
+	cnt.v_page_count++;
 	m = PHYS_TO_VM_PAGE(pa);
 	m->phys_addr = pa;
 	m->flags = 0;
 	m->pc = (pa >> PAGE_SHIFT) & PQ_COLORMASK;
 	pmap_page_init(m);
 	mtx_lock(&vm_page_queue_free_mtx);
 	vm_pageq_enqueue(m->pc + PQ_FREE, m);
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  * vm_pageq_remove_nowakeup:
  *
  * 	vm_page_unqueue() without any wakeup
  *
  *	The queue containing the given page must be locked.
  *	This routine may not block.
  */
 void
 vm_pageq_remove_nowakeup(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *pq;
 
 	if (queue != PQ_NONE) {
 		pq = &vm_page_queues[queue];
 		VM_PAGE_SETQUEUE2(m, PQ_NONE);
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
 		pq->lcnt--;
 	}
 }
 
 /*
  * vm_pageq_remove:
  *
  *	Remove a page from its queue.
  *
  *	The queue containing the given page must be locked.
  *	This routine may not block.
  */
 void
 vm_pageq_remove(vm_page_t m)
 {
 	int queue = VM_PAGE_GETQUEUE(m);
 	struct vpgqueues *pq;
 
 	if (queue != PQ_NONE) {
 		VM_PAGE_SETQUEUE2(m, PQ_NONE);
 		pq = &vm_page_queues[queue];
 		TAILQ_REMOVE(&pq->pl, m, pageq);
 		(*pq->cnt)--;
 		pq->lcnt--;
 		if (VM_PAGE_RESOLVEQUEUE(m, queue) == PQ_CACHE) {
 			if (vm_paging_needed())
 				pagedaemon_wakeup();
 		}
 	}
 }
 
 #ifndef PQ_NOOPT
 
 /*
  *	vm_pageq_find:
  *
  *	Find a page on the specified queue with color optimization.
  *
  *	The page coloring optimization attempts to locate a page
  *	that does not overload other nearby pages in the object in
  *	the cpu's L2 cache.  We need this optimization because cpu
  *	caches tend to be physical caches, while object spaces tend 
  *	to be virtual.
  *
  *	The specified queue must be locked.
  *	This routine may not block.
  *
  *	This routine may only be called from the vm_pageq_find()
  *	function in this file.
  */
 static inline vm_page_t
 _vm_pageq_find(int basequeue, int index)
 {
 	int i;
 	vm_page_t m = NULL;
 	struct vpgqueues *pq;
 
 	pq = &vm_page_queues[basequeue];
 
 	/*
 	 * Note that for the first loop, index+i and index-i wind up at the
 	 * same place.  Even though this is not totally optimal, we've already
 	 * blown it by missing the cache case so we do not care.
 	 */
 	for (i = PQ_NUMCOLORS / 2; i > 0; --i) {
 		if ((m = TAILQ_FIRST(&pq[(index + i) & PQ_COLORMASK].pl)) \
 		    != NULL)
 			break;
 
 		if ((m = TAILQ_FIRST(&pq[(index - i) & PQ_COLORMASK].pl)) \
 		    != NULL)
 			break;
 	}
 	return (m);
 }
 #endif /* PQ_NOOPT */
 
 vm_page_t
 vm_pageq_find(int basequeue, int index, boolean_t prefer_zero)
 {
         vm_page_t m;
 
 #ifndef PQ_NOOPT
 	if (PQ_NUMCOLORS > 1) {
 	        if (prefer_zero) {
 	                m = TAILQ_LAST(&vm_page_queues[basequeue+index].pl, \
 			    pglist);
         	} else {
                 	m = TAILQ_FIRST(&vm_page_queues[basequeue+index].pl);
         	}
         	if (m == NULL) {
                 	m = _vm_pageq_find(basequeue, index);
 		}
 	} else {
 #endif
         	if (prefer_zero) {
                 	m = TAILQ_LAST(&vm_page_queues[basequeue].pl, pglist);
         	} else {
                 	m = TAILQ_FIRST(&vm_page_queues[basequeue].pl);
         	}
 #ifndef PQ_NOOPT
 	}
 #endif
         return (m);
 }