Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 71982)
+++ head/sys/amd64/amd64/machdep.c	(revision 71983)
@@ -1,2529 +1,2530 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * $FreeBSD$
  */
 
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_user_ldt.h"
 #include "opt_userconfig.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/callout.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/bootinfo.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #include <machine/globaldata.h>
 #include <machine/globals.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #ifdef OLD_BUS_ARCH
 #include <i386/isa/isa_device.h>
 #endif
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 #include <isa/rtc.h>
 #include <machine/vm86.h>
 #include <sys/ptrace.h>
 #include <machine/sigframe.h>
 
 extern void init386 __P((int first));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 int	_udatasel, _ucodesel;
 u_int	atdevbase;
 
 #if defined(SWTCH_OPTIM_STATS)
 extern int swtch_optim_stats;
 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
 	CTLFLAG_RD, &tlb_flush_count, 0, "");
 #endif
 
 #ifdef PC98
 static int	ispc98 = 1;
 #else
 static int	ispc98 = 0;
 #endif
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int physmem = 0;
 int cold = 1;
 
 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code));
 
 static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 static int
 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0,
 		i386_btop(avail_end - avail_start), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_availpages, "I", "");
 
 static int
 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	/* Unwind the buffer, so that it's linear (possibly starting with
 	 * some initial nulls).
 	 */
 	error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr,
 		msgbufp->msg_size-msgbufp->msg_bufr,req);
 	if(error) return(error);
 	if(msgbufp->msg_bufr>0) {
 		error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr,
 			msgbufp->msg_bufr,req);
 	}
 	return(error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer");
 
 static int msgbuf_clear;
 
 static int
 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr) {
 		/* Clear the buffer and reset write pointer */
 		bzero(msgbufp->msg_ptr,msgbufp->msg_size);
 		msgbufp->msg_bufr=msgbufp->msg_bufx=0;
 		msgbuf_clear=0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW,
 	&msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I",
 	"Clear kernel message buffer");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct globaldata __globaldata;
 #endif
 
 struct cpuhead cpuhead;
 
 struct mtx sched_lock;
 struct mtx Giant;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 	int physmem_est;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf("%s", version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			unsigned int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %u bytes (%u pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 
 	/*
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
 	 */
 	if (kernel_map->first_free == NULL) {
 		printf("Warning: no free entries in kernel_map.\n");
 		physmem_est = physmem;
 	} else
 		physmem_est = min(physmem, kernel_map->max_offset - kernel_map->min_offset);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/20 of our ram over 64MB.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / PAGE_SIZE;
 
 		nbuf = 50;
 		if (physmem_est > 1024)
 			nbuf += min((physmem_est - 1024) / factor, 16384 / factor);
 		if (physmem_est > 16384)
 			nbuf += (physmem_est - 16384) * 2 / (factor * 5);
 	}
 
 	/*
 	 * Do not allow the buffer_map to be more then 1/2 the size of the
 	 * kernel_map.
 	 */
 	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 
 	    (BKVASIZE * 2)) {
 		nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 
 		    (BKVASIZE * 2);
 		printf("Warning: nbufs capped at %d\n", nbuf);
 	}
 
 	nswbuf = max(min(nbuf/4, 256), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size);
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE));
+	buffer_map->system_map = 1;
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*(ARG_MAX+(PAGE_SIZE*3))));
 
 	/*
 	 * XXX: Mbuf system machine-specific initializations should
 	 *      go here, if anywhere.
 	 */
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		callout_init(&callout[i], 0);
 		callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 	mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE);
 
 #if defined(USERCONFIG)
 	userconfig();
 	cninit();		/* the preferred console may have changed */
 #endif
 
 	printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	SLIST_INIT(&cpuhead);
 	SLIST_INSERT_HEAD(&cpuhead, GLOBALDATA, gd_allcpu);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 static void
 osendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig;
 	sigset_t *mask;
 	u_long code;
 {
 	struct osigframe sf;
 	struct osigframe *fp;
 	struct proc *p;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int oonstack;
 
 	p = curproc;
 	PROC_LOCK(p);
 	psp = p->p_sigacts;
 	regs = p->p_md.md_regs;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 	PROC_UNLOCK(p);
 
 	/*
 	 * grow_stack() will return 0 if *fp does not fit inside the stack
 	 * and the stack can not be grown.
 	 * useracc() will return FALSE if access is denied.
 	 */
 	if (grow_stack(p, (int)fp) == 0 ||
 	    !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		SIGDELSET(p->p_sigignore, SIGILL);
 		SIGDELSET(p->p_sigcatch, SIGILL);
 		SIGDELSET(p->p_sigmask, SIGILL);
 		PROC_UNLOCK(p);
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	PROC_LOCK(p);
 	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = code;
 		sf.sf_addr = regs->tf_err;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 }
 
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig;
 	sigset_t *mask;
 	u_long code;
 {
 	struct sigframe sf;
 	struct proc *p;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct sigframe *sfp;
 	int oonstack;
 
 	p = curproc;
 	PROC_LOCK(p);
 	psp = p->p_sigacts;
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		PROC_UNLOCK(p);
 		osendsig(catcher, sig, mask, code);
 		return;
 	}
 	regs = p->p_md.md_regs;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = p->p_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct sigframe));
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe *)regs->tf_esp - 1;
 	PROC_UNLOCK(p);
 
 	/*
 	 * grow_stack() will return 0 if *sfp does not fit inside the stack
 	 * and the stack can not be grown.
 	 * useracc() will return FALSE if access is denied.
 	 */
 	if (grow_stack(p, (int)sfp) == 0 ||
 	    !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 #ifdef DEBUG
 		printf("process %d has trashed its stack\n", p->p_pid);
 #endif
 		PROC_LOCK(p);
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		SIGDELSET(p->p_sigignore, SIGILL);
 		SIGDELSET(p->p_sigcatch, SIGILL);
 		SIGDELSET(p->p_sigmask, SIGILL);
 		PROC_UNLOCK(p);
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	PROC_LOCK(p);
 	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill siginfo structure. */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = code;
 		sf.sf_si.si_addr = (void *)regs->tf_err;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = code;
 		sf.sf_addr = regs->tf_err;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 *
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 osigreturn(p, uap)
 	struct proc *p;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags;
 
 	regs = p->p_md.md_regs;
 	scp = uap->sigcntxp;
 	if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ))
 		return (EFAULT);
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	if (scp->sc_onstack & 1)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	SIGSETOLD(p->p_sigmask, scp->sc_mask);
 	SIG_CANTMASK(p->p_sigmask);
 	PROC_UNLOCK(p);
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return (EJUSTRETURN);
 }
 
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		ucontext_t *sigcntxp;
 	} */ *uap;
 {
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags;
 
 	ucp = uap->sigcntxp;
 	if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ))
 		return (EFAULT);
 	if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516)
 		return (osigreturn(p, (struct osigreturn_args *)uap));
 
 	/*
 	 * Since ucp is not an osigcontext but a ucontext_t, we have to
 	 * check again if all of it is accessible.  A ucontext_t is
 	 * much larger, so instead of just checking for the pointer
 	 * being valid for the size of an osigcontext, now check for
 	 * it being valid for a whole, new-style ucontext_t.
 	 */
 	if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ))
 		return (EFAULT);
 
 	regs = p->p_md.md_regs;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	p->p_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(p->p_sigmask);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  This currently only works in
  * the !SMP case, as there is no clean way to ensure that a CPU will be
  * woken when there is work available for it.
  */
 static int	cpu_idle_hlt = 1;
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * procrunnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 #ifndef SMP
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (procrunnable())
 			enable_intr();
 		else {
 			enable_intr();
 			__asm __volatile("hlt");
 		}
 	}
 #endif
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack, ps_strings)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 #ifdef USER_LDT
 	/* was i386_user_cleanup() in NetBSD */
 	user_ldt_free(pcb);
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
 	/* reset %gs as well */
 	if (pcb == PCPU_GET(curpcb))
 		load_gs(_udatasel);
 	else
 		pcb->pcb_gs = _udatasel;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #ifdef DEV_NPX
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	p->p_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 	cr0 |= CR0_NE;			/* Done by npxinit() */
 	cr0 |= CR0_MP | CR0_TS;		/* Done at every execve() too. */
 #ifndef I386_CPU
 	cr0 |= CR0_WP | CR0_AM;
 #endif
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 static int
 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 int private_tss;			/* flag indicating private tss */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	3 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	4 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	5 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	6 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	7 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	9 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #define PHYSMAP_SIZE	(2 * 8)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  */
 static void
 getmemsize(int first)
 {
 	int i, physmap_idx, pa_indx;
 	u_int basemem, extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_offset_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t pte;
 	const char *cp;
 	struct bios_smap *smap;
 
 	bzero(&vmf, sizeof(struct vm86frame));
 	bzero(physmap, sizeof(physmap));
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 		pte = (pt_entry_t)vtopte(pa + KERNBASE);
 		*pte = pa | PG_RW | PG_V;
 	}
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT));
 	*pte = (1 << PAGE_SHIFT) | PG_RW | PG_V;
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
 				smap->type,
 				*(u_int32_t *)((char *)&smap->base + 4),
 				(u_int32_t)smap->base,
 				*(u_int32_t *)((char *)&smap->length + 4),
 				(u_int32_t)smap->length);
 
 		if (smap->type != 0x01)
 			goto next_run;
 
 		if (smap->length == 0)
 			goto next_run;
 
 		if (smap->base >= 0xffffffff) {
 			printf("%uK of memory above 4GB ignored\n",
 			    (u_int)(smap->length / 1024));
 			goto next_run;
 		}
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-montonic memory region, ignoring second region\n");
 				goto next_run;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			goto next_run;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 next_run:
 	} while (vmf.vmf_ebx != 0);
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		/*
 		 * Prefer the RTC value for extended memory.
 		 */
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	/*
 	 * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes
 	 * for the appropriate modifiers.  This overrides MAXMEM.
 	 */
 	if ((cp = getenv("hw.physmem")) != NULL) {
 		u_int64_t AllowMem, sanity;
 		char *ep;
 
 		sanity = AllowMem = strtouq(cp, &ep, 0);
 		if ((ep != cp) && (*ep != 0)) {
 			switch(*ep) {
 			case 'g':
 			case 'G':
 				AllowMem <<= 10;
 			case 'm':
 			case 'M':
 				AllowMem <<= 10;
 			case 'k':
 			case 'K':
 				AllowMem <<= 10;
 				break;
 			default:
 				AllowMem = sanity = 0;
 			}
 			if (AllowMem < sanity)
 				AllowMem = 0;
 		}
 		if (AllowMem == 0)
 			printf("Ignoring invalid memory size of '%s'\n", cp);
 		else
 			Maxmem = atop(AllowMem);
 	}
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %uK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa(Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 #if 0
 	pte = (pt_entry_t)vtopte(KERNBASE);
 #else
 	pte = (pt_entry_t)CMAP1;
 #endif
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_offset_t end;
 
 		end = ptoa(Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad;
 #if 0
 			int *ptr = 0;
 #else
 			int *ptr = (int *)CADDR1;
 #endif
 
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				continue;
 	
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555) {
 			page_bad = TRUE;
 			}
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE) {
 				continue;
 			}
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	avail_end = phys_avail[pa_indx];
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int off;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		printf("WARNING: loader(8) metadata is missing!\n");
 	}
 	if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef SMP
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct privatespace)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0];
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &SMP_prvspace[0].globaldata.gd_common_tss;
 	SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata;
 #else
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct globaldata)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata;
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &__globaldata.gd_common_tss;
 	__globaldata.gd_prvspace = &__globaldata;
 #endif
 
 	for (x = 0; x < NGDT; x++) {
 #ifdef BDE_DEBUGGER
 		/* avoid overwriting db entries with APM ones */
 		if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL)
 			continue;
 #endif
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	/* setup curproc so that mutexes work */
 	PCPU_SET(curproc, &proc0);
 
 	LIST_INIT(&proc0.p_heldmtx);
 	LIST_INIT(&proc0.p_contested);
 
 	mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE);
 #ifdef SMP
 	/*
 	 * Interrupts can happen very early, so initialize imen_mtx here, rather
 	 * than in init_locks().
 	 */
 	mtx_init(&imen_mtx, "imen", MTX_SPIN);
 #endif
 
 	/*
 	 * Giant is used early for at least debugger traps and unexpected traps.
 	 */
 	mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE);
 	mtx_enter(&Giant, MTX_DEF);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 #ifdef USER_LDT
 	PCPU_SET(currentldt, _default_ldt);
 #endif
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * We need this mutex before the console probe.
 	 */
 	mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 	isa_defaultirq();
 #endif
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	PCPU_SET(common_tss.tss_esp0,
 	    (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	private_tss = 0;
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 	proc0.p_addr->u_pcb.pcb_schednest = 0;
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 	proc0.p_md.md_regs = &proc0_tf;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused) {
 	struct gate_descriptor *new_idt;
 #ifndef SMP
 	struct region_descriptor r_idt;
 #endif
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned long addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_read_u_check(p, addr, len)
 	struct proc *p;
 	vm_offset_t addr;
 	size_t len;
 {
 	vm_offset_t gap;
 
 	if ((vm_offset_t) (addr + len) < addr)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= sizeof(struct user))
 		return 0;
 
 	gap = (char *) p->p_md.md_regs - (char *) p->p_addr;
 	
 	if ((vm_offset_t) addr < gap)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= 
 	    (vm_offset_t) (gap + sizeof(struct trapframe)))
 		return 0;
 	return EPERM;
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	long data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFL_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
 	return (0);
 }
 
 int
 fill_dbregs(p, dbregs)
 	struct proc *p;
 	struct dbreg *dbregs;
 {
 	struct pcb *pcb;
 
 	pcb = &p->p_addr->u_pcb;
 	dbregs->dr0 = pcb->pcb_dr0;
 	dbregs->dr1 = pcb->pcb_dr1;
 	dbregs->dr2 = pcb->pcb_dr2;
 	dbregs->dr3 = pcb->pcb_dr3;
 	dbregs->dr4 = 0;
 	dbregs->dr5 = 0;
 	dbregs->dr6 = pcb->pcb_dr6;
 	dbregs->dr7 = pcb->pcb_dr7;
 	return (0);
 }
 
 int
 set_dbregs(p, dbregs)
 	struct proc *p;
 	struct dbreg *dbregs;
 {
 	struct pcb *pcb;
 	int i;
 	u_int32_t mask1, mask2;
 
 	/*
 	 * Don't let an illegal value for dr7 get set.  Specifically,
 	 * check for undefined settings.  Setting these bit patterns
 	 * result in undefined behaviour and can lead to an unexpected
 	 * TRCTRAP.
 	 */
 	for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; 
 	     i++, mask1 <<= 2, mask2 <<= 2)
 		if ((dbregs->dr7 & mask1) == mask2)
 			return (EINVAL);
 
 	if (dbregs->dr7 & 0x0000fc00)
 		return (EINVAL);
 
 
 
 	pcb = &p->p_addr->u_pcb;
 
 	/*
 	 * Don't let a process set a breakpoint that is not within the
 	 * process's address space.  If a process could do this, it
 	 * could halt the system by setting a breakpoint in the kernel
 	 * (if ddb was enabled).  Thus, we need to check to make sure
 	 * that no breakpoints are being enabled for addresses outside
 	 * process's address space, unless, perhaps, we were called by
 	 * uid 0.
 	 *
 	 * XXX - what about when the watched area of the user's
 	 * address space is written into from within the kernel
 	 * ... wouldn't that still cause a breakpoint to be generated
 	 * from within kernel mode?
 	 */
 
 	if (suser(p) != 0) {
 		if (dbregs->dr7 & 0x3) {
 			/* dr0 is enabled */
 			if (dbregs->dr0 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<2)) {
 			/* dr1 is enabled */
 			if (dbregs->dr1 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<4)) {
 			/* dr2 is enabled */
 			if (dbregs->dr2 >= VM_MAXUSER_ADDRESS)
        				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<6)) {
 			/* dr3 is enabled */
 			if (dbregs->dr3 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 	}
 
 	pcb->pcb_dr0 = dbregs->dr0;
 	pcb->pcb_dr1 = dbregs->dr1;
 	pcb->pcb_dr2 = dbregs->dr2;
 	pcb->pcb_dr3 = dbregs->dr3;
 	pcb->pcb_dr6 = dbregs->dr6;
 	pcb->pcb_dr7 = dbregs->dr7;
 
 	pcb->pcb_flags |= PCB_DBREGS;
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i=0; i<nbp; i++) {
                 if (addr[i] <
                     (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->bio_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->bio_cmd == BIO_WRITE) && wlabel == 0) {
                 bp->bio_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->bio_cmd == BIO_WRITE) && wlabel == 0) {
                 bp->bio_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->bio_blkno == maxsz) {
                         bp->bio_resid = bp->bio_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->bio_blkno;
                 if (sz <= 0) {
                         bp->bio_error = EINVAL;
                         goto bad;
                 }
                 bp->bio_bcount = sz << DEV_BSHIFT;
         }
 
         bp->bio_pblkno = bp->bio_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->bio_flags |= BIO_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 71982)
+++ head/sys/i386/i386/machdep.c	(revision 71983)
@@ -1,2529 +1,2530 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
  * $FreeBSD$
  */
 
 #include "opt_atalk.h"
 #include "opt_compat.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_isa.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_npx.h"
 #include "opt_perfmon.h"
 #include "opt_user_ldt.h"
 #include "opt_userconfig.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ipl.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/callout.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 #include <sys/cons.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/bootinfo.h>
 #include <machine/md_var.h>
 #include <machine/pc/bios.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #include <machine/globaldata.h>
 #include <machine/globals.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #ifdef OLD_BUS_ARCH
 #include <i386/isa/isa_device.h>
 #endif
 #include <i386/isa/icu.h>
 #include <i386/isa/intr_machdep.h>
 #include <isa/rtc.h>
 #include <machine/vm86.h>
 #include <sys/ptrace.h>
 #include <machine/sigframe.h>
 
 extern void init386 __P((int first));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 int	_udatasel, _ucodesel;
 u_int	atdevbase;
 
 #if defined(SWTCH_OPTIM_STATS)
 extern int swtch_optim_stats;
 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
 	CTLFLAG_RD, &tlb_flush_count, 0, "");
 #endif
 
 #ifdef PC98
 static int	ispc98 = 1;
 #else
 static int	ispc98 = 0;
 #endif
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int physmem = 0;
 int cold = 1;
 
 static void osendsig __P((sig_t catcher, int sig, sigset_t *mask, u_long code));
 
 static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 static int
 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
 {
 	int error = sysctl_handle_int(oidp, 0,
 		i386_btop(avail_end - avail_start), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_availpages, "I", "");
 
 static int
 sysctl_machdep_msgbuf(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	/* Unwind the buffer, so that it's linear (possibly starting with
 	 * some initial nulls).
 	 */
 	error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr,
 		msgbufp->msg_size-msgbufp->msg_bufr,req);
 	if(error) return(error);
 	if(msgbufp->msg_bufr>0) {
 		error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr,
 			msgbufp->msg_bufr,req);
 	}
 	return(error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer");
 
 static int msgbuf_clear;
 
 static int
 sysctl_machdep_msgbuf_clear(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr) {
 		/* Clear the buffer and reset write pointer */
 		bzero(msgbufp->msg_ptr,msgbufp->msg_size);
 		msgbufp->msg_bufr=msgbufp->msg_bufx=0;
 		msgbuf_clear=0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW,
 	&msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I",
 	"Clear kernel message buffer");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 static struct trapframe proc0_tf;
 #ifndef SMP
 static struct globaldata __globaldata;
 #endif
 
 struct cpuhead cpuhead;
 
 struct mtx sched_lock;
 struct mtx Giant;
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 	int physmem_est;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf("%s", version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			unsigned int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %u bytes (%u pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 
 	/*
 	 * Discount the physical memory larger than the size of kernel_map
 	 * to avoid eating up all of KVA space.
 	 */
 	if (kernel_map->first_free == NULL) {
 		printf("Warning: no free entries in kernel_map.\n");
 		physmem_est = physmem;
 	} else
 		physmem_est = min(physmem, kernel_map->max_offset - kernel_map->min_offset);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/20 of our ram over 64MB.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / PAGE_SIZE;
 
 		nbuf = 50;
 		if (physmem_est > 1024)
 			nbuf += min((physmem_est - 1024) / factor, 16384 / factor);
 		if (physmem_est > 16384)
 			nbuf += (physmem_est - 16384) * 2 / (factor * 5);
 	}
 
 	/*
 	 * Do not allow the buffer_map to be more then 1/2 the size of the
 	 * kernel_map.
 	 */
 	if (nbuf > (kernel_map->max_offset - kernel_map->min_offset) / 
 	    (BKVASIZE * 2)) {
 		nbuf = (kernel_map->max_offset - kernel_map->min_offset) / 
 		    (BKVASIZE * 2);
 		printf("Warning: nbufs capped at %d\n", nbuf);
 	}
 
 	nswbuf = max(min(nbuf/4, 256), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 	v = bufhashinit(v);
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size);
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE));
+	buffer_map->system_map = 1;
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*(ARG_MAX+(PAGE_SIZE*3))));
 
 	/*
 	 * XXX: Mbuf system machine-specific initializations should
 	 *      go here, if anywhere.
 	 */
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		callout_init(&callout[i], 0);
 		callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 	mtx_init(&callout_lock, "callout", MTX_SPIN | MTX_RECURSE);
 
 #if defined(USERCONFIG)
 	userconfig();
 	cninit();		/* the preferred console may have changed */
 #endif
 
 	printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	SLIST_INIT(&cpuhead);
 	SLIST_INSERT_HEAD(&cpuhead, GLOBALDATA, gd_allcpu);
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 	cpu_setregs();
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 static void
 osendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig;
 	sigset_t *mask;
 	u_long code;
 {
 	struct osigframe sf;
 	struct osigframe *fp;
 	struct proc *p;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	int oonstack;
 
 	p = curproc;
 	PROC_LOCK(p);
 	psp = p->p_sigacts;
 	regs = p->p_md.md_regs;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((p->p_flag & P_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct osigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct osigframe));
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		fp = (struct osigframe *)regs->tf_esp - 1;
 	PROC_UNLOCK(p);
 
 	/*
 	 * grow_stack() will return 0 if *fp does not fit inside the stack
 	 * and the stack can not be grown.
 	 * useracc() will return FALSE if access is denied.
 	 */
 	if (grow_stack(p, (int)fp) == 0 ||
 	    !useracc((caddr_t)fp, sizeof(*fp), VM_PROT_WRITE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		SIGDELSET(p->p_sigignore, SIGILL);
 		SIGDELSET(p->p_sigcatch, SIGILL);
 		SIGDELSET(p->p_sigmask, SIGILL);
 		PROC_UNLOCK(p);
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
 	PROC_LOCK(p);
 	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_arg2 = (register_t)&fp->sf_siginfo;
 		sf.sf_siginfo.si_signo = sig;
 		sf.sf_siginfo.si_code = code;
 		sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_arg2 = code;
 		sf.sf_addr = regs->tf_err;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	PROC_UNLOCK(p);
 
 	/* Save most if not all of trap frame. */
 	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
 	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
 	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
 	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
 	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
 	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
 	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
 	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
 	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
 	sf.sf_siginfo.si_sc.sc_gs = rgs();
 	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;
 
 	/* Build the signal context to be used by osigreturn(). */
 	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
 	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
 	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
 	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
 	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
 	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
 	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_siginfo.si_sc.sc_ps =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/* See sendsig() for comments. */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - szosigcode;
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 }
 
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig;
 	sigset_t *mask;
 	u_long code;
 {
 	struct sigframe sf;
 	struct proc *p;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct sigframe *sfp;
 	int oonstack;
 
 	p = curproc;
 	PROC_LOCK(p);
 	psp = p->p_sigacts;
 	if (SIGISMEMBER(psp->ps_osigset, sig)) {
 		PROC_UNLOCK(p);
 		osendsig(catcher, sig, mask, code);
 		return;
 	}
 	regs = p->p_md.md_regs;
 	oonstack = sigonstack(regs->tf_esp);
 
 	/* Save user context. */
 	bzero(&sf, sizeof(sf));
 	sf.sf_uc.uc_sigmask = *mask;
 	sf.sf_uc.uc_stack = p->p_sigstk;
 	sf.sf_uc.uc_stack.ss_flags = (p->p_flag & P_ALTSTACK)
 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
 	sf.sf_uc.uc_mcontext.mc_gs = rgs();
 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
 
 	/* Allocate and validate space for the signal handler context. */
 	if ((p->p_flag & P_ALTSTACK) != 0 && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		sfp = (struct sigframe *)(p->p_sigstk.ss_sp +
 		    p->p_sigstk.ss_size - sizeof(struct sigframe));
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 #endif
 	} else
 		sfp = (struct sigframe *)regs->tf_esp - 1;
 	PROC_UNLOCK(p);
 
 	/*
 	 * grow_stack() will return 0 if *sfp does not fit inside the stack
 	 * and the stack can not be grown.
 	 * useracc() will return FALSE if access is denied.
 	 */
 	if (grow_stack(p, (int)sfp) == 0 ||
 	    !useracc((caddr_t)sfp, sizeof(*sfp), VM_PROT_WRITE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 #ifdef DEBUG
 		printf("process %d has trashed its stack\n", p->p_pid);
 #endif
 		PROC_LOCK(p);
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		SIGDELSET(p->p_sigignore, SIGILL);
 		SIGDELSET(p->p_sigcatch, SIGILL);
 		SIGDELSET(p->p_sigmask, SIGILL);
 		PROC_UNLOCK(p);
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/* Translate the signal if appropriate. */
 	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
 		sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	/* Build the argument list for the signal handler. */
 	sf.sf_signum = sig;
 	sf.sf_ucontext = (register_t)&sfp->sf_uc;
 	PROC_LOCK(p);
 	if (SIGISMEMBER(p->p_sigacts->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		sf.sf_siginfo = (register_t)&sfp->sf_si;
 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
 
 		/* Fill siginfo structure. */
 		sf.sf_si.si_signo = sig;
 		sf.sf_si.si_code = code;
 		sf.sf_si.si_addr = (void *)regs->tf_err;
 	} else {
 		/* Old FreeBSD-style arguments. */
 		sf.sf_siginfo = code;
 		sf.sf_addr = regs->tf_err;
 		sf.sf_ahu.sf_handler = catcher;
 	}
 	PROC_UNLOCK(p);
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
 		sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
 		sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
 		sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_uc.uc_mcontext.mc_eflags =
 			    (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
 			    (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 *
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/* Copy the sigframe out to the user's stack. */
 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)sfp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	load_gs(_udatasel);
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 osigreturn(p, uap)
 	struct proc *p;
 	struct osigreturn_args /* {
 		struct osigcontext *sigcntxp;
 	} */ *uap;
 {
 	struct trapframe *regs;
 	struct osigcontext *scp;
 	int eflags;
 
 	regs = p->p_md.md_regs;
 	scp = uap->sigcntxp;
 	if (!useracc((caddr_t)scp, sizeof(*scp), VM_PROT_READ))
 		return (EFAULT);
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		if (!CS_SECURE(scp->sc_cs)) {
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return (EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* Restore remaining registers. */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	if (scp->sc_onstack & 1)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	SIGSETOLD(p->p_sigmask, scp->sc_mask);
 	SIG_CANTMASK(p->p_sigmask);
 	PROC_UNLOCK(p);
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return (EJUSTRETURN);
 }
 
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		ucontext_t *sigcntxp;
 	} */ *uap;
 {
 	struct trapframe *regs;
 	ucontext_t *ucp;
 	int cs, eflags;
 
 	ucp = uap->sigcntxp;
 	if (!useracc((caddr_t)ucp, sizeof(struct osigcontext), VM_PROT_READ))
 		return (EFAULT);
 	if (((struct osigcontext *)ucp)->sc_trapno == 0x01d516)
 		return (osigreturn(p, (struct osigreturn_args *)uap));
 
 	/*
 	 * Since ucp is not an osigcontext but a ucontext_t, we have to
 	 * check again if all of it is accessible.  A ucontext_t is
 	 * much larger, so instead of just checking for the pointer
 	 * being valid for the size of an osigcontext, now check for
 	 * it being valid for a whole, new-style ucontext_t.
 	 */
 	if (!useracc((caddr_t)ucp, sizeof(*ucp), VM_PROT_READ))
 		return (EFAULT);
 
 	regs = p->p_md.md_regs;
 	eflags = ucp->uc_mcontext.mc_eflags;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* Go back to user mode if both flags are set. */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
 		tf->tf_eflags = eflags;
 		tf->tf_vm86_ds = tf->tf_ds;
 		tf->tf_vm86_es = tf->tf_es;
 		tf->tf_vm86_fs = tf->tf_fs;
 		tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 			printf("sigreturn: eflags = 0x%x\n", eflags);
 	    		return (EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 		cs = ucp->uc_mcontext.mc_cs;
 		if (!CS_SECURE(cs)) {
 			printf("sigreturn: cs = 0x%x\n", cs);
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return (EINVAL);
 		}
 
 		bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
 	}
 
 	PROC_LOCK(p);
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 	if (ucp->uc_mcontext.mc_onstack & 1)
 		p->p_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigstk.ss_flags &= ~SS_ONSTACK;
 #endif
 
 	p->p_sigmask = ucp->uc_sigmask;
 	SIG_CANTMASK(p->p_sigmask);
 	PROC_UNLOCK(p);
 	return (EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Hook to idle the CPU when possible.  This currently only works in
  * the !SMP case, as there is no clean way to ensure that a CPU will be
  * woken when there is work available for it.
  */
 static int	cpu_idle_hlt = 1;
 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
     &cpu_idle_hlt, 0, "Idle loop HLT enable");
 
 /*
  * Note that we have to be careful here to avoid a race between checking
  * procrunnable() and actually halting.  If we don't do this, we may waste
  * the time between calling hlt and the next interrupt even though there
  * is a runnable process.
  */
 void
 cpu_idle(void)
 {
 #ifndef SMP
 	if (cpu_idle_hlt) {
 		disable_intr();
   		if (procrunnable())
 			enable_intr();
 		else {
 			enable_intr();
 			__asm __volatile("hlt");
 		}
 	}
 #endif
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack, ps_strings)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 #ifdef USER_LDT
 	/* was i386_user_cleanup() in NetBSD */
 	user_ldt_free(pcb);
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
 	/* reset %gs as well */
 	if (pcb == PCPU_GET(curpcb))
 		load_gs(_udatasel);
 	else
 		pcb->pcb_gs = _udatasel;
 
         /*
          * Reset the hardware debug registers if they were in use.
          * They won't have any meaning for the newly exec'd process.  
          */
         if (pcb->pcb_flags & PCB_DBREGS) {
                 pcb->pcb_dr0 = 0;
                 pcb->pcb_dr1 = 0;
                 pcb->pcb_dr2 = 0;
                 pcb->pcb_dr3 = 0;
                 pcb->pcb_dr6 = 0;
                 pcb->pcb_dr7 = 0;
                 if (pcb == PCPU_GET(curpcb)) {
 		        /*
 			 * Clear the debug registers on the running
 			 * CPU, otherwise they will end up affecting
 			 * the next process we switch to.
 			 */
 		        reset_dbregs();
                 }
                 pcb->pcb_flags &= ~PCB_DBREGS;
         }
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #ifdef DEV_NPX
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 
 	/*
 	 * XXX - Linux emulator
 	 * Make sure sure edx is 0x0 on entry. Linux binaries depend
 	 * on it.
 	 */
 	p->p_retval[1] = 0;
 }
 
 void
 cpu_setregs(void)
 {
 	unsigned int cr0;
 
 	cr0 = rcr0();
 	cr0 |= CR0_NE;			/* Done by npxinit() */
 	cr0 |= CR0_MP | CR0_TS;		/* Done at every execve() too. */
 #ifndef I386_CPU
 	cr0 |= CR0_WP | CR0_AM;
 #endif
 	load_cr0(cr0);
 	load_gs(_udatasel);
 }
 
 static int
 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 union descriptor gdt[NGDT * MAXCPU];	/* global descriptor table */
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 int private_tss;			/* flag indicating private tss */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	3 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	4 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	5 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	6 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	7 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
 {	0x400,			/* segment base address */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	9 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE32_SEL 10 BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSCODE16_SEL 11 BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSDATA_SEL 12 BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSUTIL_SEL 13 BIOS 16-bit interface (Utility) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GBIOSARGS_SEL 14 BIOS 16-bit interface (Arguments) */
 {	0,			/* segment base address (overwritten) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #define PHYSMAP_SIZE	(2 * 8)
 
 /*
  * Populate the (physmap) array with base/bound pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * If we cannot accurately determine the physical memory map, then use
  * value from the 0xE801 call, and failing that, the RTC.
  *
  * Total memory size may be set by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  */
 static void
 getmemsize(int first)
 {
 	int i, physmap_idx, pa_indx;
 	u_int basemem, extmem;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_offset_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t pte;
 	const char *cp;
 	struct bios_smap *smap;
 
 	bzero(&vmf, sizeof(struct vm86frame));
 	bzero(physmap, sizeof(physmap));
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 		pte = (pt_entry_t)vtopte(pa + KERNBASE);
 		*pte = pa | PG_RW | PG_V;
 	}
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT));
 	*pte = (1 << PAGE_SHIFT) | PG_RW | PG_V;
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = sizeof(struct bios_smap);
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
 				smap->type,
 				*(u_int32_t *)((char *)&smap->base + 4),
 				(u_int32_t)smap->base,
 				*(u_int32_t *)((char *)&smap->length + 4),
 				(u_int32_t)smap->length);
 
 		if (smap->type != 0x01)
 			goto next_run;
 
 		if (smap->length == 0)
 			goto next_run;
 
 		if (smap->base >= 0xffffffff) {
 			printf("%uK of memory above 4GB ignored\n",
 			    (u_int)(smap->length / 1024));
 			goto next_run;
 		}
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-montonic memory region, ignoring second region\n");
 				goto next_run;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			goto next_run;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 next_run:
 	} while (vmf.vmf_ebx != 0);
 
 	if (physmap[1] != 0)
 		goto physmap_done;
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	vmf.vmf_ax = 0xE801;
 	if (vm86_intcall(0x15, &vmf) == 0) {
 		extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 	} else {
 #if 0
 		vmf.vmf_ah = 0x88;
 		vm86_intcall(0x15, &vmf);
 		extmem = vmf.vmf_ax;
 #else
 		/*
 		 * Prefer the RTC value for extended memory.
 		 */
 		extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 #endif
 	}
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 * there's 16MB of memory - this really confuses people that
 	 * are trying to use bus mastering ISA controllers with the
 	 * "16MB limit"; they only have 16MB, but the remapping puts
 	 * them beyond the limit.
 	 *
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 		extmem = 15 * 1024;
 
 	physmap[0] = 0;
 	physmap[1] = basemem * 1024;
 	physmap_idx = 2;
 	physmap[physmap_idx] = 0x100000;
 	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 physmap_done:
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We may adjust this 
 	 * based on ``hw.physmem'' and the results of the memory test.
 	 */
 	Maxmem = atop(physmap[physmap_idx + 1]);
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM / 4;
 #endif
 
 	/*
 	 * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes
 	 * for the appropriate modifiers.  This overrides MAXMEM.
 	 */
 	if ((cp = getenv("hw.physmem")) != NULL) {
 		u_int64_t AllowMem, sanity;
 		char *ep;
 
 		sanity = AllowMem = strtouq(cp, &ep, 0);
 		if ((ep != cp) && (*ep != 0)) {
 			switch(*ep) {
 			case 'g':
 			case 'G':
 				AllowMem <<= 10;
 			case 'm':
 			case 'M':
 				AllowMem <<= 10;
 			case 'k':
 			case 'K':
 				AllowMem <<= 10;
 				break;
 			default:
 				AllowMem = sanity = 0;
 			}
 			if (AllowMem < sanity)
 				AllowMem = 0;
 		}
 		if (AllowMem == 0)
 			printf("Ignoring invalid memory size of '%s'\n", cp);
 		else
 			Maxmem = atop(AllowMem);
 	}
 
 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
 	    (boothowto & RB_VERBOSE))
 		printf("Physical memory use set to %uK\n", Maxmem * 4);
 
 	/*
 	 * If Maxmem has been increased beyond what the system has detected,
 	 * extend the last memory segment to the new limit.
 	 */ 
 	if (atop(physmap[physmap_idx + 1]) < Maxmem)
 		physmap[physmap_idx + 1] = ptoa(Maxmem);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 #if 0
 	pte = (pt_entry_t)vtopte(KERNBASE);
 #else
 	pte = (pt_entry_t)CMAP1;
 #endif
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_offset_t end;
 
 		end = ptoa(Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad;
 #if 0
 			int *ptr = 0;
 #else
 			int *ptr = (int *)CADDR1;
 #endif
 
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				continue;
 	
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555) {
 			page_bad = TRUE;
 			}
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE) {
 				continue;
 			}
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	avail_end = phys_avail[pa_indx];
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int off;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	} else {
 		printf("WARNING: loader(8) metadata is missing!\n");
 	}
 	if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef SMP
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct privatespace)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0];
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &SMP_prvspace[0].globaldata.gd_common_tss;
 	SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0].globaldata;
 #else
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct globaldata)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &__globaldata;
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &__globaldata.gd_common_tss;
 	__globaldata.gd_prvspace = &__globaldata;
 #endif
 
 	for (x = 0; x < NGDT; x++) {
 #ifdef BDE_DEBUGGER
 		/* avoid overwriting db entries with APM ones */
 		if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL)
 			continue;
 #endif
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	/* setup curproc so that mutexes work */
 	PCPU_SET(curproc, &proc0);
 
 	LIST_INIT(&proc0.p_heldmtx);
 	LIST_INIT(&proc0.p_contested);
 
 	mtx_init(&sched_lock, "sched lock", MTX_SPIN | MTX_RECURSE);
 #ifdef SMP
 	/*
 	 * Interrupts can happen very early, so initialize imen_mtx here, rather
 	 * than in init_locks().
 	 */
 	mtx_init(&imen_mtx, "imen", MTX_SPIN);
 #endif
 
 	/*
 	 * Giant is used early for at least debugger traps and unexpected traps.
 	 */
 	mtx_init(&Giant, "Giant", MTX_DEF | MTX_RECURSE);
 	mtx_enter(&Giant, MTX_DEF);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 #ifdef USER_LDT
 	PCPU_SET(currentldt, _default_ldt);
 #endif
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * We need this mutex before the console probe.
 	 */
 	mtx_init(&clock_lock, "clk", MTX_SPIN | MTX_RECURSE);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #ifdef DEV_ISA
 	isa_defaultirq();
 #endif
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	PCPU_SET(common_tss.tss_esp0,
 	    (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16);
 	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	private_tss = 0;
 	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
 	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
 	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
 	ltr(gsel_tss);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 	proc0.p_addr->u_pcb.pcb_schednest = 0;
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 	proc0.p_md.md_regs = &proc0_tf;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused) {
 	struct gate_descriptor *new_idt;
 #ifndef SMP
 	struct region_descriptor r_idt;
 #endif
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned long addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_read_u_check(p, addr, len)
 	struct proc *p;
 	vm_offset_t addr;
 	size_t len;
 {
 	vm_offset_t gap;
 
 	if ((vm_offset_t) (addr + len) < addr)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= sizeof(struct user))
 		return 0;
 
 	gap = (char *) p->p_md.md_regs - (char *) p->p_addr;
 	
 	if ((vm_offset_t) addr < gap)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= 
 	    (vm_offset_t) (gap + sizeof(struct trapframe)))
 		return 0;
 	return EPERM;
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	long data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFL_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
 	return (0);
 }
 
 int
 fill_dbregs(p, dbregs)
 	struct proc *p;
 	struct dbreg *dbregs;
 {
 	struct pcb *pcb;
 
 	pcb = &p->p_addr->u_pcb;
 	dbregs->dr0 = pcb->pcb_dr0;
 	dbregs->dr1 = pcb->pcb_dr1;
 	dbregs->dr2 = pcb->pcb_dr2;
 	dbregs->dr3 = pcb->pcb_dr3;
 	dbregs->dr4 = 0;
 	dbregs->dr5 = 0;
 	dbregs->dr6 = pcb->pcb_dr6;
 	dbregs->dr7 = pcb->pcb_dr7;
 	return (0);
 }
 
 int
 set_dbregs(p, dbregs)
 	struct proc *p;
 	struct dbreg *dbregs;
 {
 	struct pcb *pcb;
 	int i;
 	u_int32_t mask1, mask2;
 
 	/*
 	 * Don't let an illegal value for dr7 get set.  Specifically,
 	 * check for undefined settings.  Setting these bit patterns
 	 * result in undefined behaviour and can lead to an unexpected
 	 * TRCTRAP.
 	 */
 	for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 8; 
 	     i++, mask1 <<= 2, mask2 <<= 2)
 		if ((dbregs->dr7 & mask1) == mask2)
 			return (EINVAL);
 
 	if (dbregs->dr7 & 0x0000fc00)
 		return (EINVAL);
 
 
 
 	pcb = &p->p_addr->u_pcb;
 
 	/*
 	 * Don't let a process set a breakpoint that is not within the
 	 * process's address space.  If a process could do this, it
 	 * could halt the system by setting a breakpoint in the kernel
 	 * (if ddb was enabled).  Thus, we need to check to make sure
 	 * that no breakpoints are being enabled for addresses outside
 	 * process's address space, unless, perhaps, we were called by
 	 * uid 0.
 	 *
 	 * XXX - what about when the watched area of the user's
 	 * address space is written into from within the kernel
 	 * ... wouldn't that still cause a breakpoint to be generated
 	 * from within kernel mode?
 	 */
 
 	if (suser(p) != 0) {
 		if (dbregs->dr7 & 0x3) {
 			/* dr0 is enabled */
 			if (dbregs->dr0 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<2)) {
 			/* dr1 is enabled */
 			if (dbregs->dr1 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<4)) {
 			/* dr2 is enabled */
 			if (dbregs->dr2 >= VM_MAXUSER_ADDRESS)
        				return (EINVAL);
 		}
 
 		if (dbregs->dr7 & (0x3<<6)) {
 			/* dr3 is enabled */
 			if (dbregs->dr3 >= VM_MAXUSER_ADDRESS)
 				return (EINVAL);
 		}
 	}
 
 	pcb->pcb_dr0 = dbregs->dr0;
 	pcb->pcb_dr1 = dbregs->dr1;
 	pcb->pcb_dr2 = dbregs->dr2;
 	pcb->pcb_dr3 = dbregs->dr3;
 	pcb->pcb_dr6 = dbregs->dr6;
 	pcb->pcb_dr7 = dbregs->dr7;
 
 	pcb->pcb_flags |= PCB_DBREGS;
 
 	return (0);
 }
 
 /*
  * Return > 0 if a hardware breakpoint has been hit, and the
  * breakpoint was in user space.  Return 0, otherwise.
  */
 int
 user_dbreg_trap(void)
 {
         u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
         u_int32_t bp;       /* breakpoint bits extracted from dr6 */
         int nbp;            /* number of breakpoints that triggered */
         caddr_t addr[4];    /* breakpoint addresses */
         int i;
         
         dr7 = rdr7();
         if ((dr7 & 0x000000ff) == 0) {
                 /*
                  * all GE and LE bits in the dr7 register are zero,
                  * thus the trap couldn't have been caused by the
                  * hardware debug registers
                  */
                 return 0;
         }
 
         nbp = 0;
         dr6 = rdr6();
         bp = dr6 & 0x0000000f;
 
         if (!bp) {
                 /*
                  * None of the breakpoint bits are set meaning this
                  * trap was not caused by any of the debug registers
                  */
                 return 0;
         }
 
         /*
          * at least one of the breakpoints were hit, check to see
          * which ones and if any of them are user space addresses
          */
 
         if (bp & 0x01) {
                 addr[nbp++] = (caddr_t)rdr0();
         }
         if (bp & 0x02) {
                 addr[nbp++] = (caddr_t)rdr1();
         }
         if (bp & 0x04) {
                 addr[nbp++] = (caddr_t)rdr2();
         }
         if (bp & 0x08) {
                 addr[nbp++] = (caddr_t)rdr3();
         }
 
         for (i=0; i<nbp; i++) {
                 if (addr[i] <
                     (caddr_t)VM_MAXUSER_ADDRESS) {
                         /*
                          * addr[i] is in user space
                          */
                         return nbp;
                 }
         }
 
         /*
          * None of the breakpoints are in user space.
          */
         return 0;
 }
 
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct bio *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->bio_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->bio_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->bio_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->bio_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->bio_cmd == BIO_WRITE) && wlabel == 0) {
                 bp->bio_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->bio_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->bio_cmd == BIO_WRITE) && wlabel == 0) {
                 bp->bio_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->bio_blkno < 0 || bp->bio_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->bio_blkno == maxsz) {
                         bp->bio_resid = bp->bio_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->bio_blkno;
                 if (sz <= 0) {
                         bp->bio_error = EINVAL;
                         goto bad;
                 }
                 bp->bio_bcount = sz << DEV_BSHIFT;
         }
 
         bp->bio_pblkno = bp->bio_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->bio_flags |= BIO_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 71982)
+++ head/sys/kern/vfs_bio.c	(revision 71983)
@@ -1,3230 +1,3237 @@
 /*
  * Copyright (c) 1994,1997 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
  * $FreeBSD$
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct buf *buf;		/* buffer header pool */
 struct swqueue bswlist;
 struct mtx buftimelock;		/* Interlock on setting prio and timo */
 
 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf * bp);
 static void vfs_setdirty(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static void vfs_backgroundwritedone(struct buf *bp);
 static int flushbufqueues(void);
 
 static int bd_request;
 
 static void buf_daemon __P((void));
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 int vmiodirenable = FALSE;
 int runningbufspace;
 static vm_offset_t bogus_offset;
 
 static int bufspace, maxbufspace,
 	bufmallocspace, maxbufmallocspace, lobufspace, hibufspace;
 static int bufreusecnt, bufdefragcnt, buffreekvacnt;
 static int needsbuffer;
 static int lorunningspace, hirunningspace, runningbufreq;
 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
 static int numfreebuffers, lofreebuffers, hifreebuffers;
 static int getnewbufcalls;
 static int getnewbufrestarts;
 
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
 	&numdirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
 	&lodirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
 	&hidirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
 	&numfreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
 	&lofreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
 	&hifreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
 	&runningbufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW,
 	&lorunningspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW,
 	&hirunningspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD,
 	&maxbufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
 	&hibufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD,
 	&lobufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
 	&bufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
 	&maxbufmallocspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
 	&bufmallocspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
 	&getnewbufcalls, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
 	&getnewbufrestarts, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW,
 	&vmiodirenable, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW,
 	&bufdefragcnt, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW,
 	&buffreekvacnt, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW,
 	&bufreusecnt, 0, "");
 
 static int bufhashmask;
 static LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
 char *buf_wmesg = BUF_WMESG;
 
 extern int vm_swap_size;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
 #define VFS_BIO_NEED_DIRTYFLUSH	0x02	/* waiting for dirty buffer flush */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
 /*
  * Buffer hash table code.  Note that the logical block scans linearly, which
  * gives us some L1 cache locality.
  */
 
 static __inline 
 struct bufhashhdr *
 bufhash(struct vnode *vnp, daddr_t bn)
 {
 	return(&bufhashtbl[(((uintptr_t)(vnp) >> 7) + (int)bn) & bufhashmask]);
 }
 
 /*
  *	numdirtywakeup:
  *
  *	If someone is blocked due to there being too many dirty buffers,
  *	and numdirtybuffers is now reasonable, wake them up.
  */
 
 static __inline void
 numdirtywakeup(int level)
 {
 	if (numdirtybuffers <= level) {
 		if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
 			needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
 			wakeup(&needsbuffer);
 		}
 	}
 }
 
 /*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 
 static __inline void
 bufspacewakeup(void)
 {
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 		wakeup(&needsbuffer);
 	}
 }
 
 /*
  * runningbufwakeup() - in-progress I/O accounting.
  *
  */
 static __inline void
 runningbufwakeup(struct buf *bp)
 {
 	if (bp->b_runningbufspace) {
 		runningbufspace -= bp->b_runningbufspace;
 		bp->b_runningbufspace = 0;
 		if (runningbufreq && runningbufspace <= lorunningspace) {
 			runningbufreq = 0;
 			wakeup(&runningbufreq);
 		}
 	}
 }
 
 /*
  *	bufcountwakeup:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
 
 static __inline void
 bufcountwakeup(void) 
 {
 	++numfreebuffers;
 	if (needsbuffer) {
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
 		if (numfreebuffers >= hifreebuffers)
 			needsbuffer &= ~VFS_BIO_NEED_FREE;
 		wakeup(&needsbuffer);
 	}
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	Reads will adjust runningbufspace, but will not block based on it.
  *	The read load has a side effect of reducing the allowed write load.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 static __inline void
 waitrunningbufspace(void)
 {
 	while (runningbufspace > hirunningspace) {
 		++runningbufreq;
 		tsleep(&runningbufreq, PVM, "wdrain", 0);
 	}
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline__
 void
 vfs_buf_test_cache(struct buf *bp,
 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 		  vm_page_t m)
 {
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 static __inline__
 void
 bd_wakeup(int dirtybuflevel)
 {
 	if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 
 static __inline__
 void
 bd_speedup(void)
 {
 	bd_wakeup(1);
 }
 
 /*
  * Initialize buffer headers and related structures. 
  */
 
 caddr_t
 bufhashinit(caddr_t vaddr)
 {
 	/* first, make a null hash table */
 	for (bufhashmask = 8; bufhashmask < nbuf / 4; bufhashmask <<= 1)
 		;
 	bufhashtbl = (void *)vaddr;
 	vaddr = vaddr + sizeof(*bufhashtbl) * bufhashmask;
 	--bufhashmask;
 	return(vaddr);
 }
 
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	TAILQ_INIT(&bswlist);
 	LIST_INIT(&invalhash);
 	mtx_init(&buftimelock, "buftime lock", MTX_DEF);
 
 	for (i = 0; i <= bufhashmask; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_dev = NODEV;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
 	 * used by most other processes.  The differential is required to 
 	 * ensure that buf_daemon is able to run when other processes might 
 	 * be blocked waiting for buffer space.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system.
 	 */
 	maxbufspace = nbuf * BKVASIZE;
 	hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 	lobufspace = hibufspace - MAXBSIZE;
 
 	lorunningspace = 512 * 1024;
 	hirunningspace = 1024 * 1024;
 
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
  * allocation, we don't want the malloced region to grow uncontrolled.
  * The malloc scheme improves memory utilization significantly on average
  * (small) directories.
  */
 	maxbufmallocspace = hibufspace / 20;
 
 /*
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
 	hidirtybuffers = nbuf / 4 + 20;
 	numdirtybuffers = 0;
 /*
  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  * eat up all available buffer space.  This occurs when our minimum cannot
  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  * BKVASIZE'd (8K) buffers.
  */
 	while (hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 /*
  * Try to keep the number of free buffers in the specified range,
  * and give special processes (e.g. like buf_daemon) access to an 
  * emergency reserve.
  */
 	lofreebuffers = nbuf / 18 + 5;
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
 /*
  * Maximum number of async ops initiated per buf_daemon loop.  This is
  * somewhat of a hack at the moment, we really need to limit ourselves
  * based on the number of bytes of I/O in-transit that were initiated
  * from buf_daemon.
  */
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 			VM_ALLOC_NORMAL);
 	cnt.v_wire_count++;
 
 }
 
 /*
  * bfreekva() - free the kva allocation for a buffer.
  *
  *	Must be called at splbio() or higher as this is the only locking for
  *	buffer_map.
  *
  *	Since this call frees up buffer space, we call bufspacewakeup().
  */
 static void
 bfreekva(struct buf * bp)
 {
 	if (bp->b_kvasize) {
 		++buffreekvacnt;
 		bufspace -= bp->b_kvasize;
 		vm_map_delete(buffer_map,
 		    (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
 		);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
 }
 
 /*
  *	bremfree:
  *
  *	Remove the buffer from the appropriate free list.
  */
 void
 bremfree(struct buf * bp)
 {
 	int s = splbio();
 	int old_qindex = bp->b_qindex;
 
 	if (bp->b_qindex != QUEUE_NONE) {
 		KASSERT(BUF_REFCNT(bp) == 1, ("bremfree: bp %p not locked",bp));
 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
 	} else {
 		if (BUF_REFCNT(bp) <= 1)
 			panic("bremfree: removing a buffer not on a queue");
 	}
 
 	/*
 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
 	 * the buffer was free and we must decrement numfreebuffers.
 	 */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
 		switch(old_qindex) {
 		case QUEUE_DIRTY:
 		case QUEUE_CLEAN:
 		case QUEUE_EMPTY:
 		case QUEUE_EMPTYKVA:
 			--numfreebuffers;
 			break;
 		default:
 			break;
 		}
 	}
 	splx(s);
 }
 
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything ( see
  * getblk() ).
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf ** bpp)
 {
 	struct buf *bp;
 
 	bp = getblk(vp, blkno, size, 0, 0);
 	*bpp = bp;
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != PCPU_GET(idleproc))
 			curproc->p_stats->p_ru.ru_inblock++;
 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(vp, bp);
 		return (bufwait(bp));
 	}
 	return (0);
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  * and we do not have to do anything.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf ** bpp)
 {
 	struct buf *bp, *rabp;
 	int i;
 	int rv = 0, readwait = 0;
 
 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != PCPU_GET(idleproc))
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(vp, bp);
 		++readwait;
 	}
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (curproc != PCPU_GET(idleproc))
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
 				rabp->b_rcred = cred;
 			}
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			VOP_STRATEGY(vp, rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 
 	if (readwait) {
 		rv = bufwait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bwrite(struct buf * bp)
 {
 	int oldflags, s;
 	struct buf *newbp;
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bwrite: buffer is not busy???");
 	s = splbio();
 	/*
 	 * If a background write is already in progress, delay
 	 * writing this block if it is asynchronous. Otherwise
 	 * wait for the background write to complete.
 	 */
 	if (bp->b_xflags & BX_BKGRDINPROG) {
 		if (bp->b_flags & B_ASYNC) {
 			splx(s);
 			bdwrite(bp);
 			return (0);
 		}
 		bp->b_xflags |= BX_BKGRDWAIT;
 		tsleep(&bp->b_xflags, PRIBIO, "biord", 0);
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("bwrite: still writing");
 	}
 
 	/* Mark the buffer clean */
 	bundirty(bp);
 
 	/*
 	 * If this buffer is marked for background writing and we
 	 * do not have to wait for it, make a copy and write the
 	 * copy so as to leave this buffer ready for further use.
 	 *
 	 * This optimization eats a lot of memory.  If we have a page
 	 * or buffer shortfall we can't do it.
 	 */
 	if ((bp->b_xflags & BX_BKGRDWRITE) && 
 	    (bp->b_flags & B_ASYNC) &&
 	    !vm_page_count_severe() &&
 	    !buf_dirty_count_severe()) {
 		if (bp->b_iodone != NULL) {
 			printf("bp->b_iodone = %p\n", bp->b_iodone);
 			panic("bwrite: need chained iodone");
 		}
 
 		/* get a new block */
 		newbp = geteblk(bp->b_bufsize);
 
 		/* set it to be identical to the old block */
 		memcpy(newbp->b_data, bp->b_data, bp->b_bufsize);
 		bgetvp(bp->b_vp, newbp);
 		newbp->b_lblkno = bp->b_lblkno;
 		newbp->b_blkno = bp->b_blkno;
 		newbp->b_offset = bp->b_offset;
 		newbp->b_iodone = vfs_backgroundwritedone;
 		newbp->b_flags |= B_ASYNC;
 		newbp->b_flags &= ~B_INVAL;
 
 		/* move over the dependencies */
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_movedeps(bp, newbp);
 
 		/*
 		 * Initiate write on the copy, release the original to
 		 * the B_LOCKED queue so that it cannot go away until
 		 * the background write completes. If not locked it could go
 		 * away and then be reconstituted while it was being written.
 		 * If the reconstituted buffer were written, we could end up
 		 * with two background copies being written at the same time.
 		 */
 		bp->b_xflags |= BX_BKGRDINPROG;
 		bp->b_flags |= B_LOCKED;
 		bqrelse(bp);
 		bp = newbp;
 	}
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
 	if (curproc != PCPU_GET(idleproc))
 		curproc->p_stats->p_ru.ru_oublock++;
 	splx(s);
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	BUF_STRATEGY(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  There is no chance of deadlock here because
 		 * we are blocking on I/O that is already in-progress.
 		 */
 		waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 /*
  * Complete a background write started from bwrite.
  */
 static void
 vfs_backgroundwritedone(bp)
 	struct buf *bp;
 {
 	struct buf *origbp;
 
 	/*
 	 * Find the original buffer that we are writing.
 	 */
 	if ((origbp = gbincore(bp->b_vp, bp->b_lblkno)) == NULL)
 		panic("backgroundwritedone: lost buffer");
 	/*
 	 * Process dependencies then return any unfinished ones.
 	 */
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		buf_complete(bp);
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		buf_movedeps(bp, origbp);
 	/*
 	 * Clear the BX_BKGRDINPROG flag in the original buffer
 	 * and awaken it if it is waiting for the write to complete.
 	 * If BX_BKGRDINPROG is not set in the original buffer it must
 	 * have been released and re-instantiated - which is not legal.
 	 */
 	KASSERT((origbp->b_xflags & BX_BKGRDINPROG), ("backgroundwritedone: lost buffer2"));
 	origbp->b_xflags &= ~BX_BKGRDINPROG;
 	if (origbp->b_xflags & BX_BKGRDWAIT) {
 		origbp->b_xflags &= ~BX_BKGRDWAIT;
 		wakeup(&origbp->b_xflags);
 	}
 	/*
 	 * Clear the B_LOCKED flag and remove it from the locked
 	 * queue if it currently resides there.
 	 */
 	origbp->b_flags &= ~B_LOCKED;
 	if (BUF_LOCK(origbp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
 		bremfree(origbp);
 		bqrelse(origbp);
 	}
 	/*
 	 * This buffer is marked B_NOCACHE, so when it is released
 	 * by biodone, it will be tossed. We mark it with BIO_READ
 	 * to avoid biodone doing a second vwakeup.
 	 */
 	bp->b_flags |= B_NOCACHE;
 	bp->b_iocmd = BIO_READ;
 	bp->b_flags &= ~(B_CACHE | B_DONE);
 	bp->b_iodone = 0;
 	bufdone(bp);
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf * bp)
 {
 	if (BUF_REFCNT(bp) == 0)
 		panic("bdwrite: buffer is not busy");
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 	bdirty(bp);
 
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
 	bqrelse(bp);
 
 	/*
 	 * Wakeup the buffer flushing daemon if we have a lot of dirty
 	 * buffers (midpoint between our recovery point and our stall
 	 * point).
 	 */
 	bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	Must be called at splbio().
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(bp)
 	struct buf *bp;
 {
 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		++numdirtybuffers;
 		bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	Must be called at splbio().
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(bp)
 	struct buf *bp;
 {
 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		--numdirtybuffers;
 		numdirtywakeup(lodirtybuffers);
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf * bp)
 {
 	bp->b_flags |= B_ASYNC;
 	(void) BUF_WRITE(bp);
 }
 
 /*
  *	bowrite:
  *
  *	Ordered write.  Start output on a buffer, and flag it so that the 
  *	device will write it in the order it was queued.  The buffer is 
  *	released when the output completes.  bwrite() ( or the VOP routine
  *	anyway ) is responsible for handling B_INVAL buffers.
  */
 int
 bowrite(struct buf * bp)
 {
 	bp->b_ioflags |= BIO_ORDERED;
 	bp->b_flags |= B_ASYNC;
 	return (BUF_WRITE(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 
 void
 bwillwrite(void)
 {
 	if (numdirtybuffers >= hidirtybuffers) {
 		int s;
 
 		s = splbio();
 		while (numdirtybuffers >= hidirtybuffers) {
 			bd_wakeup(1);
 			needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
 			tsleep(&needsbuffer, (PRIBIO + 4), "flswai", 0);
 		}
 		splx(s);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf * bp)
 {
 	int s;
 
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	s = splbio();
 
 	if (bp->b_flags & B_LOCKED)
 		bp->b_ioflags &= ~BIO_ERROR;
 
 	if (bp->b_iocmd == BIO_WRITE &&
 	    (bp->b_ioflags & BIO_ERROR) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
 		 * pages from being scrapped.  If B_INVAL is set then
 		 * this case is not run and the next case is run to 
 		 * destroy the buffer.  B_INVAL can occur if the buffer
 		 * is outside the range supported by the underlying device.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) ||
 	    bp->b_iocmd == BIO_DELETE || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI) {
 			--numdirtybuffers;
 			numdirtywakeup(lodirtybuffers);
 		}
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
 	 *
 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 	 * on pages to return pages to the VM page queues.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 	else if (vm_page_count_severe() && !(bp->b_xflags & BX_BKGRDINPROG))
 		bp->b_flags |= B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
 		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
 		vm_page_t m;
 		off_t foff;
 		vm_pindex_t poff;
 		vm_object_t obj;
 		struct vnode *vp;
 
 		vp = bp->b_vp;
 
 		/*
 		 * Get the base offset and length of the buffer.  Note that 
 		 * for block sizes that are less then PAGE_SIZE, the b_data
 		 * base of the buffer does not represent exactly b_offset and
 		 * neither b_offset nor b_size are necessarily page aligned.
 		 * Instead, the starting position of b_offset is:
 		 *
 		 * 	b_data + (b_offset & PAGE_MASK)
 		 *
 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 		 * supported due to the page granularity bits (m->valid,
 		 * m->dirty, etc...). 
 		 *
 		 * See man buf(9) for more information
 		 */
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
 
 			/*
 			 * If we hit a bogus page, fixup *all* the bogus pages
 			 * now.
 			 */
 			if (m == bogus_page) {
 				VOP_GETVOBJECT(vp, &obj);
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
 					if (mtmp == bogus_page) {
 						mtmp = vm_page_lookup(obj, poff + j);
 						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
 						bp->b_pages[j] = mtmp;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
 				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) || (bp->b_ioflags & BIO_ERROR)) {
 				int poffset = foff & PAGE_MASK;
 				int presid = resid > (PAGE_SIZE - poffset) ?
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
 				if (had_bogus)
 					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	} else if (bp->b_flags & B_VMIO) {
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	}
 			
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		splx(s);
 		return;
 	}
 
 	/* enqueue */
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~BX_BKGRDWRITE;
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 1");
 		if (bp->b_kvasize) {
 			bp->b_qindex = QUEUE_EMPTYKVA;
 		} else {
 			bp->b_qindex = QUEUE_EMPTY;
 		}
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) || (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_flags |= B_INVAL;
 		bp->b_xflags &= ~BX_BKGRDWRITE;
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 2");
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 
 	/* buffers that are locked */
 	} else if (bp->b_flags & B_LOCKED) {
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 
 	/* remaining buffers */
 	} else {
 		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
 		case B_DELWRI | B_AGE:
 		    bp->b_qindex = QUEUE_DIRTY;
 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 		    break;
 		case B_DELWRI:
 		    bp->b_qindex = QUEUE_DIRTY;
 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 		    break;
 		case B_AGE:
 		    bp->b_qindex = QUEUE_CLEAN;
 		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 		    break;
 		default:
 		    bp->b_qindex = QUEUE_CLEAN;
 		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 		    break;
 		}
 	}
 
 	/*
 	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
 	 * on the correct queue.
 	 */
 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
 		bp->b_flags &= ~B_DELWRI;
 		--numdirtybuffers;
 		numdirtywakeup(lodirtybuffers);
 	}
 
 	/*
 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 	 * if B_INVAL is set ).
 	 */
 
 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
-	 * Something we can maybe free.
+	 * Something we can maybe free or reuse
 	 */
-
 	if (bp->b_bufsize || bp->b_kvasize)
 		bufspacewakeup();
 
 	/* unlock */
 	BUF_UNLOCK(bp);
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	bp->b_ioflags &= ~BIO_ORDERED;
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	splx(s);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  */
 void
 bqrelse(struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bqrelse: free buffer onto another queue???");
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		splx(s);
 		return;
 	}
 	if (bp->b_flags & B_LOCKED) {
 		bp->b_ioflags &= ~BIO_ERROR;
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 		/* buffers with stale but valid contents */
 	} else if (bp->b_flags & B_DELWRI) {
 		bp->b_qindex = QUEUE_DIRTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 	} else if (vm_page_count_severe()) {
 		/*
 		 * We are too low on memory, we have to try to free the
 		 * buffer (most importantly: the wired pages making up its
 		 * backing store) *now*.
 		 */
 		splx(s);
 		brelse(bp);
 		return;
 	} else {
 		bp->b_qindex = QUEUE_CLEAN;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	}
 
 	if ((bp->b_flags & B_LOCKED) == 0 &&
 	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 		bufcountwakeup();
 	}
 
 	/*
-	 * Something we can maybe wakeup
+	 * Something we can maybe free or reuse.
 	 */
 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	/* unlock */
 	BUF_UNLOCK(bp);
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	bp->b_ioflags &= ~BIO_ORDERED;
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("bqrelse: not dirty");
 	splx(s);
 }
 
 static void
 vfs_vmio_release(bp)
 	struct buf *bp;
 {
 	int i, s;
 	vm_page_t m;
 
 	s = splvm();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
 		/*
 		 * In order to keep page LRU ordering consistent, put
 		 * everything on the inactive queue.
 		 */
 		vm_page_unwire(m, 0);
 		/*
 		 * We don't mess with busy pages, it is
 		 * the responsibility of the process that
 		 * busied the pages to deal with them.
 		 */
 		if ((m->flags & PG_BUSY) || (m->busy != 0))
 			continue;
 			
 		if (m->wire_count == 0) {
 			vm_page_flag_clear(m, PG_ZERO);
 			/*
 			 * Might as well free the page if we can and it has
 			 * no valid data.
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
 			} else if (vm_page_count_severe()) {
 				vm_page_try_to_cache(m);
 			}
 		}
 	}
 	splx(s);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
 	}
 	bp->b_npages = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 gbincore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
 	bh = bufhash(vp, blkno);
 
 	/* Search hash chain */
 	LIST_FOREACH(bp, bh, b_hash) {
 		/* hit */
 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 		    (bp->b_flags & B_INVAL) == 0) {
 			break;
 		}
 	}
 	return (bp);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
 	int ncl;
 	struct buf *bpa;
 	int nwritten;
 	int size;
 	int maxcl;
 
 	s = splbio();
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		for (i = 1; i < maxcl; i++) {
 			if ((bpa = gbincore(vp, lblkno + i)) &&
 			    BUF_REFCNT(bpa) == 0 &&
 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 				    (bpa->b_blkno !=
 				     bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) {
 			if ((bpa = gbincore(vp, lblkno - j)) &&
 			    BUF_REFCNT(bpa) == 0 &&
 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 				    (bpa->b_blkno !=
 				     bp->b_blkno - ((j * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
 			splx(s);
 			return nwritten;
 		}
 	}
 
 	BUF_LOCK(bp, LK_EXCLUSIVE);
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 
 	splx(s);
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) BUF_WRITE(bp);
 
 	return nwritten;
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
  *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_map is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
  *	Instead we ask the buf daemon to do it for us.  We attempt to
  *	avoid piecemeal wakeups of the pageout daemon.
  */
 
 static struct buf *
 getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
 	int defrag = 0;
 	int nqindex;
 	static int flushingbufs;
 
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
 
 	++getnewbufcalls;
 	--getnewbufrestarts;
 restart:
 	++getnewbufrestarts;
 
 	/*
 	 * Setup for scan.  If we do not have enough free buffers,
 	 * we setup a degenerate case that immediately fails.  Note
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
 	 * The scanning sequence is nominally:  EMPTY->EMPTYKVA->CLEAN
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
 	nqindex = QUEUE_EMPTYKVA;
 	nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 
 	if (nbp == NULL) {
 		/*
 		 * If no EMPTYKVA buffers and we are either
 		 * defragging or reusing, locate a CLEAN buffer
 		 * to free or reuse.  If bufspace useage is low
 		 * skip this step so we can allocate a new buffer.
 		 */
 		if (defrag || bufspace >= lobufspace) {
 			nqindex = QUEUE_CLEAN;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 		}
 
 		/*
-		 * Nada.  If we are allowed to allocate an EMPTY 
-		 * buffer, go get one.
+		 * If we could not find or were not allowed to reuse a
+		 * CLEAN buffer, check to see if it is ok to use an EMPTY
+		 * buffer.  We can only use an EMPTY buffer if allocating
+		 * its KVA would not otherwise run us out of buffer space.
 		 */
-		if (nbp == NULL && defrag == 0 && bufspace < hibufspace) {
+		if (nbp == NULL && defrag == 0 &&
+		    bufspace + maxsize < hibufspace) {
 			nqindex = QUEUE_EMPTY;
 			nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		}
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
 
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 			switch(qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 					break;
 				/* fall through */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
 				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 					break;
 				/* fall through */
 			case QUEUE_CLEAN:
 				/*
 				 * nbp is NULL. 
 				 */
 				break;
 			}
 		}
 
 		/*
 		 * Sanity Checks
 		 */
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
 		 * Note: we no longer distinguish between VMIO and non-VMIO
 		 * buffers.
 		 */
 
 		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * b_kvasize != 0.  XXX this situation should no longer
 		 * occur, if defrag is non-zero the buffer's b_kvasize
 		 * should also be non-zero at this point.  XXX
 		 */
 		if (defrag && bp->b_kvasize == 0) {
 			printf("Warning: defrag empty buffer %p\n", bp);
 			continue;
 		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 			panic("getnewbuf: locked buf");
 		bremfree(bp);
 
 		if (qindex == QUEUE_CLEAN) {
 			if (bp->b_flags & B_VMIO) {
 				bp->b_flags &= ~B_ASYNC;
 				vfs_vmio_release(bp);
 			}
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 *
 		 * Get the rest of the buffer freed up.  b_kva* is still
 		 * valid after this operation.
 		 */
 
 		if (bp->b_rcred != NOCRED) {
 			crfree(bp->b_rcred);
 			bp->b_rcred = NOCRED;
 		}
 		if (bp->b_wcred != NOCRED) {
 			crfree(bp->b_wcred);
 			bp->b_wcred = NOCRED;
 		}
 		if (LIST_FIRST(&bp->b_dep) != NULL)
 			buf_deallocate(bp);
 		if (bp->b_xflags & BX_BKGRDINPROG)
 			panic("losing buffer 3");
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 
 		if (bp->b_bufsize)
 			allocbuf(bp, 0);
 
 		bp->b_flags = 0;
 		bp->b_ioflags = 0;
 		bp->b_xflags = 0;
 		bp->b_dev = NODEV;
 		bp->b_vp = NULL;
 		bp->b_blkno = bp->b_lblkno = 0;
 		bp->b_offset = NOOFFSET;
 		bp->b_iodone = 0;
 		bp->b_error = 0;
 		bp->b_resid = 0;
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 
 		LIST_INIT(&bp->b_dep);
 
 		/*
 		 * If we are defragging then free the buffer.
 		 */
 		if (defrag) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			defrag = 0;
 			goto restart;
 		}
 
+		/*
+		 * If we are overcomitted then recover the buffer and its
+		 * KVM space.  This occurs in rare situations when multiple
+		 * processes are blocked in getnewbuf() or allocbuf().
+		 */
 		if (bufspace >= hibufspace)
 			flushingbufs = 1;
 		if (flushingbufs && bp->b_kvasize != 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 		if (bufspace < lobufspace)
 			flushingbufs = 0;
 		break;
 	}
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
 	 * wakeup various daemons and write out some dirty buffers.
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 
 	if (bp == NULL) {
 		int flags;
 		char *waitmsg;
 
 		if (defrag) {
 			flags = VFS_BIO_NEED_BUFSPACE;
 			waitmsg = "nbufkv";
 		} else if (bufspace >= hibufspace) {
 			waitmsg = "nbufbs";
 			flags = VFS_BIO_NEED_BUFSPACE;
 		} else {
 			waitmsg = "newbuf";
 			flags = VFS_BIO_NEED_ANY;
 		}
 
 		bd_speedup();	/* heeeelp */
 
 		needsbuffer |= flags;
 		while (needsbuffer & flags) {
 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
 			    waitmsg, slptimeo))
 				return (NULL);
 		}
 	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
 		 * to keep fragmentation sane we only allocate kva in
 		 * BKVASIZE chunks.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize) {
 			vm_offset_t addr = 0;
 
 			bfreekva(bp);
 
 			if (vm_map_findspace(buffer_map,
 				vm_map_min(buffer_map), maxsize, &addr)) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  We
 				 * must defragment the map.
 				 */
 				++bufdefragcnt;
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
 			if (addr) {
 				vm_map_insert(buffer_map, NULL, 0,
 					addr, addr + maxsize,
 					VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 
 				bp->b_kvabase = (caddr_t) addr;
 				bp->b_kvasize = maxsize;
 				bufspace += bp->b_kvasize;
 				++bufreusecnt;
 			}
 		}
 		bp->b_data = bp->b_kvabase;
 	}
 	return(bp);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 
 static struct proc *bufdaemonproc;
 
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
 
 static void
 buf_daemon()
 {
 	int s;
 
 	mtx_enter(&Giant, MTX_DEF);
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curproc->p_flag |= P_BUFEXHAUST;
 	s = splbio();
 
 	for (;;) {
 		kthread_suspend_check(bufdaemonproc);
 
 		bd_request = 0;
 
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.  Wakeup any waiting processes before we
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
 			if (flushbufqueues() == 0)
 				break;
 			waitrunningbufspace();
 			numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 5 seconds and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep half a second.
 		 * Otherwise we loop immediately.
 		 */
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			tsleep(&bd_request, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			tsleep(&bd_request, PVM, "qsleep", hz / 2);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 
 static int
 flushbufqueues(void)
 {
 	struct buf *bp;
 	int r = 0;
 
 	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 
 	while (bp) {
 		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 		if ((bp->b_flags & B_DELWRI) != 0 &&
 		    (bp->b_xflags & BX_BKGRDINPROG) == 0) {
 			if (bp->b_flags & B_INVAL) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 					panic("flushbufqueues: locked buf");
 				bremfree(bp);
 				brelse(bp);
 				++r;
 				break;
 			}
 			if (LIST_FIRST(&bp->b_dep) != NULL &&
 			    (bp->b_flags & B_DEFERRED) == 0 &&
 			    buf_countdeps(bp, 0)) {
 				TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY],
 				    bp, b_freelist);
 				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY],
 				    bp, b_freelist);
 				bp->b_flags |= B_DEFERRED;
 				bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 				continue;
 			}
 			vfs_bio_awrite(bp);
 			++r;
 			break;
 		}
 		bp = TAILQ_NEXT(bp, b_freelist);
 	}
 	return (r);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 
 	int s = splbio();
 	bp = gbincore(vp, blkno);
 	splx(s);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	if (incore(vp, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	if (VOP_GETVOBJECT(vp, &obj) != 0 || (vp->v_flag & VOBJBUF) == 0)
 		return 0;
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			return 0;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 /*
  *	vfs_setdirty:
  *
  *	Sets the dirty range for a buffer based on the status of the dirty
  *	bits in the pages comprising the buffer.
  *
  *	The range is limited to the size of the buffer.
  *
  *	This routine is primarily used by NFS, but is generalized for the
  *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 	int i;
 	vm_object_t object;
 
 	/*
 	 * Degenerate case - empty buffer
 	 */
 
 	if (bp->b_bufsize == 0)
 		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 	 * is not cleared simply by protecting pages off.
 	 */
 
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
 	object = bp->b_pages[0]->object;
 
 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 		printf("Warning: object %p writeable but not mightbedirty\n", object);
 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 			vm_page_test_dirty(bp->b_pages[i]);
 		}
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successfull read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
 	int s;
 	struct bufhashhdr *bh;
 
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 
 	s = splbio();
 loop:
 	/*
 	 * Block if we are low on buffers.   Certain processes are allowed
 	 * to completely exhaust the buffer cache.
          *
          * If this check ever becomes a bottleneck it may be better to
          * move it into the else, when gbincore() fails.  At the moment
          * it isn't a problem.
 	 *
 	 * XXX remove if 0 sections (clean this up after its proven)
          */
 	if (numfreebuffers == 0) {
 		if (curproc == PCPU_GET(idleproc))
 			return NULL;
 		needsbuffer |= VFS_BIO_NEED_ANY;
 	}
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
 		 * Buffer is in-core.  If the buffer is not busy, it must
 		 * be on a queue.
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 			    "getblk", slpflag, slptimeo) == ENOLCK)
 				goto loop;
 			splx(s);
 			return (struct buf *) NULL;
 		}
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					BUF_WRITE(bp);
 				} else {
 					if ((bp->b_flags & B_VMIO) &&
 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						BUF_WRITE(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			BUF_WRITE(bp);
 			goto loop;
 		}
 
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		int bsize, maxsize, vmio;
 		off_t offset;
 
 		if (vn_isdisk(vp, NULL))
 			bsize = DEV_BSIZE;
 		else if (vp->v_mountedhere)
 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 		else if (vp->v_mount)
 			bsize = vp->v_mount->mnt_stat.f_iosize;
 		else
 			bsize = size;
 
 		offset = (off_t)blkno * bsize;
 		vmio = (VOP_GETVOBJECT(vp, NULL) == 0) && (vp->v_flag & VOBJBUF);
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
 		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
 			}
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.  There is now window
 		 * race because we are safely running at splbio() from the
 		 * point of the duplicate buffer creation through to here,
 		 * and we've locked the buffer.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
 		bh = bufhash(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 #endif
 		} else {
 			bp->b_flags &= ~B_VMIO;
 		}
 
 		allocbuf(bp, size);
 
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	}
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 	int s;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 
 	s = splbio();
 	while ((bp = getnewbuf(0, 0, size, maxsize)) == 0);
 	splx(s);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	return (bp);
 }
 
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize, mbsize;
 	int i;
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("allocbuf: buffer not busy");
 
 	if (bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		caddr_t origbuf;
 		int origbufsize;
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 #if !defined(NO_B_MALLOC)
 		if (bp->b_flags & B_MALLOC)
 			newbsize = mbsize;
 		else
 #endif
 			newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 #if !defined(NO_B_MALLOC)
 			/*
 			 * malloced buffers are not shrunk
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				if (newbsize) {
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
 					if (bp->b_bufsize) {
 						bufmallocspace -= bp->b_bufsize;
 						bufspacewakeup();
 						bp->b_bufsize = 0;
 					}
 					bp->b_data = bp->b_kvabase;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
 				return 1;
 			}		
 #endif
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 #if !defined(NO_B_MALLOC)
 			/*
 			 * We only use malloced memory on the first allocation.
 			 * and revert to page-allocated memory when the buffer
 			 * grows.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
 				(mbsize <= PAGE_SIZE/2)) {
 
 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				bufmallocspace += mbsize;
 				return 1;
 			}
 #endif
 			origbuf = NULL;
 			origbufsize = 0;
 #if !defined(NO_B_MALLOC)
 			/*
 			 * If the buffer is growing on its other-than-first allocation,
 			 * then we revert to the page-allocation scheme.
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				if (bp->b_bufsize) {
 					bufmallocspace -= bp->b_bufsize;
 					bufspacewakeup();
 					bp->b_bufsize = 0;
 				}
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
 #endif
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 #if !defined(NO_B_MALLOC)
 			if (origbuf) {
 				bcopy(origbuf, bp->b_data, origbufsize);
 				free(origbuf, M_BIOBUF);
 			}
 #endif
 		}
 	} else {
 		vm_page_t m;
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (size == 0) ? 0 :
 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 #if !defined(NO_B_MALLOC)
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 #endif
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * DEV_BSIZE aligned new buffer size is less then the
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
 			if (desiredpages < bp->b_npages) {
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
 					 * is the responsibility of 
 					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
 						;
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
 				}
 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
 			 * byte-granular fashion.
 			 */
 			struct vnode *vp;
 			vm_object_t obj;
 			vm_offset_t toff;
 			vm_offset_t tinc;
 
 			/*
 			 * Step 1, bring in the VM pages from the object, 
 			 * allocating them if necessary.  We must clear
 			 * B_CACHE if these pages are not valid for the 
 			 * range covered by the buffer.
 			 */
 
 			vp = bp->b_vp;
 			VOP_GETVOBJECT(vp, &obj);
 
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 					/*
 					 * note: must allocate system pages
 					 * since blocking here could intefere
 					 * with paging I/O, no matter which
 					 * process we are.
 					 */
 					m = vm_page_alloc(obj, pi, VM_ALLOC_SYSTEM);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
 					} else {
 						vm_page_wire(m);
 						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
 					continue;
 				}
 
 				/*
 				 * We found a page.  If we have to sleep on it,
 				 * retry because it might have gotten freed out
 				 * from under us.
 				 *
 				 * We can only test PG_BUSY here.  Blocking on
 				 * m->busy might lead to a deadlock:
 				 *
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
 
 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 					continue;
 
 				/*
 				 * We have a good page.  Should we wakeup the
 				 * page daemon?
 				 */
 				if ((curproc != pageproc) &&
 				    ((m->queue - m->pc) == PQ_CACHE) &&
 				    ((cnt.v_free_count + cnt.v_cache_count) <
 					(cnt.v_free_min + cnt.v_cache_min))) {
 					pagedaemon_wakeup();
 				}
 				vm_page_flag_clear(m, PG_ZERO);
 				vm_page_wire(m);
 				bp->b_pages[bp->b_npages] = m;
 				++bp->b_npages;
 			}
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
 			 * we have to figure out if we can still have B_CACHE
 			 * set.  Note that B_CACHE is set according to the
 			 * byte-granular range ( bcount and size ), new the
 			 * aligned range ( newbsize ).
 			 *
 			 * The VM test is against m->valid, which is DEV_BSIZE
 			 * aligned.  Needless to say, the validity of the data
 			 * needs to also be DEV_BSIZE aligned.  Note that this
 			 * fails with NFS if the server or some other client
 			 * extends the file's EOF.  If our buffer is resized, 
 			 * B_CACHE may remain set! XXX
 			 */
 
 			toff = bp->b_bcount;
 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 
 			while ((bp->b_flags & B_CACHE) && toff < size) {
 				vm_pindex_t pi;
 
 				if (tinc > (size - toff))
 					tinc = size - toff;
 
 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 				    PAGE_SHIFT;
 
 				vfs_buf_test_cache(
 				    bp, 
 				    bp->b_offset,
 				    toff, 
 				    tinc, 
 				    bp->b_pages[pi]
 				);
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
 
 			/*
 			 * Step 3, fixup the KVM pmap.  Remember that
 			 * bp->b_data is relative to bp->b_offset, but 
 			 * bp->b_offset may be offset into the first page.
 			 */
 
 			bp->b_data = (caddr_t)
 			    trunc_page((vm_offset_t)bp->b_data);
 			pmap_qenter(
 			    (vm_offset_t)bp->b_data,
 			    bp->b_pages, 
 			    bp->b_npages
 			);
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
  *	error and cleared.
  */
 int
 bufwait(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 	while ((bp->b_flags & B_DONE) == 0) {
 		if (bp->b_iocmd == BIO_READ)
 			tsleep(bp, PRIBIO, "biord", 0);
 		else
 			tsleep(bp, PRIBIO, "biowr", 0);
 	}
 	splx(s);
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
  /*
   * Call back function from struct bio back up to struct buf.
   * The corresponding initialization lives in sys/conf.h:DEV_STRATEGY().
   */
 void
 bufdonebio(struct bio *bp)
 {
 	bufdone(bp->bio_caller2);
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occured, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existance
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	int s, error;
 	void    (*biodone) __P((struct buf *));
 
 	s = splbio();
 
 	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	bp->b_flags |= B_DONE;
 	runningbufwakeup(bp);
 
 	if (bp->b_iocmd == BIO_DELETE) {
 		brelse(bp);
 		splx(s);
 		return;
 	}
 
 	if (bp->b_iocmd == BIO_WRITE) {
 		vwakeup(bp);
 	}
 
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		splx(s);
 		return;
 	}
 	if (LIST_FIRST(&bp->b_dep) != NULL)
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		int i;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 
 		error = VOP_GETVOBJECT(vp, &obj);
 
 #if defined(VFS_BIO_DEBUG)
 		if (vp->v_usecount == 0) {
 			panic("biodone: zero vnode ref count");
 		}
 
 		if (error) {
 			panic("biodone: missing VM object");
 		}
 
 		if ((vp->v_flag & VOBJBUF) == 0) {
 			panic("biodone: vnode is not setup for merged cache");
 		}
 #endif
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("biodone: no buffer offset"));
 
 		if (error) {
 			panic("biodone: no object");
 		}
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occured.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount - bp->b_resid;
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR)) {
 			bp->b_flags |= B_CACHE;
 		}
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
 
 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 			if (resid > iosize)
 				resid = iosize;
 
 			/*
 			 * cleanup bogus pages, restoring the originals
 			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf(
 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
 
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, i, m);
 			}
 			vm_page_flag_clear(m, PG_ZERO);
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 				printf("biodone: page busy < 0, "
 				    "pindex: %d, foff: 0x(%x,%x), "
 				    "resid: %d, index: %d\n",
 				    (int) m->pindex, (int)(foff >> 32),
 						(int) foff & 0xffffffff, resid, i);
 				if (!vn_isdisk(vp, NULL))
 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				else
 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 				    m->valid, m->dirty, m->wire_count);
 				panic("biodone: page busy < 0\n");
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		if (obj)
 			vm_object_pip_wakeupn(obj, 0);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else {
 		wakeup(bp);
 	}
 	splx(s);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
 	runningbufwakeup(bp);
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
 
 		VOP_GETVOBJECT(vp, &obj);
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (m == bogus_page) {
 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 				if (!m) {
 					panic("vfs_unbusy_pages: page missing\n");
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 			}
 			vm_object_pip_subtract(obj, 1);
 			vm_page_flag_clear(m, PG_ZERO);
 			vm_page_io_finish(m);
 		}
 		vm_object_pip_wakeupn(obj, 0);
 	}
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i, bogus;
 
 	bp->b_runningbufspace = bp->b_bufsize;
 	runningbufspace += bp->b_runningbufspace;
 
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj;
 		vm_ooffset_t foff;
 
 		VOP_GETVOBJECT(vp, &obj);
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_busy_pages: no buffer offset"));
 		vfs_setdirty(bp);
 
 retry:
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 				goto retry;
 		}
 
 		bogus = 0;
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			vm_page_flag_clear(m, PG_ZERO);
 			if ((bp->b_flags & B_CLUSTER) == 0) {
 				vm_object_pip_add(obj, 1);
 				vm_page_io_start(m);
 			}
 
 			/*
 			 * When readying a buffer for a read ( i.e
 			 * clear_modify == 0 ), it is important to do
 			 * bogus_page replacement for valid pages in 
 			 * partially instantiated buffers.  Partially 
 			 * instantiated buffers can, in turn, occur when
 			 * reconstituting a buffer from its VM backing store
 			 * base.  We only have to do this if B_CACHE is
 			 * clear ( which causes the I/O to occur in the
 			 * first place ).  The replacement prevents the read
 			 * I/O from overwriting potentially dirty VM-backed
 			 * pages.  XXX bogus page replacement is, uh, bogus.
 			 * It may not work properly with small-block devices.
 			 * We need to find a better way.
 			 */
 
 			vm_page_protect(m, VM_PROT_NONE);
 			if (clear_modify)
 				vfs_page_set_valid(bp, foff, i, m);
 			else if (m->valid == VM_PAGE_BITS_ALL &&
 				(bp->b_flags & B_CACHE) == 0) {
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			vm_ooffset_t eoff = noff;
 
 			if (eoff > bp->b_offset + bp->b_bufsize)
 				eoff = bp->b_offset + bp->b_bufsize;
 			vfs_page_set_valid(bp, foff, i, m);
 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 			foff = noff;
 		}
 	}
 }
 
 /*
  *	vfs_bio_set_validclean:
  *
  *	Set the range within the buffer to valid and clean.  The range is 
  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
  *	itself may be offset from the beginning of the first page.
  */
 
 void   
 vfs_bio_set_validclean(struct buf *bp, int base, int size)
 {
 	if (bp->b_flags & B_VMIO) {
 		int i;
 		int n;
 
 		/*
 		 * Fixup base to be relative to beginning of first page.
 		 * Set initial n to be the maximum number of bytes in the
 		 * first page that can be validated.
 		 */
 
 		base += (bp->b_offset & PAGE_MASK);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (n > size)
 				n = size;
 
 			vm_page_set_validclean(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	clear a buffer.  This routine essentially fakes an I/O, so we need
  *	to clear BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i, mask = 0;
 	caddr_t sa, ea;
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 		    (bp->b_offset & PAGE_MASK) == 0) {
 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 			    ((bp->b_pages[0]->valid & mask) != mask)) {
 				bzero(bp->b_data, bp->b_bufsize);
 			}
 			bp->b_pages[0]->valid |= mask;
 			bp->b_resid = 0;
 			return;
 		}
 		ea = sa = bp->b_data;
 		for(i=0;i<bp->b_npages;i++,sa=ea) {
 			int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE;
 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 			ea = (caddr_t)(vm_offset_t)ulmin(
 			    (u_long)(vm_offset_t)ea,
 			    (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize);
 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 			if ((bp->b_pages[i]->valid & mask) == mask)
 				continue;
 			if ((bp->b_pages[i]->valid & mask) == 0) {
 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 					bzero(sa, ea - sa);
 				}
 			} else {
 				for (; sa < ea; sa += DEV_BSIZE, j++) {
 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 						(bp->b_pages[i]->valid & (1<<j)) == 0)
 						bzero(sa, DEV_BSIZE);
 				}
 			}
 			bp->b_pages[i]->valid |= mask;
 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 		}
 		bp->b_resid = 0;
 	} else {
 		clrbuf(bp);
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_unload pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 void
 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 
 tryagain:
 
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could intefere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 		    VM_ALLOC_SYSTEM);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
 			goto tryagain;
 		}
 		vm_page_wire(p);
 		p->valid = VM_PAGE_BITS_ALL;
 		vm_page_flag_clear(p, PG_ZERO);
 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 		bp->b_pages[index] = p;
 		vm_page_wakeup(p);
 	}
 	bp->b_npages = index;
 }
 
 void
 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index, newnpages;
 
 	from = round_page(from);
 	to = round_page(to);
 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
 		if (p && (index < bp->b_npages)) {
 			if (p->busy) {
 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 					bp->b_blkno, bp->b_lblkno);
 			}
 			bp->b_pages[index] = NULL;
 			pmap_kremove(pg);
 			vm_page_busy(p);
 			vm_page_unwire(p, 0);
 			vm_page_free(p);
 		}
 	}
 	bp->b_npages = newnpages;
 }
 
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 		  "b_blkno = %d, b_pblkno = %d\n",
 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 		  major(bp->b_dev), minor(bp->b_dev),
 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 71982)
+++ head/sys/vm/vm_map.c	(revision 71983)
@@ -1,3078 +1,3094 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Virtual memory mapping module.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <sys/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_zone.h>
 #include <vm/swap_pager.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a single hint is used to speed up lookups.
  *
  *	Since portions of maps are specified by start/end addresses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one map to
  *	another, and then marking both regions as copy-on-write.
  */
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 static struct vm_zone kmapentzone_store, mapentzone_store, mapzone_store;
 static vm_zone_t mapentzone, kmapentzone, mapzone, vmspace_zone;
 static struct vm_object kmapentobj, mapentobj, mapobj;
 
 static struct vm_map_entry map_entry_init[MAX_MAPENT];
 static struct vm_map_entry kmap_entry_init[MAX_KMAPENT];
 static struct vm_map map_init[MAX_KMAP];
 
 static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t));
 static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t));
 static vm_map_entry_t vm_map_entry_create __P((vm_map_t));
 static void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t));
 static void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t));
 static void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t));
 static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t,
 		vm_map_entry_t));
 static void vm_map_split __P((vm_map_entry_t));
 
 void
 vm_map_startup()
 {
 	mapzone = &mapzone_store;
 	zbootinit(mapzone, "MAP", sizeof (struct vm_map),
 		map_init, MAX_KMAP);
 	kmapentzone = &kmapentzone_store;
 	zbootinit(kmapentzone, "KMAP ENTRY", sizeof (struct vm_map_entry),
 		kmap_entry_init, MAX_KMAPENT);
 	mapentzone = &mapentzone_store;
 	zbootinit(mapentzone, "MAP ENTRY", sizeof (struct vm_map_entry),
 		map_entry_init, MAX_MAPENT);
 }
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  * The remaining fields must be initialized by the caller.
  */
 struct vmspace *
 vmspace_alloc(min, max)
 	vm_offset_t min, max;
 {
 	struct vmspace *vm;
 
 	vm = zalloc(vmspace_zone);
 	vm_map_init(&vm->vm_map, min, max);
 	pmap_pinit(vmspace_pmap(vm));
 	vm->vm_map.pmap = vmspace_pmap(vm);		/* XXX */
 	vm->vm_refcnt = 1;
 	vm->vm_shm = NULL;
 	return (vm);
 }
 
 void
 vm_init2(void) {
 	zinitna(kmapentzone, &kmapentobj,
 		NULL, 0, cnt.v_page_count / 4, ZONE_INTERRUPT, 1);
 	zinitna(mapentzone, &mapentobj,
 		NULL, 0, 0, 0, 1);
 	zinitna(mapzone, &mapobj,
 		NULL, 0, 0, 0, 1);
 	vmspace_zone = zinit("VMSPACE", sizeof (struct vmspace), 0, 0, 3);
 	pmap_init2();
 	vm_object_init2();
 }
 
 void
 vmspace_free(vm)
 	struct vmspace *vm;
 {
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (--vm->vm_refcnt == 0) {
 
 		/*
 		 * Lock the map, to wait out all other references to it.
 		 * Delete all of the mappings and pages they hold, then call
 		 * the pmap module to reclaim anything left.
 		 */
 		vm_map_lock(&vm->vm_map);
 		(void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 		    vm->vm_map.max_offset);
 		vm_map_unlock(&vm->vm_map);
 
 		pmap_release(vmspace_pmap(vm));
 		vm_map_destroy(&vm->vm_map);
 		zfree(vmspace_zone, vm);
 	}
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap, min, max)
 	pmap_t pmap;
 	vm_offset_t min, max;
 {
 	vm_map_t result;
 
 	result = zalloc(mapzone);
 	vm_map_init(result, min, max);
 	result->pmap = pmap;
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  * The pmap is set elsewhere.
  */
 void
 vm_map_init(map, min, max)
 	struct vm_map *map;
 	vm_offset_t min, max;
 {
 	map->header.next = map->header.prev = &map->header;
 	map->nentries = 0;
 	map->size = 0;
 	map->system_map = 0;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->first_free = &map->header;
 	map->hint = &map->header;
 	map->timestamp = 0;
 	lockinit(&map->lock, PVM, "thrd_sleep", 0, LK_NOPAUSE);
 }
 
 void
 vm_map_destroy(map)
 	struct vm_map *map;
 {
 	lockdestroy(&map->lock);
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	zfree((map->system_map || !mapentzone) ? kmapentzone : mapentzone, entry);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.  This routine is
  */
 static vm_map_entry_t
 vm_map_entry_create(map)
 	vm_map_t map;
 {
 	vm_map_entry_t new_entry;
 
 	new_entry = zalloc((map->system_map || !mapentzone) ? 
 		kmapentzone : mapentzone);
 	if (new_entry == NULL)
 	    panic("vm_map_entry_create: kernel resources exhausted");
 	return(new_entry);
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 static __inline void
 vm_map_entry_link(vm_map_t map,
 		  vm_map_entry_t after_where,
 		  vm_map_entry_t entry)
 {
 	map->nentries++;
 	entry->prev = after_where;
 	entry->next = after_where->next;
 	entry->next->prev = entry;
 	after_where->next = entry;
 }
 
 static __inline void
 vm_map_entry_unlink(vm_map_t map,
 		    vm_map_entry_t entry)
 {
 	vm_map_entry_t prev = entry->prev;
 	vm_map_entry_t next = entry->next;
 
 	next->prev = prev;
 	prev->next = next;
 	map->nentries--;
 }
 
 /*
  *	SAVE_HINT:
  *
  *	Saves the specified entry as the hint for
  *	future lookups.
  */
 #define	SAVE_HINT(map,value) \
 		(map)->hint = (value);
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(map, address, entry)
 	vm_map_t map;
 	vm_offset_t address;
 	vm_map_entry_t *entry;	/* OUT */
 {
 	vm_map_entry_t cur;
 	vm_map_entry_t last;
 
 	/*
 	 * Start looking either from the head of the list, or from the hint.
 	 */
 
 	cur = map->hint;
 
 	if (cur == &map->header)
 		cur = cur->next;
 
 	if (address >= cur->start) {
 		/*
 		 * Go from hint to end of list.
 		 *
 		 * But first, make a quick check to see if we are already looking
 		 * at the entry we want (which is usually the case). Note also
 		 * that we don't need to save the hint here... it is the same
 		 * hint (unless we are at the header, in which case the hint
 		 * didn't buy us anything anyway).
 		 */
 		last = &map->header;
 		if ((cur != last) && (cur->end > address)) {
 			*entry = cur;
 			return (TRUE);
 		}
 	} else {
 		/*
 		 * Go from start to hint, *inclusively*
 		 */
 		last = cur->next;
 		cur = map->header.next;
 	}
 
 	/*
 	 * Search linearly
 	 */
 
 	while (cur != last) {
 		if (cur->end > address) {
 			if (address >= cur->start) {
 				/*
 				 * Save this lookup for future hints, and
 				 * return
 				 */
 
 				*entry = cur;
 				SAVE_HINT(map, cur);
 				return (TRUE);
 			}
 			break;
 		}
 		cur = cur->next;
 	}
 	*entry = cur->prev;
 	SAVE_HINT(map, *entry);
 	return (FALSE);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_insert(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	      vm_offset_t start, vm_offset_t end, vm_prot_t prot, vm_prot_t max,
 	      int cow)
 {
 	vm_map_entry_t new_entry;
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 	vm_eflags_t protoeflags;
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	protoeflags = 0;
 
 	if (cow & MAP_COPY_ON_WRITE)
 		protoeflags |= MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY;
 
 	if (cow & MAP_NOFAULT) {
 		protoeflags |= MAP_ENTRY_NOFAULT;
 
 		KASSERT(object == NULL,
 			("vm_map_insert: paradoxical MAP_NOFAULT request"));
 	}
 	if (cow & MAP_DISABLE_SYNCER)
 		protoeflags |= MAP_ENTRY_NOSYNC;
 	if (cow & MAP_DISABLE_COREDUMP)
 		protoeflags |= MAP_ENTRY_NOCOREDUMP;
 
 	if (object) {
 		/*
 		 * When object is non-NULL, it could be shared with another
 		 * process.  We have to set or clear OBJ_ONEMAPPING 
 		 * appropriately.
 		 */
 		if ((object->ref_count > 1) || (object->shadow_count != 0)) {
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 		}
 	}
 	else if ((prev_entry != &map->header) &&
 		 (prev_entry->eflags == protoeflags) &&
 		 (prev_entry->end == start) &&
 		 (prev_entry->wired_count == 0) &&
 		 ((prev_entry->object.vm_object == NULL) ||
 		  vm_object_coalesce(prev_entry->object.vm_object,
 				     OFF_TO_IDX(prev_entry->offset),
 				     (vm_size_t)(prev_entry->end - prev_entry->start),
 				     (vm_size_t)(end - prev_entry->end)))) {
 		/*
 		 * We were able to extend the object.  Determine if we
 		 * can extend the previous map entry to include the 
 		 * new range as well.
 		 */
 		if ((prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
 		    (prev_entry->protection == prot) &&
 		    (prev_entry->max_protection == max)) {
 			map->size += (end - prev_entry->end);
 			prev_entry->end = end;
+			vm_map_simplify_entry(map, prev_entry);
 			return (KERN_SUCCESS);
 		}
 
 		/*
 		 * If we can extend the object but cannot extend the
 		 * map entry, we have to create a new map entry.  We
 		 * must bump the ref count on the extended object to
-		 * account for it.
+		 * account for it.  object may be NULL.
 		 */
 		object = prev_entry->object.vm_object;
 		offset = prev_entry->offset +
 			(prev_entry->end - prev_entry->start);
 		vm_object_reference(object);
 	}
 
 	/*
 	 * NOTE: if conditionals fail, object can be NULL here.  This occurs
 	 * in things like the buffer map where we manage kva but do not manage
 	 * backing objects.
 	 */
 
 	/*
 	 * Create a new entry
 	 */
 
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 
 	new_entry->eflags = protoeflags;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 	new_entry->avail_ssize = 0;
 
 	new_entry->inheritance = VM_INHERIT_DEFAULT;
 	new_entry->protection = prot;
 	new_entry->max_protection = max;
 	new_entry->wired_count = 0;
 
 	/*
 	 * Insert the new entry into the list
 	 */
 
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * Update the free space hint
 	 */
 	if ((map->first_free == prev_entry) &&
 	    (prev_entry->end >= new_entry->start)) {
 		map->first_free = new_entry;
 	}
 
+	/*
+	 * It may be possible to simplify the entry
+	 */
+	vm_map_simplify_entry(map, new_entry);
+
 	if (cow & (MAP_PREFAULT|MAP_PREFAULT_PARTIAL)) {
 		pmap_object_init_pt(map->pmap, start,
 				    object, OFF_TO_IDX(offset), end - start,
 				    cow & MAP_PREFAULT_PARTIAL);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Find sufficient space for `length' bytes in the given map, starting at
  * `start'.  The map must be locked.  Returns 0 on success, 1 on no space.
  */
 int
 vm_map_findspace(map, start, length, addr)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_size_t length;
 	vm_offset_t *addr;
 {
 	vm_map_entry_t entry, next;
 	vm_offset_t end;
 
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start > map->max_offset)
 		return (1);
 
 	/*
 	 * Look for the first possible address; if there's already something
 	 * at this address, we have to start after it.
 	 */
 	if (start == map->min_offset) {
 		if ((entry = map->first_free) != &map->header)
 			start = entry->end;
 	} else {
 		vm_map_entry_t tmp;
 
 		if (vm_map_lookup_entry(map, start, &tmp))
 			start = tmp->end;
 		entry = tmp;
 	}
 
 	/*
 	 * Look through the rest of the map, trying to fit a new region in the
 	 * gap between existing regions, or after the very last region.
 	 */
 	for (;; start = (entry = next)->end) {
 		/*
 		 * Find the end of the proposed new region.  Be sure we didn't
 		 * go beyond the end of the map, or wrap around the address;
 		 * if so, we lose.  Otherwise, if this is the last entry, or
 		 * if the proposed new region fits before the next entry, we
 		 * win.
 		 */
 		end = start + length;
 		if (end > map->max_offset || end < start)
 			return (1);
 		next = entry->next;
 		if (next == &map->header || next->start >= end)
 			break;
 	}
 	SAVE_HINT(map, entry);
 	*addr = start;
 	if (map == kernel_map) {
 		vm_offset_t ksize;
 		if ((ksize = round_page(start + length)) > kernel_vm_end) {
 			pmap_growkernel(ksize);
 		}
 	}
 	return (0);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  *	If object is non-NULL, ref count must be bumped by caller
  *	prior to making call to account for the new entry.
  */
 int
 vm_map_find(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
 	    vm_offset_t *addr,	/* IN/OUT */
 	    vm_size_t length, boolean_t find_space, vm_prot_t prot,
 	    vm_prot_t max, int cow)
 {
 	vm_offset_t start;
 	int result, s = 0;
 
 	start = *addr;
 
 	if (map == kmem_map || map == mb_map)
 		s = splvm();
 
 	vm_map_lock(map);
 	if (find_space) {
 		if (vm_map_findspace(map, start, length, addr)) {
 			vm_map_unlock(map);
 			if (map == kmem_map || map == mb_map)
 				splx(s);
 			return (KERN_NO_SPACE);
 		}
 		start = *addr;
 	}
 	result = vm_map_insert(map, object, offset,
 		start, start + length, prot, max, cow);
 	vm_map_unlock(map);
 
 	if (map == kmem_map || map == mb_map)
 		splx(s);
 
 	return (result);
 }
 
 /*
  *	vm_map_simplify_entry:
  *
- *	Simplify the given map entry by merging with either neighbor.
+ *	Simplify the given map entry by merging with either neighbor.  This
+ *	routine also has the ability to merge with both neighbors.
+ *
+ *	The map must be locked.
+ *
+ *	This routine guarentees that the passed entry remains valid (though
+ *	possibly extended).  When merging, this routine may delete one or
+ *	both neighbors.
  */
 void
 vm_map_simplify_entry(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	vm_map_entry_t next, prev;
 	vm_size_t prevsize, esize;
 
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	prev = entry->prev;
 	if (prev != &map->header) {
 		prevsize = prev->end - prev->start;
 		if ( (prev->end == entry->start) &&
 		     (prev->object.vm_object == entry->object.vm_object) &&
 		     (!prev->object.vm_object ||
 			(prev->offset + prevsize == entry->offset)) &&
 		     (prev->eflags == entry->eflags) &&
 		     (prev->protection == entry->protection) &&
 		     (prev->max_protection == entry->max_protection) &&
 		     (prev->inheritance == entry->inheritance) &&
 		     (prev->wired_count == entry->wired_count)) {
 			if (map->first_free == prev)
 				map->first_free = entry;
 			if (map->hint == prev)
 				map->hint = entry;
 			vm_map_entry_unlink(map, prev);
 			entry->start = prev->start;
 			entry->offset = prev->offset;
 			if (prev->object.vm_object)
 				vm_object_deallocate(prev->object.vm_object);
 			vm_map_entry_dispose(map, prev);
 		}
 	}
 
 	next = entry->next;
 	if (next != &map->header) {
 		esize = entry->end - entry->start;
 		if ((entry->end == next->start) &&
 		    (next->object.vm_object == entry->object.vm_object) &&
 		     (!entry->object.vm_object ||
 			(entry->offset + esize == next->offset)) &&
 		    (next->eflags == entry->eflags) &&
 		    (next->protection == entry->protection) &&
 		    (next->max_protection == entry->max_protection) &&
 		    (next->inheritance == entry->inheritance) &&
 		    (next->wired_count == entry->wired_count)) {
 			if (map->first_free == next)
 				map->first_free = entry;
 			if (map->hint == next)
 				map->hint = entry;
 			vm_map_entry_unlink(map, next);
 			entry->end = next->end;
 			if (next->object.vm_object)
 				vm_object_deallocate(next->object.vm_object);
 			vm_map_entry_dispose(map, next);
 	        }
 	}
 }
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(map, entry, start)
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_offset_t start;
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 
 	vm_map_simplify_entry(map, entry);
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if (endaddr < entry->end) \
 		_vm_map_clip_end(map, entry, endaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(map, entry, end)
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_offset_t end;
 {
 	vm_map_entry_t new_entry;
 
 	/*
 	 * If there is no object backing this entry, we might as well create
 	 * one now.  If we defer it, an object can get created after the map
 	 * is clipped, and individual objects will be created for the split-up
 	 * map.  This is a bit of a hack, but is also about the best place to
 	 * put this improvement.
 	 */
 
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL && !map->system_map) {
 		vm_object_t object;
 		object = vm_object_allocate(OBJT_DEFAULT,
 				atop(entry->end - entry->start));
 		entry->object.vm_object = object;
 		entry->offset = 0;
 	}
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_reference(new_entry->object.vm_object);
 	}
 }
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(map, start, end, submap)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 	vm_map_t submap;
 {
 	vm_map_entry_t entry;
 	int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    ((entry->eflags & MAP_ENTRY_COW) == 0) &&
 	    (entry->object.vm_object == NULL)) {
 		entry->object.sub_map = submap;
 		entry->eflags |= MAP_ENTRY_IS_SUB_MAP;
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_prot_t new_prot, boolean_t set_max)
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 
 	current = entry;
 
 	while ((current != &map->header) && (current->start < end)) {
 		vm_prot_t old_prot;
 
 		vm_map_clip_end(map, current, end);
 
 		old_prot = current->protection;
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * Update physical map if necessary. Worry about copy-on-write
 		 * here -- CHECK THIS XXX
 		 */
 
 		if (current->protection != old_prot) {
 #define MASK(entry)	(((entry)->eflags & MAP_ENTRY_COW) ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 
 			pmap_protect(map->pmap, current->start,
 			    current->end,
 			    current->protection & MASK(current));
 #undef	MASK
 		}
 
 		vm_map_simplify_entry(map, current);
 
 		current = current->next;
 	}
 
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_madvise:
  *
  * 	This routine traverses a processes map handling the madvise
  *	system call.  Advisories are classified as either those effecting
  *	the vm_map_entry structure, or those effecting the underlying 
  *	objects.
  */
 
 int
 vm_map_madvise(map, start, end, behav)
 	vm_map_t map;
 	vm_offset_t start, end;
 	int behav;
 {
 	vm_map_entry_t current, entry;
 	int modify_map = 0;
 
 	/*
 	 * Some madvise calls directly modify the vm_map_entry, in which case
 	 * we need to use an exclusive lock on the map and we need to perform 
 	 * various clipping operations.  Otherwise we only need a read-lock
 	 * on the map.
 	 */
 
 	switch(behav) {
 	case MADV_NORMAL:
 	case MADV_SEQUENTIAL:
 	case MADV_RANDOM:
 	case MADV_NOSYNC:
 	case MADV_AUTOSYNC:
 	case MADV_NOCORE:
 	case MADV_CORE:
 		modify_map = 1;
 		vm_map_lock(map);
 		break;
 	case MADV_WILLNEED:
 	case MADV_DONTNEED:
 	case MADV_FREE:
 		vm_map_lock_read(map);
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	/*
 	 * Locate starting entry and clip if necessary.
 	 */
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		if (modify_map)
 			vm_map_clip_start(map, entry, start);
 	} else {
 		entry = entry->next;
 	}
 
 	if (modify_map) {
 		/*
 		 * madvise behaviors that are implemented in the vm_map_entry.
 		 *
 		 * We clip the vm_map_entry so that behavioral changes are
 		 * limited to the specified address range.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			vm_map_clip_end(map, current, end);
 
 			switch (behav) {
 			case MADV_NORMAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_NORMAL);
 				break;
 			case MADV_SEQUENTIAL:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_SEQUENTIAL);
 				break;
 			case MADV_RANDOM:
 				vm_map_entry_set_behavior(current, MAP_ENTRY_BEHAV_RANDOM);
 				break;
 			case MADV_NOSYNC:
 				current->eflags |= MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_AUTOSYNC:
 				current->eflags &= ~MAP_ENTRY_NOSYNC;
 				break;
 			case MADV_NOCORE:
 				current->eflags |= MAP_ENTRY_NOCOREDUMP;
 				break;
 			case MADV_CORE:
 				current->eflags &= ~MAP_ENTRY_NOCOREDUMP;
 				break;
 			default:
 				break;
 			}
 			vm_map_simplify_entry(map, current);
 		}
 		vm_map_unlock(map);
 	} else {
 		vm_pindex_t pindex;
 		int count;
 
 		/*
 		 * madvise behaviors that are implemented in the underlying
 		 * vm_object.
 		 *
 		 * Since we don't clip the vm_map_entry, we have to clip
 		 * the vm_object pindex and count.
 		 */
 		for (current = entry;
 		     (current != &map->header) && (current->start < end);
 		     current = current->next
 		) {
 			vm_offset_t useStart;
 
 			if (current->eflags & MAP_ENTRY_IS_SUB_MAP)
 				continue;
 
 			pindex = OFF_TO_IDX(current->offset);
 			count = atop(current->end - current->start);
 			useStart = current->start;
 
 			if (current->start < start) {
 				pindex += atop(start - current->start);
 				count -= atop(start - current->start);
 				useStart = start;
 			}
 			if (current->end > end)
 				count -= atop(current->end - end);
 
 			if (count <= 0)
 				continue;
 
 			vm_object_madvise(current->object.vm_object,
 					  pindex, count, behav);
 			if (behav == MADV_WILLNEED) {
 				pmap_object_init_pt(
 				    map->pmap, 
 				    useStart,
 				    current->object.vm_object,
 				    pindex, 
 				    (count << PAGE_SHIFT),
 				    0
 				);
 			}
 		}
 		vm_map_unlock_read(map);
 	}
 	return(0);
 }	
 
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vm_map_fork.
  */
 int
 vm_map_inherit(vm_map_t map, vm_offset_t start, vm_offset_t end,
 	       vm_inherit_t new_inheritance)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 
 		entry->inheritance = new_inheritance;
 
 		vm_map_simplify_entry(map, entry);
 
 		entry = entry->next;
 	}
 
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Implement the semantics of mlock
  */
 int
 vm_map_user_pageable(map, start, end, new_pageable)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 	boolean_t new_pageable;
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t start_entry;
 	vm_offset_t estart;
 	int rv;
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) {
 		vm_map_unlock(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 
 	if (new_pageable) {
 
 		entry = start_entry;
 		vm_map_clip_start(map, entry, start);
 
 		/*
 		 * Now decrement the wiring count for each region. If a region
 		 * becomes completely unwired, unwire its physical pages and
 		 * mappings.
 		 */
 		while ((entry != &map->header) && (entry->start < end)) {
 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
 				vm_map_clip_end(map, entry, end);
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 				entry->wired_count--;
 				if (entry->wired_count == 0)
 					vm_fault_unwire(map, entry->start, entry->end);
 			}
 			vm_map_simplify_entry(map,entry);
 			entry = entry->next;
 		}
 	} else {
 
 		entry = start_entry;
 
 		while ((entry != &map->header) && (entry->start < end)) {
 
 			if (entry->eflags & MAP_ENTRY_USER_WIRED) {
 				entry = entry->next;
 				continue;
 			}
 			
 			if (entry->wired_count != 0) {
 				entry->wired_count++;
 				entry->eflags |= MAP_ENTRY_USER_WIRED;
 				entry = entry->next;
 				continue;
 			}
 
 			/* Here on entry being newly wired */
 
 			if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 				int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
 				if (copyflag && ((entry->protection & VM_PROT_WRITE) != 0)) {
 
 					vm_object_shadow(&entry->object.vm_object,
 					    &entry->offset,
 					    atop(entry->end - entry->start));
 					entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 
-				} else if (entry->object.vm_object == NULL) {
+				} else if (entry->object.vm_object == NULL &&
+					   !map->system_map) {
 
 					entry->object.vm_object =
 					    vm_object_allocate(OBJT_DEFAULT,
 						atop(entry->end - entry->start));
 					entry->offset = (vm_offset_t) 0;
 
 				}
 			}
 
 			vm_map_clip_start(map, entry, start);
 			vm_map_clip_end(map, entry, end);
 
 			entry->wired_count++;
 			entry->eflags |= MAP_ENTRY_USER_WIRED;
 			estart = entry->start;
 
 			/* First we need to allow map modifications */
 			vm_map_set_recursive(map);
 			vm_map_lock_downgrade(map);
 			map->timestamp++;
 
 			rv = vm_fault_user_wire(map, entry->start, entry->end);
 			if (rv) {
 
 				entry->wired_count--;
 				entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 
 				vm_map_clear_recursive(map);
 				vm_map_unlock(map);
 				
 				(void) vm_map_user_pageable(map, start, entry->start, TRUE);
 				return rv;
 			}
 
 			vm_map_clear_recursive(map);
 			if (vm_map_lock_upgrade(map)) {
 				vm_map_lock(map);
 				if (vm_map_lookup_entry(map, estart, &entry) 
 				    == FALSE) {
 					vm_map_unlock(map);
 					(void) vm_map_user_pageable(map,
 								    start,
 								    estart,
 								    TRUE);
 					return (KERN_INVALID_ADDRESS);
 				}
 			}
 			vm_map_simplify_entry(map,entry);
 		}
 	}
 	map->timestamp++;
 	vm_map_unlock(map);
 	return KERN_SUCCESS;
 }
 
 /*
  *	vm_map_pageable:
  *
  *	Sets the pageability of the specified address
  *	range in the target map.  Regions specified
  *	as not pageable require locked-down physical
  *	memory and physical page maps.
  *
  *	The map must not be locked, but a reference
  *	must remain to the map throughout the call.
  */
 int
 vm_map_pageable(map, start, end, new_pageable)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 	boolean_t new_pageable;
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t start_entry;
 	vm_offset_t failed = 0;
 	int rv;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	/*
 	 * Only one pageability change may take place at one time, since
 	 * vm_fault assumes it will be called only once for each
 	 * wiring/unwiring.  Therefore, we have to make sure we're actually
 	 * changing the pageability for the entire region.  We do so before
 	 * making any changes.
 	 */
 
 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) {
 		vm_map_unlock(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 	entry = start_entry;
 
 	/*
 	 * Actions are rather different for wiring and unwiring, so we have
 	 * two separate cases.
 	 */
 
 	if (new_pageable) {
 
 		vm_map_clip_start(map, entry, start);
 
 		/*
 		 * Unwiring.  First ensure that the range to be unwired is
 		 * really wired down and that there are no holes.
 		 */
 		while ((entry != &map->header) && (entry->start < end)) {
 
 			if (entry->wired_count == 0 ||
 			    (entry->end < end &&
 				(entry->next == &map->header ||
 				    entry->next->start > entry->end))) {
 				vm_map_unlock(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 			entry = entry->next;
 		}
 
 		/*
 		 * Now decrement the wiring count for each region. If a region
 		 * becomes completely unwired, unwire its physical pages and
 		 * mappings.
 		 */
 		entry = start_entry;
 		while ((entry != &map->header) && (entry->start < end)) {
 			vm_map_clip_end(map, entry, end);
 
 			entry->wired_count--;
 			if (entry->wired_count == 0)
 				vm_fault_unwire(map, entry->start, entry->end);
 
 			vm_map_simplify_entry(map, entry);
 
 			entry = entry->next;
 		}
 	} else {
 		/*
 		 * Wiring.  We must do this in two passes:
 		 *
 		 * 1.  Holding the write lock, we create any shadow or zero-fill
 		 * objects that need to be created. Then we clip each map
 		 * entry to the region to be wired and increment its wiring
 		 * count.  We create objects before clipping the map entries
 		 * to avoid object proliferation.
 		 *
 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
 		 * fault in the pages for any newly wired area (wired_count is
 		 * 1).
 		 *
 		 * Downgrading to a read lock for vm_fault_wire avoids a possible
 		 * deadlock with another process that may have faulted on one
 		 * of the pages to be wired (it would mark the page busy,
 		 * blocking us, then in turn block on the map lock that we
 		 * hold).  Because of problems in the recursive lock package,
 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
 		 * any actions that require the write lock must be done
 		 * beforehand.  Because we keep the read lock on the map, the
 		 * copy-on-write status of the entries we modify here cannot
 		 * change.
 		 */
 
 		/*
 		 * Pass 1.
 		 */
 		while ((entry != &map->header) && (entry->start < end)) {
 			if (entry->wired_count == 0) {
 
 				/*
 				 * Perform actions of vm_map_lookup that need
 				 * the write lock on the map: create a shadow
 				 * object for a copy-on-write region, or an
 				 * object for a zero-fill region.
 				 *
 				 * We don't have to do this for entries that
 				 * point to sub maps, because we won't
 				 * hold the lock on the sub map.
 				 */
 				if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 					int copyflag = entry->eflags & MAP_ENTRY_NEEDS_COPY;
 					if (copyflag &&
 					    ((entry->protection & VM_PROT_WRITE) != 0)) {
 
 						vm_object_shadow(&entry->object.vm_object,
 						    &entry->offset,
 						    atop(entry->end - entry->start));
 						entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
-					} else if (entry->object.vm_object == NULL) {
+					} else if (entry->object.vm_object == NULL &&
+						   !map->system_map) {
 						entry->object.vm_object =
 						    vm_object_allocate(OBJT_DEFAULT,
 							atop(entry->end - entry->start));
 						entry->offset = (vm_offset_t) 0;
 					}
 				}
 			}
 			vm_map_clip_start(map, entry, start);
 			vm_map_clip_end(map, entry, end);
 			entry->wired_count++;
 
 			/*
 			 * Check for holes
 			 */
 			if (entry->end < end &&
 			    (entry->next == &map->header ||
 				entry->next->start > entry->end)) {
 				/*
 				 * Found one.  Object creation actions do not
 				 * need to be undone, but the wired counts
 				 * need to be restored.
 				 */
 				while (entry != &map->header && entry->end > start) {
 					entry->wired_count--;
 					entry = entry->prev;
 				}
 				vm_map_unlock(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 			entry = entry->next;
 		}
 
 		/*
 		 * Pass 2.
 		 */
 
 		/*
 		 * HACK HACK HACK HACK
 		 *
 		 * If we are wiring in the kernel map or a submap of it,
 		 * unlock the map to avoid deadlocks.  We trust that the
 		 * kernel is well-behaved, and therefore will not do
 		 * anything destructive to this region of the map while
 		 * we have it unlocked.  We cannot trust user processes
 		 * to do the same.
 		 *
 		 * HACK HACK HACK HACK
 		 */
 		if (vm_map_pmap(map) == kernel_pmap) {
 			vm_map_unlock(map);	/* trust me ... */
 		} else {
 			vm_map_lock_downgrade(map);
 		}
 
 		rv = 0;
 		entry = start_entry;
 		while (entry != &map->header && entry->start < end) {
 			/*
 			 * If vm_fault_wire fails for any page we need to undo
 			 * what has been done.  We decrement the wiring count
 			 * for those pages which have not yet been wired (now)
 			 * and unwire those that have (later).
 			 *
 			 * XXX this violates the locking protocol on the map,
 			 * needs to be fixed.
 			 */
 			if (rv)
 				entry->wired_count--;
 			else if (entry->wired_count == 1) {
 				rv = vm_fault_wire(map, entry->start, entry->end);
 				if (rv) {
 					failed = entry->start;
 					entry->wired_count--;
 				}
 			}
 			entry = entry->next;
 		}
 
 		if (vm_map_pmap(map) == kernel_pmap) {
 			vm_map_lock(map);
 		}
 		if (rv) {
 			vm_map_unlock(map);
 			(void) vm_map_pageable(map, start, failed, TRUE);
 			return (rv);
 		}
 		vm_map_simplify_entry(map, start_entry);
 	}
 
 	vm_map_unlock(map);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * vm_map_clean
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_clean(map, start, end, syncio, invalidate)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 	boolean_t syncio;
 	boolean_t invalidate;
 {
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 	/*
 	 * Make a first pass to check for holes.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	if (invalidate)
 		pmap_remove(vm_map_pmap(map), start, end);
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.sub_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		/*
 		 * Note that there is absolutely no sense in writing out
 		 * anonymous objects, so we track down the vnode object
 		 * to write out.
 		 * We invalidate (remove) all pages from the address space
 		 * anyway, for semantic correctness.
 		 */
 		while (object->backing_object) {
 			object = object->backing_object;
 			offset += object->backing_object_offset;
 			if (object->size < OFF_TO_IDX( offset + size))
 				size = IDX_TO_OFF(object->size) - offset;
 		}
 		if (object && (object->type == OBJT_VNODE) && 
 		    (current->protection & VM_PROT_WRITE)) {
 			/*
 			 * Flush pages if writing is allowed, invalidate them
 			 * if invalidation requested.  Pages undergoing I/O
 			 * will be ignored by vm_object_page_remove().
 			 *
 			 * We cannot lock the vnode and then wait for paging
 			 * to complete without deadlocking against vm_fault.
 			 * Instead we simply call vm_object_page_remove() and
 			 * allow it to block internally on a page-by-page 
 			 * basis when it encounters pages undergoing async 
 			 * I/O.
 			 */
 			int flags;
 
 			vm_object_reference(object);
 			vn_lock(object->handle, LK_EXCLUSIVE | LK_RETRY, curproc);
 			flags = (syncio || invalidate) ? OBJPC_SYNC : 0;
 			flags |= invalidate ? OBJPC_INVAL : 0;
 			vm_object_page_clean(object,
 			    OFF_TO_IDX(offset),
 			    OFF_TO_IDX(offset + size + PAGE_MASK),
 			    flags);
 			if (invalidate) {
 				/*vm_object_pip_wait(object, "objmcl");*/
 				vm_object_page_remove(object,
 				    OFF_TO_IDX(offset),
 				    OFF_TO_IDX(offset + size + PAGE_MASK),
 				    FALSE);
 			}
 			VOP_UNLOCK(object->handle, 0, curproc);
 			vm_object_deallocate(object);
 		}
 		start += size;
 	}
 
 	vm_map_unlock_read(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void 
 vm_map_entry_unwire(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	vm_fault_unwire(map, entry->start, entry->end);
 	entry->wired_count = 0;
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	vm_map_entry_unlink(map, entry);
 	map->size -= entry->end - entry->start;
 
 	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 		vm_object_deallocate(entry->object.vm_object);
 	}
 
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  */
 int
 vm_map_delete(map, start, end)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 {
 	vm_object_t object;
 	vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 		/*
 		 * Fix the lookup hint now, rather than each time though the
 		 * loop.
 		 */
 		SAVE_HINT(map, entry->prev);
 	}
 
 	/*
 	 * Save the free space hint
 	 */
 
 	if (entry == &map->header) {
 		map->first_free = &map->header;
 	} else if (map->first_free->start >= start) {
 		map->first_free = entry->prev;
 	}
 
 	/*
 	 * Step through all entries in this region
 	 */
 
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 		vm_offset_t s, e;
 		vm_pindex_t offidxstart, offidxend, count;
 
 		vm_map_clip_end(map, entry, end);
 
 		s = entry->start;
 		e = entry->end;
 		next = entry->next;
 
 		offidxstart = OFF_TO_IDX(entry->offset);
 		count = OFF_TO_IDX(e - s);
 		object = entry->object.vm_object;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 		if (entry->wired_count != 0) {
 			vm_map_entry_unwire(map, entry);
 		}
 
 		offidxend = offidxstart + count;
 
 		if ((object == kernel_object) || (object == kmem_object)) {
 			vm_object_page_remove(object, offidxstart, offidxend, FALSE);
 		} else {
 			pmap_remove(map->pmap, s, e);
 			if (object != NULL &&
 			    object->ref_count != 1 &&
 			    (object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING &&
 			    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 				vm_object_collapse(object);
 				vm_object_page_remove(object, offidxstart, offidxend, FALSE);
 				if (object->type == OBJT_SWAP) {
 					swap_pager_freespace(object, offidxstart, count);
 				}
 				if (offidxend >= object->size &&
 				    offidxstart < object->size) {
 					object->size = offidxstart;
 				}
 			}
 		}
 
 		/*
 		 * Delete the entry (which may delete the object) only after
 		 * removing all pmap entries pointing to its pages.
 		 * (Otherwise, its page frames may be reallocated, and any
 		 * modify bits will be set in the wrong object!)
 		 */
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(map, start, end)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 {
 	int result, s = 0;
 
 	if (map == kmem_map || map == mb_map)
 		s = splvm();
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 
 	if (map == kmem_map || map == mb_map)
 		splx(s);
 
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified
  *	privilege on the entire address region given.
  *	The entire region must be allocated.
  */
 boolean_t
 vm_map_check_protection(vm_map_t map, vm_offset_t start, vm_offset_t end,
 			vm_prot_t protection)
 {
 	vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
 		return (FALSE);
 	}
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header) {
 			return (FALSE);
 		}
 		/*
 		 * No holes allowed!
 		 */
 
 		if (start < entry->start) {
 			return (FALSE);
 		}
 		/*
 		 * Check protection associated with entry.
 		 */
 
 		if ((entry->protection & protection) != protection) {
 			return (FALSE);
 		}
 		/* go to next entry */
 
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  * Split the pages in a map entry into a new object.  This affords
  * easier removal of unused pages, and keeps object inheritance from
  * being a negative impact on memory usage.
  */
 static void
 vm_map_split(entry)
 	vm_map_entry_t entry;
 {
 	vm_page_t m;
 	vm_object_t orig_object, new_object, source;
 	vm_offset_t s, e;
 	vm_pindex_t offidxstart, offidxend, idx;
 	vm_size_t size;
 	vm_ooffset_t offset;
 
 	orig_object = entry->object.vm_object;
 	if (orig_object->type != OBJT_DEFAULT && orig_object->type != OBJT_SWAP)
 		return;
 	if (orig_object->ref_count <= 1)
 		return;
 
 	offset = entry->offset;
 	s = entry->start;
 	e = entry->end;
 
 	offidxstart = OFF_TO_IDX(offset);
 	offidxend = offidxstart + OFF_TO_IDX(e - s);
 	size = offidxend - offidxstart;
 
 	new_object = vm_pager_allocate(orig_object->type,
 		NULL, IDX_TO_OFF(size), VM_PROT_ALL, 0LL);
 	if (new_object == NULL)
 		return;
 
 	source = orig_object->backing_object;
 	if (source != NULL) {
 		vm_object_reference(source);	/* Referenced by new_object */
 		TAILQ_INSERT_TAIL(&source->shadow_head,
 				  new_object, shadow_list);
 		vm_object_clear_flag(source, OBJ_ONEMAPPING);
 		new_object->backing_object_offset = 
 			orig_object->backing_object_offset + IDX_TO_OFF(offidxstart);
 		new_object->backing_object = source;
 		source->shadow_count++;
 		source->generation++;
 	}
 
 	for (idx = 0; idx < size; idx++) {
 		vm_page_t m;
 
 	retry:
 		m = vm_page_lookup(orig_object, offidxstart + idx);
 		if (m == NULL)
 			continue;
 
 		/*
 		 * We must wait for pending I/O to complete before we can
 		 * rename the page.
 		 *
 		 * We do not have to VM_PROT_NONE the page as mappings should
 		 * not be changed by this operation.
 		 */
 		if (vm_page_sleep_busy(m, TRUE, "spltwt"))
 			goto retry;
 			
 		vm_page_busy(m);
 		vm_page_rename(m, new_object, idx);
 		/* page automatically made dirty by rename and cache handled */
 		vm_page_busy(m);
 	}
 
 	if (orig_object->type == OBJT_SWAP) {
 		vm_object_pip_add(orig_object, 1);
 		/*
 		 * copy orig_object pages into new_object
 		 * and destroy unneeded pages in
 		 * shadow object.
 		 */
 		swap_pager_copy(orig_object, new_object, offidxstart, 0);
 		vm_object_pip_wakeup(orig_object);
 	}
 
 	for (idx = 0; idx < size; idx++) {
 		m = vm_page_lookup(new_object, idx);
 		if (m) {
 			vm_page_wakeup(m);
 		}
 	}
 
 	entry->object.vm_object = new_object;
 	entry->offset = 0LL;
 	vm_object_deallocate(orig_object);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry)
 	vm_map_t src_map, dst_map;
 	vm_map_entry_t src_entry, dst_entry;
 {
 	vm_object_t src_object;
 
 	if ((dst_entry->eflags|src_entry->eflags) & MAP_ENTRY_IS_SUB_MAP)
 		return;
 
 	if (src_entry->wired_count == 0) {
 
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if ((src_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 			pmap_protect(src_map->pmap,
 			    src_entry->start,
 			    src_entry->end,
 			    src_entry->protection & ~VM_PROT_WRITE);
 		}
 
 		/*
 		 * Make a copy of the object.
 		 */
 		if ((src_object = src_entry->object.vm_object) != NULL) {
 
 			if ((src_object->handle == NULL) &&
 				(src_object->type == OBJT_DEFAULT ||
 				 src_object->type == OBJT_SWAP)) {
 				vm_object_collapse(src_object);
 				if ((src_object->flags & (OBJ_NOSPLIT|OBJ_ONEMAPPING)) == OBJ_ONEMAPPING) {
 					vm_map_split(src_entry);
 					src_object = src_entry->object.vm_object;
 				}
 			}
 
 			vm_object_reference(src_object);
 			vm_object_clear_flag(src_object, OBJ_ONEMAPPING);
 			dst_entry->object.vm_object = src_object;
 			src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY);
 			dst_entry->offset = src_entry->offset;
 		} else {
 			dst_entry->object.vm_object = NULL;
 			dst_entry->offset = 0;
 		}
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * Of course, wired down pages can't be set copy-on-write.
 		 * Cause wired pages to be copied into the new map by
 		 * simulating faults (the new pages are pageable)
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(vm1)
 	struct vmspace *vm1;
 {
 	struct vmspace *vm2;
 	vm_map_t old_map = &vm1->vm_map;
 	vm_map_t new_map;
 	vm_map_entry_t old_entry;
 	vm_map_entry_t new_entry;
 	vm_object_t object;
 
 	vm_map_lock(old_map);
 
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset);
 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
 	new_map = &vm2->vm_map;	/* XXX */
 	new_map->timestamp = 1;
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, creating the shared object if necessary.
 			 */
 			object = old_entry->object.vm_object;
 			if (object == NULL) {
 				object = vm_object_allocate(OBJT_DEFAULT,
 					atop(old_entry->end - old_entry->start));
 				old_entry->object.vm_object = object;
 				old_entry->offset = (vm_offset_t) 0;
 			}
 
 			/*
 			 * Add the reference before calling vm_object_shadow
 			 * to insure that a shadow object is created.
 			 */
 			vm_object_reference(object);
 			if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 				vm_object_shadow(&old_entry->object.vm_object,
 					&old_entry->offset,
 					atop(old_entry->end - old_entry->start));
 				old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 				object = old_entry->object.vm_object;
 			}
 			vm_object_clear_flag(object, OBJ_ONEMAPPING);
 
 			/*
 			 * Clone the entry, referencing the shared object.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->eflags &= ~MAP_ENTRY_USER_WIRED;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 			vm_map_copy_entry(old_map, new_map, old_entry,
 			    new_entry);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 
 	new_map->size = old_map->size;
 	vm_map_unlock(old_map);
 
 	return (vm2);
 }
 
 int
 vm_map_stack (vm_map_t map, vm_offset_t addrbos, vm_size_t max_ssize,
 	      vm_prot_t prot, vm_prot_t max, int cow)
 {
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t new_stack_entry;
 	vm_size_t      init_ssize;
 	int            rv;
 
 	if (VM_MIN_ADDRESS > 0 && addrbos < VM_MIN_ADDRESS)
 		return (KERN_NO_SPACE);
 
 	if (max_ssize < SGROWSIZ)
 		init_ssize = max_ssize;
 	else
 		init_ssize = SGROWSIZ;
 
 	vm_map_lock(map);
 
 	/* If addr is already mapped, no go */
 	if (vm_map_lookup_entry(map, addrbos, &prev_entry)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* If we can't accomodate max_ssize in the current mapping,
 	 * no go.  However, we need to be aware that subsequent user
 	 * mappings might map into the space we have reserved for
 	 * stack, and currently this space is not protected.  
 	 * 
 	 * Hopefully we will at least detect this condition 
 	 * when we try to grow the stack.
 	 */
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < addrbos + max_ssize)) {
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* We initially map a stack of only init_ssize.  We will
 	 * grow as needed later.  Since this is to be a grow 
 	 * down stack, we map at the top of the range.
 	 *
 	 * Note: we would normally expect prot and max to be
 	 * VM_PROT_ALL, and cow to be 0.  Possibly we should
 	 * eliminate these as input parameters, and just
 	 * pass these values here in the insert call.
 	 */
 	rv = vm_map_insert(map, NULL, 0, addrbos + max_ssize - init_ssize,
 	                   addrbos + max_ssize, prot, max, cow);
 
 	/* Now set the avail_ssize amount */
 	if (rv == KERN_SUCCESS){
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, addrbos + max_ssize - init_ssize);
 		new_stack_entry = prev_entry->next;
 		if (new_stack_entry->end   != addrbos + max_ssize ||
 		    new_stack_entry->start != addrbos + max_ssize - init_ssize)
 			panic ("Bad entry start/end for new stack entry");
 		else 
 			new_stack_entry->avail_ssize = max_ssize - init_ssize;
 	}
 
 	vm_map_unlock(map);
 	return (rv);
 }
 
 /* Attempts to grow a vm stack entry.  Returns KERN_SUCCESS if the
  * desired address is already mapped, or if we successfully grow
  * the stack.  Also returns KERN_SUCCESS if addr is outside the
  * stack range (this is strange, but preserves compatibility with
  * the grow function in vm_machdep.c).
  */
 int
 vm_map_growstack (struct proc *p, vm_offset_t addr)
 {
 	vm_map_entry_t prev_entry;
 	vm_map_entry_t stack_entry;
 	vm_map_entry_t new_stack_entry;
 	struct vmspace *vm = p->p_vmspace;
 	vm_map_t map = &vm->vm_map;
 	vm_offset_t    end;
 	int      grow_amount;
 	int      rv;
 	int      is_procstack;
 Retry:
 	vm_map_lock_read(map);
 
 	/* If addr is already in the entry range, no need to grow.*/
 	if (vm_map_lookup_entry(map, addr, &prev_entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	}
 
 	if ((stack_entry = prev_entry->next) == &map->header) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	} 
 	if (prev_entry == &map->header) 
 		end = stack_entry->start - stack_entry->avail_ssize;
 	else
 		end = prev_entry->end;
 
 	/* This next test mimics the old grow function in vm_machdep.c.
 	 * It really doesn't quite make sense, but we do it anyway
 	 * for compatibility.
 	 *
 	 * If not growable stack, return success.  This signals the
 	 * caller to proceed as he would normally with normal vm.
 	 */
 	if (stack_entry->avail_ssize < 1 ||
 	    addr >= stack_entry->start ||
 	    addr <  stack_entry->start - stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_SUCCESS);
 	} 
 	
 	/* Find the minimum grow amount */
 	grow_amount = roundup (stack_entry->start - addr, PAGE_SIZE);
 	if (grow_amount > stack_entry->avail_ssize) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* If there is no longer enough space between the entries
 	 * nogo, and adjust the available space.  Note: this 
 	 * should only happen if the user has mapped into the
 	 * stack area after the stack was created, and is
 	 * probably an error.
 	 *
 	 * This also effectively destroys any guard page the user
 	 * might have intended by limiting the stack size.
 	 */
 	if (grow_amount > stack_entry->start - end) {
 		if (vm_map_lock_upgrade(map))
 			goto Retry;
 
 		stack_entry->avail_ssize = stack_entry->start - end;
 
 		vm_map_unlock(map);
 		return (KERN_NO_SPACE);
 	}
 
 	is_procstack = addr >= (vm_offset_t)vm->vm_maxsaddr;
 
 	/* If this is the main process stack, see if we're over the 
 	 * stack limit.
 	 */
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
 			     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
 		vm_map_unlock_read(map);
 		return (KERN_NO_SPACE);
 	}
 
 	/* Round up the grow amount modulo SGROWSIZ */
 	grow_amount = roundup (grow_amount, SGROWSIZ);
 	if (grow_amount > stack_entry->avail_ssize) {
 		grow_amount = stack_entry->avail_ssize;
 	}
 	if (is_procstack && (ctob(vm->vm_ssize) + grow_amount >
 	                     p->p_rlimit[RLIMIT_STACK].rlim_cur)) {
 		grow_amount = p->p_rlimit[RLIMIT_STACK].rlim_cur -
 		              ctob(vm->vm_ssize);
 	}
 
 	if (vm_map_lock_upgrade(map))
 		goto Retry;
 
 	/* Get the preliminary new entry start value */
 	addr = stack_entry->start - grow_amount;
 
 	/* If this puts us into the previous entry, cut back our growth
 	 * to the available space.  Also, see the note above.
 	 */
 	if (addr < end) {
 		stack_entry->avail_ssize = stack_entry->start - end;
 		addr = end;
 	}
 
 	rv = vm_map_insert(map, NULL, 0, addr, stack_entry->start,
 			   VM_PROT_ALL,
 			   VM_PROT_ALL,
 			   0);
 
 	/* Adjust the available stack space by the amount we grew. */
 	if (rv == KERN_SUCCESS) {
 		if (prev_entry != &map->header)
 			vm_map_clip_end(map, prev_entry, addr);
 		new_stack_entry = prev_entry->next;
 		if (new_stack_entry->end   != stack_entry->start  ||
 		    new_stack_entry->start != addr)
 			panic ("Bad stack grow start/end in new stack entry");
 		else {
 			new_stack_entry->avail_ssize = stack_entry->avail_ssize -
 							(new_stack_entry->end -
 							 new_stack_entry->start);
 			if (is_procstack)
 				vm->vm_ssize += btoc(new_stack_entry->end -
 						     new_stack_entry->start);
 		}
 	}
 
 	vm_map_unlock(map);
 	return (rv);
 
 }
 
 /*
  * Unshare the specified VM space for exec.  If other processes are
  * mapped to it, then create a new one.  The new vmspace is null.
  */
 
 void
 vmspace_exec(struct proc *p) {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 	vm_map_t map = &p->p_vmspace->vm_map;
 
 	newvmspace = vmspace_alloc(map->min_offset, map->max_offset);
 	bcopy(&oldvmspace->vm_startcopy, &newvmspace->vm_startcopy,
 	    (caddr_t) (newvmspace + 1) - (caddr_t) &newvmspace->vm_startcopy);
 	/*
 	 * This code is written like this for prototype purposes.  The
 	 * goal is to avoid running down the vmspace here, but let the
 	 * other process's that are still using the vmspace to finally
 	 * run it down.  Even though there is little or no chance of blocking
 	 * here, it is a good idea to keep this form for future mods.
 	 */
 	vmspace_free(oldvmspace);
 	p->p_vmspace = newvmspace;
 	pmap_pinit2(vmspace_pmap(newvmspace));
 	if (p == curproc)
 		pmap_activate(p);
 }
 
 /*
  * Unshare the specified VM space for forcing COW.  This
  * is called by rfork, for the (RFMEM|RFPROC) == 0 case.
  */
 
 void
 vmspace_unshare(struct proc *p) {
 	struct vmspace *oldvmspace = p->p_vmspace;
 	struct vmspace *newvmspace;
 
 	if (oldvmspace->vm_refcnt == 1)
 		return;
 	newvmspace = vmspace_fork(oldvmspace);
 	vmspace_free(oldvmspace);
 	p->p_vmspace = newvmspace;
 	pmap_pinit2(vmspace_pmap(newvmspace));
 	if (p == curproc)
 		pmap_activate(p);
 }
 	
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(vm_map_t *var_map,		/* IN/OUT */
 	      vm_offset_t vaddr,
 	      vm_prot_t fault_typea,
 	      vm_map_entry_t *out_entry,	/* OUT */
 	      vm_object_t *object,		/* OUT */
 	      vm_pindex_t *pindex,		/* OUT */
 	      vm_prot_t *out_prot,		/* OUT */
 	      boolean_t *wired)			/* OUT */
 {
 	vm_map_entry_t entry;
 	vm_map_t map = *var_map;
 	vm_prot_t prot;
 	vm_prot_t fault_type = fault_typea;
 
 RetryLookup:;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 
 	vm_map_lock_read(map);
 
 #define	RETURN(why) \
 		{ \
 		vm_map_unlock_read(map); \
 		return(why); \
 		}
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 
 	entry = map->hint;
 
 	*out_entry = entry;
 
 	if ((entry == &map->header) ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		vm_map_entry_t tmp_entry;
 
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry))
 			RETURN(KERN_INVALID_ADDRESS);
 
 		entry = tmp_entry;
 		*out_entry = entry;
 	}
 	
 	/*
 	 * Handle submaps.
 	 */
 
 	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 
 	/*
 	 * Check whether this task is allowed to have this page.
 	 * Note the special case for MAP_ENTRY_COW
 	 * pages with an override.  This is to implement a forced
 	 * COW for debuggers.
 	 */
 
 	if (fault_type & VM_PROT_OVERRIDE_WRITE)
 		prot = entry->max_protection;
 	else
 		prot = entry->protection;
 
 	fault_type &= (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
 	if ((fault_type & prot) != fault_type) {
 			RETURN(KERN_PROTECTION_FAILURE);
 	}
 
 	if ((entry->eflags & MAP_ENTRY_USER_WIRED) &&
 	    (entry->eflags & MAP_ENTRY_COW) &&
 	    (fault_type & VM_PROT_WRITE) &&
 	    (fault_typea & VM_PROT_OVERRIDE_WRITE) == 0) {
 		RETURN(KERN_PROTECTION_FAILURE);
 	}
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 
 	if (entry->eflags & MAP_ENTRY_NEEDS_COPY) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 
 		if (fault_type & VM_PROT_WRITE) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the map to the new
 			 * object.
 			 */
 
 			if (vm_map_lock_upgrade(map))
 				goto RetryLookup;
 
 			vm_object_shadow(
 			    &entry->object.vm_object,
 			    &entry->offset,
 			    atop(entry->end - entry->start));
 
 			entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
 			vm_map_lock_downgrade(map);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * Create an object if necessary.
 	 */
-	if (entry->object.vm_object == NULL) {
+	if (entry->object.vm_object == NULL &&
+	    !map->system_map) {
 		if (vm_map_lock_upgrade(map)) 
 			goto RetryLookup;
 
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    atop(entry->end - entry->start));
 		entry->offset = 0;
 		vm_map_lock_downgrade(map);
 	}
 
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 
 	*pindex = OFF_TO_IDX((vaddr - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	/*
 	 * Return whether this is the only map sharing this data.
 	 */
 
 	*out_prot = prot;
 	return (KERN_SUCCESS);
 
 #undef	RETURN
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 
 void
 vm_map_lookup_done(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	/*
 	 * Unlock the main-level map
 	 */
 
 	vm_map_unlock_read(map);
 }
 
 /*
  * Implement uiomove with VM operations.  This handles (and collateral changes)
  * support every combination of source object modification, and COW type
  * operations.
  */
 int
 vm_uiomove(mapa, srcobject, cp, cnta, uaddra, npages)
 	vm_map_t mapa;
 	vm_object_t srcobject;
 	off_t cp;
 	int cnta;
 	vm_offset_t uaddra;
 	int *npages;
 {
 	vm_map_t map;
 	vm_object_t first_object, oldobject, object;
 	vm_map_entry_t entry;
 	vm_prot_t prot;
 	boolean_t wired;
 	int tcnt, rv;
 	vm_offset_t uaddr, start, end, tend;
 	vm_pindex_t first_pindex, osize, oindex;
 	off_t ooffset;
 	int cnt;
 
 	if (npages)
 		*npages = 0;
 
 	cnt = cnta;
 	uaddr = uaddra;
 
 	while (cnt > 0) {
 		map = mapa;
 
 		if ((vm_map_lookup(&map, uaddr,
 			VM_PROT_READ, &entry, &first_object,
 			&first_pindex, &prot, &wired)) != KERN_SUCCESS) {
 			return EFAULT;
 		}
 
 		vm_map_clip_start(map, entry, uaddr);
 
 		tcnt = cnt;
 		tend = uaddr + tcnt;
 		if (tend > entry->end) {
 			tcnt = entry->end - uaddr;
 			tend = entry->end;
 		}
 
 		vm_map_clip_end(map, entry, tend);
 
 		start = entry->start;
 		end = entry->end;
 
 		osize = atop(tcnt);
 
 		oindex = OFF_TO_IDX(cp);
 		if (npages) {
 			vm_pindex_t idx;
 			for (idx = 0; idx < osize; idx++) {
 				vm_page_t m;
 				if ((m = vm_page_lookup(srcobject, oindex + idx)) == NULL) {
 					vm_map_lookup_done(map, entry);
 					return 0;
 				}
 				/*
 				 * disallow busy or invalid pages, but allow
 				 * m->busy pages if they are entirely valid.
 				 */
 				if ((m->flags & PG_BUSY) ||
 					((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL)) {
 					vm_map_lookup_done(map, entry);
 					return 0;
 				}
 			}
 		}
 
 /*
  * If we are changing an existing map entry, just redirect
  * the object, and change mappings.
  */
 		if ((first_object->type == OBJT_VNODE) &&
 			((oldobject = entry->object.vm_object) == first_object)) {
 
 			if ((entry->offset != cp) || (oldobject != srcobject)) {
 				/*
    				* Remove old window into the file
    				*/
 				pmap_remove (map->pmap, uaddr, tend);
 
 				/*
    				* Force copy on write for mmaped regions
    				*/
 				vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
 
 				/*
    				* Point the object appropriately
    				*/
 				if (oldobject != srcobject) {
 
 				/*
    				* Set the object optimization hint flag
    				*/
 					vm_object_set_flag(srcobject, OBJ_OPT);
 					vm_object_reference(srcobject);
 					entry->object.vm_object = srcobject;
 
 					if (oldobject) {
 						vm_object_deallocate(oldobject);
 					}
 				}
 
 				entry->offset = cp;
 				map->timestamp++;
 			} else {
 				pmap_remove (map->pmap, uaddr, tend);
 			}
 
 		} else if ((first_object->ref_count == 1) &&
 			(first_object->size == osize) &&
 			((first_object->type == OBJT_DEFAULT) ||
 				(first_object->type == OBJT_SWAP)) ) {
 
 			oldobject = first_object->backing_object;
 
 			if ((first_object->backing_object_offset != cp) ||
 				(oldobject != srcobject)) {
 				/*
    				* Remove old window into the file
    				*/
 				pmap_remove (map->pmap, uaddr, tend);
 
 				/*
 				 * Remove unneeded old pages
 				 */
 				vm_object_page_remove(first_object, 0, 0, 0);
 
 				/*
 				 * Invalidate swap space
 				 */
 				if (first_object->type == OBJT_SWAP) {
 					swap_pager_freespace(first_object,
 						0,
 						first_object->size);
 				}
 
 				/*
    				* Force copy on write for mmaped regions
    				*/
 				vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
 
 				/*
    				* Point the object appropriately
    				*/
 				if (oldobject != srcobject) {
 
 				/*
    				* Set the object optimization hint flag
    				*/
 					vm_object_set_flag(srcobject, OBJ_OPT);
 					vm_object_reference(srcobject);
 
 					if (oldobject) {
 						TAILQ_REMOVE(&oldobject->shadow_head,
 							first_object, shadow_list);
 						oldobject->shadow_count--;
 						/* XXX bump generation? */
 						vm_object_deallocate(oldobject);
 					}
 
 					TAILQ_INSERT_TAIL(&srcobject->shadow_head,
 						first_object, shadow_list);
 					srcobject->shadow_count++;
 					/* XXX bump generation? */
 
 					first_object->backing_object = srcobject;
 				}
 				first_object->backing_object_offset = cp;
 				map->timestamp++;
 			} else {
 				pmap_remove (map->pmap, uaddr, tend);
 			}
 /*
  * Otherwise, we have to do a logical mmap.
  */
 		} else {
 
 			vm_object_set_flag(srcobject, OBJ_OPT);
 			vm_object_reference(srcobject);
 
 			pmap_remove (map->pmap, uaddr, tend);
 
 			vm_object_pmap_copy_1 (srcobject, oindex, oindex + osize);
 			vm_map_lock_upgrade(map);
 
 			if (entry == &map->header) {
 				map->first_free = &map->header;
 			} else if (map->first_free->start >= start) {
 				map->first_free = entry->prev;
 			}
 
 			SAVE_HINT(map, entry->prev);
 			vm_map_entry_delete(map, entry);
 
 			object = srcobject;
 			ooffset = cp;
 
 			rv = vm_map_insert(map, object, ooffset, start, tend,
 				VM_PROT_ALL, VM_PROT_ALL, MAP_COPY_ON_WRITE);
 
 			if (rv != KERN_SUCCESS)
 				panic("vm_uiomove: could not insert new entry: %d", rv);
 		}
 
 /*
  * Map the window directly, if it is already in memory
  */
 		pmap_object_init_pt(map->pmap, uaddr,
 			srcobject, oindex, tcnt, 0);
 
 		map->timestamp++;
 		vm_map_unlock(map);
 
 		cnt -= tcnt;
 		uaddr += tcnt;
 		cp += tcnt;
 		if (npages)
 			*npages += osize;
 	}
 	return 0;
 }
 
 /*
  * Performs the copy_on_write operations necessary to allow the virtual copies
  * into user space to work.  This has to be called for write(2) system calls
  * from other processes, file unlinking, and file size shrinkage.
  */
 void
 vm_freeze_copyopts(object, froma, toa)
 	vm_object_t object;
 	vm_pindex_t froma, toa;
 {
 	int rv;
 	vm_object_t robject;
 	vm_pindex_t idx;
 
 	if ((object == NULL) ||
 		((object->flags & OBJ_OPT) == 0))
 		return;
 
 	if (object->shadow_count > object->ref_count)
 		panic("vm_freeze_copyopts: sc > rc");
 
 	while((robject = TAILQ_FIRST(&object->shadow_head)) != NULL) {
 		vm_pindex_t bo_pindex;
 		vm_page_t m_in, m_out;
 
 		bo_pindex = OFF_TO_IDX(robject->backing_object_offset);
 
 		vm_object_reference(robject);
 
 		vm_object_pip_wait(robject, "objfrz");
 
 		if (robject->ref_count == 1) {
 			vm_object_deallocate(robject);
 			continue;
 		}
 
 		vm_object_pip_add(robject, 1);
 
 		for (idx = 0; idx < robject->size; idx++) {
 
 			m_out = vm_page_grab(robject, idx,
 						VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 
 			if (m_out->valid == 0) {
 				m_in = vm_page_grab(object, bo_pindex + idx,
 						VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 				if (m_in->valid == 0) {
 					rv = vm_pager_get_pages(object, &m_in, 1, 0);
 					if (rv != VM_PAGER_OK) {
 						printf("vm_freeze_copyopts: cannot read page from file: %lx\n", (long)m_in->pindex);
 						continue;
 					}
 					vm_page_deactivate(m_in);
 				}
 
 				vm_page_protect(m_in, VM_PROT_NONE);
 				pmap_copy_page(VM_PAGE_TO_PHYS(m_in), VM_PAGE_TO_PHYS(m_out));
 				m_out->valid = m_in->valid;
 				vm_page_dirty(m_out);
 				vm_page_activate(m_out);
 				vm_page_wakeup(m_in);
 			}
 			vm_page_wakeup(m_out);
 		}
 
 		object->shadow_count--;
 		object->ref_count--;
 		TAILQ_REMOVE(&object->shadow_head, robject, shadow_list);
 		robject->backing_object = NULL;
 		robject->backing_object_offset = 0;
 
 		vm_object_pip_wakeup(robject);
 		vm_object_deallocate(robject);
 	}
 
 	vm_object_clear_flag(object, OBJ_OPT);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 /*
  *	vm_map_print:	[ debug ]
  */
 DB_SHOW_COMMAND(map, vm_map_print)
 {
 	static int nlines;
 	/* XXX convert args. */
 	vm_map_t map = (vm_map_t)addr;
 	boolean_t full = have_addr;
 
 	vm_map_entry_t entry;
 
 	db_iprintf("Task map %p: pmap=%p, nentries=%d, version=%u\n",
 	    (void *)map,
 	    (void *)map->pmap, map->nentries, map->timestamp);
 	nlines++;
 
 	if (!full && db_indent)
 		return;
 
 	db_indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		db_iprintf("map entry %p: start=%p, end=%p\n",
 		    (void *)entry, (void *)entry->start, (void *)entry->end);
 		nlines++;
 		{
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			db_iprintf(" prot=%x/%x/%s",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[(int)(unsigned char)entry->inheritance]);
 			if (entry->wired_count != 0)
 				db_printf(", wired");
 		}
 		if (entry->eflags & MAP_ENTRY_IS_SUB_MAP) {
 			/* XXX no %qd in kernel.  Truncate entry->offset. */
 			db_printf(", share=%p, offset=0x%lx\n",
 			    (void *)entry->object.sub_map,
 			    (long)entry->offset);
 			nlines++;
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.sub_map !=
 				entry->object.sub_map)) {
 				db_indent += 2;
 				vm_map_print((db_expr_t)(intptr_t)
 					     entry->object.sub_map,
 					     full, 0, (char *)0);
 				db_indent -= 2;
 			}
 		} else {
 			/* XXX no %qd in kernel.  Truncate entry->offset. */
 			db_printf(", object=%p, offset=0x%lx",
 			    (void *)entry->object.vm_object,
 			    (long)entry->offset);
 			if (entry->eflags & MAP_ENTRY_COW)
 				db_printf(", copy (%s)",
 				    (entry->eflags & MAP_ENTRY_NEEDS_COPY) ? "needed" : "done");
 			db_printf("\n");
 			nlines++;
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				db_indent += 2;
 				vm_object_print((db_expr_t)(intptr_t)
 						entry->object.vm_object,
 						full, 0, (char *)0);
 				nlines += 4;
 				db_indent -= 2;
 			}
 		}
 	}
 	db_indent -= 2;
 	if (db_indent == 0)
 		nlines = 0;
 }
 
 
 DB_SHOW_COMMAND(procvm, procvm)
 {
 	struct proc *p;
 
 	if (have_addr) {
 		p = (struct proc *) addr;
 	} else {
 		p = curproc;
 	}
 
 	db_printf("p = %p, vmspace = %p, map = %p, pmap = %p\n",
 	    (void *)p, (void *)p->p_vmspace, (void *)&p->p_vmspace->vm_map,
 	    (void *)vmspace_pmap(p->p_vmspace));
 
 	vm_map_print((db_expr_t)(intptr_t)&p->p_vmspace->vm_map, 1, 0, NULL);
 }
 
 #endif /* DDB */