diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 9c9cacd08b35..12aa8c708c95 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1,2051 +1,2053 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.348 1999/07/02 04:33:05 peter Exp $
+ *	$Id: machdep.c,v 1.349 1999/07/02 20:33:32 msmith Exp $
  */
 
 #include "apm.h"
 #include "ether.h"
 #include "npx.h"
 #include "opt_atalk.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_perfmon.h"
 #include "opt_smp.h"
 #include "opt_sysvipc.h"
 #include "opt_user_ldt.h"
 #include "opt_userconfig.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/bus.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/cons.h>
 #include <machine/bootinfo.h>
 #include <machine/ipl.h>
 #include <machine/md_var.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #ifdef SMP
 #include <machine/smp.h>
 #include <machine/globaldata.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #ifdef OLD_BUS_ARCH
 #include <i386/isa/isa_device.h>
 #endif
 #include <i386/isa/intr_machdep.h>
 #include <isa/rtc.h>
 #include <machine/vm86.h>
 #include <machine/random.h>
 #include <sys/ptrace.h>
 
 extern void init386 __P((int first));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
 int	_udatasel, _ucodesel;
 u_int	atdevbase;
 
 #if defined(SWTCH_OPTIM_STATS)
 extern int swtch_optim_stats;
 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
 	CTLFLAG_RD, &tlb_flush_count, 0, "");
 #endif
 
 #ifdef PC98
 static int	ispc98 = 1;
 #else
 static int	ispc98 = 0;
 #endif
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int physmem = 0;
 int cold = 1;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 static int
 sysctl_hw_availpages SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		i386_btop(avail_end - avail_start), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_availpages, "I", "");
 
 static int
 sysctl_machdep_msgbuf SYSCTL_HANDLER_ARGS
 {
 	int error;
 
 	/* Unwind the buffer, so that it's linear (possibly starting with
 	 * some initial nulls).
 	 */
 	error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr,
 		msgbufp->msg_size-msgbufp->msg_bufr,req);
 	if(error) return(error);
 	if(msgbufp->msg_bufr>0) {
 		error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr,
 			msgbufp->msg_bufr,req);
 	}
 	return(error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer");
 
 static int msgbuf_clear;
 
 static int
 sysctl_machdep_msgbuf_clear SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr) {
 		/* Clear the buffer and reset write pointer */
 		bzero(msgbufp->msg_ptr,msgbufp->msg_size);
 		msgbufp->msg_bufr=msgbufp->msg_bufx=0;
 		msgbuf_clear=0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW,
 	&msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I",
 	"Clear kernel message buffer");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %u bytes (%u pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 8, 2048);
+		if( physmem > 65536)
+			nbuf += (physmem - 65536) / 20;
 	}
-	nswbuf = max(min(nbuf/4, 64), 16);
+	nswbuf = max(min(nbuf/4, 256), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size);
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE));
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*(ARG_MAX+(PAGE_SIZE*3))));
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	{
 		vm_offset_t mb_map_size;
 		int xclusters;
 
 		/* Allow override of NMBCLUSTERS from the kernel environment */
 		if (getenv_int("kern.ipc.nmbclusters", &xclusters) && 
 		    xclusters > nmbclusters)
 		    nmbclusters = xclusters;
 
 		mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES;
 		mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE));
 		mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT);
 		bzero(mclrefcnt, mb_map_size / MCLBYTES);
 		mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			mb_map_size);
 		mb_map->system_map = 1;
 	}
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		callout_init(&callout[i]);
 		callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 #if defined(USERCONFIG)
 	userconfig();
 	cninit();		/* the preferred console may have changed */
 #endif
 
 	printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 void
 netisr_sysinit(data)
 	void *data;
 {
 	const struct netisrtab *nit;
 
 	nit = (const struct netisrtab *)data;
 	register_netisr(nit->nit_num, nit->nit_isr);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	u_long code;
 {
 	register struct proc *p = curproc;
 	register struct trapframe *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SS_ONSTACK;
 	} else {
 		fp = (struct sigframe *)regs->tf_esp - 1;
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow_stack (p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs->tf_err;
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs->tf_eax;
 	sf.sf_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_sc.sc_edx = regs->tf_edx;
 	sf.sf_sc.sc_esi = regs->tf_esi;
 	sf.sf_sc.sc_edi = regs->tf_edi;
 	sf.sf_sc.sc_cs = regs->tf_cs;
 	sf.sf_sc.sc_ds = regs->tf_ds;
 	sf.sf_sc.sc_ss = regs->tf_ss;
 	sf.sf_sc.sc_es = regs->tf_es;
 	sf.sf_sc.sc_fs = regs->tf_fs;
 	sf.sf_sc.sc_isp = regs->tf_isp;
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs->tf_esp;
 	sf.sf_sc.sc_fp = regs->tf_ebp;
 	sf.sf_sc.sc_pc = regs->tf_eip;
 	sf.sf_sc.sc_ps = regs->tf_eflags;
 	sf.sf_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP))
 			    | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 *
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register struct trapframe *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs->tf_esp points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0)
 		return(EFAULT);
 
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* go back to user mode if both flags are set */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 #ifdef DEBUG
 	    		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
 	    		return(EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 		if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     			printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return(EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* restore scratch registers */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK;
 	p->p_sigmask = scp->sc_mask & ~sigcantmask;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return(EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack, ps_strings)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 #ifdef USER_LDT
 	/* was i386_user_cleanup() in NetBSD */
 	if (pcb->pcb_ldt) {
 		if (pcb == curpcb) {
 			lldt(_default_ldt);
 			currentldt = _default_ldt;
 		}
 		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt,
 			pcb->pcb_ldt_len * sizeof(union descriptor));
 		pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0;
  	}
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
 	/* reset %gs as well */
 	pcb->pcb_gs = _udatasel;
 	if (pcb == curpcb) {
 		load_gs(_udatasel);
 	}
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #if NNPX > 0
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 
       /*
        * XXX - Linux emulator
        * Make sure sure edx is 0x0 on entry. Linux binaries depend
        * on it.
        */
       p->p_retval[1] = 0;
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 #ifdef SMP
 union descriptor gdt[NGDT * NCPU];	/* global descriptor table */
 #else
 union descriptor gdt[NGDT];		/* global descriptor table */
 #endif
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 #ifndef SMP
 extern struct segment_descriptor common_tssd, *tss_gdt;
 #endif
 int private_tss;			/* flag indicating private tss */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	3 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	4 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	5 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	6 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	7 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	8 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 9 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 10 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	11 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #define PHYSMAP_SIZE	(2 * 8)
 
 /*
  * Populate the (physmap) array with base/length pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be constrained by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * If we cannot accurately determine the physical memory map, and the
  * value from the RTC seems dubious, trust the value of hw.physmem/MAXMEM
  * instead, but require a speculative probe of memory.
  */
 static void
 getmemsize(int first)
 {
 	int i, physmap_idx, pa_indx;
 	u_int basemem, extmem;
 	int speculative_mprobe = FALSE;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_offset_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t pte;
 	u_int64_t AllowMem, MaxMem, sanity;
 	const char *cp, *ep;
 	struct {
 		u_int64_t base;
 		u_int64_t length;
 		u_int32_t type;
 	} *smap;
 
 	bzero(&vmf, sizeof(struct vm86frame));
 	bzero(physmap, sizeof(physmap));
 
 	/*
 	 * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes
 	 * for the appropriate modifiers.
 	 * After this calculation, AllowMem is either 0 (no memory size cap) 
 	 * or the maximum memory size desired in bytes.
 	 */
 	AllowMem = 0;
 	if ((cp = getenv("hw.physmem")) != NULL) {
 		sanity = AllowMem = strtouq(cp, &ep, 0);
 		if ((ep != cp) && (*ep != 0)) {
 			switch(*ep) {
 			case 'g':
 			case 'G':
 				AllowMem <<= 10;
 			case 'm':
 			case 'M':
 				AllowMem <<= 10;
 			case 'k':
 			case 'K':
 				AllowMem <<= 10;
 				break;
 			default:
 				AllowMem = sanity = 0;
 			}
 			if (AllowMem < sanity)
 				AllowMem = 0;
 		}
 		if (AllowMem == 0)
 			printf("Warning: invalid memory limit '%s' specified\n", cp);
 	}
 #ifdef MAXMEM
 	if (AllowMem == 0)
 		AllowMem = MAXMEM * (u_int64_t)1024;
 #endif
 	if ((AllowMem != 0) && (boothowto & RB_VERBOSE))
 		printf("Physical memory use limited to %uk\n", (u_int)(AllowMem / 1024));
 	MaxMem = AllowMem;
 	if (AllowMem == 0)
 		AllowMem = (u_int64_t)1 << 32;	/* 4GB limit imposed by 32-bit pmap */
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 		pte = (pt_entry_t)vtopte(pa + KERNBASE);
 		*pte = pa | PG_RW | PG_V;
 	}
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT));
 	*pte = (1 << PAGE_SHIFT) | PG_RW | PG_V;
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 #define SMAPSIZ 	sizeof(*smap)
 #define SMAP_SIG	0x534D4150			/* 'SMAP' */
 
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = SMAPSIZ;
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
 				smap->type,
 				*(u_int32_t *)((char *)&smap->base + 4),
 				(u_int32_t)smap->base,
 				*(u_int32_t *)((char *)&smap->length + 4),
 				(u_int32_t)smap->length);
 
 		if (smap->type != 0x01)
 			goto next_run;
 
 		if (smap->length == 0)
 			goto next_run;
 
 		if (smap->base >= AllowMem) {
 			printf("%uk of memory above %uk ignored\n",
 			    (u_int)(smap->length / 1024), (u_int)(AllowMem / 1024));
 			goto next_run;
 		}
 		if ((smap->base + smap->length) >= AllowMem) {
 			printf("%uk region truncated to %uk to fit %uk limit\n", 
 			       (u_int)(smap->length / 1024), 
 			       (u_int)((AllowMem - smap->base) / 1024),
 			       (u_int)(AllowMem / 1024));
 			smap->length = AllowMem - smap->base;
 		}
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-montonic memory region, ignoring second region\n");
 				goto next_run;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			goto next_run;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 next_run:
 	} while (vmf.vmf_ebx != 0);
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	if (physmap[1] == 0) {
 		vmf.vmf_ax = 0xE801;
 		if (vm86_intcall(0x15, &vmf) == 0) {
 			extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 		} else {
 #if 0
 			vmf.vmf_ah = 0x88;
 			vm86_intcall(0x15, &vmf);
 			extmem = vmf.vmf_ax;
 #else
 			/*
 			 * Prefer the RTC value for extended memory, or
 			 * hw.physmem/MAXMEM overrides.
 			 */
 			if (MaxMem > (1024 * 1024)) {		/* < 1MB is insane */
 				extmem = (MaxMem / 1024) - 1024;
 			} else {
 				extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 			}
 			/*
 			 * If the value from the RTC is >= 16M, there is a good
 			 * chance that it's lying.  Compaq systems never report
 			 * more than 16M, and no system can honestly report more
 			 * than 64M.  We should end up here only on extremely
 			 * old and broken systems.  In any case, qualify the value
 			 * that we've got here by actually checking for physical
 			 * memory later on.
 			 */
 			if (extmem >= 16 * 1024)
 				speculative_mprobe = TRUE;
 #endif
 		}
 
 		/*
 		 * Special hack for chipsets that still remap the 384k hole when
 		 * there's 16MB of memory - this really confuses people that
 		 * are trying to use bus mastering ISA controllers with the
 		 * "16MB limit"; they only have 16MB, but the remapping puts
 		 * them beyond the limit.
 		 *
 		 * If extended memory is between 15-16MB (16-17MB phys address range),
 		 *	chop it to 15MB.
 		 */
 		if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 			extmem = 15 * 1024;
 
 		physmap[0] = 0;
 		physmap[1] = basemem * 1024;
 		physmap_idx = 2;
 		physmap[physmap_idx] = 0x100000;
 		physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We fiddle it again
 	 * later based on the results of the memory test.
 	 */
 	Maxmem = physmap[physmap_idx + 1] / PAGE_SIZE;
 	
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 #if 0
 	pte = (pt_entry_t)vtopte(KERNBASE);
 #else
 	pte = (pt_entry_t)CMAP1;
 #endif
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_offset_t end;
 
 		if (boothowto & RB_VERBOSE)
 			printf("Testing memory %uk to %uk\n",
 			       (u_int)(physmap[i] / 1024), 
 			       (u_int)((physmap[i] + physmap[i+1]) / 1024));
 		end = ptoa(Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad;
 #if 0
 			int *ptr = 0;
 #else
 			int *ptr = (int *)CADDR1;
 #endif
 
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				continue;
 	
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555) {
 			page_bad = TRUE;
 			}
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE) {
 				continue;
 			}
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 				if (speculative_mprobe == TRUE &&
 				    phys_avail[pa_indx] >= (64*1024*1024))
 					end += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	avail_end = phys_avail[pa_indx];
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int off;
 
 	/*
 	 * Prevent lowering of the ipl if we call tsleep() early.
 	 */
 	safepri = cpl;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	}
 	if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef SMP
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct privatespace)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0];
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &SMP_prvspace[0].globaldata.gd_common_tss;
 	SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0];
 #else
 	gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &common_tss;
 #endif
 
 	for (x = 0; x < NGDT; x++) {
 #ifdef BDE_DEBUGGER
 		/* avoid overwriting db entries with APM ones */
 		if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL)
 			continue;
 #endif
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 #ifdef USER_LDT
 	currentldt = _default_ldt;
 #endif
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16;
 	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	private_tss = 0;
 	tss_gdt = &gdt[GPROC0_SEL].sd;
 	common_tssd = *tss_gdt;
 	common_tss.tss_ioopt = (sizeof common_tss) << 16;
 	ltr(gsel_tss);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 #ifdef SMP
 	proc0.p_addr->u_pcb.pcb_mpnest = 1;
 #endif
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused) {
 	struct gate_descriptor *new_idt;
 #ifndef SMP
 	struct region_descriptor r_idt;
 #endif
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned long addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_read_u_check(p, addr, len)
 	struct proc *p;
 	vm_offset_t addr;
 	size_t len;
 {
 	vm_offset_t gap;
 
 	if ((vm_offset_t) (addr + len) < addr)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= sizeof(struct user))
 		return 0;
 
 	gap = (char *) p->p_md.md_regs - (char *) p->p_addr;
 	
 	if ((vm_offset_t) addr < gap)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= 
 	    (vm_offset_t) (gap + sizeof(struct trapframe)))
 		return 0;
 	return EPERM;
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	long data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
diff --git a/sys/i386/i386/machdep.c b/sys/i386/i386/machdep.c
index 9c9cacd08b35..12aa8c708c95 100644
--- a/sys/i386/i386/machdep.c
+++ b/sys/i386/i386/machdep.c
@@ -1,2051 +1,2053 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.348 1999/07/02 04:33:05 peter Exp $
+ *	$Id: machdep.c,v 1.349 1999/07/02 20:33:32 msmith Exp $
  */
 
 #include "apm.h"
 #include "ether.h"
 #include "npx.h"
 #include "opt_atalk.h"
 #include "opt_cpu.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_ipx.h"
 #include "opt_maxmem.h"
 #include "opt_msgbuf.h"
 #include "opt_perfmon.h"
 #include "opt_smp.h"
 #include "opt_sysvipc.h"
 #include "opt_user_ldt.h"
 #include "opt_userconfig.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/msgbuf.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/bus.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/reg.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/cons.h>
 #include <machine/bootinfo.h>
 #include <machine/ipl.h>
 #include <machine/md_var.h>
 #include <machine/pcb_ext.h>		/* pcb.h included via sys/user.h */
 #ifdef SMP
 #include <machine/smp.h>
 #include <machine/globaldata.h>
 #endif
 #ifdef PERFMON
 #include <machine/perfmon.h>
 #endif
 
 #ifdef OLD_BUS_ARCH
 #include <i386/isa/isa_device.h>
 #endif
 #include <i386/isa/intr_machdep.h>
 #include <isa/rtc.h>
 #include <machine/vm86.h>
 #include <machine/random.h>
 #include <sys/ptrace.h>
 
 extern void init386 __P((int first));
 extern void dblfault_handler __P((void));
 
 extern void printcpuinfo(void);	/* XXX header file */
 extern void earlysetcpuclass(void);	/* same header file */
 extern void finishidentcpu(void);
 extern void panicifcpuunsupported(void);
 extern void initializecpu(void);
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static MALLOC_DEFINE(M_MBUF, "mbuf", "mbuf");
 
 int	_udatasel, _ucodesel;
 u_int	atdevbase;
 
 #if defined(SWTCH_OPTIM_STATS)
 extern int swtch_optim_stats;
 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
 	CTLFLAG_RD, &swtch_optim_stats, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
 	CTLFLAG_RD, &tlb_flush_count, 0, "");
 #endif
 
 #ifdef PC98
 static int	ispc98 = 1;
 #else
 static int	ispc98 = 0;
 #endif
 SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");
 
 int physmem = 0;
 int cold = 1;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 static int
 sysctl_hw_availpages SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		i386_btop(avail_end - avail_start), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_availpages, "I", "");
 
 static int
 sysctl_machdep_msgbuf SYSCTL_HANDLER_ARGS
 {
 	int error;
 
 	/* Unwind the buffer, so that it's linear (possibly starting with
 	 * some initial nulls).
 	 */
 	error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr+msgbufp->msg_bufr,
 		msgbufp->msg_size-msgbufp->msg_bufr,req);
 	if(error) return(error);
 	if(msgbufp->msg_bufr>0) {
 		error=sysctl_handle_opaque(oidp,msgbufp->msg_ptr,
 			msgbufp->msg_bufr,req);
 	}
 	return(error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf, CTLTYPE_STRING|CTLFLAG_RD,
 	0, 0, sysctl_machdep_msgbuf, "A","Contents of kernel message buffer");
 
 static int msgbuf_clear;
 
 static int
 sysctl_machdep_msgbuf_clear SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr) {
 		/* Clear the buffer and reset write pointer */
 		bzero(msgbufp->msg_ptr,msgbufp->msg_size);
 		msgbufp->msg_bufr=msgbufp->msg_bufx=0;
 		msgbuf_clear=0;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, OID_AUTO, msgbuf_clear, CTLTYPE_INT|CTLFLAG_RW,
 	&msgbuf_clear, 0, sysctl_machdep_msgbuf_clear, "I",
 	"Clear kernel message buffer");
 
 int bootverbose = 0, Maxmem = 0;
 long dumplo;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	earlysetcpuclass();
 	startrtclock();
 	printcpuinfo();
 	panicifcpuunsupported();
 #ifdef PERFMON
 	perfmon_init();
 #endif
 	printf("real memory  = %u (%uK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (bootverbose) {
 		int indx;
 
 		printf("Physical memory chunk(s):\n");
 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
 			int size1 = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08x - 0x%08x, %u bytes (%u pages)\n",
 			    phys_avail[indx], phys_avail[indx + 1] - 1, size1,
 			    size1 / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Calculate callout wheel size
 	 */
 	for (callwheelsize = 1, callwheelbits = 0;
 	     callwheelsize < ncallout;
 	     callwheelsize <<= 1, ++callwheelbits)
 		;
 	callwheelmask = callwheelsize - 1;
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 
 	valloc(callout, struct callout, ncallout);
 	valloc(callwheel, struct callout_tailq, callwheelsize);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 8, 2048);
+		if( physmem > 65536)
+			nbuf += (physmem - 65536) / 20;
 	}
-	nswbuf = max(min(nbuf/4, 64), 16);
+	nswbuf = max(min(nbuf/4, 256), 16);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*BKVASIZE) + (nswbuf*MAXPHYS) + pager_map_size);
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*BKVASIZE));
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size);
 	pager_map->system_map = 1;
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*(ARG_MAX+(PAGE_SIZE*3))));
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	{
 		vm_offset_t mb_map_size;
 		int xclusters;
 
 		/* Allow override of NMBCLUSTERS from the kernel environment */
 		if (getenv_int("kern.ipc.nmbclusters", &xclusters) && 
 		    xclusters > nmbclusters)
 		    nmbclusters = xclusters;
 
 		mb_map_size = nmbufs * MSIZE + nmbclusters * MCLBYTES;
 		mb_map_size = roundup2(mb_map_size, max(MCLBYTES, PAGE_SIZE));
 		mclrefcnt = malloc(mb_map_size / MCLBYTES, M_MBUF, M_NOWAIT);
 		bzero(mclrefcnt, mb_map_size / MCLBYTES);
 		mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			mb_map_size);
 		mb_map->system_map = 1;
 	}
 
 	/*
 	 * Initialize callouts
 	 */
 	SLIST_INIT(&callfree);
 	for (i = 0; i < ncallout; i++) {
 		callout_init(&callout[i]);
 		callout[i].c_flags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&callfree, &callout[i], c_links.sle);
 	}
 
 	for (i = 0; i < callwheelsize; i++) {
 		TAILQ_INIT(&callwheel[i]);
 	}
 
 #if defined(USERCONFIG)
 	userconfig();
 	cninit();		/* the preferred console may have changed */
 #endif
 
 	printf("avail memory = %u (%uK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 #ifdef SMP
 	/*
 	 * OK, enough kmem_alloc/malloc state should be up, lets get on with it!
 	 */
 	mp_start();			/* fire up the APs and APICs */
 	mp_announce();
 #endif  /* SMP */
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 void
 netisr_sysinit(data)
 	void *data;
 {
 	const struct netisrtab *nit;
 
 	nit = (const struct netisrtab *)data;
 	register_netisr(nit->nit_num, nit->nit_isr);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * at top to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	u_long code;
 {
 	register struct proc *p = curproc;
 	register struct trapframe *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler context.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) && !oonstack &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SS_ONSTACK;
 	} else {
 		fp = (struct sigframe *)regs->tf_esp - 1;
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow_stack (p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof(struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs->tf_err;
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs->tf_eax;
 	sf.sf_sc.sc_ebx = regs->tf_ebx;
 	sf.sf_sc.sc_ecx = regs->tf_ecx;
 	sf.sf_sc.sc_edx = regs->tf_edx;
 	sf.sf_sc.sc_esi = regs->tf_esi;
 	sf.sf_sc.sc_edi = regs->tf_edi;
 	sf.sf_sc.sc_cs = regs->tf_cs;
 	sf.sf_sc.sc_ds = regs->tf_ds;
 	sf.sf_sc.sc_ss = regs->tf_ss;
 	sf.sf_sc.sc_es = regs->tf_es;
 	sf.sf_sc.sc_fs = regs->tf_fs;
 	sf.sf_sc.sc_isp = regs->tf_isp;
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs->tf_esp;
 	sf.sf_sc.sc_fp = regs->tf_ebp;
 	sf.sf_sc.sc_pc = regs->tf_eip;
 	sf.sf_sc.sc_ps = regs->tf_eflags;
 	sf.sf_sc.sc_trapno = regs->tf_trapno;
 	sf.sf_sc.sc_err = regs->tf_err;
 
 	/*
 	 * If we're a vm86 process, we want to save the segment registers.
 	 * We also change eflags to be our emulated eflags, not the actual
 	 * eflags.
 	 */
 	if (regs->tf_eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 
 		sf.sf_sc.sc_gs = tf->tf_vm86_gs;
 		sf.sf_sc.sc_fs = tf->tf_vm86_fs;
 		sf.sf_sc.sc_es = tf->tf_vm86_es;
 		sf.sf_sc.sc_ds = tf->tf_vm86_ds;
 
 		if (vm86->vm86_has_vme == 0)
 			sf.sf_sc.sc_ps = (tf->tf_eflags & ~(PSL_VIF | PSL_VIP))
 			    | (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
 
 		/*
 		 * We should never have PSL_T set when returning from vm86
 		 * mode.  It may be set here if we deliver a signal before
 		 * getting to vm86 mode, so turn it off.
 		 *
 		 * Clear PSL_NT to inhibit T_TSSFLT faults on return from
 		 * syscalls made by the signal handler.  This just avoids
 		 * wasting time for our lazy fixup of such faults.  PSL_NT
 		 * does nothing in vm86 mode, but vm86 programs can set it
 		 * almost legitimately in probes for old cpu types.
 		 */
 		tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_T | PSL_VIF | PSL_VIP);
 	}
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	}
 
 	regs->tf_esp = (int)fp;
 	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_cs = _ucodesel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_ss = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register struct trapframe *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs->tf_esp points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), B_WRITE) == 0)
 		return(EFAULT);
 
 	eflags = scp->sc_ps;
 	if (eflags & PSL_VM) {
 		struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
 		struct vm86_kernel *vm86;
 
 		/*
 		 * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
 		 * set up the vm86 area, and we can't enter vm86 mode.
 		 */
 		if (p->p_addr->u_pcb.pcb_ext == 0)
 			return (EINVAL);
 		vm86 = &p->p_addr->u_pcb.pcb_ext->ext_vm86;
 		if (vm86->vm86_inited == 0)
 			return (EINVAL);
 
 		/* go back to user mode if both flags are set */
 		if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
 			trapsignal(p, SIGBUS, 0);
 
 		if (vm86->vm86_has_vme) {
 			eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
 			    (eflags & VME_USERCHANGE) | PSL_VM;
 		} else {
 			vm86->vm86_eflags = eflags;	/* save VIF, VIP */
 			eflags = (tf->tf_eflags & ~VM_USERCHANGE) |					    (eflags & VM_USERCHANGE) | PSL_VM;
 		}
 		tf->tf_vm86_ds = scp->sc_ds;
 		tf->tf_vm86_es = scp->sc_es;
 		tf->tf_vm86_fs = scp->sc_fs;
 		tf->tf_vm86_gs = scp->sc_gs;
 		tf->tf_ds = _udatasel;
 		tf->tf_es = _udatasel;
 		tf->tf_fs = _udatasel;
 	} else {
 		/*
 		 * Don't allow users to change privileged or reserved flags.
 		 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 		/*
 		 * XXX do allow users to change the privileged flag PSL_RF.
 		 * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
 		 * should sometimes set it there too.  tf_eflags is kept in
 		 * the signal context during signal handling and there is no
 		 * other place to remember it, so the PSL_RF bit may be
 		 * corrupted by the signal handler without us knowing.
 		 * Corruption of the PSL_RF bit at worst causes one more or
 		 * one less debugger trap, so allowing it is fairly harmless.
 		 */
 		if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
 #ifdef DEBUG
 	    		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
 	    		return(EINVAL);
 		}
 
 		/*
 		 * Don't allow users to load a valid privileged %cs.  Let the
 		 * hardware check for invalid selectors, excess privilege in
 		 * other selectors, invalid %eip's and invalid %esp's.
 		 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 		if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     			printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 			trapsignal(p, SIGBUS, T_PROTFLT);
 			return(EINVAL);
 		}
 		regs->tf_ds = scp->sc_ds;
 		regs->tf_es = scp->sc_es;
 		regs->tf_fs = scp->sc_fs;
 	}
 
 	/* restore scratch registers */
 	regs->tf_eax = scp->sc_eax;
 	regs->tf_ebx = scp->sc_ebx;
 	regs->tf_ecx = scp->sc_ecx;
 	regs->tf_edx = scp->sc_edx;
 	regs->tf_esi = scp->sc_esi;
 	regs->tf_edi = scp->sc_edi;
 	regs->tf_cs = scp->sc_cs;
 	regs->tf_ss = scp->sc_ss;
 	regs->tf_isp = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), B_WRITE) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SS_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SS_ONSTACK;
 	p->p_sigmask = scp->sc_mask & ~sigcantmask;
 	regs->tf_ebp = scp->sc_fp;
 	regs->tf_esp = scp->sc_sp;
 	regs->tf_eip = scp->sc_pc;
 	regs->tf_eflags = eflags;
 	return(EJUSTRETURN);
 }
 
 /*
  * Machine dependent boot() routine
  *
  * I haven't seen anything to put here yet
  * Possibly some stuff might be grafted back here from boot()
  */
 void
 cpu_boot(int howto)
 {
 }
 
 /*
  * Shutdown the CPU as much as possible
  */
 void
 cpu_halt(void)
 {
 	for (;;)
 		__asm__ ("hlt");
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack, ps_strings)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = p->p_md.md_regs;
 	struct pcb *pcb = &p->p_addr->u_pcb;
 
 #ifdef USER_LDT
 	/* was i386_user_cleanup() in NetBSD */
 	if (pcb->pcb_ldt) {
 		if (pcb == curpcb) {
 			lldt(_default_ldt);
 			currentldt = _default_ldt;
 		}
 		kmem_free(kernel_map, (vm_offset_t)pcb->pcb_ldt,
 			pcb->pcb_ldt_len * sizeof(union descriptor));
 		pcb->pcb_ldt_len = (int)pcb->pcb_ldt = 0;
  	}
 #endif
   
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_eip = entry;
 	regs->tf_esp = stack;
 	regs->tf_eflags = PSL_USER | (regs->tf_eflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_ds = _udatasel;
 	regs->tf_es = _udatasel;
 	regs->tf_fs = _udatasel;
 	regs->tf_cs = _ucodesel;
 
 	/* PS_STRINGS value for BSD/OS binaries.  It is 0 for non-BSD/OS. */
 	regs->tf_ebx = ps_strings;
 
 	/* reset %gs as well */
 	pcb->pcb_gs = _udatasel;
 	if (pcb == curpcb) {
 		load_gs(_udatasel);
 	}
 
 	/*
 	 * Initialize the math emulator (if any) for the current process.
 	 * Actually, just clear the bit that says that the emulator has
 	 * been initialized.  Initialization is delayed until the process
 	 * traps to the emulator (if it is done at all) mainly because
 	 * emulators don't provide an entry point for initialization.
 	 */
 	p->p_addr->u_pcb.pcb_flags &= ~FP_SOFTFP;
 
 	/*
 	 * Arrange to trap the next npx or `fwait' instruction (see npx.c
 	 * for why fwait must be trapped at least if there is an npx or an
 	 * emulator).  This is mainly to handle the case where npx0 is not
 	 * configured, since the npx routines normally set up the trap
 	 * otherwise.  It should be done only at boot time, but doing it
 	 * here allows modifying `npx_exists' for testing the emulator on
 	 * systems with an npx.
 	 */
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 
 #if NNPX > 0
 	/* Initialize the npx (if any) for the current process. */
 	npxinit(__INITIAL_NPXCW__);
 #endif
 
       /*
        * XXX - Linux emulator
        * Make sure sure edx is 0x0 on entry. Linux binaries depend
        * on it.
        */
       p->p_retval[1] = 0;
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int _default_ldt;
 #ifdef SMP
 union descriptor gdt[NGDT * NCPU];	/* global descriptor table */
 #else
 union descriptor gdt[NGDT];		/* global descriptor table */
 #endif
 static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 #ifdef SMP
 /* table descriptors - used to load tables by microp */
 struct region_descriptor r_gdt, r_idt;
 #endif
 
 #ifndef SMP
 extern struct segment_descriptor common_tssd, *tss_gdt;
 #endif
 int private_tss;			/* flag indicating private tss */
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 extern int has_f00f_bug;
 #endif
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPRIV_SEL	3 SMP Per-Processor Private Data Descriptor */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	4 Proc 0 Tss Descriptor */
 {
 	0x0,			/* segment base address */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	5 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	6 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	7 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	8 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 9 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 10 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	11 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip;
 
 	ip = idt + idx;
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall), IDTVEC(int0x80_syscall);
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 #define PHYSMAP_SIZE	(2 * 8)
 
 /*
  * Populate the (physmap) array with base/length pairs describing the
  * available physical memory in the system, then test this memory and
  * build the phys_avail array describing the actually-available memory.
  *
  * Total memory size may be constrained by the kernel environment variable
  * hw.physmem or the compile-time define MAXMEM.
  *
  * If we cannot accurately determine the physical memory map, and the
  * value from the RTC seems dubious, trust the value of hw.physmem/MAXMEM
  * instead, but require a speculative probe of memory.
  */
 static void
 getmemsize(int first)
 {
 	int i, physmap_idx, pa_indx;
 	u_int basemem, extmem;
 	int speculative_mprobe = FALSE;
 	struct vm86frame vmf;
 	struct vm86context vmc;
 	vm_offset_t pa, physmap[PHYSMAP_SIZE];
 	pt_entry_t pte;
 	u_int64_t AllowMem, MaxMem, sanity;
 	const char *cp, *ep;
 	struct {
 		u_int64_t base;
 		u_int64_t length;
 		u_int32_t type;
 	} *smap;
 
 	bzero(&vmf, sizeof(struct vm86frame));
 	bzero(physmap, sizeof(physmap));
 
 	/*
 	 * hw.maxmem is a size in bytes; we also allow k, m, and g suffixes
 	 * for the appropriate modifiers.
 	 * After this calculation, AllowMem is either 0 (no memory size cap) 
 	 * or the maximum memory size desired in bytes.
 	 */
 	AllowMem = 0;
 	if ((cp = getenv("hw.physmem")) != NULL) {
 		sanity = AllowMem = strtouq(cp, &ep, 0);
 		if ((ep != cp) && (*ep != 0)) {
 			switch(*ep) {
 			case 'g':
 			case 'G':
 				AllowMem <<= 10;
 			case 'm':
 			case 'M':
 				AllowMem <<= 10;
 			case 'k':
 			case 'K':
 				AllowMem <<= 10;
 				break;
 			default:
 				AllowMem = sanity = 0;
 			}
 			if (AllowMem < sanity)
 				AllowMem = 0;
 		}
 		if (AllowMem == 0)
 			printf("Warning: invalid memory limit '%s' specified\n", cp);
 	}
 #ifdef MAXMEM
 	if (AllowMem == 0)
 		AllowMem = MAXMEM * (u_int64_t)1024;
 #endif
 	if ((AllowMem != 0) && (boothowto & RB_VERBOSE))
 		printf("Physical memory use limited to %uk\n", (u_int)(AllowMem / 1024));
 	MaxMem = AllowMem;
 	if (AllowMem == 0)
 		AllowMem = (u_int64_t)1 << 32;	/* 4GB limit imposed by 32-bit pmap */
 
 	/*
 	 * Perform "base memory" related probes & setup
 	 */
 	vm86_intcall(0x12, &vmf);
 	basemem = vmf.vmf_ax;
 	if (basemem > 640) {
 		printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
 			basemem);
 		basemem = 640;
 	}
 
 	/*
 	 * XXX if biosbasemem is now < 640, there is a `hole'
 	 * between the end of base memory and the start of
 	 * ISA memory.  The hole may be empty or it may
 	 * contain BIOS code or data.  Map it read/write so
 	 * that the BIOS can write to it.  (Memory from 0 to
 	 * the physical end of the kernel is mapped read-only
 	 * to begin with and then parts of it are remapped.
 	 * The parts that aren't remapped form holes that
 	 * remain read-only and are unused by the kernel.
 	 * The base memory area is below the physical end of
 	 * the kernel and right now forms a read-only hole.
 	 * The part of it from PAGE_SIZE to
 	 * (trunc_page(biosbasemem * 1024) - 1) will be
 	 * remapped and used by the kernel later.)
 	 *
 	 * This code is similar to the code used in
 	 * pmap_mapdev, but since no memory needs to be
 	 * allocated we simply change the mapping.
 	 */
 	for (pa = trunc_page(basemem * 1024);
 	     pa < ISA_HOLE_START; pa += PAGE_SIZE) {
 		pte = (pt_entry_t)vtopte(pa + KERNBASE);
 		*pte = pa | PG_RW | PG_V;
 	}
 
 	/*
 	 * if basemem != 640, map pages r/w into vm86 page table so 
 	 * that the bios can scribble on it.
 	 */
 	pte = (pt_entry_t)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 
 	/*
 	 * map page 1 R/W into the kernel page table so we can use it
 	 * as a buffer.  The kernel will unmap this page later.
 	 */
 	pte = (pt_entry_t)vtopte(KERNBASE + (1 << PAGE_SHIFT));
 	*pte = (1 << PAGE_SHIFT) | PG_RW | PG_V;
 
 	/*
 	 * get memory map with INT 15:E820
 	 */
 #define SMAPSIZ 	sizeof(*smap)
 #define SMAP_SIG	0x534D4150			/* 'SMAP' */
 
 	vmc.npages = 0;
 	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
 	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);
 
 	physmap_idx = 0;
 	vmf.vmf_ebx = 0;
 	do {
 		vmf.vmf_eax = 0xE820;
 		vmf.vmf_edx = SMAP_SIG;
 		vmf.vmf_ecx = SMAPSIZ;
 		i = vm86_datacall(0x15, &vmf, &vmc);
 		if (i || vmf.vmf_eax != SMAP_SIG)
 			break;
 		if (boothowto & RB_VERBOSE)
 			printf("SMAP type=%02x base=%08x %08x len=%08x %08x\n",
 				smap->type,
 				*(u_int32_t *)((char *)&smap->base + 4),
 				(u_int32_t)smap->base,
 				*(u_int32_t *)((char *)&smap->length + 4),
 				(u_int32_t)smap->length);
 
 		if (smap->type != 0x01)
 			goto next_run;
 
 		if (smap->length == 0)
 			goto next_run;
 
 		if (smap->base >= AllowMem) {
 			printf("%uk of memory above %uk ignored\n",
 			    (u_int)(smap->length / 1024), (u_int)(AllowMem / 1024));
 			goto next_run;
 		}
 		if ((smap->base + smap->length) >= AllowMem) {
 			printf("%uk region truncated to %uk to fit %uk limit\n", 
 			       (u_int)(smap->length / 1024), 
 			       (u_int)((AllowMem - smap->base) / 1024),
 			       (u_int)(AllowMem / 1024));
 			smap->length = AllowMem - smap->base;
 		}
 
 		for (i = 0; i <= physmap_idx; i += 2) {
 			if (smap->base < physmap[i + 1]) {
 				if (boothowto & RB_VERBOSE)
 					printf(
 	"Overlapping or non-montonic memory region, ignoring second region\n");
 				goto next_run;
 			}
 		}
 
 		if (smap->base == physmap[physmap_idx + 1]) {
 			physmap[physmap_idx + 1] += smap->length;
 			goto next_run;
 		}
 
 		physmap_idx += 2;
 		if (physmap_idx == PHYSMAP_SIZE) {
 			printf(
 		"Too many segments in the physical address map, giving up\n");
 			break;
 		}
 		physmap[physmap_idx] = smap->base;
 		physmap[physmap_idx + 1] = smap->base + smap->length;
 next_run:
 	} while (vmf.vmf_ebx != 0);
 
 	/*
 	 * If we failed above, try memory map with INT 15:E801
 	 */
 	if (physmap[1] == 0) {
 		vmf.vmf_ax = 0xE801;
 		if (vm86_intcall(0x15, &vmf) == 0) {
 			extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
 		} else {
 #if 0
 			vmf.vmf_ah = 0x88;
 			vm86_intcall(0x15, &vmf);
 			extmem = vmf.vmf_ax;
 #else
 			/*
 			 * Prefer the RTC value for extended memory, or
 			 * hw.physmem/MAXMEM overrides.
 			 */
 			if (MaxMem > (1024 * 1024)) {		/* < 1MB is insane */
 				extmem = (MaxMem / 1024) - 1024;
 			} else {
 				extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
 			}
 			/*
 			 * If the value from the RTC is >= 16M, there is a good
 			 * chance that it's lying.  Compaq systems never report
 			 * more than 16M, and no system can honestly report more
 			 * than 64M.  We should end up here only on extremely
 			 * old and broken systems.  In any case, qualify the value
 			 * that we've got here by actually checking for physical
 			 * memory later on.
 			 */
 			if (extmem >= 16 * 1024)
 				speculative_mprobe = TRUE;
 #endif
 		}
 
 		/*
 		 * Special hack for chipsets that still remap the 384k hole when
 		 * there's 16MB of memory - this really confuses people that
 		 * are trying to use bus mastering ISA controllers with the
 		 * "16MB limit"; they only have 16MB, but the remapping puts
 		 * them beyond the limit.
 		 *
 		 * If extended memory is between 15-16MB (16-17MB phys address range),
 		 *	chop it to 15MB.
 		 */
 		if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
 			extmem = 15 * 1024;
 
 		physmap[0] = 0;
 		physmap[1] = basemem * 1024;
 		physmap_idx = 2;
 		physmap[physmap_idx] = 0x100000;
 		physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;
 
 	}
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".  We fiddle it again
 	 * later based on the results of the memory test.
 	 */
 	Maxmem = physmap[physmap_idx + 1] / PAGE_SIZE;
 	
 	/*
 	 * Now, physmap contains a map of physical memory.
 	 */
 
 #ifdef SMP
 	/* make hole for AP bootstrap code */
 	physmap[1] = mp_bootaddress(physmap[1] / 1024);
 
 	/* look for the MP hardware - needed for apic addresses */
 	mp_probe();
 #endif
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 	physmap[0] = PAGE_SIZE;		/* mask off page 0 */
 	pa_indx = 0;
 	phys_avail[pa_indx++] = physmap[0];
 	phys_avail[pa_indx] = physmap[0];
 #if 0
 	pte = (pt_entry_t)vtopte(KERNBASE);
 #else
 	pte = (pt_entry_t)CMAP1;
 #endif
 
 	/*
 	 * physmap is in bytes, so when converting to page boundaries,
 	 * round up the start address and round down the end address.
 	 */
 	for (i = 0; i <= physmap_idx; i += 2) {
 		vm_offset_t end;
 
 		if (boothowto & RB_VERBOSE)
 			printf("Testing memory %uk to %uk\n",
 			       (u_int)(physmap[i] / 1024), 
 			       (u_int)((physmap[i] + physmap[i+1]) / 1024));
 		end = ptoa(Maxmem);
 		if (physmap[i + 1] < end)
 			end = trunc_page(physmap[i + 1]);
 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
 			int tmp, page_bad;
 #if 0
 			int *ptr = 0;
 #else
 			int *ptr = (int *)CADDR1;
 #endif
 
 			/*
 			 * block out kernel memory as not available.
 			 */
 			if (pa >= 0x100000 && pa < first)
 				continue;
 	
 			page_bad = FALSE;
 
 			/*
 			 * map page into kernel: valid, read/write,non-cacheable
 			 */
 			*pte = pa | PG_V | PG_RW | PG_N;
 			invltlb();
 
 			tmp = *(int *)ptr;
 			/*
 			 * Test for alternating 1's and 0's
 			 */
 			*(volatile int *)ptr = 0xaaaaaaaa;
 			if (*(volatile int *)ptr != 0xaaaaaaaa) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for alternating 0's and 1's
 			 */
 			*(volatile int *)ptr = 0x55555555;
 			if (*(volatile int *)ptr != 0x55555555) {
 			page_bad = TRUE;
 			}
 			/*
 			 * Test for all 1's
 			 */
 			*(volatile int *)ptr = 0xffffffff;
 			if (*(volatile int *)ptr != 0xffffffff) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Test for all 0's
 			 */
 			*(volatile int *)ptr = 0x0;
 			if (*(volatile int *)ptr != 0x0) {
 				page_bad = TRUE;
 			}
 			/*
 			 * Restore original value.
 			 */
 			*(int *)ptr = tmp;
 
 			/*
 			 * Adjust array of valid/good pages.
 			 */
 			if (page_bad == TRUE) {
 				continue;
 			}
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 * If we're also doing a speculative memory
 			 * test and we at or past the end, bump up Maxmem
 			 * so that we keep going. The first bad page
 			 * will terminate the loop.
 			 */
 			if (phys_avail[pa_indx] == pa) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 				if (speculative_mprobe == TRUE &&
 				    phys_avail[pa_indx] >= (64*1024*1024))
 					end += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = pa;	/* start */
 				phys_avail[pa_indx] = pa + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		}
 	}
 	*pte = 0;
 	invltlb();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(MSGBUF_SIZE) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(MSGBUF_SIZE);
 
 	avail_end = phys_avail[pa_indx];
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 #ifndef SMP
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 #endif
 	int off;
 
 	/*
 	 * Prevent lowering of the ipl if we call tsleep() early.
 	 */
 	safepri = cpl;
 
 	proc0.p_addr = proc0paddr;
 
 	atdevbase = ISA_HOLE_START + KERNBASE;
 
 	if (bootinfo.bi_modulep) {
 		preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
 		preload_bootstrap_relocate(KERNBASE);
 	}
 	if (bootinfo.bi_envp)
 		kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
 	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 #ifdef SMP
 	gdt_segs[GPRIV_SEL].ssd_limit =
 		i386_btop(sizeof(struct privatespace)) - 1;
 	gdt_segs[GPRIV_SEL].ssd_base = (int) &SMP_prvspace[0];
 	gdt_segs[GPROC0_SEL].ssd_base =
 		(int) &SMP_prvspace[0].globaldata.gd_common_tss;
 	SMP_prvspace[0].globaldata.gd_prvspace = &SMP_prvspace[0];
 #else
 	gdt_segs[GPRIV_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GPROC0_SEL].ssd_base = (int) &common_tss;
 #endif
 
 	for (x = 0; x < NGDT; x++) {
 #ifdef BDE_DEBUGGER
 		/* avoid overwriting db entries with APM ones */
 		if (x >= GAPMCODE32_SEL && x <= GAPMDATA_SEL)
 			continue;
 #endif
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 	}
 
 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * PAGE_SIZE)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 #ifdef USER_LDT
 	currentldt = _default_ldt;
 #endif
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(18, &IDTVEC(mchk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(0x80, &IDTVEC(int0x80_syscall),
 			SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	finishidentcpu();	/* Final stage of CPU initialization */
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	initializecpu();	/* Initialize CPU registers */
 
 	/* make an initial tss so cpu can get interrupt stack on syscall! */
 	common_tss.tss_esp0 = (int) proc0.p_addr + UPAGES*PAGE_SIZE - 16;
 	common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 	private_tss = 0;
 	tss_gdt = &gdt[GPROC0_SEL].sd;
 	common_tssd = *tss_gdt;
 	common_tss.tss_ioopt = (sizeof common_tss) << 16;
 	ltr(gsel_tss);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = (int)IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es =
 	    dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	vm86_initialize();
 	getmemsize(first);
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* Map the message buffer. */
 	for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
 		pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
 
 	msgbufinit(msgbufp, MSGBUF_SIZE);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* XXX does this work? */
 	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
 	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_cr3 = (int)IdlePTD;
 #ifdef SMP
 	proc0.p_addr->u_pcb.pcb_mpnest = 1;
 #endif
 	proc0.p_addr->u_pcb.pcb_ext = 0;
 }
 
 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
 static void f00f_hack(void *unused);
 SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);
 
 static void
 f00f_hack(void *unused) {
 	struct gate_descriptor *new_idt;
 #ifndef SMP
 	struct region_descriptor r_idt;
 #endif
 	vm_offset_t tmp;
 
 	if (!has_f00f_bug)
 		return;
 
 	printf("Intel Pentium detected, installing workaround for F00F bug\n");
 
 	r_idt.rd_limit = sizeof(idt0) - 1;
 
 	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
 	if (tmp == 0)
 		panic("kmem_alloc returned 0");
 	if (((unsigned int)tmp & (PAGE_SIZE-1)) != 0)
 		panic("kmem_alloc returned non-page-aligned memory");
 	/* Put the first seven entries in the lower page */
 	new_idt = (struct gate_descriptor*)(tmp + PAGE_SIZE - (7*8));
 	bcopy(idt, new_idt, sizeof(idt0));
 	r_idt.rd_base = (int)new_idt;
 	lidt(&r_idt);
 	idt = new_idt;
 	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
 			   VM_PROT_READ, FALSE) != KERN_SUCCESS)
 		panic("vm_map_protect failed");
 	return;
 }
 #endif /* defined(I586_CPU) && !NO_F00F_HACK */
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned long addr;
 {
 	p->p_md.md_regs->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	p->p_md.md_regs->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_read_u_check(p, addr, len)
 	struct proc *p;
 	vm_offset_t addr;
 	size_t len;
 {
 	vm_offset_t gap;
 
 	if ((vm_offset_t) (addr + len) < addr)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= sizeof(struct user))
 		return 0;
 
 	gap = (char *) p->p_md.md_regs - (char *) p->p_addr;
 	
 	if ((vm_offset_t) addr < gap)
 		return EPERM;
 	if ((vm_offset_t) (addr + len) <= 
 	    (vm_offset_t) (gap + sizeof(struct trapframe)))
 		return 0;
 	return EPERM;
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	long data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - (char *)p->p_addr;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = p->p_md.md_regs;
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	regs->r_fs = tp->tf_fs;
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	pcb = &p->p_addr->u_pcb;
 	regs->r_gs = pcb->pcb_gs;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct pcb *pcb;
 	struct trapframe *tp;
 
 	tp = p->p_md.md_regs;
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_fs = regs->r_fs;
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	pcb = &p->p_addr->u_pcb;
 	pcb->pcb_gs = regs->r_gs;
 	return (0);
 }
 
 int
 fill_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(&p->p_addr->u_pcb.pcb_savefpu, fpregs, sizeof *fpregs);
 	return (0);
 }
 
 int
 set_fpregs(p, fpregs)
 	struct proc *p;
 	struct fpreg *fpregs;
 {
 	bcopy(fpregs, &p->p_addr->u_pcb.pcb_savefpu, sizeof *fpregs);
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 #ifdef DDB
 
 /*
  * Provide inb() and outb() as functions.  They are normally only
  * available as macros calling inlined functions, thus cannot be
  * called inside DDB.
  *
  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
  */
 
 #undef inb
 #undef outb
 
 /* silence compiler warnings */
 u_char inb(u_int);
 void outb(u_int, u_char);
 
 u_char
 inb(u_int port)
 {
 	u_char	data;
 	/*
 	 * We use %%dx and not %1 here because i/o is done at %dx and not at
 	 * %edx, while gcc generates inferior code (movw instead of movl)
 	 * if we tell it to load (u_short) port.
 	 */
 	__asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
 	return (data);
 }
 
 void
 outb(u_int port, u_char data)
 {
 	u_char	al;
 	/*
 	 * Use an unnecessary assignment to help gcc's register allocator.
 	 * This make a large difference for gcc-1.40 and a tiny difference
 	 * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
 	 * best results.  gcc-2.6.0 can't handle this.
 	 */
 	al = data;
 	__asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
 }
 
 #endif /* DDB */
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index 00293bed6505..5c478c6ccb4a 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,2973 +1,3056 @@
 /*
  * Copyright (c) 1994,1997 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Absolutely no warranty of function or purpose is made by the author
  *		John S. Dyson.
  *
- * $Id: vfs_bio.c,v 1.218 1999/06/28 15:32:10 peter Exp $
+ * $Id: vfs_bio.c,v 1.219 1999/06/29 05:59:41 peter Exp $
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #define VMIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
+#include <sys/kthread.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/lock.h>
 #include <miscfs/specfs/specdev.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 
 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct buf *buf;		/* buffer header pool */
 struct swqueue bswlist;
 
 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off,
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf * bp);
 static void vfs_setdirty(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
-static void flushdirtybuffers(int slpflag, int slptimeo);
 static int flushbufqueues(void);
 
+static int bd_request;
+
+static void buf_daemon __P((void));
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 int runningbufspace;
 static vm_offset_t bogus_offset;
 
-static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
+static int bufspace, maxbufspace, vmiospace, 
 	bufmallocspace, maxbufmallocspace, hibufspace;
+#if 0
+static int maxvmiobufspace;
+#endif
 static int needsbuffer;
 static int numdirtybuffers, lodirtybuffers, hidirtybuffers;
 static int numfreebuffers, lofreebuffers, hifreebuffers;
+static int getnewbufcalls;
+static int getnewbufloops;
+static int getnewbufloops1;
+static int getnewbufloops2;
+static int getnewbufloops3;
+static int getnewbufrestarts;
 static int kvafreespace;
 
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD,
 	&numdirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW,
 	&lodirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW,
 	&hidirtybuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD,
 	&numfreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW,
 	&lofreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW,
 	&hifreebuffers, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD,
 	&runningbufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW,
 	&maxbufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD,
 	&hibufspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD,
 	&bufspace, 0, "");
+#if 0
 SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW,
 	&maxvmiobufspace, 0, "");
+#endif
 SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD,
 	&vmiospace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW,
 	&maxbufmallocspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD,
 	&bufmallocspace, 0, "");
 SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD,
 	&kvafreespace, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW,
+	&getnewbufcalls, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops, CTLFLAG_RW,
+	&getnewbufloops, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops1, CTLFLAG_RW,
+	&getnewbufloops1, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops2, CTLFLAG_RW,
+	&getnewbufloops2, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufloops3, CTLFLAG_RW,
+	&getnewbufloops3, 0, "");
+SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW,
+	&getnewbufrestarts, 0, "");
 
 static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
 struct bqueues bufqueues[BUFFER_QUEUES] = { { 0 } };
 char *buf_wmesg = BUF_WMESG;
 
 extern int vm_swap_size;
 
 #define BUF_MAXUSE		24
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
 #define VFS_BIO_NEED_RESERVED02	0x02	/* unused */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 #define VFS_BIO_NEED_KVASPACE	0x10	/* wait for buffer_map space, emerg  */
 
 /*
  *	kvaspacewakeup:
  *
  *	Called when kva space is potential available for recovery or when
  *	kva space is recovered in the buffer_map.  This function wakes up
  *	anyone waiting for buffer_map kva space.  Even though the buffer_map
  *	is larger then maxbufspace, this situation will typically occur 
  *	when the buffer_map gets fragmented.
  */
 
 static __inline void
 kvaspacewakeup(void)
 {
 	/*
 	 * If someone is waiting for KVA space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	if (needsbuffer & VFS_BIO_NEED_KVASPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_KVASPACE;
 		wakeup(&needsbuffer);
 	}
 }
 
 /*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery or when
  *	buffer space is recovered.  getnewbuf() will block on this flag when
  *	it is unable to free sufficient buffer space.  Buffer space becomes
  *	recoverable when bp's get placed back in the queues.
  */
 
 static __inline void
 bufspacewakeup(void)
 {
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 		wakeup(&needsbuffer);
 	}
 }
 
 /*
  *	bufcountwakeup:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
 
 static __inline void
 bufcountwakeup(void) 
 {
 	++numfreebuffers;
 	if (needsbuffer) {
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
 		if (numfreebuffers >= hifreebuffers)
 			needsbuffer &= ~VFS_BIO_NEED_FREE;
 		wakeup(&needsbuffer);
 	}
 }
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline__
 void
 vfs_buf_test_cache(struct buf *bp,
 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 		  vm_page_t m)
 {
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
+static __inline__
+void
+bd_wakeup(int dirtybuflevel)
+{
+	if (numdirtybuffers >= dirtybuflevel && bd_request == 0) {
+		bd_request = 1;
+		wakeup(&bd_request);
+	}
+}
+
 
 /*
  * Initialize buffer headers and related structures.
  */
 void
 bufinit()
 {
 	struct buf *bp;
 	int i;
 
 	TAILQ_INIT(&bswlist);
 	LIST_INIT(&invalhash);
 	simple_lock_init(&buftimelock);
 
 	/* first, make a null hash table */
 	for (i = 0; i < BUFHSZ; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_dev = NODEV;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
 
 	/*
 	 * maxbufspace is currently calculated to support all filesystem 
 	 * blocks to be 8K.  If you happen to use a 16K filesystem, the size
 	 * of the buffer cache is still the same as it would be for 8K 
 	 * filesystems.  This keeps the size of the buffer cache "in check" 
 	 * for big block filesystems.
 	 *
 	 * maxbufspace is calculated as around 50% of the KVA available in
 	 * the buffer_map ( DFLTSIZE vs BKVASIZE ), I presume to reduce the 
 	 * effect of fragmentation.
 	 */
 	maxbufspace = (nbuf + 8) * DFLTBSIZE;
 	if ((hibufspace = maxbufspace - MAXBSIZE * 5) <= MAXBSIZE)
 		hibufspace = 3 * maxbufspace / 4;
+#if 0
 /*
  * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
  */
 	maxvmiobufspace = 2 * hibufspace / 3;
+#endif
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
  * allocation, we don't want the malloced region to grow uncontrolled.
  * The malloc scheme improves memory utilization significantly on average
  * (small) directories.
  */
 	maxbufmallocspace = hibufspace / 20;
 
 /*
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
-	lodirtybuffers = nbuf / 16 + 10;
-	hidirtybuffers = nbuf / 8 + 20;
+	lodirtybuffers = nbuf / 6 + 10;
+	hidirtybuffers = nbuf / 3 + 20;
 	numdirtybuffers = 0;
 
 /*
  * Try to keep the number of free buffers in the specified range,
  * and give the syncer access to an emergency reserve.
  */
 	lofreebuffers = nbuf / 18 + 5;
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
 	kvafreespace = 0;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 			VM_ALLOC_NORMAL);
 
 }
 
 /*
  * Free the kva allocation for a buffer
  * Must be called only at splbio or higher,
  *  as this is the only locking for buffer_map.
  */
 static void
 bfreekva(struct buf * bp)
 {
 	if (bp->b_kvasize) {
 		vm_map_delete(buffer_map,
 		    (vm_offset_t) bp->b_kvabase,
 		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
 		);
 		bp->b_kvasize = 0;
 		kvaspacewakeup();
 	}
 }
 
 /*
  *	bremfree:
  *
  *	Remove the buffer from the appropriate free list.
  */
 void
 bremfree(struct buf * bp)
 {
 	int s = splbio();
 	int old_qindex = bp->b_qindex;
 
 	if (bp->b_qindex != QUEUE_NONE) {
-		if (bp->b_qindex == QUEUE_EMPTY) {
+		if (bp->b_qindex == QUEUE_EMPTYKVA) {
 			kvafreespace -= bp->b_kvasize;
 		}
 		if (BUF_REFCNT(bp) == 1)
 			TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		else if (BUF_REFCNT(bp) == 0)
 			panic("bremfree: not locked");
 		else
 			/* Temporary panic to verify exclusive locking */
 			/* This panic goes away when we allow shared refs */
 			panic("bremfree: multiple refs");
 		bp->b_qindex = QUEUE_NONE;
 		runningbufspace += bp->b_bufsize;
 	} else {
 #if !defined(MAX_PERF)
 		panic("bremfree: removing a buffer when not on a queue");
 #endif
 	}
 
 	/*
 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
 	 * delayed-write, and it was on the EMPTY, LRU, or AGE queues,
 	 * the buffer was free and we must decrement numfreebuffers.
 	 */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
 		switch(old_qindex) {
+		case QUEUE_DIRTY:
+		case QUEUE_CLEAN:
 		case QUEUE_EMPTY:
-		case QUEUE_LRU:
-		case QUEUE_AGE:
+		case QUEUE_EMPTYKVA:
 			--numfreebuffers;
 			break;
 		default:
 			break;
 		}
 	}
 	splx(s);
 }
 
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything ( see
  * getblk() ).
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf ** bpp)
 {
 	struct buf *bp;
 
 	bp = getblk(vp, blkno, size, 0, 0);
 	*bpp = bp;
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		KASSERT(!(bp->b_flags & B_ASYNC), ("bread: illegal async bp %p", bp));
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(vp, bp);
 		return (biowait(bp));
 	}
 	return (0);
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
  * to initiating I/O . If B_CACHE is set, the buffer is valid 
  * and we do not have to do anything.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf ** bpp)
 {
 	struct buf *bp, *rabp;
 	int i;
 	int rv = 0, readwait = 0;
 
 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(vp, bp);
 		++readwait;
 	}
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (curproc != NULL)
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_READ | B_ASYNC;
 			rabp->b_flags &= ~(B_ERROR | B_INVAL);
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
 				rabp->b_rcred = cred;
 			}
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			VOP_STRATEGY(vp, rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 
 	if (readwait) {
 		rv = biowait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bwrite(struct buf * bp)
 {
 	int oldflags, s;
 	struct vnode *vp;
 	struct mount *mp;
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	oldflags = bp->b_flags;
 
 #if !defined(MAX_PERF)
 	if (BUF_REFCNT(bp) == 0)
 		panic("bwrite: buffer is not busy???");
 #endif
 	s = splbio();
 	bundirty(bp);
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR);
 	bp->b_flags |= B_WRITEINPROG | B_CACHE;
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
 	if (curproc != NULL)
 		curproc->p_stats->p_ru.ru_oublock++;
 	splx(s);
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	VOP_STRATEGY(bp->b_vp, bp);
 
 	/*
 	 * Collect statistics on synchronous and asynchronous writes.
 	 * Writes to block devices are charged to their associated
 	 * filesystem (if any).
 	 */
 	if ((vp = bp->b_vp) != NULL) {
 		if (vp->v_type == VBLK)
 			mp = vp->v_specmountpoint;
 		else
 			mp = vp->v_mount;
 		if (mp != NULL) {
 			if ((oldflags & B_ASYNC) == 0)
 				mp->mnt_stat.f_syncwrites++;
 			else
 				mp->mnt_stat.f_asyncwrites++;
 		}
 	}
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = biowait(bp);
 		brelse(bp);
 		return (rtval);
 	}
 
 	return (0);
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf * bp)
 {
 	struct vnode *vp;
 
 #if !defined(MAX_PERF)
 	if (BUF_REFCNT(bp) == 0)
 		panic("bdwrite: buffer is not busy");
 #endif
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 	bdirty(bp);
 
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
 	bqrelse(bp);
 
+	/*
+	 * Wakeup the buffer flushing daemon if we have saturated the
+	 * buffer cache.
+	 */
+
+	bd_wakeup(hidirtybuffers);
+
 	/*
 	 * XXX The soft dependency code is not prepared to
 	 * have I/O done when a bdwrite is requested. For
 	 * now we just let the write be delayed if it is
 	 * requested by the soft dependency code.
 	 */
 	if ((vp = bp->b_vp) &&
 	    ((vp->v_type == VBLK && vp->v_specmountpoint &&
 		  (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) ||
 		 (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))))
 		return;
-
-	if (numdirtybuffers >= hidirtybuffers)
-		flushdirtybuffers(0, 0);
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear B_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	Must be called at splbio().
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(bp)
 	struct buf *bp;
 {
 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	bp->b_flags &= ~(B_READ|B_RELBUF);
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		++numdirtybuffers;
+		bd_wakeup(hidirtybuffers);
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	Must be called at splbio().
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(bp)
 	struct buf *bp;
 {
 	KASSERT(bp->b_qindex == QUEUE_NONE, ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 		--numdirtybuffers;
 	}
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf * bp)
 {
 	bp->b_flags |= B_ASYNC;
 	(void) VOP_BWRITE(bp->b_vp, bp);
 }
 
 /*
  *	bowrite:
  *
  *	Ordered write.  Start output on a buffer, and flag it so that the 
  *	device will write it in the order it was queued.  The buffer is 
  *	released when the output completes.  bwrite() ( or the VOP routine
  *	anyway ) is responsible for handling B_INVAL buffers.
  */
 int
 bowrite(struct buf * bp)
 {
 	bp->b_flags |= B_ORDERED | B_ASYNC;
 	return (VOP_BWRITE(bp->b_vp, bp));
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf * bp)
 {
 	int s;
 
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 #if 0
 	if (bp->b_flags & B_CLUSTER) {
 		relpbuf(bp, NULL);
 		return;
 	}
 #endif
 
 	s = splbio();
 
 	if (bp->b_flags & B_LOCKED)
 		bp->b_flags &= ~B_ERROR;
 
 	if ((bp->b_flags & (B_READ | B_ERROR)) == B_ERROR) {
 		/*
 		 * Failed write, redirty.  Must clear B_ERROR to prevent
 		 * pages from being scrapped.  Note: B_INVAL is ignored
 		 * here but will presumably be dealt with later.
 		 */
 		bp->b_flags &= ~B_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) ||
 	    (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
 		if (bp->b_flags & B_DELWRI)
 			--numdirtybuffers;
 		bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
 	 */
 
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If B_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  B_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_tag == VT_NFS &&
 		 bp->b_vp->v_type != VBLK &&
 		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
 		vm_page_t m;
 		off_t foff;
 		vm_pindex_t poff;
 		vm_object_t obj;
 		struct vnode *vp;
 
 		vp = bp->b_vp;
 
 		/*
 		 * Get the base offset and length of the buffer.  Note that 
 		 * for block sizes that are less then PAGE_SIZE, the b_data
 		 * base of the buffer does not represent exactly b_offset and
 		 * neither b_offset nor b_size are necessarily page aligned.
 		 * Instead, the starting position of b_offset is:
 		 *
 		 * 	b_data + (b_offset & PAGE_MASK)
 		 *
 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 		 * supported due to the page granularity bits (m->valid,
 		 * m->dirty, etc...). 
 		 *
 		 * See man buf(9) for more information
 		 */
 
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			m = bp->b_pages[i];
 			vm_page_flag_clear(m, PG_ZERO);
 			if (m == bogus_page) {
 
 				obj = (vm_object_t) vp->v_object;
 				poff = OFF_TO_IDX(bp->b_offset);
 
 				for (j = i; j < bp->b_npages; j++) {
 					m = bp->b_pages[j];
 					if (m == bogus_page) {
 						m = vm_page_lookup(obj, poff + j);
 #if !defined(MAX_PERF)
 						if (!m) {
 							panic("brelse: page missing\n");
 						}
 #endif
 						bp->b_pages[j] = m;
 					}
 				}
 
 				if ((bp->b_flags & B_INVAL) == 0) {
 					pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 				}
 			}
 			if (bp->b_flags & (B_NOCACHE|B_ERROR)) {
 				int poffset = foff & PAGE_MASK;
 				int presid = resid > (PAGE_SIZE - poffset) ?
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				vm_page_set_invalid(m, poffset, presid);
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
 		}
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	} else if (bp->b_flags & B_VMIO) {
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	}
 			
 #if !defined(MAX_PERF)
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 #endif
 	if (BUF_REFCNT(bp) > 1) {
 		/* Temporary panic to verify exclusive locking */
 		/* This panic goes away when we allow shared refs */
 		panic("brelse: multiple refs");
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		splx(s);
 		return;
 	}
 
 	/* enqueue */
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_flags |= B_INVAL;
-		bp->b_qindex = QUEUE_EMPTY;
-		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		if (bp->b_kvasize)
+			bp->b_qindex = QUEUE_EMPTYKVA;
+		else
+			bp->b_qindex = QUEUE_EMPTY;
+		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 		kvafreespace += bp->b_kvasize;
 		if (bp->b_kvasize)
 			kvaspacewakeup();
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 		bp->b_flags |= B_INVAL;
-		bp->b_qindex = QUEUE_AGE;
-		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
+		bp->b_qindex = QUEUE_CLEAN;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 
 	/* buffers that are locked */
 	} else if (bp->b_flags & B_LOCKED) {
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 
-	/* buffers with stale but valid contents */
-	} else if (bp->b_flags & B_AGE) {
-		bp->b_qindex = QUEUE_AGE;
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
-
-	/* buffers with valid and quite potentially reuseable contents */
+	/* remaining buffers */
 	} else {
-		bp->b_qindex = QUEUE_LRU;
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		switch(bp->b_flags & (B_DELWRI|B_AGE)) {
+		case B_DELWRI | B_AGE:
+		    bp->b_qindex = QUEUE_DIRTY;
+		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+		    break;
+		case B_DELWRI:
+		    bp->b_qindex = QUEUE_DIRTY;
+		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+		    break;
+		case B_AGE:
+		    bp->b_qindex = QUEUE_CLEAN;
+		    TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+		    break;
+		default:
+		    bp->b_qindex = QUEUE_CLEAN;
+		    TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+		    break;
+		}
 	}
 
 	/*
-	 * If B_INVAL, clear B_DELWRI.
+	 * If B_INVAL, clear B_DELWRI.  We've already placed the buffer
+	 * on the correct queue.
 	 */
 	if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) {
 		bp->b_flags &= ~B_DELWRI;
 		--numdirtybuffers;
 	}
 
 	runningbufspace -= bp->b_bufsize;
 
 	/*
 	 * Fixup numfreebuffers count.  The bp is on an appropriate queue
 	 * unless locked.  We then bump numfreebuffers if it is not B_DELWRI.
 	 * We've already handled the B_INVAL case ( B_DELWRI will be clear
 	 * if B_INVAL is set ).
 	 */
 
 	if ((bp->b_flags & B_LOCKED) == 0 && !(bp->b_flags & B_DELWRI))
 		bufcountwakeup();
 
 	/*
 	 * Something we can maybe free.
 	 */
 
 	if (bp->b_bufsize)
 		bufspacewakeup();
 
 	/* unlock */
 	BUF_UNLOCK(bp);
 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	splx(s);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  */
 void
 bqrelse(struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 #if !defined(MAX_PERF)
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("bqrelse: free buffer onto another queue???");
 #endif
 	if (BUF_REFCNT(bp) > 1) {
 		/* do not release to free list */
 		panic("bqrelse: multiple refs");
 		BUF_UNLOCK(bp);
 		splx(s);
 		return;
 	}
 	if (bp->b_flags & B_LOCKED) {
 		bp->b_flags &= ~B_ERROR;
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 		/* buffers with stale but valid contents */
+	} else if (bp->b_flags & B_DELWRI) {
+		bp->b_qindex = QUEUE_DIRTY;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
 	} else {
-		bp->b_qindex = QUEUE_LRU;
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		bp->b_qindex = QUEUE_CLEAN;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	}
 
 	runningbufspace -= bp->b_bufsize;
 
 	if ((bp->b_flags & B_LOCKED) == 0 &&
-	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
-	) {
+	    ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))) {
 		bufcountwakeup();
 	}
 
 	/*
 	 * Something we can maybe wakeup
 	 */
 	if (bp->b_bufsize)
 		bufspacewakeup();
 
 	/* unlock */
 	BUF_UNLOCK(bp);
 	bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	splx(s);
 }
 
 static void
 vfs_vmio_release(bp)
 	struct buf *bp;
 {
 	int i, s;
 	vm_page_t m;
 
 	s = splvm();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
 		/*
 		 * In order to keep page LRU ordering consistent, put
 		 * everything on the inactive queue.
 		 */
 		vm_page_unwire(m, 0);
 		/*
 		 * We don't mess with busy pages, it is
 		 * the responsibility of the process that
 		 * busied the pages to deal with them.
 		 */
 		if ((m->flags & PG_BUSY) || (m->busy != 0))
 			continue;
 			
 		if (m->wire_count == 0) {
 			vm_page_flag_clear(m, PG_ZERO);
 			/*
 			 * Might as well free the page if we can and it has
 			 * no valid data.
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
 				vm_page_busy(m);
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_free(m);
 			}
 		}
 	}
 	bufspace -= bp->b_bufsize;
 	vmiospace -= bp->b_bufsize;
 	runningbufspace -= bp->b_bufsize;
 	splx(s);
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	if (bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_npages = 0;
 	bp->b_bufsize = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 gbincore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
 	bh = BUFHASH(vp, blkno);
 	bp = bh->lh_first;
 
 	/* Search hash chain */
 	while (bp != NULL) {
 		/* hit */
 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 		    (bp->b_flags & B_INVAL) == 0) {
 			break;
 		}
 		bp = bp->b_hash.le_next;
 	}
 	return (bp);
 }
 
 /*
  * this routine implements clustered async writes for
  * clearing out B_DELWRI buffers...  This is much better
  * than the old way of writing only one buffer at a time.
  */
 int
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
 	int ncl;
 	struct buf *bpa;
 	int nwritten;
 	int size;
 	int maxcl;
 
 	s = splbio();
 	/*
 	 * right now we support clustered writing only to regular files, and
 	 * then only if our I/O system is not saturated.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		for (i = 1; i < maxcl; i++) {
 			if ((bpa = gbincore(vp, lblkno + i)) &&
 			    BUF_REFCNT(bpa) == 0 &&
 			    ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
 		ncl = i;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
 			splx(s);
 			return nwritten;
 		}
 	}
 
 	BUF_LOCK(bp, LK_EXCLUSIVE);
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 
 	splx(s);
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) VOP_BWRITE(bp->b_vp, bp);
 
 	return nwritten;
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers 
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
  *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_map is too fragmented ( space reservation fails )
+ *		If we have to flush dirty buffers ( but we try to avoid this )
  *
- *	We do *not* attempt to flush dirty buffers more then one level deep.
- *	I.e., if P_FLSINPROG is set we do not flush dirty buffers at all.
- *
- *	If P_FLSINPROG is set, we are allowed to dip into our emergency 
- *	reserve.
+ *	To avoid VFS layer recursion we do not flush dirty buffers ourselves.
+ *	Instead we ask the pageout daemon to do it for us.  We attempt to
+ *	avoid piecemeal wakeups of the pageout daemon.
  */
+
+	/*
+	 * We fully expect to be able to handle any fragmentation and buffer
+	 * space issues by freeing QUEUE_CLEAN buffers.  If this fails, we 
+	 * have to wakeup the pageout daemon and ask it to flush some of our 
+	 * QUEUE_DIRTY buffers.  We have to be careful to prevent a deadlock.
+	 * XXX
+	 */
+
 static struct buf *
 getnewbuf(struct vnode *vp, daddr_t blkno,
 	int slpflag, int slptimeo, int size, int maxsize)
 {
 	struct buf *bp;
 	struct buf *nbp;
 	struct buf *dbp;
 	int outofspace;
 	int nqindex;
 	int defrag = 0;
 	static int newbufcnt = 0;
 	int lastnewbuf = newbufcnt;
 	
+	++getnewbufcalls;
+	--getnewbufrestarts;
 restart:
+	++getnewbufrestarts;
+
 	/*
 	 * Calculate whether we are out of buffer space.  This state is
 	 * recalculated on every restart.  If we are out of space, we
-	 * have to turn off defragmentation.  The outofspace code will
-	 * defragment too, but the looping conditionals will be messed up
-	 * if both outofspace and defrag are on.
+	 * have to turn off defragmentation.  Setting defrag to -1 when
+	 * outofspace is positive means "defrag while freeing buffers".
+	 * The looping conditional will be muffed up if defrag is left
+	 * positive when outofspace is positive.
 	 */
 
 	dbp = NULL;
 	outofspace = 0;
 	if (bufspace >= hibufspace) {
-		if ((curproc->p_flag & P_FLSINPROG) == 0 ||
-		    bufspace >= maxbufspace
-		) {
+		if ((curproc->p_flag & P_BUFEXHAUST) == 0 ||
+		    bufspace >= maxbufspace) {
 			outofspace = 1;
-			defrag = 0;
+			if (defrag > 0)
+				defrag = -1;
 		}
 	}
 
 	/*
 	 * defrag state is semi-persistant.  1 means we are flagged for
 	 * defragging.  -1 means we actually defragged something.
 	 */
 	/* nop */
 
 	/*
 	 * Setup for scan.  If we do not have enough free buffers,
-	 * we setup a degenerate case that falls through the while.
+	 * we setup a degenerate case that immediately fails.  Note
+	 * that if we are specially marked process, we are allowed to
+	 * dip into our reserves.
 	 *
-	 * If we are in the middle of a flush, we can dip into the
-	 * emergency reserve.
+	 * Normally we want to find an EMPTYKVA buffer.  That is, a
+	 * buffer with kva already allocated.  If there are no EMPTYKVA
+	 * buffers we back up to the truely EMPTY buffers.  When defragging
+	 * we do not bother backing up since we have to locate buffers with
+	 * kva to defrag.  If we are out of space we skip both EMPTY and
+	 * EMPTYKVA and dig right into the CLEAN queue.
 	 *
-	 * If we are out of space, we skip trying to scan QUEUE_EMPTY
-	 * because those buffers are, well, empty.
+	 * In this manner we avoid scanning unnecessary buffers.  It is very
+	 * important for us to do this because the buffer cache is almost
+	 * constantly out of space or in need of defragmentation.
 	 */
 
-	if ((curproc->p_flag & P_FLSINPROG) == 0 &&
+	if ((curproc->p_flag & P_BUFEXHAUST) == 0 &&
 	    numfreebuffers < lofreebuffers) {
-		nqindex = QUEUE_LRU;
+		nqindex = QUEUE_CLEAN;
 		nbp = NULL;
 	} else {
-		nqindex = QUEUE_EMPTY;
-		if (outofspace || 
-		    (nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY])) == NULL) {
-			nqindex = QUEUE_AGE;
-			nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
-			if (nbp == NULL) {
-				nqindex = QUEUE_LRU;
-				nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]);
+		nqindex = QUEUE_EMPTYKVA;
+		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+		if (nbp == NULL) {
+			if (defrag <= 0) {
+				nqindex = QUEUE_EMPTY;
+				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 			}
 		}
+		if (outofspace || nbp == NULL) {
+			nqindex = QUEUE_CLEAN;
+			nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+		}
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 
+	if (nbp)
+		--getnewbufloops;
+
 	while ((bp = nbp) != NULL) {
 		int qindex = nqindex;
+
+		++getnewbufloops;
 		/*
 		 * Calculate next bp ( we can only use it if we do not block
 		 * or do other fancy things ).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 			switch(qindex) {
 			case QUEUE_EMPTY:
-				nqindex = QUEUE_AGE;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_AGE])))
+				nqindex = QUEUE_EMPTYKVA;
+				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
 					break;
 				/* fall through */
-			case QUEUE_AGE:
-				nqindex = QUEUE_LRU;
-				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])))
+			case QUEUE_EMPTYKVA:
+				nqindex = QUEUE_CLEAN;
+				if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
 					break;
 				/* fall through */
-			case QUEUE_LRU:
+			case QUEUE_CLEAN:
 				/*
 				 * nbp is NULL. 
 				 */
 				break;
 			}
 		}
 
 		/*
 		 * Sanity Checks
 		 */
 		KASSERT(BUF_REFCNT(bp) == 0, ("getnewbuf: busy buffer %p on free list", bp));
 		KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
 
 		/*
-		 * Here we try to move NON VMIO buffers to the end of the 
-		 * LRU queue in order to make VMIO buffers more readily 
-		 * freeable.  We also try to move buffers with a positive
-		 * usecount to the end.
-		 *
-		 * Note that by moving the bp to the end, we setup a following
-		 * loop.  Since we continue to decrement b_usecount this
-		 * is ok and, in fact, desireable.
-		 *
-		 * If we are at the end of the list, we move ourself to the
-		 * same place and need to fixup nbp and nqindex to handle
-		 * the following case.
+		 * Note: we no longer distinguish between VMIO and non-VMIO
+		 * buffers.
 		 */
 
-		if ((qindex == QUEUE_LRU) && bp->b_usecount > 0) {
-			if ((bp->b_flags & B_VMIO) == 0 ||
-			    (vmiospace < maxvmiobufspace)
-			) {
-				--bp->b_usecount;
-				TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
-				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
-				if (nbp == NULL) {
-					nqindex = qindex;
-					nbp = bp;
-				}
-				continue;
-			}
-		}
+		KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
 
 		/*
-		 * If we come across a delayed write and numdirtybuffers should
-		 * be flushed, try to write it out.  Only if P_FLSINPROG is
-		 * not set.  We can't afford to recursively stack more then
-		 * one deep due to the possibility of having deep VFS call
-		 * stacks.
-		 *
-		 * Limit the number of dirty buffers we are willing to try
-		 * to recover since it really isn't our job here.
+		 * If we are defragging and the buffer isn't useful for fixing
+		 * that problem we continue.  If we are out of space and the
+		 * buffer isn't useful for fixing that problem we continue.
 		 */
-		if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
-			/*
-			 * This is rather complex, but necessary.  If we come
-			 * across a B_DELWRI buffer we have to flush it in
-			 * order to use it.  We only do this if we absolutely
-			 * need to.  We must also protect against too much
-			 * recursion which might run us out of stack due to
-			 * deep VFS call stacks.
-			 *
-			 * In heavy-writing situations, QUEUE_LRU can contain
-			 * a large number of DELWRI buffers at its head.  These
-			 * buffers must be moved to the tail if they cannot be
-			 * written async in order to reduce the scanning time
-			 * required to skip past these buffers in later 
-			 * getnewbuf() calls.
-			 */
-			if ((curproc->p_flag & P_FLSINPROG) || 
-			    numdirtybuffers < hidirtybuffers) {
-				if (qindex == QUEUE_LRU) {
-					/*
-					 * dbp prevents us from looping forever
-					 * if all bps in QUEUE_LRU are dirty.
-					 */
-					if (bp == dbp) {
-						bp = NULL;
-						break;
-					}
-					if (dbp == NULL)
-						dbp = TAILQ_LAST(&bufqueues[QUEUE_LRU], bqueues);
-					TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
-					TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
-				}
-				continue;
-			}
-			curproc->p_flag |= P_FLSINPROG;
-			vfs_bio_awrite(bp);
-			curproc->p_flag &= ~P_FLSINPROG;
-			goto restart;
-		}
 
-		if (defrag > 0 && bp->b_kvasize == 0)
+		if (defrag > 0 && bp->b_kvasize == 0) {
+			++getnewbufloops1;
 			continue;
-		if (outofspace > 0 && bp->b_bufsize == 0)
+		}
+		if (outofspace > 0 && bp->b_bufsize == 0) {
+			++getnewbufloops2;
 			continue;
+		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
-		 * remains valid only for QUEUE_EMPTY bp's.
+		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 			panic("getnewbuf: locked buf");
 		bremfree(bp);
 
-		if (qindex == QUEUE_LRU || qindex == QUEUE_AGE) {
+		if (qindex == QUEUE_CLEAN) {
 			if (bp->b_flags & B_VMIO) {
 				bp->b_flags &= ~B_ASYNC;
 				vfs_vmio_release(bp);
 			}
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 *
 		 * Get the rest of the buffer freed up.  b_kva* is still
 		 * valid after this operation.
 		 */
 
 		if (bp->b_rcred != NOCRED) {
 			crfree(bp->b_rcred);
 			bp->b_rcred = NOCRED;
 		}
 		if (bp->b_wcred != NOCRED) {
 			crfree(bp->b_wcred);
 			bp->b_wcred = NOCRED;
 		}
 		if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate)
 			(*bioops.io_deallocate)(bp);
 
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 
 		if (bp->b_bufsize)
 			allocbuf(bp, 0);
 
 		bp->b_flags = 0;
 		bp->b_dev = NODEV;
 		bp->b_vp = NULL;
 		bp->b_blkno = bp->b_lblkno = 0;
 		bp->b_offset = NOOFFSET;
 		bp->b_iodone = 0;
 		bp->b_error = 0;
 		bp->b_resid = 0;
 		bp->b_bcount = 0;
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 		bp->b_usecount = 5;
 
 		LIST_INIT(&bp->b_dep);
 
 		/*
 		 * Ok, now that we have a free buffer, if we are defragging
-		 * we have to recover the kvaspace.
+		 * we have to recover the kvaspace.  If we are out of space
+		 * we have to free the buffer (which we just did), but we
+		 * do not have to recover kva space unless we hit a defrag
+		 * hicup.  Being able to avoid freeing the kva space leads
+		 * to a significant reduction in overhead.
 		 */
 
 		if (defrag > 0) {
 			defrag = -1;
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 
 		if (outofspace > 0) {
 			outofspace = -1;
 			bp->b_flags |= B_INVAL;
-			bfreekva(bp);
+			if (defrag < 0)
+				bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 
 		/*
 		 * We are done
 		 */
 		break;
 	}
 
 	/*
-	 * If we exhausted our list, sleep as appropriate.
+	 * If we exhausted our list, sleep as appropriate.  We may have to
+	 * wakeup the pageout daemon to write out some dirty buffers.
 	 */
 
 	if (bp == NULL) {
 		int flags;
 
 dosleep:
 		if (defrag > 0)
 			flags = VFS_BIO_NEED_KVASPACE;
 		else if (outofspace > 0)
 			flags = VFS_BIO_NEED_BUFSPACE;
 		else
 			flags = VFS_BIO_NEED_ANY;
 
+		/* XXX */
+
 		(void) speedup_syncer();
 		needsbuffer |= flags;
 		while (needsbuffer & flags) {
 			if (tsleep(&needsbuffer, (PRIBIO + 4) | slpflag,
 			    "newbuf", slptimeo))
 				return (NULL);
 		}
 	} else {
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.
 		 */
 		vm_offset_t addr = 0;
 
 		maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
 
 		if (maxsize != bp->b_kvasize) {
 			bfreekva(bp);
 
 			if (vm_map_findspace(buffer_map,
-				vm_map_min(buffer_map), maxsize, &addr)
-			) {
+				vm_map_min(buffer_map), maxsize, &addr)) {
 				/*
 				 * Uh oh.  Buffer map is to fragmented.  Try
 				 * to defragment.
 				 */
 				if (defrag <= 0) {
 					defrag = 1;
 					bp->b_flags |= B_INVAL;
 					brelse(bp);
 					goto restart;
 				}
 				/*
 				 * Uh oh.  We couldn't seem to defragment
 				 */
 				bp = NULL;
 				goto dosleep;
 			}
 		}
 		if (addr) {
 			vm_map_insert(buffer_map, NULL, 0,
 				addr, addr + maxsize,
 				VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 
 			bp->b_kvabase = (caddr_t) addr;
 			bp->b_kvasize = maxsize;
 		}
 		bp->b_data = bp->b_kvabase;
 	}
 
 	/*
 	 * If we have slept at some point in this process and another
 	 * process has managed to allocate a new buffer while we slept,
 	 * we have to return NULL so that our caller can recheck to
 	 * ensure that the other process did not create an identically
 	 * identified buffer to the one we were requesting. We make this
 	 * check by incrementing the static int newbufcnt each time we
 	 * successfully allocate a new buffer. By saving the value of
 	 * newbufcnt in our local lastnewbuf, we can compare newbufcnt
 	 * with lastnewbuf to see if any other process managed to
 	 * allocate a buffer while we were doing so ourselves.
 	 *
 	 * Note that bp, if valid, is locked.
 	 */
 	if (lastnewbuf == newbufcnt) {
 		/*
 		 * No buffers allocated, so we can return one if we were
 		 * successful, or continue trying if we were not successful.
 		 */
 		if (bp != NULL) {
 			newbufcnt += 1;
 			return (bp);
 		}
 		goto restart;
 	}
 	/*
 	 * Another process allocated a buffer since we were called, so
 	 * we have to free the one we allocated and return NULL to let
 	 * our caller recheck to see if a new buffer is still needed.
 	 */
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	return (NULL);
 }
 
 /*
  *	waitfreebuffers:
  *
- *	Wait for sufficient free buffers.  This routine is not called if
- *	curproc is the update process so we do not have to do anything 
- *	fancy.
+ *	Wait for sufficient free buffers.  Only called from normal processes.
  */
 
 static void
 waitfreebuffers(int slpflag, int slptimeo) 
 {
 	while (numfreebuffers < hifreebuffers) {
-		flushdirtybuffers(slpflag, slptimeo);
+		bd_wakeup(0);
 		if (numfreebuffers >= hifreebuffers)
 			break;
 		needsbuffer |= VFS_BIO_NEED_FREE;
 		if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo))
 			break;
 	}
 }
 
 /*
- *	flushdirtybuffers:
- *
- *	This routine is called when we get too many dirty buffers.  
+ *	buf_daemon:
  *
- *	We have to protect ourselves from recursion, but we also do not want
- *	other process's flushdirtybuffers() to interfere with the syncer if
- *	it decides to flushdirtybuffers().
- *
- *	In order to maximize operations, we allow any process to flush
- *	dirty buffers and use P_FLSINPROG to prevent recursion.  
+ *	buffer flushing daemon.  Buffers are normally flushed by the
+ *	update daemon but if it cannot keep up this process starts to
+ *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 
+static struct proc *bufdaemonproc;
+static int bd_interval;
+static int bd_flushto;
+
+static struct kproc_desc buf_kp = {
+	"bufdaemon",
+	buf_daemon,
+	&bufdaemonproc
+};
+SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp)
+
 static void
-flushdirtybuffers(int slpflag, int slptimeo) 
+buf_daemon()
 {
 	int s;
-
+	/*
+	 * This process is allowed to take the buffer cache to the limit
+	 */
+	curproc->p_flag |= P_BUFEXHAUST;
 	s = splbio();
 
-	if (curproc->p_flag & P_FLSINPROG) {
-		splx(s);
-		return;
-	}
-	curproc->p_flag |= P_FLSINPROG;
+	bd_interval = 5 * hz;	/* dynamically adjusted */
+	bd_flushto = hidirtybuffers;	/* dynamically adjusted */
 
-	while (numdirtybuffers > lodirtybuffers) {
-		if (flushbufqueues() == 0)
-			break;
-	}
+	while (TRUE) {
+		bd_request = 0;
 
-	curproc->p_flag &= ~P_FLSINPROG;
+		/*
+		 * Do the flush.  
+		 */
+		{
+			while (numdirtybuffers > bd_flushto) {
+				if (flushbufqueues() == 0)
+					break;
+			}
+		}
 
-	splx(s);
+		/*
+		 * Whew.  If nobody is requesting anything we sleep until the
+		 * next event.  If we sleep and the sleep times out and
+		 * nobody is waiting for interesting things we back-off.  
+		 * Otherwise we get more aggressive.
+		 */
+
+		if (bd_request == 0 &&
+		    tsleep(&bd_request, PVM, "psleep", bd_interval) &&
+		    needsbuffer == 0) {
+			/*
+			 * timed out and nothing serious going on,
+			 * increase the flushto high water mark to reduce
+			 * the flush rate.
+			 */
+			bd_flushto += 10;
+		} else {
+			/*
+			 * We were woken up or hit a serious wall that needs
+			 * to be addressed.
+			 */
+			bd_flushto -= 10;
+			if (needsbuffer) {
+				int middb = (lodirtybuffers+hidirtybuffers)/2;
+				bd_interval >>= 1;
+				if (bd_flushto > middb)
+					bd_flushto = middb;
+			}
+		}
+		if (bd_flushto < lodirtybuffers) {
+			bd_flushto = lodirtybuffers;
+			bd_interval -= hz / 10;
+		}
+		if (bd_flushto > hidirtybuffers) {
+			bd_flushto = hidirtybuffers;
+			bd_interval += hz / 10;
+		}
+		if (bd_interval < hz / 10)
+			bd_interval = hz / 10;
+
+		if (bd_interval > 5 * hz)
+			bd_interval = 5 * hz;
+	}
 }
 
 static int
 flushbufqueues(void)
 {
 	struct buf *bp;
-	int qindex;
 	int r = 0;
 
-	qindex = QUEUE_AGE;
-	bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]);
-
-	for (;;) {
-		if (bp == NULL) {
-			if (qindex == QUEUE_LRU)
-				break;
-			qindex = QUEUE_LRU;
-			if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU])) == NULL)
-				break;
-		}
+	bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
 
+	while (bp) {
 		/*
 		 * Try to free up B_INVAL delayed-write buffers rather then
 		 * writing them out.  Note also that NFS is somewhat sensitive
 		 * to B_INVAL buffers so it is doubly important that we do 
 		 * this.
+		 *
+		 * We do not try to sync buffers whos vnodes are locked, we
+		 * cannot afford to block in this process.
 		 */
+		KASSERT((bp->b_flags & B_DELWRI), ("unexpected clean buffer %p", bp));
 		if ((bp->b_flags & B_DELWRI) != 0) {
 			if (bp->b_flags & B_INVAL) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0)
 					panic("flushbufqueues: locked buf");
 				bremfree(bp);
 				brelse(bp);
-			} else {
+				++r;
+				break;
+			}
+			if (!VOP_ISLOCKED(bp->b_vp)) {
 				vfs_bio_awrite(bp);
+				++r;
+				break;
 			}
-			++r;
-			break;
 		}
 		bp = TAILQ_NEXT(bp, b_freelist);
 	}
 	return(r);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 
 	int s = splbio();
 	bp = gbincore(vp, blkno);
 	splx(s);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	if (incore(vp, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0)
 		return 0;
 
 	obj = vp->v_object;
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			return 0;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 /*
  *	vfs_setdirty:
  *
  *	Sets the dirty range for a buffer based on the status of the dirty
  *	bits in the pages comprising the buffer.
  *
  *	The range is limited to the size of the buffer.
  *
  *	This routine is primarily used by NFS, but is generalized for the
  *	B_VMIO case.
  */
 static void
 vfs_setdirty(struct buf *bp) 
 {
 	int i;
 	vm_object_t object;
 
 	/*
 	 * Degenerate case - empty buffer
 	 */
 
 	if (bp->b_bufsize == 0)
 		return;
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 	 * is not cleared simply by protecting pages off.
 	 */
 
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
 	object = bp->b_pages[0]->object;
 
 	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
 		printf("Warning: object %p writeable but not mightbedirty\n", object);
 	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
 		printf("Warning: object %p mightbedirty but not writeable\n", object);
 
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 			vm_page_test_dirty(bp->b_pages[i]);
 		}
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a VOP_BWRITE() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successfull read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and B_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
 	int s;
 	struct bufhashhdr *bh;
 
 #if !defined(MAX_PERF)
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 #endif
 
 	s = splbio();
 loop:
 	/*
-	 * Block if we are low on buffers.  The syncer is allowed more
-	 * buffers in order to avoid a deadlock.
+	 * Block if we are low on buffers.   Certain processes are allowed
+	 * to completely exhaust the buffer cache.
 	 */
-	if (curproc == updateproc && numfreebuffers == 0) {
-		needsbuffer |= VFS_BIO_NEED_ANY;
-		tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
-		    slptimeo);
-	} else if (curproc != updateproc && numfreebuffers < lofreebuffers) {
+	if (curproc->p_flag & P_BUFEXHAUST) {
+		if (numfreebuffers == 0) {
+			needsbuffer |= VFS_BIO_NEED_ANY;
+			tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf",
+			    slptimeo);
+		}
+	} else if (numfreebuffers < lofreebuffers) {
 		waitfreebuffers(slpflag, slptimeo);
 	}
 
 	if ((bp = gbincore(vp, blkno))) {
 		/*
-		 * Buffer is in-core
+		 * Buffer is in-core.  If the buffer is not busy, it must
+		 * be on a queue.
 		 */
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 			if (bp->b_usecount < BUF_MAXUSE)
 				++bp->b_usecount;
 			if (BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL,
 			    "getblk", slpflag, slptimeo) == ENOLCK)
 				goto loop;
 			splx(s);
 			return (struct buf *) NULL;
 		}
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Ohterwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		bremfree(bp);
 
 		/*
 		 * check for size inconsistancies for non-VMIO case.
 		 */
 
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
-			    (size > bp->b_kvasize)
-			) {
+			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					bp->b_flags |= B_NOCACHE;
 					VOP_BWRITE(bp->b_vp, bp);
 				} else {
 					if ((bp->b_flags & B_VMIO) &&
 					   (LIST_FIRST(&bp->b_dep) == NULL)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						VOP_BWRITE(bp->b_vp, bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			VOP_BWRITE(bp->b_vp, bp);
 			goto loop;
 		}
 
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		int bsize, maxsize, vmio;
 		off_t offset;
 
 		if (vp->v_type == VBLK)
 			bsize = DEV_BSIZE;
 		else if (vp->v_mountedhere)
 			bsize = vp->v_mountedhere->mnt_stat.f_iosize;
 		else if (vp->v_mount)
 			bsize = vp->v_mount->mnt_stat.f_iosize;
 		else
 			bsize = size;
 
 		offset = (off_t)blkno * bsize;
 		vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF);
 		maxsize = vmio ? size + (offset & PAGE_MASK) : size;
 		maxsize = imax(maxsize, bsize);
 
 		if ((bp = getnewbuf(vp, blkno,
 			slpflag, slptimeo, size, maxsize)) == NULL) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
 			}
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.  There is now window
 		 * race because we are safely running at splbio() from the
 		 * point of the duplicate buffer creation through to here.
 		 */
 		if (gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG && vp->v_type != VBLK)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 #endif
 		} else {
 			bp->b_flags &= ~B_VMIO;
 		}
 
 		allocbuf(bp, size);
 
 		splx(s);
 		bp->b_flags &= ~B_DONE;
 	}
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 	int s;
 
 	s = splbio();
 	while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0);
 	splx(s);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	return (bp);
 }
 
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize, mbsize;
 	int i;
 
 #if !defined(MAX_PERF)
 	if (BUF_REFCNT(bp) == 0)
 		panic("allocbuf: buffer not busy");
 
 	if (bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 #endif
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		caddr_t origbuf;
 		int origbufsize;
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 #if !defined(NO_B_MALLOC)
 		if (bp->b_flags & B_MALLOC)
 			newbsize = mbsize;
 		else
 #endif
 			newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 #if !defined(NO_B_MALLOC)
 			/*
 			 * malloced buffers are not shrunk
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				if (newbsize) {
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
 					bufspace -= bp->b_bufsize;
 					bufmallocspace -= bp->b_bufsize;
 					runningbufspace -= bp->b_bufsize;
 					if (bp->b_bufsize)
 						bufspacewakeup();
 					bp->b_data = bp->b_kvabase;
 					bp->b_bufsize = 0;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
 				return 1;
 			}		
 #endif
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 #if !defined(NO_B_MALLOC)
 			/*
 			 * We only use malloced memory on the first allocation.
 			 * and revert to page-allocated memory when the buffer grows.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
 				(mbsize <= PAGE_SIZE/2)) {
 
 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				bufspace += mbsize;
 				bufmallocspace += mbsize;
 				runningbufspace += bp->b_bufsize;
 				return 1;
 			}
 #endif
 			origbuf = NULL;
 			origbufsize = 0;
 #if !defined(NO_B_MALLOC)
 			/*
 			 * If the buffer is growing on its other-than-first allocation,
 			 * then we revert to the page-allocation scheme.
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				bufspace -= bp->b_bufsize;
 				bufmallocspace -= bp->b_bufsize;
 				runningbufspace -= bp->b_bufsize;
 				if (bp->b_bufsize)
 					bufspacewakeup();
 				bp->b_bufsize = 0;
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
 #endif
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 #if !defined(NO_B_MALLOC)
 			if (origbuf) {
 				bcopy(origbuf, bp->b_data, origbufsize);
 				free(origbuf, M_BIOBUF);
 			}
 #endif
 		}
 	} else {
 		vm_page_t m;
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (size == 0) ? 0 :
 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 #if !defined(NO_B_MALLOC)
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 #endif
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * DEV_BSIZE aligned new buffer size is less then the
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
 			if (desiredpages < bp->b_npages) {
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
 					 * is the responsibility of 
 					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
 					while (vm_page_sleep_busy(m, TRUE, "biodep"))
 						;
 
 					bp->b_pages[i] = NULL;
 					vm_page_unwire(m, 0);
 				}
 				pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
 				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
 			 * byte-granular fashion.
 			 */
 			struct vnode *vp;
 			vm_object_t obj;
 			vm_offset_t toff;
 			vm_offset_t tinc;
 
 			/*
 			 * Step 1, bring in the VM pages from the object, 
 			 * allocating them if necessary.  We must clear
 			 * B_CACHE if these pages are not valid for the 
 			 * range covered by the buffer.
 			 */
 
 			vp = bp->b_vp;
 			obj = vp->v_object;
 
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 				vm_pindex_t pi;
 
 				pi = OFF_TO_IDX(bp->b_offset) + bp->b_npages;
 				if ((m = vm_page_lookup(obj, pi)) == NULL) {
 					m = vm_page_alloc(obj, pi, VM_ALLOC_NORMAL);
 					if (m == NULL) {
 						VM_WAIT;
 						vm_pageout_deficit += desiredpages - bp->b_npages;
 					} else {
 						vm_page_wire(m);
 						vm_page_wakeup(m);
 						bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
 					continue;
 				}
 
 				/*
 				 * We found a page.  If we have to sleep on it,
 				 * retry because it might have gotten freed out
 				 * from under us.
 				 *
 				 * We can only test PG_BUSY here.  Blocking on
 				 * m->busy might lead to a deadlock:
 				 *
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
 
 				if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
 					continue;
 
 				/*
 				 * We have a good page.  Should we wakeup the
 				 * page daemon?
 				 */
 				if ((curproc != pageproc) &&
 				    ((m->queue - m->pc) == PQ_CACHE) &&
 				    ((cnt.v_free_count + cnt.v_cache_count) <
-					(cnt.v_free_min + cnt.v_cache_min))
-				) {
+					(cnt.v_free_min + cnt.v_cache_min))) {
 					pagedaemon_wakeup();
 				}
 				vm_page_flag_clear(m, PG_ZERO);
 				vm_page_wire(m);
 				bp->b_pages[bp->b_npages] = m;
 				++bp->b_npages;
 			}
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
 			 * we have to figure out if we can still have B_CACHE
 			 * set.  Note that B_CACHE is set according to the
 			 * byte-granular range ( bcount and size ), new the
 			 * aligned range ( newbsize ).
 			 *
 			 * The VM test is against m->valid, which is DEV_BSIZE
 			 * aligned.  Needless to say, the validity of the data
 			 * needs to also be DEV_BSIZE aligned.  Note that this
 			 * fails with NFS if the server or some other client
 			 * extends the file's EOF.  If our buffer is resized, 
 			 * B_CACHE may remain set! XXX
 			 */
 
 			toff = bp->b_bcount;
 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 
 			while ((bp->b_flags & B_CACHE) && toff < size) {
 				vm_pindex_t pi;
 
 				if (tinc > (size - toff))
 					tinc = size - toff;
 
 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 				    PAGE_SHIFT;
 
 				vfs_buf_test_cache(
 				    bp, 
 				    bp->b_offset,
 				    toff, 
 				    tinc, 
 				    bp->b_pages[pi]
 				);
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
 
 			/*
 			 * Step 3, fixup the KVM pmap.  Remember that
 			 * bp->b_data is relative to bp->b_offset, but 
 			 * bp->b_offset may be offset into the first page.
 			 */
 
 			bp->b_data = (caddr_t)
 			    trunc_page((vm_offset_t)bp->b_data);
 			pmap_qenter(
 			    (vm_offset_t)bp->b_data,
 			    bp->b_pages, 
 			    bp->b_npages
 			);
 			bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 
 			    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 		}
 	}
 	if (bp->b_flags & B_VMIO)
 		vmiospace += (newbsize - bp->b_bufsize);
 	bufspace += (newbsize - bp->b_bufsize);
 	runningbufspace += (newbsize - bp->b_bufsize);
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 /*
  *	biowait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into a EINTR
  *	error and cleared.
  */
 int
 biowait(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
-	while ((bp->b_flags & B_DONE) == 0)
+	while ((bp->b_flags & B_DONE) == 0) {
 #if defined(NO_SCHEDULE_MODS)
 		tsleep(bp, PRIBIO, "biowait", 0);
 #else
 		if (bp->b_flags & B_READ)
 			tsleep(bp, PRIBIO, "biord", 0);
 		else
 			tsleep(bp, PRIBIO, "biowr", 0);
 #endif
+	}
 	splx(s);
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_flags & B_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	biodone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occured, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existance
  *	in the biodone routine.
  */
 void
 biodone(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 
-	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy", bp));
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	bp->b_flags |= B_DONE;
 
 	if (bp->b_flags & B_FREEBUF) {
 		brelse(bp);
 		splx(s);
 		return;
 	}
 
 	if ((bp->b_flags & B_READ) == 0) {
 		vwakeup(bp);
 	}
 
 	/* call optional completion function if requested */
 	if (bp->b_flags & B_CALL) {
 		bp->b_flags &= ~B_CALL;
 		(*bp->b_iodone) (bp);
 		splx(s);
 		return;
 	}
 	if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete)
 		(*bioops.io_complete)(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		int i, resid;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 
 		obj = vp->v_object;
 
 #if defined(VFS_BIO_DEBUG)
 		if (vp->v_usecount == 0) {
 			panic("biodone: zero vnode ref count");
 		}
 
 		if (vp->v_object == NULL) {
 			panic("biodone: missing VM object");
 		}
 
 		if ((vp->v_flag & VOBJBUF) == 0) {
 			panic("biodone: vnode is not setup for merged cache");
 		}
 #endif
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("biodone: no buffer offset"));
 
 #if !defined(MAX_PERF)
 		if (!obj) {
 			panic("biodone: no object");
 		}
 #endif
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occured.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount;
 		if ((bp->b_flags & (B_READ|B_FREEBUF|B_INVAL|B_NOCACHE|B_ERROR)) == B_READ) {
 			bp->b_flags |= B_CACHE;
 		}
 
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					vm_object_pip_subtract(obj, 1);
 					bp->b_flags &= ~B_CACHE;
 					continue;
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf(
 "biodone: foff(%lu)/m->pindex(%d) mismatch\n",
 				    (unsigned long)foff, m->pindex);
 			}
 #endif
 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
 			if (resid > iosize)
 				resid = iosize;
 
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 				vfs_page_set_valid(bp, foff, i, m);
 			}
 			vm_page_flag_clear(m, PG_ZERO);
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 #if !defined(MAX_PERF)
 				printf("biodone: page busy < 0, "
 				    "pindex: %d, foff: 0x(%x,%x), "
 				    "resid: %d, index: %d\n",
 				    (int) m->pindex, (int)(foff >> 32),
 						(int) foff & 0xffffffff, resid, i);
 #endif
 				if (vp->v_type != VBLK)
 #if !defined(MAX_PERF)
 					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				else
 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
 				    m->valid, m->dirty, m->wire_count);
 #endif
 				panic("biodone: page busy < 0\n");
 			}
 			vm_page_io_finish(m);
 			vm_object_pip_subtract(obj, 1);
 			foff += resid;
 			iosize -= resid;
 		}
 		if (obj)
 			vm_object_pip_wakeupn(obj, 0);
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else {
 		wakeup(bp);
 	}
 	splx(s);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj = vp->v_object;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (m == bogus_page) {
 				m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 #if !defined(MAX_PERF)
 				if (!m) {
 					panic("vfs_unbusy_pages: page missing\n");
 				}
 #endif
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 			}
 			vm_object_pip_subtract(obj, 1);
 			vm_page_flag_clear(m, PG_ZERO);
 			vm_page_io_finish(m);
 		}
 		vm_object_pip_wakeupn(obj, 0);
 	}
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as B_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i, bogus;
 
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj = vp->v_object;
 		vm_ooffset_t foff;
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_busy_pages: no buffer offset"));
 		vfs_setdirty(bp);
 
 retry:
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			if (vm_page_sleep_busy(m, FALSE, "vbpage"))
 				goto retry;
 		}
 
 		bogus = 0;
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			vm_page_flag_clear(m, PG_ZERO);
 			if ((bp->b_flags & B_CLUSTER) == 0) {
 				vm_object_pip_add(obj, 1);
 				vm_page_io_start(m);
 			}
 
 			/*
 			 * When readying a buffer for a read ( i.e
 			 * clear_modify == 0 ), it is important to do
 			 * bogus_page replacement for valid pages in 
 			 * partially instantiated buffers.  Partially 
 			 * instantiated buffers can, in turn, occur when
 			 * reconstituting a buffer from its VM backing store
 			 * base.  We only have to do this if B_CACHE is
 			 * clear ( which causes the I/O to occur in the
 			 * first place ).  The replacement prevents the read
 			 * I/O from overwriting potentially dirty VM-backed
 			 * pages.  XXX bogus page replacement is, uh, bogus.
 			 * It may not work properly with small-block devices.
 			 * We need to find a better way.
 			 */
 
 			vm_page_protect(m, VM_PROT_NONE);
 			if (clear_modify)
 				vfs_page_set_valid(bp, foff, i, m);
 			else if (m->valid == VM_PAGE_BITS_ALL &&
 				(bp->b_flags & B_CACHE) == 0) {
 				bp->b_pages[i] = bogus_page;
 				bogus++;
 			}
 			foff = (foff + PAGE_SIZE) & ~PAGE_MASK;
 		}
 		if (bogus)
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("vfs_clean_pages: no buffer offset"));
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			vm_ooffset_t noff = (foff + PAGE_SIZE) & ~PAGE_MASK;
 			vm_ooffset_t eoff = noff;
 
 			if (eoff > bp->b_offset + bp->b_bufsize)
 				eoff = bp->b_offset + bp->b_bufsize;
 			vfs_page_set_valid(bp, foff, i, m);
 			/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 			foff = noff;
 		}
 	}
 }
 
 /*
  *	vfs_bio_set_validclean:
  *
  *	Set the range within the buffer to valid and clean.  The range is 
  *	relative to the beginning of the buffer, b_offset.  Note that b_offset
  *	itself may be offset from the beginning of the first page.
  */
 
 void   
 vfs_bio_set_validclean(struct buf *bp, int base, int size)
 {
 	if (bp->b_flags & B_VMIO) {
 		int i;
 		int n;
 
 		/*
 		 * Fixup base to be relative to beginning of first page.
 		 * Set initial n to be the maximum number of bytes in the
 		 * first page that can be validated.
 		 */
 
 		base += (bp->b_offset & PAGE_MASK);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (n > size)
 				n = size;
 
 			vm_page_set_validclean(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	clear a buffer.  This routine essentially fakes an I/O, so we need
  *	to clear B_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i, mask = 0;
 	caddr_t sa, ea;
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) {
 		bp->b_flags &= ~(B_INVAL|B_ERROR);
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 		    (bp->b_offset & PAGE_MASK) == 0) {
 			mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 			if (((bp->b_pages[0]->flags & PG_ZERO) == 0) &&
 			    ((bp->b_pages[0]->valid & mask) != mask)) {
 				bzero(bp->b_data, bp->b_bufsize);
 			}
 			bp->b_pages[0]->valid |= mask;
 			bp->b_resid = 0;
 			return;
 		}
 		ea = sa = bp->b_data;
 		for(i=0;i<bp->b_npages;i++,sa=ea) {
 			int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE;
 			ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE);
 			ea = (caddr_t)ulmin((u_long)ea,
 				(u_long)bp->b_data + bp->b_bufsize);
 			mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 			if ((bp->b_pages[i]->valid & mask) == mask)
 				continue;
 			if ((bp->b_pages[i]->valid & mask) == 0) {
 				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 					bzero(sa, ea - sa);
 				}
 			} else {
 				for (; sa < ea; sa += DEV_BSIZE, j++) {
 					if (((bp->b_pages[i]->flags & PG_ZERO) == 0) &&
 						(bp->b_pages[i]->valid & (1<<j)) == 0)
 						bzero(sa, DEV_BSIZE);
 				}
 			}
 			bp->b_pages[i]->valid |= mask;
 			vm_page_flag_clear(bp->b_pages[i], PG_ZERO);
 		}
 		bp->b_resid = 0;
 	} else {
 		clrbuf(bp);
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_unload pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 void
 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 
 tryagain:
 
 		p = vm_page_alloc(kernel_object,
 			((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 		    VM_ALLOC_NORMAL);
 		if (!p) {
 			vm_pageout_deficit += (to - from) >> PAGE_SHIFT;
 			VM_WAIT;
 			goto tryagain;
 		}
 		vm_page_wire(p);
 		p->valid = VM_PAGE_BITS_ALL;
 		vm_page_flag_clear(p, PG_ZERO);
 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
 		bp->b_pages[index] = p;
 		vm_page_wakeup(p);
 	}
 	bp->b_npages = index;
 }
 
 void
 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index, newnpages;
 
 	from = round_page(from);
 	to = round_page(to);
 	newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
 		if (p && (index < bp->b_npages)) {
 #if !defined(MAX_PERF)
 			if (p->busy) {
 				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
 					bp->b_blkno, bp->b_lblkno);
 			}
 #endif
 			bp->b_pages[index] = NULL;
 			pmap_kremove(pg);
 			vm_page_busy(p);
 			vm_page_unwire(p, 0);
 			vm_page_free(p);
 		}
 	}
 	bp->b_npages = newnpages;
 }
 
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS);
 	db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, "
 		  "b_resid = %ld\nb_dev = (%d,%d), b_data = %p, "
 		  "b_blkno = %d, b_pblkno = %d\n",
 		  bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 		  major(bp->b_dev), minor(bp->b_dev),
 		  bp->b_data, bp->b_blkno, bp->b_pblkno);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 }
 #endif /* DDB */
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
index 8357241479e2..efca6c8a1578 100644
--- a/sys/kern/vfs_export.c
+++ b/sys/kern/vfs_export.c
@@ -1,2945 +1,2977 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.204 1999/07/01 13:21:41 peter Exp $
+ * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/domain.h>
 #include <sys/dirent.h>
 #include <sys/vmmeter.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
 #include <sys/sysctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	insmntque __P((struct vnode *vp, struct mount *mp));
 static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
 static void	vfree __P((struct vnode *));
 static void	vgonel __P((struct vnode *vp, struct proc *p));
 static unsigned long	numvnodes;
 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 struct tobefreelist vnode_tobefree_list;	/* vnode free list */
 
 static u_long wantfreevnodes = 25;
 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 static u_long freevnodes = 0;
 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
+static int reassignbufloops;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
+static int reassignbufsortgood;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
+static int reassignbufsortbad;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
+static int reassignbufmethod = 1;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
+
 int vfs_ioopt = 0;
 #ifdef ENABLE_VFS_IOOPT
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
 struct mntlist mountlist;	/* mounted filesystem list */
 struct simplelock mountlist_slock;
 struct simplelock mntvnode_slock;
 int	nfs_mount_type = -1;
 #ifndef NULL_SIMPLELOCKS
 static struct simplelock mntid_slock;
 static struct simplelock vnode_free_list_slock;
 static struct simplelock spechash_slock;
 #endif
 struct nfs_public nfs_pub;	/* publicly exported FS */
 static vm_zone_t vnode_zone;
 
 /*
  * The workitem queue.
  */
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 time_t syncdelay = 30;		/* max time to delay syncing data */
 time_t filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 time_t dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 time_t metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;			/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 static int syncer_delayno = 0;
 static long syncer_mask; 
 LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 
     &desiredvnodes, 0, "Maximum number of vnodes");
 
 static void	vfs_free_addrlist __P((struct netexport *nep));
 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 				       struct export_args *argp));
 
 /*
  * Initialize the vnode management data structures.
  */
 void
 vntblinit()
 {
 
 	desiredvnodes = maxproc + cnt.v_page_count / 4;
 	simple_lock_init(&mntvnode_slock);
 	simple_lock_init(&mntid_slock);
 	simple_lock_init(&spechash_slock);
 	TAILQ_INIT(&vnode_free_list);
 	TAILQ_INIT(&vnode_tobefree_list);
 	simple_lock_init(&vnode_free_list_slock);
 	CIRCLEQ_INIT(&mountlist);
 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
 	/*
 	 * Initialize the filesystem syncer.
 	 */     
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 }
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, p)
 	struct mount *mp;
 	int flags;
 	struct simplelock *interlkp;
 	struct proc *p;
 {
 	int lkflags;
 
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		if (interlkp) {
 			simple_unlock(interlkp);
 		}
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
 		if (interlkp) {
 			simple_lock(interlkp);
 		}
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED | LK_NOPAUSE;
 	if (interlkp)
 		lkflags |= LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, p)
 	struct mount *mp;
 	struct proc *p;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 }
 
 /*
  * Lookup a filesystem type, and if found allocate and initialize
  * a mount structure for it.
  *
  * Devname is usually updated by mount(8) after booting.
  */
 int
 vfs_rootmountalloc(fstypename, devname, mpp)
 	char *fstypename;
 	char *devname;
 	struct mount **mpp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vfsconf *vfsp;
 	struct mount *mp;
 
 	if (fstypename == NULL)
 		return (ENODEV);
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 		if (!strcmp(vfsp->vfc_name, fstypename))
 			break;
 	if (vfsp == NULL)
 		return (ENODEV);
 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 	bzero((char *)mp, (u_long)sizeof(struct mount));
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
 	LIST_INIT(&mp->mnt_vnodelist);
 	mp->mnt_vfc = vfsp;
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_flag = MNT_RDONLY;
 	mp->mnt_vnodecovered = NULLVP;
 	vfsp->vfc_refcount++;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_stat.f_mntonname[0] = '/';
 	mp->mnt_stat.f_mntonname[1] = 0;
 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 	*mpp = mp;
 	return (0);
 }
 
 /*
  * Find an appropriate filesystem to use for the root. If a filesystem
  * has not been preselected, walk through the list of known filesystems
  * trying those that have mountroot routines, and try them until one
  * works or we have tried them all.
  */
 #ifdef notdef	/* XXX JH */
 int
 lite2_vfs_mountroot()
 {
 	struct vfsconf *vfsp;
 	extern int (*lite2_mountroot) __P((void));
 	int error;
 
 	if (lite2_mountroot != NULL)
 		return ((*lite2_mountroot)());
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		if (vfsp->vfc_mountroot == NULL)
 			continue;
 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
 			return (0);
 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 	}
 	return (ENODEV);
 }
 #endif
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			simple_unlock(&mountlist_slock);
 			return (mp);
 	    }
 	}
 	simple_unlock(&mountlist_slock);
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
 	int mtype;
 
 	simple_lock(&mntid_slock); 
 	mtype = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_stat.f_fsid.val[0] = umakedev(255, mtype);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
 	if (xxxfs_mntid == 0)
 		++xxxfs_mntid;
 	tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16));
 	tfsid.val[1] = mtype;
 	if (mountlist.cqh_first != (void *)&mountlist) {
 		while (vfs_getvfs(&tfsid)) {
 			xxxfs_mntid++;
 			tfsid.val[0] = umakedev(255,
 			    mtype + (xxxfs_mntid << 16));
 		}
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	simple_unlock(&mntid_slock);
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	int s;
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp, *tvp, *nvp;
 	vm_object_t object;
 	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
 
 	/*
 	 * We take the least recently used vnode from the freelist
 	 * if we can get it and it has no cached pages, and no
 	 * namecache entries are relative to it.
 	 * Otherwise we allocate a new vnode
 	 */
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	TAILQ_INIT(&vnode_tmp_list);
 
 	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
 		nvp = TAILQ_NEXT(vp, v_freelist);
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		if (vp->v_flag & VAGE) {
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		} else {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 		}
 		vp->v_flag &= ~(VTBFREE|VAGE);
 		vp->v_flag |= VFREE;
 		if (vp->v_usecount)
 			panic("tobe free vnode isn't");
 		freevnodes++;
 	}
 
 	if (wantfreevnodes && freevnodes < wantfreevnodes) {
 		vp = NULL;
 	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
 		/* 
 		 * XXX: this is only here to be backwards compatible
 		 */
 		vp = NULL;
 	} else {
 		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
 			nvp = TAILQ_NEXT(vp, v_freelist);
 			if (!simple_lock_try(&vp->v_interlock)) 
 				continue;
 			if (vp->v_usecount)
 				panic("free vnode isn't");
 
 			object = vp->v_object;
 			if (object && (object->resident_page_count || object->ref_count)) {
 				printf("object inconsistant state: RPC: %d, RC: %d\n",
 					object->resident_page_count, object->ref_count);
 				/* Don't recycle if it's caching some pages */
 				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
 				continue;
 			} else if (LIST_FIRST(&vp->v_cache_src)) {
 				/* Don't recycle if active in the namecache */
 				simple_unlock(&vp->v_interlock);
 				continue;
 			} else {
 				break;
 			}
 		}
 	}
 
 	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
 		nvp = TAILQ_NEXT(tvp, v_freelist);
 		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
 		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
 		simple_unlock(&tvp->v_interlock);
 	}
 
 	if (vp) {
 		vp->v_flag |= VDOOMED;
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 		simple_unlock(&vnode_free_list_slock);
 		cache_purge(vp);
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD) {
 			vgonel(vp, p);
 		} else {
 			simple_unlock(&vp->v_interlock);
 		}
 
 #ifdef INVARIANTS
 		{
 			int s;
 
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			s = splbio();
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			splx(s);
 		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		vp->v_writecount = 0;	/* XXX */
 		vp->v_maxio = 0;
 	} else {
 		simple_unlock(&vnode_free_list_slock);
 		vp = (struct vnode *) zalloc(vnode_zone);
 		bzero((char *) vp, sizeof *vp);
 		simple_lock_init(&vp->v_interlock);
 		vp->v_dd = vp;
 		cache_purge(vp);
 		LIST_INIT(&vp->v_cache_src);
 		TAILQ_INIT(&vp->v_cache_dst);
 		numvnodes++;
 	}
 
 	TAILQ_INIT(&vp->v_cleanblkhd);
 	TAILQ_INIT(&vp->v_dirtyblkhd);
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	insmntque(vp, mp);
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	splx(s);
 
 	vfs_object_create(vp, p, p->p_ucred);
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 static void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	simple_lock(&mntvnode_slock);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		LIST_REMOVE(vp, v_mntvnodes);
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL) {
 		simple_unlock(&mntvnode_slock);
 		return;
 	}
 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 	simple_unlock(&mntvnode_slock);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
 			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 	int slpflag, slptimeo;
 {
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
 	vm_object_t object;
 
 	if (flags & V_SAVE) {
 		s = splbio();
 		while (vp->v_numoutput) {
 			vp->v_flag |= VBWAIT;
 			error = tsleep((caddr_t)&vp->v_numoutput,
 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 			if (error) {
 				splx(s);
 				return (error);
 			}
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			splx(s);
 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 				return (error);
 			s = splbio();
 			if (vp->v_numoutput > 0 ||
 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 				panic("vinvalbuf: dirty bufs");
 		}
 		splx(s);
   	}
 	s = splbio();
 	for (;;) {
 		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
 		if (!blist)
 			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
 		if (!blist)
 			break;
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 				error = BUF_TIMELOCK(bp,
 				    LK_EXCLUSIVE | LK_SLEEPFAIL,
 				    "vinvalbuf", slpflag, slptimeo);
 				if (error == ENOLCK)
 					break;
 				splx(s);
 				return (error);
 			}
 			/*
 			 * XXX Since there are no node locks for NFS, I
 			 * believe there is a slight chance that a delayed
 			 * write will occur while sleeping just above, so
 			 * check for it.  Note that vfs_bio_awrite expects
 			 * buffers to reside on a queue, while VOP_BWRITE and
 			 * brelse do not.
 			 */
 			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 				(flags & V_SAVE)) {
 
 				if (bp->b_vp == vp) {
 					if (bp->b_flags & B_CLUSTEROK) {
 						BUF_UNLOCK(bp);
 						vfs_bio_awrite(bp);
 					} else {
 						bremfree(bp);
 						bp->b_flags |= B_ASYNC;
 						VOP_BWRITE(bp->b_vp, bp);
 					}
 				} else {
 					bremfree(bp);
 					(void) VOP_BWRITE(bp->b_vp, bp);
 				}
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 	}
 
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	simple_lock(&vp->v_interlock);
 	object = vp->v_object;
 	if (object != NULL) {
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
 	simple_unlock(&vp->v_interlock);
 
 	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(vp, cred, p, length, blksize)
 	register struct vnode *vp;
 	struct ucred *cred;
 	struct proc *p;
 	off_t length;
 	int blksize;
 {
 	register struct buf *bp;
 	struct buf *nbp;
 	int s, anyfreed;
 	int trunclbn;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	s = splbio();
 restart:
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
 					 (nbp->b_vp != vp) ||
 					 (nbp->b_flags & B_DELWRI))) {
 					goto restart;
 				}
 			}
 		}
 
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
 					 (nbp->b_vp != vp) ||
 					 (nbp->b_flags & B_DELWRI) == 0)) {
 					goto restart;
 				}
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					if (bp->b_vp == vp) {
 						bp->b_flags |= B_ASYNC;
 					} else {
 						bp->b_flags &= ~B_ASYNC;
 					}
 					VOP_BWRITE(bp->b_vp, bp);
 				}
 				goto restartsync;
 			}
 
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
 	}
 
 	splx(s);
 
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	bp->b_xflags |= B_VNCLEAN;
 	bp->b_xflags &= ~B_VNDIRTY;
 	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 	splx(s);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	struct buflists *listheadp;
 	int s;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;
 	s = splbio();
 	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
 		if (bp->b_xflags & B_VNDIRTY)
 			listheadp = &vp->v_dirtyblkhd;
 		else 
 			listheadp = &vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
 	}
 	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
 	splx(s);
 	bp->b_vp = (struct vnode *) 0;
 	vdrop(vp);
 }
 
 /*
  * The workitem queue.
  * 
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syner process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 {
 	int s, slot;
 
 	s = splbio();
 
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 	vp->v_flag |= VONWORKLST;
 	splx(s);
 }
 
 struct  proc *updateproc;
 static void sched_sync __P((void));
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 /*
  * System filesystem synchronizer daemon.
  */
 void 
 sched_sync(void)
 {
 	struct synclist *slp;
 	struct vnode *vp;
 	long starttime;
 	int s;
 	struct proc *p = updateproc;
 
+	p->p_flag |= P_BUFEXHAUST;
+
 	for (;;) {
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 */
 		s = splbio();
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
 		if (syncer_delayno == syncer_maxdelay)
 			syncer_delayno = 0;
 		splx(s);
 
 		while ((vp = LIST_FIRST(slp)) != NULL) {
 			if (VOP_ISLOCKED(vp) == 0) {
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
 				VOP_UNLOCK(vp, 0, p);
 			}
 			s = splbio();
 			if (LIST_FIRST(slp) == vp) {
 				/*
 				 * Note: v_tag VT_VFS vps can remain on the
 				 * worklist too with no dirty blocks, but 
 				 * since sync_fsync() moves it to a different 
 				 * slot we are safe.
 				 */
 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
 				    vp->v_type != VBLK)
 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			splx(s);
 		}
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (bioops.io_sync)
 			(*bioops.io_sync)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	int s;
 
 	s = splhigh();
 	if (updateproc->p_wchan == &lbolt)
 		setrunnable(updateproc);
 	splx(s);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		return (1);
 	}
 	return(0);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
 #if !defined(MAX_PERF)
 	/* XXX REMOVE ME */
 	if (bp->b_vnbufs.tqe_next != NULL) {
 		panic(
 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
 		    bp,
 		    (int)bp->b_flags
 		);
 	}
 #endif
 	bp->b_vp = (struct vnode *) 0;
 	bp->b_flags &= ~B_PAGING;
 }
 
 void
 pbreassignbuf(bp, newvp)
 	struct buf *bp;
 	struct vnode *newvp;
 {
 #if !defined(MAX_PERF)
 	if ((bp->b_flags & B_PAGING) == 0) {
 		panic(
 		    "pbreassignbuf() on non phys bp %p", 
 		    bp
 		);
 	}
 #endif
 	bp->b_vp = newvp;
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	struct buflists *listheadp;
 	int delay;
 	int s;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
+	++reassignbufcalls;
 
 #if !defined(MAX_PERF)
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 #endif
 
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
 		if (bp->b_xflags & B_VNDIRTY)
 			listheadp = &bp->b_vp->v_dirtyblkhd;
 		else 
 			listheadp = &bp->b_vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
 		if (bp->b_vp != newvp) {
 			vdrop(bp->b_vp);
 			bp->b_vp = NULL;	/* for clarification */
 		}
 	}
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
 		listheadp = &newvp->v_dirtyblkhd;
 		if ((newvp->v_flag & VONWORKLST) == 0) {
 			switch (newvp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VBLK:
 				if (newvp->v_specmountpoint != NULL) {
 					delay = metadelay;
 					break;
 				}
 				/* fall through */
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
 		bp->b_xflags |= B_VNDIRTY;
 		tbp = TAILQ_FIRST(listheadp);
 		if (tbp == NULL ||
-		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+		    bp->b_lblkno == 0 ||
+		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
 			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
-		} else {
-			if (bp->b_lblkno >= 0) {
-				struct buf *ttbp;
-				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
-				    (ttbp->b_lblkno < bp->b_lblkno)) {
-					tbp = ttbp;
-				}
+			++reassignbufsortgood;
+		} else if (bp->b_lblkno < 0) {
+			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+			++reassignbufsortgood;
+		} else if (reassignbufmethod == 1) {
+			/*
+			 * New sorting algorithm, only handle sequential case,
+			 * otherwise guess.
+			 */
+			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
+			    (tbp->b_xflags & B_VNDIRTY)) {
 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+				++reassignbufsortgood;
 			} else {
-				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+				TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+				++reassignbufsortbad;
+			}
+		} else {
+			/*
+			 * Old sorting algorithm, scan queue and insert
+			 */
+			struct buf *ttbp;
+			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+			    (ttbp->b_lblkno < bp->b_lblkno)) {
+				++reassignbufloops;
+				tbp = ttbp;
 			}
+			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 		}
 	} else {
 		bp->b_xflags |= B_VNCLEAN;
 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
 		if ((newvp->v_flag & VONWORKLST) &&
 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 			newvp->v_flag &= ~VONWORKLST;
 			LIST_REMOVE(newvp, v_synclist);
 		}
 	}
 	if (bp->b_vp != newvp) {
 		bp->b_vp = newvp;
 		vhold(bp->b_vp);
 	}
 	splx(s);
 }
 
 /*
  * Create a vnode for a block device.
  * Used for mounting the root file system.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV) {
 		*vpp = NULLVP;
 		return (ENXIO);
 	}
 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
 	if ((nvp = checkalias(vp, dev2udev(dev), (struct mount *)0)) != NULL) {
 		vput(vp);
 		vp = nvp;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Check to see if the new vnode represents a special device
  * for which we already have a vnode (either because of
  * bdevvp() or because of a different vnode representing
  * the same block device). If such an alias exists, deallocate
  * the existing contents and return the aliased vnode. The
  * caller is responsible for filling it with its new contents.
  */
 struct vnode *
 checkalias(nvp, nvp_rdev, mp)
 	register struct vnode *nvp;
 	udev_t nvp_rdev;
 	struct mount *mp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp;
 	struct vnode **vpp;
 	dev_t	dev;
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		return (NULLVP);
 
 	dev = udev2dev(nvp_rdev, 2);
 
 	vpp = &speclisth[SPECHASH(dev)];
 loop:
 	simple_lock(&spechash_slock);
 	for (vp = *vpp; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || nvp->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 * Only alias active device nodes.
 		 * Not sure why we don't re-use this like we do below.
 		 */
 		simple_lock(&vp->v_interlock);
 		if (vp->v_usecount == 0) {
 			simple_unlock(&spechash_slock);
 			vgonel(vp, p);
 			goto loop;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
 			/*
 			 * It dissappeared, and we may have slept.
 			 * Restart from the beginning
 			 */
 			simple_unlock(&spechash_slock);
 			goto loop;
 		}
 		break;
 	}
 	/*
 	 * It would be a lot clearer what is going on here if
 	 * this had been expressed as:
 	 * if ( vp && (vp->v_tag == VT_NULL))
 	 * and the clauses had been swapped.
 	 */
 	if (vp == NULL || vp->v_tag != VT_NON) {
 		struct specinfo *sinfo;
 
 		/*
 		 * Put the new vnode into the hash chain.
 		 * and if there was an alias, connect them.
 		 */
 		MALLOC(sinfo, struct specinfo *,
 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
 		bzero(sinfo, sizeof(struct specinfo));
 		nvp->v_specinfo = sinfo;
 		sinfo->si_rdev = dev;
 		sinfo->si_hashchain = vpp;
 		sinfo->si_specnext = *vpp;
 		sinfo->si_bsize_phys = DEV_BSIZE;
 		sinfo->si_bsize_best = BLKDEV_IOSIZE;
 		sinfo->si_bsize_max = MAXBSIZE;
 
 		/*
 		 * Ask the device to fix up specinfo.  Typically the 
 		 * si_bsize_* parameters may need fixing up.
 		 */
 
 		if (nvp->v_type == VBLK) {
 			if (bdevsw(dev) && bdevsw(dev)->d_parms)
 				(*bdevsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
 		} else if (nvp->v_type == VCHR) {
 			if (devsw(dev) && devsw(dev)->d_parms)
 				(*devsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
 		}
 
 		simple_unlock(&spechash_slock);
 		*vpp = nvp;
 		if (vp != NULLVP) {
 			nvp->v_flag |= VALIASED;
 			vp->v_flag |= VALIASED;
 			vput(vp);
 		}
 		return (NULLVP);
 	}
 	/*
 	 * if ( vp && (vp->v_tag == VT_NULL))
 	 * We have a vnode alias, but it is a trashed.
 	 * Make it look like it's newley allocated. (by getnewvnode())
 	 * The caller should use this instead.
 	 */
 	simple_unlock(&spechash_slock);
 	VOP_UNLOCK(vp, 0, p);
 	simple_lock(&vp->v_interlock);
 	vclean(vp, 0, p);
 	vp->v_op = nvp->v_op;
 	vp->v_tag = nvp->v_tag;
 	nvp->v_type = VNON;
 	insmntque(vp, mp);
 	return (vp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
  */
 int
 vget(vp, flags, p)
 	register struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int error;
 
 	/*
 	 * If the vnode is in the process of being cleaned out for
 	 * another use, we wait for the cleaning to finish and then
 	 * return failure. Cleaning is determined by checking that
 	 * the VXLOCK flag is set.
 	 */
 	if ((flags & LK_INTERLOCK) == 0) {
 		simple_lock(&vp->v_interlock);
 	}
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vget", 0);
 		return (ENOENT);
 	}
 
 	vp->v_usecount++;
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
 			/*
 			 * must expand vrele here because we do not want
 			 * to call VOP_INACTIVE if the reference count
 			 * drops back to zero since it was never really
 			 * active. We must remove it from the free list
 			 * before sleeping so that multiple processes do
 			 * not try to recycle it.
 			 */
 			simple_lock(&vp->v_interlock);
 			vp->v_usecount--;
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 			simple_unlock(&vp->v_interlock);
 		}
 		return (error);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 void
 vref(struct vnode *vp)
 {
 	simple_lock(&vp->v_interlock);
 	vp->v_usecount++;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 
 		vp->v_usecount--;
 		simple_unlock(&vp->v_interlock);
 
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 
 		vp->v_usecount--;
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 	/*
 	 * If we are doing a vput, the node is already locked, and we must
 	 * call VOP_INACTIVE with the node locked.  So, in the case of
 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
 	 */
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
 			VOP_INACTIVE(vp, p);
 		}
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 		simple_unlock(&vp->v_interlock);
 #endif
 		panic("vrele: negative ref cnt");
 	}
 }
 
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 
 		vp->v_usecount--;
 		VOP_UNLOCK(vp, LK_INTERLOCK, p);
 		return;
 
 	}
 
 	if (vp->v_usecount == 1) {
 
 		vp->v_usecount--;
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 	/*
 	 * If we are doing a vput, the node is already locked, and we must
 	 * call VOP_INACTIVE with the node locked.  So, in the case of
 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
 	 */
 		simple_unlock(&vp->v_interlock);
 		VOP_INACTIVE(vp, p);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(vp)
 	register struct vnode *vp;
 {
 	int s;
 
   	s = splbio();
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	splx(s);
 }
 
 /*
  * One less who cares about this vnode.
  */
 void
 vdrop(vp)
 	register struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	splx(s);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If MNT_NOFORCE is specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If MNT_FORCE is specified, detach any active vnodes
  * that are found.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, skipvp, flags)
 	struct mount *mp;
 	struct vnode *skipvp;
 	int flags;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp, *nvp;
 	int busy = 0;
 
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		/*
 		 * Skip over a selected vnode.
 		 */
 		if (vp == skipvp)
 			continue;
 
 		simple_lock(&vp->v_interlock);
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, only flush out regular file vnodes
 		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			simple_unlock(&mntvnode_slock);
 			vgonel(vp, p);
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			simple_unlock(&mntvnode_slock);
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
 				vgonel(vp, p);
 			} else {
 				vclean(vp, 0, p);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		simple_unlock(&vp->v_interlock);
 		busy++;
 	}
 	simple_unlock(&mntvnode_slock);
 	if (busy)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 static void
 vclean(vp, flags, p)
 	struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int active;
 	vm_object_t obj;
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		vp->v_usecount++;
 
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
 	 * ensures that the VOP_INACTIVE routine is done with its work.
 	 * For active vnodes, it ensures that no other activity can
 	 * occur while the underlying object is being cleaned out.
 	 */
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 */
 	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
 	if ((obj = vp->v_object) != NULL) {
 		if (obj->ref_count == 0) {
 			/*
 			 * This is a normal way of shutting down the object/vnode
 			 * association.
 			 */
 			vm_object_terminate(obj);
 		} else {
 			/*
 			 * Woe to the process that tries to page now :-).
 			 */
 			vm_pager_deallocate(obj);
 		}
 	}
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed. Note that the
 	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
 		VOP_INACTIVE(vp, p);
 	} else {
 		/*
 		 * Any other processes trying to obtain this lock must first
 		 * wait for VXLOCK to clear, then call the new lock operation.
 		 */
 		VOP_UNLOCK(vp, 0, p);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, p))
 		panic("vclean: cannot reclaim");
 
 	if (active)
 		vrele(vp);
 
 	cache_purge(vp);
 	if (vp->v_vnlock) {
 #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
 #ifdef DIAGNOSTIC
 		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
 			vprint("vclean: lock not drained", vp);
 #endif
 #endif
 		FREE(vp->v_vnlock, M_VNODE);
 		vp->v_vnlock = NULL;
 	}
 
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 
 	/*
 	 * Done with purge, notify sleepers of the grim news.
 	 */
 	vp->v_op = dead_vnodeop_p;
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
 vop_revoke(ap)
 	struct vop_revoke_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp, *vq;
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 
 	vp = ap->a_vp;
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_flag & VALIASED) {
 		/*
 		 * If a vgone (or vclean) is already in progress,
 		 * wait until it is done and return.
 		 */
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
 			simple_unlock(&vp->v_interlock);
 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
 			return (0);
 		}
 		/*
 		 * Ensure that vp will not be vgone'd while we
 		 * are eliminating its aliases.
 		 */
 		vp->v_flag |= VXLOCK;
 		simple_unlock(&vp->v_interlock);
 		while (vp->v_flag & VALIASED) {
 			simple_lock(&spechash_slock);
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type || vp == vq)
 					continue;
 				simple_unlock(&spechash_slock);
 				vgone(vq);
 				break;
 			}
 			if (vq == NULLVP) {
 				simple_unlock(&spechash_slock);
 			}
 		}
 		/*
 		 * Remove the lock so that vgone below will
 		 * really eliminate the vnode after which time
 		 * vgone will awaken any sleepers.
 		 */
 		simple_lock(&vp->v_interlock);
 		vp->v_flag &= ~VXLOCK;
 		if (vp->v_flag & VXWANT) {
 			vp->v_flag &= ~VXWANT;
 			wakeup(vp);
 		}
 	}
 	vgonel(vp, p);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(vp, inter_lkp, p)
 	struct vnode *vp;
 	struct simplelock *inter_lkp;
 	struct proc *p;
 {
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount == 0) {
 		if (inter_lkp) {
 			simple_unlock(inter_lkp);
 		}
 		vgonel(vp, p);
 		return (1);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	simple_lock(&vp->v_interlock);
 	vgonel(vp, p);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(vp, p)
 	struct vnode *vp;
 	struct proc *p;
 {
 	int s;
 	struct vnode *vq;
 	struct vnode *vx;
 
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
 		return;
 	}
 
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE, p);
 	simple_lock(&vp->v_interlock);
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		insmntque(vp, (struct mount *)0);
 	/*
 	 * If special device, remove it from special device alias list
 	 * if it is on one.
 	 */
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
 		simple_lock(&spechash_slock);
 		if (*vp->v_hashchain == vp) {
 			*vp->v_hashchain = vp->v_specnext;
 		} else {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_specnext != vp)
 					continue;
 				vq->v_specnext = vp->v_specnext;
 				break;
 			}
 			if (vq == NULL)
 				panic("missing bdev");
 		}
 		if (vp->v_flag & VALIASED) {
 			vx = NULL;
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type)
 					continue;
 				if (vx)
 					break;
 				vx = vq;
 			}
 			if (vx == NULL)
 				panic("missing alias");
 			if (vq == NULL)
 				vx->v_flag &= ~VALIASED;
 			vp->v_flag &= ~VALIASED;
 		}
 		simple_unlock(&spechash_slock);
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the back
 	 * pointer and the reference count of zero is because
 	 * it will be removed from the free list by getnewvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
 		s = splbio();
 		simple_lock(&vnode_free_list_slock);
 		if (vp->v_flag & VFREE) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		} else if (vp->v_flag & VTBFREE) {
 			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 			vp->v_flag &= ~VTBFREE;
 			freevnodes++;
 		} else
 			freevnodes++;
 		vp->v_flag |= VFREE;
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		simple_unlock(&vnode_free_list_slock);
 		splx(s);
 	}
 
 	vp->v_type = VBAD;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	int rc = 0;
 
 	simple_lock(&spechash_slock);
 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || type != vp->v_type)
 			continue;
 		*vpp = vp;
 		rc = 1;
 		break;
 	}
 	simple_unlock(&spechash_slock);
 	return (rc);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	register struct vnode *vp;
 {
 	struct vnode *vq, *vnext;
 	int count;
 
 loop:
 	if ((vp->v_flag & VALIASED) == 0)
 		return (vp->v_usecount);
 	simple_lock(&spechash_slock);
 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
 		vnext = vq->v_specnext;
 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vq->v_usecount == 0 && vq != vp) {
 			simple_unlock(&spechash_slock);
 			vgone(vq);
 			goto loop;
 		}
 		count += vq->v_usecount;
 	}
 	simple_unlock(&spechash_slock);
 	return (count);
 }
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	register struct vnode *vp;
 {
 	char buf[96];
 
 	if (label != NULL)
 		printf("%s: %p: ", label, (void *)vp);
 	else
 		printf("%p: ", (void *)vp);
 	printf("type %s, usecount %d, writecount %d, refcount %d,",
 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
 	if (vp->v_flag & VTEXT)
 		strcat(buf, "|VTEXT");
 	if (vp->v_flag & VSYSTEM)
 		strcat(buf, "|VSYSTEM");
 	if (vp->v_flag & VXLOCK)
 		strcat(buf, "|VXLOCK");
 	if (vp->v_flag & VXWANT)
 		strcat(buf, "|VXWANT");
 	if (vp->v_flag & VBWAIT)
 		strcat(buf, "|VBWAIT");
 	if (vp->v_flag & VALIASED)
 		strcat(buf, "|VALIASED");
 	if (vp->v_flag & VDOOMED)
 		strcat(buf, "|VDOOMED");
 	if (vp->v_flag & VFREE)
 		strcat(buf, "|VFREE");
 	if (vp->v_flag & VOBJBUF)
 		strcat(buf, "|VOBJBUF");
 	if (buf[0] != '\0')
 		printf(" flags (%s)", &buf[1]);
 	if (vp->v_data == NULL) {
 		printf("\n");
 	} else {
 		printf("\n\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = mp->mnt_list.cqe_next;
 			continue;
 		}
 		for (vp = mp->mnt_vnodelist.lh_first;
 		     vp != NULL;
 		     vp = vp->v_mntvnodes.le_next) {
 			if (VOP_ISLOCKED(vp))
 				vprint((char *)0, vp);
 		}
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl SYSCTL_HANDLER_ARGS
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 #ifdef notyet
 	/* all sysctl names at this level are at least name and field */
 	if (namelen < 2)
 		return (ENOTDIR);		/* overloaded */
 	if (name[0] != VFS_GENERIC) {
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[0])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 		    oldp, oldlenp, newp, newlen, p));
 	}
 #endif
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
 	"Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 
 #if 0
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
 static int
 sysctl_vnode SYSCTL_HANDLER_ARGS
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *nvp, *vp;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
 
 	req->lock = 0;
 	if (!req->oldptr) /* Make an estimate */
 		return (SYSCTL_OUT(req, 0,
 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = mp->mnt_list.cqe_next;
 			continue;
 		}
 again:
 		simple_lock(&mntvnode_slock);
 		for (vp = mp->mnt_vnodelist.lh_first;
 		     vp != NULL;
 		     vp = nvp) {
 			/*
 			 * Check that the vp is still associated with
 			 * this filesystem.  RACE: could have been
 			 * recycled onto the same filesystem.
 			 */
 			if (vp->v_mount != mp) {
 				simple_unlock(&mntvnode_slock);
 				goto again;
 			}
 			nvp = vp->v_mntvnodes.le_next;
 			simple_unlock(&mntvnode_slock);
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
 				return (error);
 			simple_lock(&mntvnode_slock);
 		}
 		simple_unlock(&mntvnode_slock);
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 
 	return (0);
 }
 #endif
 
 /*
  * XXX
  * Exporting the vnode list on large systems causes them to crash.
  * Exporting the vnode list on medium systems causes sysctl to coredump.
  */
 #if 0
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,vnode", "");
 #endif
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	struct vnode *vp;
 {
 	struct vnode *vq;
 	int error = 0;
 
 	if (vp->v_specmountpoint != NULL)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		simple_lock(&spechash_slock);
 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
 			if (vq->v_specmountpoint != NULL) {
 				error = EBUSY;
 				break;
 			}
 		}
 		simple_unlock(&spechash_slock);
 	}
 	return (error);
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp, *nmp;
 	struct proc *p;
 	int error;
 
 	if (curproc != NULL)
 		p = curproc;
 	else
 		p = initproc;	/* XXX XXX should this be proc0? */
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_prev;
 		error = dounmount(mp, MNT_FORCE, p);
 		if (error) {
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		}
 	}
 }
 
 /*
  * Build hash lists of net addresses and hang them off the mount point.
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
 vfs_hang_addrlist(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	register int i;
 	struct radix_node *rn;
 	struct sockaddr *saddr, *smask = 0;
 	struct domain *dom;
 	int error;
 
 	if (argp->ex_addrlen == 0) {
 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 			return (EPERM);
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		np->netc_anon = argp->ex_anon;
 		np->netc_anon.cr_ref = 1;
 		mp->mnt_flag |= MNT_DEFEXPORTED;
 		return (0);
 	}
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 	bzero((caddr_t) np, i);
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
 			smask->sa_len = argp->ex_masklen;
 	}
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
 		 * Seems silly to initialize every AF when most are not used,
 		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 	}
 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 	    np->netc_rnodes);
 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 		error = EPERM;
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
 	np->netc_anon.cr_ref = 1;
 	return (0);
 out:
 	free(np, M_NETADDR);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 vfs_free_netcred(rn, w)
 	struct radix_node *rn;
 	void *w;
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
 /*
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
 vfs_free_addrlist(nep)
 	struct netexport *nep;
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
 		if ((rnh = nep->ne_rtable[i])) {
 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 			    (caddr_t) rnh);
 			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
 
 int
 vfs_export(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 
 	if (argp->ex_flags & MNT_DELEXPORT) {
 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 			vfs_setpublicfs(NULL, NULL, NULL);
 			mp->mnt_flag &= ~MNT_EXPUBLIC;
 		}
 		vfs_free_addrlist(nep);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if (argp->ex_flags & MNT_EXPUBLIC) {
 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
 				return (error);
 			mp->mnt_flag |= MNT_EXPUBLIC;
 		}
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
 	return (0);
 }
 
 
 /*
  * Set the publicly exported filesystem (WebNFS). Currently, only
  * one public filesystem is possible in the spec (RFC 2054 and 2055)
  */
 int
 vfs_setpublicfs(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 	struct vnode *rvp;
 	char *cp;
 
 	/*
 	 * mp == NULL -> invalidate the current info, the FS is
 	 * no longer exported. May be called from either vfs_export
 	 * or unmount, so check if it hasn't already been done.
 	 */
 	if (mp == NULL) {
 		if (nfs_pub.np_valid) {
 			nfs_pub.np_valid = 0;
 			if (nfs_pub.np_index != NULL) {
 				FREE(nfs_pub.np_index, M_TEMP);
 				nfs_pub.np_index = NULL;
 			}
 		}
 		return (0);
 	}
 
 	/*
 	 * Only one allowed at a time.
 	 */
 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
 		return (EBUSY);
 
 	/*
 	 * Get real filehandle for root of exported FS.
 	 */
 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
 
 	if ((error = VFS_ROOT(mp, &rvp)))
 		return (error);
 
 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
 		return (error);
 
 	vput(rvp);
 
 	/*
 	 * If an indexfile was specified, pull it in.
 	 */
 	if (argp->ex_indexfile != NULL) {
 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
 		    M_WAITOK);
 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
 		    MAXNAMLEN, (size_t *)0);
 		if (!error) {
 			/*
 			 * Check for illegal filenames.
 			 */
 			for (cp = nfs_pub.np_index; *cp; cp++) {
 				if (*cp == '/') {
 					error = EINVAL;
 					break;
 				}
 			}
 		}
 		if (error) {
 			FREE(nfs_pub.np_index, M_TEMP);
 			return (error);
 		}
 	}
 
 	nfs_pub.np_mount = mp;
 	nfs_pub.np_valid = 1;
 	return (0);
 }
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
 	register struct mount *mp;
 	struct netexport *nep;
 	struct sockaddr *nam;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
 
 	np = NULL;
 	if (mp->mnt_flag & MNT_EXPORTED) {
 		/*
 		 * Lookup in the export list first.
 		 */
 		if (nam != NULL) {
 			saddr = nam;
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
 							      rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
 		}
 		/*
 		 * If no address match, use the default if it exists.
 		 */
 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 			np = &nep->ne_defexported;
 	}
 	return (np);
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags) {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int anyio, tries;
 
 	tries = 5;
 loop:
 	anyio = 0;
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 
 		nvp = vp->v_mntvnodes.le_next;
 
 		if (vp->v_mount != mp) {
 			goto loop;
 		}
 
 		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
 			continue;
 
 		if (flags != MNT_WAIT) {
 			obj = vp->v_object;
 			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
 				continue;
 			if (VOP_ISLOCKED(vp))
 				continue;
 		}
 
 		simple_lock(&vp->v_interlock);
 		if (vp->v_object &&
 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
 			if (!vget(vp,
 				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
 				if (vp->v_object) {
 					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
 					anyio = 1;
 				}
 				vput(vp);
 			}
 		} else {
 			simple_unlock(&vp->v_interlock);
 		}
 	}
 	if (anyio && (--tries > 0))
 		goto loop;
 }
 
 /*
  * Create the VM object needed for VMIO and mmap support.  This
  * is done for all VREG files in the system.  Some filesystems might
  * afford the additional metadata buffering capability of the
  * VMIO code by making the device node be VMIO mode also.
  *
  * vp must be locked when vfs_object_create is called.
  */
 int
 vfs_object_create(vp, p, cred)
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 {
 	struct vattr vat;
 	vm_object_t object;
 	int error = 0;
 
 	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
 		return 0;
 
 retry:
 	if ((object = vp->v_object) == NULL) {
 		if (vp->v_type == VREG) {
 			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
 				goto retn;
 			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
 		} else if (bdevsw(vp->v_rdev) != NULL) {
 			/*
 			 * This simply allocates the biggest object possible
 			 * for a VBLK vnode.  This should be fixed, but doesn't
 			 * cause any problems (yet).
 			 */
 			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
 		} else {
 			goto retn;
 		}
 		/*
 		 * Dereference the reference we just created.  This assumes
 		 * that the object is associated with the vp.
 		 */
 		object->ref_count--;
 		vp->v_usecount--;
 	} else {
 		if (object->flags & OBJ_DEAD) {
 			VOP_UNLOCK(vp, 0, p);
 			tsleep(object, PVM, "vodead", 0);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			goto retry;
 		}
 	}
 
 	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
 	vp->v_flag |= VOBJBUF;
 
 retn:
 	return error;
 }
 
 static void
 vfree(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	if (vp->v_flag & VTBFREE) {
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		vp->v_flag &= ~VTBFREE;
 	}
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~VAGE;
 	vp->v_flag |= VFREE;
 	splx(s);
 }
 
 void
 vbusy(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	if (vp->v_flag & VTBFREE) {
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		vp->v_flag &= ~VTBFREE;
 	} else {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 	}
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~(VFREE|VAGE);
 	splx(s);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, p, events)
 	struct vnode *vp;
 	struct proc *p;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo.vpi_revents;
 		vp->v_pollinfo.vpi_revents &= ~events;
 
 		simple_unlock(&vp->v_pollinfo.vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo.vpi_events |= events;
 	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 	return 0;
 }
 
 /*
  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
  * it is possible for us to miss an event due to race conditions, but
  * that condition is expected to be rare, so for the moment it is the
  * preferred interface.
  */
 void
 vn_pollevent(vp, events)
 	struct vnode *vp;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events & events) {
 		/*
 		 * We clear vpi_events so that we don't
 		 * call selwakeup() twice if two events are
 		 * posted before the polling process(es) is
 		 * awakened.  This also ensures that we take at
 		 * most one selwakeup() if the polling process
 		 * is no longer interested.  However, it does
 		 * mean that only one event can be noticed at
 		 * a time.  (Perhaps we should only clear those
 		 * event bits which we note?) XXX
 		 */
 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
 		vp->v_pollinfo.vpi_revents |= events;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 /*
  * Wake up anyone polling on vp because it is being revoked.
  * This depends on dead_poll() returning POLLHUP for correct
  * behavior.
  */
 void
 vn_pollgone(vp)
 	struct vnode *vp;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events) {
 		vp->v_pollinfo.vpi_events = 0;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
 static int	sync_fsync __P((struct  vop_fsync_args *));
 static int	sync_inactive __P((struct  vop_inactive_args *));
 static int	sync_reclaim  __P((struct  vop_reclaim_args *));
 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
 static int	sync_print __P((struct vop_print_args *));
 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
 	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
 	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 
 VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct proc *a_p;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct proc *p = ap->a_p;
 	int asyncflag;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	simple_lock(&mountlist_slock);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
 		simple_unlock(&mountlist_slock);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vfs_unbusy(mp, p);
 	return (0);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected at splbio().
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int s;
 
 	s = splbio();
 	vp->v_mount->mnt_syncer = NULL;
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 		vp->v_flag &= ~VONWORKLST;
 	}
 	splx(s);
 
 	return (0);
 }
 
 /*
  * Print out a syncer vnode.
  */
 static int
 sync_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	printf("syncer vnode");
 	if (vp->v_vnlock != NULL)
 		lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	return (0);
 }
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 8357241479e2..efca6c8a1578 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1,2945 +1,2977 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
- * $Id: vfs_subr.c,v 1.204 1999/07/01 13:21:41 peter Exp $
+ * $Id: vfs_subr.c,v 1.205 1999/07/02 16:29:14 phk Exp $
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/socket.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/domain.h>
 #include <sys/dirent.h>
 #include <sys/vmmeter.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_zone.h>
 #include <sys/sysctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
 static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
 
 static void	insmntque __P((struct vnode *vp, struct mount *mp));
 static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
 static void	vfree __P((struct vnode *));
 static void	vgonel __P((struct vnode *vp, struct proc *p));
 static unsigned long	numvnodes;
 SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
 
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 static TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 struct tobefreelist vnode_tobefree_list;	/* vnode free list */
 
 static u_long wantfreevnodes = 25;
 SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
 static u_long freevnodes = 0;
 SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
 
+static int reassignbufcalls;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
+static int reassignbufloops;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
+static int reassignbufsortgood;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
+static int reassignbufsortbad;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
+static int reassignbufmethod = 1;
+SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
+
 int vfs_ioopt = 0;
 #ifdef ENABLE_VFS_IOOPT
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
 #endif
 
 struct mntlist mountlist;	/* mounted filesystem list */
 struct simplelock mountlist_slock;
 struct simplelock mntvnode_slock;
 int	nfs_mount_type = -1;
 #ifndef NULL_SIMPLELOCKS
 static struct simplelock mntid_slock;
 static struct simplelock vnode_free_list_slock;
 static struct simplelock spechash_slock;
 #endif
 struct nfs_public nfs_pub;	/* publicly exported FS */
 static vm_zone_t vnode_zone;
 
 /*
  * The workitem queue.
  */
 #define SYNCER_MAXDELAY		32
 static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 time_t syncdelay = 30;		/* max time to delay syncing data */
 time_t filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
 time_t dirdelay = 29;		/* time to delay syncing directories */
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 time_t metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
 static int rushjob;			/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
 static int syncer_delayno = 0;
 static long syncer_mask; 
 LIST_HEAD(synclist, vnode);
 static struct synclist *syncer_workitem_pending;
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, 
     &desiredvnodes, 0, "Maximum number of vnodes");
 
 static void	vfs_free_addrlist __P((struct netexport *nep));
 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 				       struct export_args *argp));
 
 /*
  * Initialize the vnode management data structures.
  */
 void
 vntblinit()
 {
 
 	desiredvnodes = maxproc + cnt.v_page_count / 4;
 	simple_lock_init(&mntvnode_slock);
 	simple_lock_init(&mntid_slock);
 	simple_lock_init(&spechash_slock);
 	TAILQ_INIT(&vnode_free_list);
 	TAILQ_INIT(&vnode_tobefree_list);
 	simple_lock_init(&vnode_free_list_slock);
 	CIRCLEQ_INIT(&mountlist);
 	vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
 	/*
 	 * Initialize the filesystem syncer.
 	 */     
 	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, 
 		&syncer_mask);
 	syncer_maxdelay = syncer_mask + 1;
 }
 
 /*
  * Mark a mount point as busy. Used to synchronize access and to delay
  * unmounting. Interlock is not released on failure.
  */
 int
 vfs_busy(mp, flags, interlkp, p)
 	struct mount *mp;
 	int flags;
 	struct simplelock *interlkp;
 	struct proc *p;
 {
 	int lkflags;
 
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_kern_flag |= MNTK_MWAIT;
 		if (interlkp) {
 			simple_unlock(interlkp);
 		}
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
 		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
 		if (interlkp) {
 			simple_lock(interlkp);
 		}
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED | LK_NOPAUSE;
 	if (interlkp)
 		lkflags |= LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
 		panic("vfs_busy: unexpected lock failure");
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  */
 void
 vfs_unbusy(mp, p)
 	struct mount *mp;
 	struct proc *p;
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
 }
 
 /*
  * Lookup a filesystem type, and if found allocate and initialize
  * a mount structure for it.
  *
  * Devname is usually updated by mount(8) after booting.
  */
 int
 vfs_rootmountalloc(fstypename, devname, mpp)
 	char *fstypename;
 	char *devname;
 	struct mount **mpp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vfsconf *vfsp;
 	struct mount *mp;
 
 	if (fstypename == NULL)
 		return (ENODEV);
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 		if (!strcmp(vfsp->vfc_name, fstypename))
 			break;
 	if (vfsp == NULL)
 		return (ENODEV);
 	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
 	bzero((char *)mp, (u_long)sizeof(struct mount));
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
 	LIST_INIT(&mp->mnt_vnodelist);
 	mp->mnt_vfc = vfsp;
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_flag = MNT_RDONLY;
 	mp->mnt_vnodecovered = NULLVP;
 	vfsp->vfc_refcount++;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_stat.f_mntonname[0] = '/';
 	mp->mnt_stat.f_mntonname[1] = 0;
 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 	*mpp = mp;
 	return (0);
 }
 
 /*
  * Find an appropriate filesystem to use for the root. If a filesystem
  * has not been preselected, walk through the list of known filesystems
  * trying those that have mountroot routines, and try them until one
  * works or we have tried them all.
  */
 #ifdef notdef	/* XXX JH */
 int
 lite2_vfs_mountroot()
 {
 	struct vfsconf *vfsp;
 	extern int (*lite2_mountroot) __P((void));
 	int error;
 
 	if (lite2_mountroot != NULL)
 		return ((*lite2_mountroot)());
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		if (vfsp->vfc_mountroot == NULL)
 			continue;
 		if ((error = (*vfsp->vfc_mountroot)()) == 0)
 			return (0);
 		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
 	}
 	return (ENODEV);
 }
 #endif
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 vfs_getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			simple_unlock(&mountlist_slock);
 			return (mp);
 	    }
 	}
 	simple_unlock(&mountlist_slock);
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid
  */
 void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
 	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
 	int mtype;
 
 	simple_lock(&mntid_slock); 
 	mtype = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_stat.f_fsid.val[0] = umakedev(255, mtype);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
 	if (xxxfs_mntid == 0)
 		++xxxfs_mntid;
 	tfsid.val[0] = umakedev(255, mtype + (xxxfs_mntid << 16));
 	tfsid.val[1] = mtype;
 	if (mountlist.cqh_first != (void *)&mountlist) {
 		while (vfs_getvfs(&tfsid)) {
 			xxxfs_mntid++;
 			tfsid.val[0] = umakedev(255,
 			    mtype + (xxxfs_mntid << 16));
 		}
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	simple_unlock(&mntid_slock);
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = VNOVAL;
 	vap->va_nlink = VNOVAL;
 	vap->va_uid = VNOVAL;
 	vap->va_gid = VNOVAL;
 	vap->va_fsid = VNOVAL;
 	vap->va_fileid = VNOVAL;
 	vap->va_blocksize = VNOVAL;
 	vap->va_rdev = VNOVAL;
 	vap->va_atime.tv_sec = VNOVAL;
 	vap->va_atime.tv_nsec = VNOVAL;
 	vap->va_mtime.tv_sec = VNOVAL;
 	vap->va_mtime.tv_nsec = VNOVAL;
 	vap->va_ctime.tv_sec = VNOVAL;
 	vap->va_ctime.tv_nsec = VNOVAL;
 	vap->va_flags = VNOVAL;
 	vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	int s;
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp, *tvp, *nvp;
 	vm_object_t object;
 	TAILQ_HEAD(freelst, vnode) vnode_tmp_list;
 
 	/*
 	 * We take the least recently used vnode from the freelist
 	 * if we can get it and it has no cached pages, and no
 	 * namecache entries are relative to it.
 	 * Otherwise we allocate a new vnode
 	 */
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	TAILQ_INIT(&vnode_tmp_list);
 
 	for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) {
 		nvp = TAILQ_NEXT(vp, v_freelist);
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		if (vp->v_flag & VAGE) {
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		} else {
 			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 		}
 		vp->v_flag &= ~(VTBFREE|VAGE);
 		vp->v_flag |= VFREE;
 		if (vp->v_usecount)
 			panic("tobe free vnode isn't");
 		freevnodes++;
 	}
 
 	if (wantfreevnodes && freevnodes < wantfreevnodes) {
 		vp = NULL;
 	} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
 		/* 
 		 * XXX: this is only here to be backwards compatible
 		 */
 		vp = NULL;
 	} else {
 		for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) {
 			nvp = TAILQ_NEXT(vp, v_freelist);
 			if (!simple_lock_try(&vp->v_interlock)) 
 				continue;
 			if (vp->v_usecount)
 				panic("free vnode isn't");
 
 			object = vp->v_object;
 			if (object && (object->resident_page_count || object->ref_count)) {
 				printf("object inconsistant state: RPC: %d, RC: %d\n",
 					object->resident_page_count, object->ref_count);
 				/* Don't recycle if it's caching some pages */
 				TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 				TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist);
 				continue;
 			} else if (LIST_FIRST(&vp->v_cache_src)) {
 				/* Don't recycle if active in the namecache */
 				simple_unlock(&vp->v_interlock);
 				continue;
 			} else {
 				break;
 			}
 		}
 	}
 
 	for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) {
 		nvp = TAILQ_NEXT(tvp, v_freelist);
 		TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist);
 		TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist);
 		simple_unlock(&tvp->v_interlock);
 	}
 
 	if (vp) {
 		vp->v_flag |= VDOOMED;
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 		simple_unlock(&vnode_free_list_slock);
 		cache_purge(vp);
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD) {
 			vgonel(vp, p);
 		} else {
 			simple_unlock(&vp->v_interlock);
 		}
 
 #ifdef INVARIANTS
 		{
 			int s;
 
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			s = splbio();
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			splx(s);
 		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		vp->v_writecount = 0;	/* XXX */
 		vp->v_maxio = 0;
 	} else {
 		simple_unlock(&vnode_free_list_slock);
 		vp = (struct vnode *) zalloc(vnode_zone);
 		bzero((char *) vp, sizeof *vp);
 		simple_lock_init(&vp->v_interlock);
 		vp->v_dd = vp;
 		cache_purge(vp);
 		LIST_INIT(&vp->v_cache_src);
 		TAILQ_INIT(&vp->v_cache_dst);
 		numvnodes++;
 	}
 
 	TAILQ_INIT(&vp->v_cleanblkhd);
 	TAILQ_INIT(&vp->v_dirtyblkhd);
 	vp->v_type = VNON;
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	insmntque(vp, mp);
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	splx(s);
 
 	vfs_object_create(vp, p, p->p_ucred);
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 static void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	simple_lock(&mntvnode_slock);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		LIST_REMOVE(vp, v_mntvnodes);
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL) {
 		simple_unlock(&mntvnode_slock);
 		return;
 	}
 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 	simple_unlock(&mntvnode_slock);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
 			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 	int slpflag, slptimeo;
 {
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
 	vm_object_t object;
 
 	if (flags & V_SAVE) {
 		s = splbio();
 		while (vp->v_numoutput) {
 			vp->v_flag |= VBWAIT;
 			error = tsleep((caddr_t)&vp->v_numoutput,
 			    slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
 			if (error) {
 				splx(s);
 				return (error);
 			}
 		}
 		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 			splx(s);
 			if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
 				return (error);
 			s = splbio();
 			if (vp->v_numoutput > 0 ||
 			    !TAILQ_EMPTY(&vp->v_dirtyblkhd))
 				panic("vinvalbuf: dirty bufs");
 		}
 		splx(s);
   	}
 	s = splbio();
 	for (;;) {
 		blist = TAILQ_FIRST(&vp->v_cleanblkhd);
 		if (!blist)
 			blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
 		if (!blist)
 			break;
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 				error = BUF_TIMELOCK(bp,
 				    LK_EXCLUSIVE | LK_SLEEPFAIL,
 				    "vinvalbuf", slpflag, slptimeo);
 				if (error == ENOLCK)
 					break;
 				splx(s);
 				return (error);
 			}
 			/*
 			 * XXX Since there are no node locks for NFS, I
 			 * believe there is a slight chance that a delayed
 			 * write will occur while sleeping just above, so
 			 * check for it.  Note that vfs_bio_awrite expects
 			 * buffers to reside on a queue, while VOP_BWRITE and
 			 * brelse do not.
 			 */
 			if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
 				(flags & V_SAVE)) {
 
 				if (bp->b_vp == vp) {
 					if (bp->b_flags & B_CLUSTEROK) {
 						BUF_UNLOCK(bp);
 						vfs_bio_awrite(bp);
 					} else {
 						bremfree(bp);
 						bp->b_flags |= B_ASYNC;
 						VOP_BWRITE(bp->b_vp, bp);
 					}
 				} else {
 					bremfree(bp);
 					(void) VOP_BWRITE(bp->b_vp, bp);
 				}
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
 			bp->b_flags &= ~B_ASYNC;
 			brelse(bp);
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 	}
 
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	simple_lock(&vp->v_interlock);
 	object = vp->v_object;
 	if (object != NULL) {
 		vm_object_page_remove(object, 0, 0,
 			(flags & V_SAVE) ? TRUE : FALSE);
 	}
 	simple_unlock(&vp->v_interlock);
 
 	if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
 /*
  * Truncate a file's buffer and pages to a specified length.  This
  * is in lieu of the old vinvalbuf mechanism, which performed unneeded
  * sync activity.
  */
 int
 vtruncbuf(vp, cred, p, length, blksize)
 	register struct vnode *vp;
 	struct ucred *cred;
 	struct proc *p;
 	off_t length;
 	int blksize;
 {
 	register struct buf *bp;
 	struct buf *nbp;
 	int s, anyfreed;
 	int trunclbn;
 
 	/*
 	 * Round up to the *next* lbn.
 	 */
 	trunclbn = (length + blksize - 1) / blksize;
 
 	s = splbio();
 restart:
 	anyfreed = 1;
 	for (;anyfreed;) {
 		anyfreed = 0;
 		for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)||
 					 (nbp->b_vp != vp) ||
 					 (nbp->b_flags & B_DELWRI))) {
 					goto restart;
 				}
 			}
 		}
 
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if (bp->b_lblkno >= trunclbn) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					bp->b_flags |= (B_INVAL | B_RELBUF);
 					bp->b_flags &= ~B_ASYNC;
 					brelse(bp);
 					anyfreed = 1;
 				}
 				if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)||
 					 (nbp->b_vp != vp) ||
 					 (nbp->b_flags & B_DELWRI) == 0)) {
 					goto restart;
 				}
 			}
 		}
 	}
 
 	if (length > 0) {
 restartsync:
 		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
 			nbp = TAILQ_NEXT(bp, b_vnbufs);
 			if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 					BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
 					goto restart;
 				} else {
 					bremfree(bp);
 					if (bp->b_vp == vp) {
 						bp->b_flags |= B_ASYNC;
 					} else {
 						bp->b_flags &= ~B_ASYNC;
 					}
 					VOP_BWRITE(bp->b_vp, bp);
 				}
 				goto restartsync;
 			}
 
 		}
 	}
 
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
 	}
 
 	splx(s);
 
 	vnode_pager_setsize(vp, length);
 
 	return (0);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
 
 	vhold(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	bp->b_xflags |= B_VNCLEAN;
 	bp->b_xflags &= ~B_VNDIRTY;
 	TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
 	splx(s);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	struct buflists *listheadp;
 	int s;
 
 	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
 
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	vp = bp->b_vp;
 	s = splbio();
 	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
 		if (bp->b_xflags & B_VNDIRTY)
 			listheadp = &vp->v_dirtyblkhd;
 		else 
 			listheadp = &vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
 	}
 	if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
 		vp->v_flag &= ~VONWORKLST;
 		LIST_REMOVE(vp, v_synclist);
 	}
 	splx(s);
 	bp->b_vp = (struct vnode *) 0;
 	vdrop(vp);
 }
 
 /*
  * The workitem queue.
  * 
  * It is useful to delay writes of file data and filesystem metadata
  * for tens of seconds so that quickly created and deleted files need
  * not waste disk bandwidth being created and removed. To realize this,
  * we append vnodes to a "workitem" queue. When running with a soft
  * updates implementation, most pending metadata dependencies should
  * not wait for more than a few seconds. Thus, mounted on block devices
  * are delayed only about a half the time that file data is delayed.
  * Similarly, directory updates are more critical, so are only delayed
  * about a third the time that file data is delayed. Thus, there are
  * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
  * one each second (driven off the filesystem syner process). The
  * syncer_delayno variable indicates the next queue that is to be processed.
  * Items that need to be processed soon are placed in this queue:
  *
  *	syncer_workitem_pending[syncer_delayno]
  *
  * A delay of fifteen seconds is done by placing the request fifteen
  * entries later in the queue:
  *
  *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
  *
  */
 
 /*
  * Add an item to the syncer work queue.
  */
 static void
 vn_syncer_add_to_worklist(struct vnode *vp, int delay)
 {
 	int s, slot;
 
 	s = splbio();
 
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 	}
 
 	if (delay > syncer_maxdelay - 2)
 		delay = syncer_maxdelay - 2;
 	slot = (syncer_delayno + delay) & syncer_mask;
 
 	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
 	vp->v_flag |= VONWORKLST;
 	splx(s);
 }
 
 struct  proc *updateproc;
 static void sched_sync __P((void));
 static struct kproc_desc up_kp = {
 	"syncer",
 	sched_sync,
 	&updateproc
 };
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 /*
  * System filesystem synchronizer daemon.
  */
 void 
 sched_sync(void)
 {
 	struct synclist *slp;
 	struct vnode *vp;
 	long starttime;
 	int s;
 	struct proc *p = updateproc;
 
+	p->p_flag |= P_BUFEXHAUST;
+
 	for (;;) {
 		starttime = time_second;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
 		 * of interrupt race on slp queue.
 		 */
 		s = splbio();
 		slp = &syncer_workitem_pending[syncer_delayno];
 		syncer_delayno += 1;
 		if (syncer_delayno == syncer_maxdelay)
 			syncer_delayno = 0;
 		splx(s);
 
 		while ((vp = LIST_FIRST(slp)) != NULL) {
 			if (VOP_ISLOCKED(vp) == 0) {
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 				(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
 				VOP_UNLOCK(vp, 0, p);
 			}
 			s = splbio();
 			if (LIST_FIRST(slp) == vp) {
 				/*
 				 * Note: v_tag VT_VFS vps can remain on the
 				 * worklist too with no dirty blocks, but 
 				 * since sync_fsync() moves it to a different 
 				 * slot we are safe.
 				 */
 				if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
 				    vp->v_type != VBLK)
 					panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
 				/*
 				 * Put us back on the worklist.  The worklist
 				 * routine will remove us from our current
 				 * position and then add us back in at a later
 				 * position.
 				 */
 				vn_syncer_add_to_worklist(vp, syncdelay);
 			}
 			splx(s);
 		}
 
 		/*
 		 * Do soft update processing.
 		 */
 		if (bioops.io_sync)
 			(*bioops.io_sync)(NULL);
 
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
 		 * value of N tells the filesystem syncer to process the next
 		 * N seconds worth of work on its queue ASAP. Currently rushjob
 		 * is used by the soft update code to speed up the filesystem
 		 * syncer process when the incore state is getting so far
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
 		if (rushjob > 0) {
 			rushjob -= 1;
 			continue;
 		}
 		/*
 		 * If it has taken us less than a second to process the
 		 * current work, then wait. Otherwise start right over
 		 * again. We can still lose time if any single round
 		 * takes more than two seconds, but it does not really
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
 		if (time_second == starttime)
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
 	}
 }
 
 /*
  * Request the syncer daemon to speed up its work.
  * We never push it to speed up more than half of its
  * normal turn time, otherwise it could take over the cpu.
  */
 int
 speedup_syncer()
 {
 	int s;
 
 	s = splhigh();
 	if (updateproc->p_wchan == &lbolt)
 		setrunnable(updateproc);
 	splx(s);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
 		stat_rush_requests += 1;
 		return (1);
 	}
 	return(0);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  *
  * Also sets B_PAGING flag to indicate that vnode is not fully associated
  * with the buffer.  i.e. the bp has not been linked into the vnode or
  * ref-counted.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
 
 	bp->b_vp = vp;
 	bp->b_flags |= B_PAGING;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 
 	KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
 
 #if !defined(MAX_PERF)
 	/* XXX REMOVE ME */
 	if (bp->b_vnbufs.tqe_next != NULL) {
 		panic(
 		    "relpbuf(): b_vp was probably reassignbuf()d %p %x", 
 		    bp,
 		    (int)bp->b_flags
 		);
 	}
 #endif
 	bp->b_vp = (struct vnode *) 0;
 	bp->b_flags &= ~B_PAGING;
 }
 
 void
 pbreassignbuf(bp, newvp)
 	struct buf *bp;
 	struct vnode *newvp;
 {
 #if !defined(MAX_PERF)
 	if ((bp->b_flags & B_PAGING) == 0) {
 		panic(
 		    "pbreassignbuf() on non phys bp %p", 
 		    bp
 		);
 	}
 #endif
 	bp->b_vp = newvp;
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	struct buflists *listheadp;
 	int delay;
 	int s;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
+	++reassignbufcalls;
 
 #if !defined(MAX_PERF)
 	/*
 	 * B_PAGING flagged buffers cannot be reassigned because their vp
 	 * is not fully linked in.
 	 */
 	if (bp->b_flags & B_PAGING)
 		panic("cannot reassign paging buffer");
 #endif
 
 	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) {
 		if (bp->b_xflags & B_VNDIRTY)
 			listheadp = &bp->b_vp->v_dirtyblkhd;
 		else 
 			listheadp = &bp->b_vp->v_cleanblkhd;
 		TAILQ_REMOVE(listheadp, bp, b_vnbufs);
 		bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN);
 		if (bp->b_vp != newvp) {
 			vdrop(bp->b_vp);
 			bp->b_vp = NULL;	/* for clarification */
 		}
 	}
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
 		listheadp = &newvp->v_dirtyblkhd;
 		if ((newvp->v_flag & VONWORKLST) == 0) {
 			switch (newvp->v_type) {
 			case VDIR:
 				delay = dirdelay;
 				break;
 			case VBLK:
 				if (newvp->v_specmountpoint != NULL) {
 					delay = metadelay;
 					break;
 				}
 				/* fall through */
 			default:
 				delay = filedelay;
 			}
 			vn_syncer_add_to_worklist(newvp, delay);
 		}
 		bp->b_xflags |= B_VNDIRTY;
 		tbp = TAILQ_FIRST(listheadp);
 		if (tbp == NULL ||
-		    (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) {
+		    bp->b_lblkno == 0 ||
+		    (bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
 			TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
-		} else {
-			if (bp->b_lblkno >= 0) {
-				struct buf *ttbp;
-				while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
-				    (ttbp->b_lblkno < bp->b_lblkno)) {
-					tbp = ttbp;
-				}
+			++reassignbufsortgood;
+		} else if (bp->b_lblkno < 0) {
+			TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+			++reassignbufsortgood;
+		} else if (reassignbufmethod == 1) {
+			/*
+			 * New sorting algorithm, only handle sequential case,
+			 * otherwise guess.
+			 */
+			if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
+			    (tbp->b_xflags & B_VNDIRTY)) {
 				TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
+				++reassignbufsortgood;
 			} else {
-				TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
+				TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
+				++reassignbufsortbad;
+			}
+		} else {
+			/*
+			 * Old sorting algorithm, scan queue and insert
+			 */
+			struct buf *ttbp;
+			while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
+			    (ttbp->b_lblkno < bp->b_lblkno)) {
+				++reassignbufloops;
+				tbp = ttbp;
 			}
+			TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
 		}
 	} else {
 		bp->b_xflags |= B_VNCLEAN;
 		TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
 		if ((newvp->v_flag & VONWORKLST) &&
 		    TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
 			newvp->v_flag &= ~VONWORKLST;
 			LIST_REMOVE(newvp, v_synclist);
 		}
 	}
 	if (bp->b_vp != newvp) {
 		bp->b_vp = newvp;
 		vhold(bp->b_vp);
 	}
 	splx(s);
 }
 
 /*
  * Create a vnode for a block device.
  * Used for mounting the root file system.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV) {
 		*vpp = NULLVP;
 		return (ENXIO);
 	}
 	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
 	if ((nvp = checkalias(vp, dev2udev(dev), (struct mount *)0)) != NULL) {
 		vput(vp);
 		vp = nvp;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Check to see if the new vnode represents a special device
  * for which we already have a vnode (either because of
  * bdevvp() or because of a different vnode representing
  * the same block device). If such an alias exists, deallocate
  * the existing contents and return the aliased vnode. The
  * caller is responsible for filling it with its new contents.
  */
 struct vnode *
 checkalias(nvp, nvp_rdev, mp)
 	register struct vnode *nvp;
 	udev_t nvp_rdev;
 	struct mount *mp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp;
 	struct vnode **vpp;
 	dev_t	dev;
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		return (NULLVP);
 
 	dev = udev2dev(nvp_rdev, 2);
 
 	vpp = &speclisth[SPECHASH(dev)];
 loop:
 	simple_lock(&spechash_slock);
 	for (vp = *vpp; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || nvp->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 * Only alias active device nodes.
 		 * Not sure why we don't re-use this like we do below.
 		 */
 		simple_lock(&vp->v_interlock);
 		if (vp->v_usecount == 0) {
 			simple_unlock(&spechash_slock);
 			vgonel(vp, p);
 			goto loop;
 		}
 		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
 			/*
 			 * It dissappeared, and we may have slept.
 			 * Restart from the beginning
 			 */
 			simple_unlock(&spechash_slock);
 			goto loop;
 		}
 		break;
 	}
 	/*
 	 * It would be a lot clearer what is going on here if
 	 * this had been expressed as:
 	 * if ( vp && (vp->v_tag == VT_NULL))
 	 * and the clauses had been swapped.
 	 */
 	if (vp == NULL || vp->v_tag != VT_NON) {
 		struct specinfo *sinfo;
 
 		/*
 		 * Put the new vnode into the hash chain.
 		 * and if there was an alias, connect them.
 		 */
 		MALLOC(sinfo, struct specinfo *,
 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
 		bzero(sinfo, sizeof(struct specinfo));
 		nvp->v_specinfo = sinfo;
 		sinfo->si_rdev = dev;
 		sinfo->si_hashchain = vpp;
 		sinfo->si_specnext = *vpp;
 		sinfo->si_bsize_phys = DEV_BSIZE;
 		sinfo->si_bsize_best = BLKDEV_IOSIZE;
 		sinfo->si_bsize_max = MAXBSIZE;
 
 		/*
 		 * Ask the device to fix up specinfo.  Typically the 
 		 * si_bsize_* parameters may need fixing up.
 		 */
 
 		if (nvp->v_type == VBLK) {
 			if (bdevsw(dev) && bdevsw(dev)->d_parms)
 				(*bdevsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
 		} else if (nvp->v_type == VCHR) {
 			if (devsw(dev) && devsw(dev)->d_parms)
 				(*devsw(dev)->d_parms)(dev, sinfo, DPARM_GET);
 		}
 
 		simple_unlock(&spechash_slock);
 		*vpp = nvp;
 		if (vp != NULLVP) {
 			nvp->v_flag |= VALIASED;
 			vp->v_flag |= VALIASED;
 			vput(vp);
 		}
 		return (NULLVP);
 	}
 	/*
 	 * if ( vp && (vp->v_tag == VT_NULL))
 	 * We have a vnode alias, but it is a trashed.
 	 * Make it look like it's newley allocated. (by getnewvnode())
 	 * The caller should use this instead.
 	 */
 	simple_unlock(&spechash_slock);
 	VOP_UNLOCK(vp, 0, p);
 	simple_lock(&vp->v_interlock);
 	vclean(vp, 0, p);
 	vp->v_op = nvp->v_op;
 	vp->v_tag = nvp->v_tag;
 	nvp->v_type = VNON;
 	insmntque(vp, mp);
 	return (vp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
  */
 int
 vget(vp, flags, p)
 	register struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int error;
 
 	/*
 	 * If the vnode is in the process of being cleaned out for
 	 * another use, we wait for the cleaning to finish and then
 	 * return failure. Cleaning is determined by checking that
 	 * the VXLOCK flag is set.
 	 */
 	if ((flags & LK_INTERLOCK) == 0) {
 		simple_lock(&vp->v_interlock);
 	}
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vget", 0);
 		return (ENOENT);
 	}
 
 	vp->v_usecount++;
 
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	if (flags & LK_TYPE_MASK) {
 		if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
 			/*
 			 * must expand vrele here because we do not want
 			 * to call VOP_INACTIVE if the reference count
 			 * drops back to zero since it was never really
 			 * active. We must remove it from the free list
 			 * before sleeping so that multiple processes do
 			 * not try to recycle it.
 			 */
 			simple_lock(&vp->v_interlock);
 			vp->v_usecount--;
 			if (VSHOULDFREE(vp))
 				vfree(vp);
 			simple_unlock(&vp->v_interlock);
 		}
 		return (error);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 void
 vref(struct vnode *vp)
 {
 	simple_lock(&vp->v_interlock);
 	vp->v_usecount++;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Vnode put/release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vrele: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 
 		vp->v_usecount--;
 		simple_unlock(&vp->v_interlock);
 
 		return;
 	}
 
 	if (vp->v_usecount == 1) {
 
 		vp->v_usecount--;
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 	/*
 	 * If we are doing a vput, the node is already locked, and we must
 	 * call VOP_INACTIVE with the node locked.  So, in the case of
 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
 	 */
 		if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
 			VOP_INACTIVE(vp, p);
 		}
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 		simple_unlock(&vp->v_interlock);
 #endif
 		panic("vrele: negative ref cnt");
 	}
 }
 
 void
 vput(vp)
 	struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT(vp != NULL, ("vput: null vp"));
 
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_usecount > 1) {
 
 		vp->v_usecount--;
 		VOP_UNLOCK(vp, LK_INTERLOCK, p);
 		return;
 
 	}
 
 	if (vp->v_usecount == 1) {
 
 		vp->v_usecount--;
 		if (VSHOULDFREE(vp))
 			vfree(vp);
 	/*
 	 * If we are doing a vput, the node is already locked, and we must
 	 * call VOP_INACTIVE with the node locked.  So, in the case of
 	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
 	 */
 		simple_unlock(&vp->v_interlock);
 		VOP_INACTIVE(vp, p);
 
 	} else {
 #ifdef DIAGNOSTIC
 		vprint("vput: negative ref count", vp);
 #endif
 		panic("vput: negative ref cnt");
 	}
 }
 
 /*
  * Somebody doesn't want the vnode recycled.
  */
 void
 vhold(vp)
 	register struct vnode *vp;
 {
 	int s;
 
   	s = splbio();
 	vp->v_holdcnt++;
 	if (VSHOULDBUSY(vp))
 		vbusy(vp);
 	splx(s);
 }
 
 /*
  * One less who cares about this vnode.
  */
 void
 vdrop(vp)
 	register struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt");
 	vp->v_holdcnt--;
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 	splx(s);
 }
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If MNT_NOFORCE is specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If MNT_FORCE is specified, detach any active vnodes
  * that are found.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, skipvp, flags)
 	struct mount *mp;
 	struct vnode *skipvp;
 	int flags;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp, *nvp;
 	int busy = 0;
 
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		/*
 		 * Skip over a selected vnode.
 		 */
 		if (vp == skipvp)
 			continue;
 
 		simple_lock(&vp->v_interlock);
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 		/*
 		 * If WRITECLOSE is set, only flush out regular file vnodes
 		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
 
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			simple_unlock(&mntvnode_slock);
 			vgonel(vp, p);
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			simple_unlock(&mntvnode_slock);
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
 				vgonel(vp, p);
 			} else {
 				vclean(vp, 0, p);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		simple_unlock(&vp->v_interlock);
 		busy++;
 	}
 	simple_unlock(&mntvnode_slock);
 	if (busy)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 static void
 vclean(vp, flags, p)
 	struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
 	int active;
 	vm_object_t obj;
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		vp->v_usecount++;
 
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still
 	 * have the object locked while it cleans it out. The VOP_LOCK
 	 * ensures that the VOP_INACTIVE routine is done with its work.
 	 * For active vnodes, it ensures that no other activity can
 	 * occur while the underlying object is being cleaned out.
 	 */
 	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
 
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 */
 	vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
 	if ((obj = vp->v_object) != NULL) {
 		if (obj->ref_count == 0) {
 			/*
 			 * This is a normal way of shutting down the object/vnode
 			 * association.
 			 */
 			vm_object_terminate(obj);
 		} else {
 			/*
 			 * Woe to the process that tries to page now :-).
 			 */
 			vm_pager_deallocate(obj);
 		}
 	}
 
 	/*
 	 * If purging an active vnode, it must be closed and
 	 * deactivated before being reclaimed. Note that the
 	 * VOP_INACTIVE will unlock the vnode.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
 		VOP_INACTIVE(vp, p);
 	} else {
 		/*
 		 * Any other processes trying to obtain this lock must first
 		 * wait for VXLOCK to clear, then call the new lock operation.
 		 */
 		VOP_UNLOCK(vp, 0, p);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp, p))
 		panic("vclean: cannot reclaim");
 
 	if (active)
 		vrele(vp);
 
 	cache_purge(vp);
 	if (vp->v_vnlock) {
 #if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */
 #ifdef DIAGNOSTIC
 		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
 			vprint("vclean: lock not drained", vp);
 #endif
 #endif
 		FREE(vp->v_vnlock, M_VNODE);
 		vp->v_vnlock = NULL;
 	}
 
 	if (VSHOULDFREE(vp))
 		vfree(vp);
 
 	/*
 	 * Done with purge, notify sleepers of the grim news.
 	 */
 	vp->v_op = dead_vnodeop_p;
 	vn_pollgone(vp);
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
 vop_revoke(ap)
 	struct vop_revoke_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *vp, *vq;
 	struct proc *p = curproc;	/* XXX */
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
 
 	vp = ap->a_vp;
 	simple_lock(&vp->v_interlock);
 
 	if (vp->v_flag & VALIASED) {
 		/*
 		 * If a vgone (or vclean) is already in progress,
 		 * wait until it is done and return.
 		 */
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
 			simple_unlock(&vp->v_interlock);
 			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
 			return (0);
 		}
 		/*
 		 * Ensure that vp will not be vgone'd while we
 		 * are eliminating its aliases.
 		 */
 		vp->v_flag |= VXLOCK;
 		simple_unlock(&vp->v_interlock);
 		while (vp->v_flag & VALIASED) {
 			simple_lock(&spechash_slock);
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type || vp == vq)
 					continue;
 				simple_unlock(&spechash_slock);
 				vgone(vq);
 				break;
 			}
 			if (vq == NULLVP) {
 				simple_unlock(&spechash_slock);
 			}
 		}
 		/*
 		 * Remove the lock so that vgone below will
 		 * really eliminate the vnode after which time
 		 * vgone will awaken any sleepers.
 		 */
 		simple_lock(&vp->v_interlock);
 		vp->v_flag &= ~VXLOCK;
 		if (vp->v_flag & VXWANT) {
 			vp->v_flag &= ~VXWANT;
 			wakeup(vp);
 		}
 	}
 	vgonel(vp, p);
 	return (0);
 }
 
 /*
  * Recycle an unused vnode to the front of the free list.
  * Release the passed interlock if the vnode will be recycled.
  */
 int
 vrecycle(vp, inter_lkp, p)
 	struct vnode *vp;
 	struct simplelock *inter_lkp;
 	struct proc *p;
 {
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount == 0) {
 		if (inter_lkp) {
 			simple_unlock(inter_lkp);
 		}
 		vgonel(vp, p);
 		return (1);
 	}
 	simple_unlock(&vp->v_interlock);
 	return (0);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	simple_lock(&vp->v_interlock);
 	vgonel(vp, p);
 }
 
 /*
  * vgone, with the vp interlock held.
  */
 static void
 vgonel(vp, p)
 	struct vnode *vp;
 	struct proc *p;
 {
 	int s;
 	struct vnode *vq;
 	struct vnode *vx;
 
 	/*
 	 * If a vgone (or vclean) is already in progress,
 	 * wait until it is done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
 		return;
 	}
 
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE, p);
 	simple_lock(&vp->v_interlock);
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		insmntque(vp, (struct mount *)0);
 	/*
 	 * If special device, remove it from special device alias list
 	 * if it is on one.
 	 */
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
 		simple_lock(&spechash_slock);
 		if (*vp->v_hashchain == vp) {
 			*vp->v_hashchain = vp->v_specnext;
 		} else {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_specnext != vp)
 					continue;
 				vq->v_specnext = vp->v_specnext;
 				break;
 			}
 			if (vq == NULL)
 				panic("missing bdev");
 		}
 		if (vp->v_flag & VALIASED) {
 			vx = NULL;
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type)
 					continue;
 				if (vx)
 					break;
 				vx = vq;
 			}
 			if (vx == NULL)
 				panic("missing alias");
 			if (vq == NULL)
 				vx->v_flag &= ~VALIASED;
 			vp->v_flag &= ~VALIASED;
 		}
 		simple_unlock(&spechash_slock);
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
 
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the back
 	 * pointer and the reference count of zero is because
 	 * it will be removed from the free list by getnewvnode,
 	 * but will not have its reference count incremented until
 	 * after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to
 	 * close the previous instance of the underlying object.
 	 */
 	if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
 		s = splbio();
 		simple_lock(&vnode_free_list_slock);
 		if (vp->v_flag & VFREE) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		} else if (vp->v_flag & VTBFREE) {
 			TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 			vp->v_flag &= ~VTBFREE;
 			freevnodes++;
 		} else
 			freevnodes++;
 		vp->v_flag |= VFREE;
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		simple_unlock(&vnode_free_list_slock);
 		splx(s);
 	}
 
 	vp->v_type = VBAD;
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	int rc = 0;
 
 	simple_lock(&spechash_slock);
 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || type != vp->v_type)
 			continue;
 		*vpp = vp;
 		rc = 1;
 		break;
 	}
 	simple_unlock(&spechash_slock);
 	return (rc);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	register struct vnode *vp;
 {
 	struct vnode *vq, *vnext;
 	int count;
 
 loop:
 	if ((vp->v_flag & VALIASED) == 0)
 		return (vp->v_usecount);
 	simple_lock(&spechash_slock);
 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
 		vnext = vq->v_specnext;
 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vq->v_usecount == 0 && vq != vp) {
 			simple_unlock(&spechash_slock);
 			vgone(vq);
 			goto loop;
 		}
 		count += vq->v_usecount;
 	}
 	simple_unlock(&spechash_slock);
 	return (count);
 }
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	register struct vnode *vp;
 {
 	char buf[96];
 
 	if (label != NULL)
 		printf("%s: %p: ", label, (void *)vp);
 	else
 		printf("%p: ", (void *)vp);
 	printf("type %s, usecount %d, writecount %d, refcount %d,",
 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
 	if (vp->v_flag & VTEXT)
 		strcat(buf, "|VTEXT");
 	if (vp->v_flag & VSYSTEM)
 		strcat(buf, "|VSYSTEM");
 	if (vp->v_flag & VXLOCK)
 		strcat(buf, "|VXLOCK");
 	if (vp->v_flag & VXWANT)
 		strcat(buf, "|VXWANT");
 	if (vp->v_flag & VBWAIT)
 		strcat(buf, "|VBWAIT");
 	if (vp->v_flag & VALIASED)
 		strcat(buf, "|VALIASED");
 	if (vp->v_flag & VDOOMED)
 		strcat(buf, "|VDOOMED");
 	if (vp->v_flag & VFREE)
 		strcat(buf, "|VFREE");
 	if (vp->v_flag & VOBJBUF)
 		strcat(buf, "|VOBJBUF");
 	if (buf[0] != '\0')
 		printf(" flags (%s)", &buf[1]);
 	if (vp->v_data == NULL) {
 		printf("\n");
 	} else {
 		printf("\n\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 #include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = mp->mnt_list.cqe_next;
 			continue;
 		}
 		for (vp = mp->mnt_vnodelist.lh_first;
 		     vp != NULL;
 		     vp = vp->v_mntvnodes.le_next) {
 			if (VOP_ISLOCKED(vp))
 				vprint((char *)0, vp);
 		}
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 }
 #endif
 
 /*
  * Top level filesystem related information gathering.
  */
 static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
 
 static int
 vfs_sysctl SYSCTL_HANDLER_ARGS
 {
 	int *name = (int *)arg1 - 1;	/* XXX */
 	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
 #if 1 || defined(COMPAT_PRELITE2)
 	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
 	if (namelen == 1)
 		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
 #endif
 
 #ifdef notyet
 	/* all sysctl names at this level are at least name and field */
 	if (namelen < 2)
 		return (ENOTDIR);		/* overloaded */
 	if (name[0] != VFS_GENERIC) {
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[0])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 		    oldp, oldlenp, newp, newlen, p));
 	}
 #endif
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
 		if (namelen != 2)
 			return (ENOTDIR);
 		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
 		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
 		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
 SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
 	"Generic filesystem");
 
 #if 1 || defined(COMPAT_PRELITE2)
 
 static int
 sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
 {
 	int error;
 	struct vfsconf *vfsp;
 	struct ovfsconf ovfs;
 
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
 		strcpy(ovfs.vfc_name, vfsp->vfc_name);
 		ovfs.vfc_index = vfsp->vfc_typenum;
 		ovfs.vfc_refcount = vfsp->vfc_refcount;
 		ovfs.vfc_flags = vfsp->vfc_flags;
 		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 
 #endif /* 1 || COMPAT_PRELITE2 */
 
 #if 0
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
 static int
 sysctl_vnode SYSCTL_HANDLER_ARGS
 {
 	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *nvp, *vp;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
 
 	req->lock = 0;
 	if (!req->oldptr) /* Make an estimate */
 		return (SYSCTL_OUT(req, 0,
 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = mp->mnt_list.cqe_next;
 			continue;
 		}
 again:
 		simple_lock(&mntvnode_slock);
 		for (vp = mp->mnt_vnodelist.lh_first;
 		     vp != NULL;
 		     vp = nvp) {
 			/*
 			 * Check that the vp is still associated with
 			 * this filesystem.  RACE: could have been
 			 * recycled onto the same filesystem.
 			 */
 			if (vp->v_mount != mp) {
 				simple_unlock(&mntvnode_slock);
 				goto again;
 			}
 			nvp = vp->v_mntvnodes.le_next;
 			simple_unlock(&mntvnode_slock);
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
 				return (error);
 			simple_lock(&mntvnode_slock);
 		}
 		simple_unlock(&mntvnode_slock);
 		simple_lock(&mountlist_slock);
 		nmp = mp->mnt_list.cqe_next;
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
 
 	return (0);
 }
 #endif
 
 /*
  * XXX
  * Exporting the vnode list on large systems causes them to crash.
  * Exporting the vnode list on medium systems causes sysctl to coredump.
  */
 #if 0
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,vnode", "");
 #endif
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	struct vnode *vp;
 {
 	struct vnode *vq;
 	int error = 0;
 
 	if (vp->v_specmountpoint != NULL)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		simple_lock(&spechash_slock);
 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
 			if (vq->v_specmountpoint != NULL) {
 				error = EBUSY;
 				break;
 			}
 		}
 		simple_unlock(&spechash_slock);
 	}
 	return (error);
 }
 
 /*
  * Unmount all filesystems. The list is traversed in reverse order
  * of mounting to avoid dependencies.
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp, *nmp;
 	struct proc *p;
 	int error;
 
 	if (curproc != NULL)
 		p = curproc;
 	else
 		p = initproc;	/* XXX XXX should this be proc0? */
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_prev;
 		error = dounmount(mp, MNT_FORCE, p);
 		if (error) {
 			printf("unmount of %s failed (",
 			    mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		}
 	}
 }
 
 /*
  * Build hash lists of net addresses and hang them off the mount point.
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
 vfs_hang_addrlist(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	register int i;
 	struct radix_node *rn;
 	struct sockaddr *saddr, *smask = 0;
 	struct domain *dom;
 	int error;
 
 	if (argp->ex_addrlen == 0) {
 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 			return (EPERM);
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		np->netc_anon = argp->ex_anon;
 		np->netc_anon.cr_ref = 1;
 		mp->mnt_flag |= MNT_DEFEXPORTED;
 		return (0);
 	}
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 	bzero((caddr_t) np, i);
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 		error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
 			smask->sa_len = argp->ex_masklen;
 	}
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
 		 * Seems silly to initialize every AF when most are not used,
 		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 	}
 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 	    np->netc_rnodes);
 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 		error = EPERM;
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
 	np->netc_anon.cr_ref = 1;
 	return (0);
 out:
 	free(np, M_NETADDR);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 vfs_free_netcred(rn, w)
 	struct radix_node *rn;
 	void *w;
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
 /*
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
 vfs_free_addrlist(nep)
 	struct netexport *nep;
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
 		if ((rnh = nep->ne_rtable[i])) {
 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 			    (caddr_t) rnh);
 			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
 
 int
 vfs_export(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 
 	if (argp->ex_flags & MNT_DELEXPORT) {
 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 			vfs_setpublicfs(NULL, NULL, NULL);
 			mp->mnt_flag &= ~MNT_EXPUBLIC;
 		}
 		vfs_free_addrlist(nep);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if (argp->ex_flags & MNT_EXPUBLIC) {
 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
 				return (error);
 			mp->mnt_flag |= MNT_EXPUBLIC;
 		}
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
 	return (0);
 }
 
 
 /*
  * Set the publicly exported filesystem (WebNFS). Currently, only
  * one public filesystem is possible in the spec (RFC 2054 and 2055)
  */
 int
 vfs_setpublicfs(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 	struct vnode *rvp;
 	char *cp;
 
 	/*
 	 * mp == NULL -> invalidate the current info, the FS is
 	 * no longer exported. May be called from either vfs_export
 	 * or unmount, so check if it hasn't already been done.
 	 */
 	if (mp == NULL) {
 		if (nfs_pub.np_valid) {
 			nfs_pub.np_valid = 0;
 			if (nfs_pub.np_index != NULL) {
 				FREE(nfs_pub.np_index, M_TEMP);
 				nfs_pub.np_index = NULL;
 			}
 		}
 		return (0);
 	}
 
 	/*
 	 * Only one allowed at a time.
 	 */
 	if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
 		return (EBUSY);
 
 	/*
 	 * Get real filehandle for root of exported FS.
 	 */
 	bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
 	nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
 
 	if ((error = VFS_ROOT(mp, &rvp)))
 		return (error);
 
 	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
 		return (error);
 
 	vput(rvp);
 
 	/*
 	 * If an indexfile was specified, pull it in.
 	 */
 	if (argp->ex_indexfile != NULL) {
 		MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
 		    M_WAITOK);
 		error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
 		    MAXNAMLEN, (size_t *)0);
 		if (!error) {
 			/*
 			 * Check for illegal filenames.
 			 */
 			for (cp = nfs_pub.np_index; *cp; cp++) {
 				if (*cp == '/') {
 					error = EINVAL;
 					break;
 				}
 			}
 		}
 		if (error) {
 			FREE(nfs_pub.np_index, M_TEMP);
 			return (error);
 		}
 	}
 
 	nfs_pub.np_mount = mp;
 	nfs_pub.np_valid = 1;
 	return (0);
 }
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
 	register struct mount *mp;
 	struct netexport *nep;
 	struct sockaddr *nam;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
 
 	np = NULL;
 	if (mp->mnt_flag & MNT_EXPORTED) {
 		/*
 		 * Lookup in the export list first.
 		 */
 		if (nam != NULL) {
 			saddr = nam;
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
 					(*rnh->rnh_matchaddr)((caddr_t)saddr,
 							      rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
 		}
 		/*
 		 * If no address match, use the default if it exists.
 		 */
 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 			np = &nep->ne_defexported;
 	}
 	return (np);
 }
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags) {
 	struct vnode *vp, *nvp;
 	struct vm_object *obj;
 	int anyio, tries;
 
 	tries = 5;
 loop:
 	anyio = 0;
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 
 		nvp = vp->v_mntvnodes.le_next;
 
 		if (vp->v_mount != mp) {
 			goto loop;
 		}
 
 		if (vp->v_flag & VXLOCK)	/* XXX: what if MNT_WAIT? */
 			continue;
 
 		if (flags != MNT_WAIT) {
 			obj = vp->v_object;
 			if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0)
 				continue;
 			if (VOP_ISLOCKED(vp))
 				continue;
 		}
 
 		simple_lock(&vp->v_interlock);
 		if (vp->v_object &&
 		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
 			if (!vget(vp,
 				LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
 				if (vp->v_object) {
 					vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0);
 					anyio = 1;
 				}
 				vput(vp);
 			}
 		} else {
 			simple_unlock(&vp->v_interlock);
 		}
 	}
 	if (anyio && (--tries > 0))
 		goto loop;
 }
 
 /*
  * Create the VM object needed for VMIO and mmap support.  This
  * is done for all VREG files in the system.  Some filesystems might
  * afford the additional metadata buffering capability of the
  * VMIO code by making the device node be VMIO mode also.
  *
  * vp must be locked when vfs_object_create is called.
  */
 int
 vfs_object_create(vp, p, cred)
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 {
 	struct vattr vat;
 	vm_object_t object;
 	int error = 0;
 
 	if ((vp->v_type != VREG) && (vp->v_type != VBLK))
 		return 0;
 
 retry:
 	if ((object = vp->v_object) == NULL) {
 		if (vp->v_type == VREG) {
 			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
 				goto retn;
 			object = vnode_pager_alloc(vp, vat.va_size, 0, 0);
 		} else if (bdevsw(vp->v_rdev) != NULL) {
 			/*
 			 * This simply allocates the biggest object possible
 			 * for a VBLK vnode.  This should be fixed, but doesn't
 			 * cause any problems (yet).
 			 */
 			object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0);
 		} else {
 			goto retn;
 		}
 		/*
 		 * Dereference the reference we just created.  This assumes
 		 * that the object is associated with the vp.
 		 */
 		object->ref_count--;
 		vp->v_usecount--;
 	} else {
 		if (object->flags & OBJ_DEAD) {
 			VOP_UNLOCK(vp, 0, p);
 			tsleep(object, PVM, "vodead", 0);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 			goto retry;
 		}
 	}
 
 	KASSERT(vp->v_object != NULL, ("vfs_object_create: NULL object"));
 	vp->v_flag |= VOBJBUF;
 
 retn:
 	return error;
 }
 
 static void
 vfree(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	if (vp->v_flag & VTBFREE) {
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		vp->v_flag &= ~VTBFREE;
 	}
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~VAGE;
 	vp->v_flag |= VFREE;
 	splx(s);
 }
 
 void
 vbusy(vp)
 	struct vnode *vp;
 {
 	int s;
 
 	s = splbio();
 	simple_lock(&vnode_free_list_slock);
 	if (vp->v_flag & VTBFREE) {
 		TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist);
 		vp->v_flag &= ~VTBFREE;
 	} else {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 	}
 	simple_unlock(&vnode_free_list_slock);
 	vp->v_flag &= ~(VFREE|VAGE);
 	splx(s);
 }
 
 /*
  * Record a process's interest in events which might happen to
  * a vnode.  Because poll uses the historic select-style interface
  * internally, this routine serves as both the ``check for any
  * pending events'' and the ``record my interest in future events''
  * functions.  (These are done together, while the lock is held,
  * to avoid race conditions.)
  */
 int
 vn_pollrecord(vp, p, events)
 	struct vnode *vp;
 	struct proc *p;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_revents & events) {
 		/*
 		 * This leaves events we are not interested
 		 * in available for the other process which
 		 * which presumably had requested them
 		 * (otherwise they would never have been
 		 * recorded).
 		 */
 		events &= vp->v_pollinfo.vpi_revents;
 		vp->v_pollinfo.vpi_revents &= ~events;
 
 		simple_unlock(&vp->v_pollinfo.vpi_lock);
 		return events;
 	}
 	vp->v_pollinfo.vpi_events |= events;
 	selrecord(p, &vp->v_pollinfo.vpi_selinfo);
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 	return 0;
 }
 
 /*
  * Note the occurrence of an event.  If the VN_POLLEVENT macro is used,
  * it is possible for us to miss an event due to race conditions, but
  * that condition is expected to be rare, so for the moment it is the
  * preferred interface.
  */
 void
 vn_pollevent(vp, events)
 	struct vnode *vp;
 	short events;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events & events) {
 		/*
 		 * We clear vpi_events so that we don't
 		 * call selwakeup() twice if two events are
 		 * posted before the polling process(es) is
 		 * awakened.  This also ensures that we take at
 		 * most one selwakeup() if the polling process
 		 * is no longer interested.  However, it does
 		 * mean that only one event can be noticed at
 		 * a time.  (Perhaps we should only clear those
 		 * event bits which we note?) XXX
 		 */
 		vp->v_pollinfo.vpi_events = 0;	/* &= ~events ??? */
 		vp->v_pollinfo.vpi_revents |= events;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 /*
  * Wake up anyone polling on vp because it is being revoked.
  * This depends on dead_poll() returning POLLHUP for correct
  * behavior.
  */
 void
 vn_pollgone(vp)
 	struct vnode *vp;
 {
 	simple_lock(&vp->v_pollinfo.vpi_lock);
 	if (vp->v_pollinfo.vpi_events) {
 		vp->v_pollinfo.vpi_events = 0;
 		selwakeup(&vp->v_pollinfo.vpi_selinfo);
 	}
 	simple_unlock(&vp->v_pollinfo.vpi_lock);
 }
 
 
 
 /*
  * Routine to create and manage a filesystem syncer vnode.
  */
 #define sync_close ((int (*) __P((struct  vop_close_args *)))nullop)
 static int	sync_fsync __P((struct  vop_fsync_args *));
 static int	sync_inactive __P((struct  vop_inactive_args *));
 static int	sync_reclaim  __P((struct  vop_reclaim_args *));
 #define sync_lock ((int (*) __P((struct  vop_lock_args *)))vop_nolock)
 #define sync_unlock ((int (*) __P((struct  vop_unlock_args *)))vop_nounlock)
 static int	sync_print __P((struct vop_print_args *));
 #define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
 
 static vop_t **sync_vnodeop_p;
 static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
 	{ &vop_default_desc,	(vop_t *) vop_eopnotsupp },
 	{ &vop_close_desc,	(vop_t *) sync_close },		/* close */
 	{ &vop_fsync_desc,	(vop_t *) sync_fsync },		/* fsync */
 	{ &vop_inactive_desc,	(vop_t *) sync_inactive },	/* inactive */
 	{ &vop_reclaim_desc,	(vop_t *) sync_reclaim },	/* reclaim */
 	{ &vop_lock_desc,	(vop_t *) sync_lock },		/* lock */
 	{ &vop_unlock_desc,	(vop_t *) sync_unlock },	/* unlock */
 	{ &vop_print_desc,	(vop_t *) sync_print },		/* print */
 	{ &vop_islocked_desc,	(vop_t *) sync_islocked },	/* islocked */
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc sync_vnodeop_opv_desc =
 	{ &sync_vnodeop_p, sync_vnodeop_entries };
 
 VNODEOP_SET(sync_vnodeop_opv_desc);
 
 /*
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
 vfs_allocate_syncvnode(mp)
 	struct mount *mp;
 {
 	struct vnode *vp;
 	static long start, incr, next;
 	int error;
 
 	/* Allocate a new vnode */
 	if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
 		mp->mnt_syncer = NULL;
 		return (error);
 	}
 	vp->v_type = VNON;
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
 	 * at evenly distributed times even if all the filesystems
 	 * are mounted at once.
 	 */
 	next += incr;
 	if (next == 0 || next > syncer_maxdelay) {
 		start /= 2;
 		incr /= 2;
 		if (start == 0) {
 			start = syncer_maxdelay / 2;
 			incr = syncer_maxdelay;
 		}
 		next = start;
 	}
 	vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
 	mp->mnt_syncer = vp;
 	return (0);
 }
 
 /*
  * Do a lazy sync of the filesystem.
  */
 static int
 sync_fsync(ap)
 	struct vop_fsync_args /* {
 		struct vnode *a_vp;
 		struct ucred *a_cred;
 		int a_waitfor;
 		struct proc *a_p;
 	} */ *ap;
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct proc *p = ap->a_p;
 	int asyncflag;
 
 	/*
 	 * We only need to do something if this is a lazy evaluation.
 	 */
 	if (ap->a_waitfor != MNT_LAZY)
 		return (0);
 
 	/*
 	 * Move ourselves to the back of the sync list.
 	 */
 	vn_syncer_add_to_worklist(syncvp, syncdelay);
 
 	/*
 	 * Walk the list of vnodes pushing all that are dirty and
 	 * not already on the sync list.
 	 */
 	simple_lock(&mountlist_slock);
 	if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) {
 		simple_unlock(&mountlist_slock);
 		return (0);
 	}
 	asyncflag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
 	vfs_msync(mp, MNT_NOWAIT);
 	VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
 	if (asyncflag)
 		mp->mnt_flag |= MNT_ASYNC;
 	vfs_unbusy(mp, p);
 	return (0);
 }
 
 /*
  * The syncer vnode is no referenced.
  */
 static int
 sync_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	vgone(ap->a_vp);
 	return (0);
 }
 
 /*
  * The syncer vnode is no longer needed and is being decommissioned.
  *
  * Modifications to the worklist must be protected at splbio().
  */
 static int
 sync_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int s;
 
 	s = splbio();
 	vp->v_mount->mnt_syncer = NULL;
 	if (vp->v_flag & VONWORKLST) {
 		LIST_REMOVE(vp, v_synclist);
 		vp->v_flag &= ~VONWORKLST;
 	}
 	splx(s);
 
 	return (0);
 }
 
 /*
  * Print out a syncer vnode.
  */
 static int
 sync_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	printf("syncer vnode");
 	if (vp->v_vnlock != NULL)
 		lockmgr_printinfo(vp->v_vnlock);
 	printf("\n");
 	return (0);
 }
diff --git a/sys/sys/bio.h b/sys/sys/bio.h
index 42f26e437968..e6d23d86d9cc 100644
--- a/sys/sys/bio.h
+++ b/sys/sys/bio.h
@@ -1,514 +1,514 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.73 1999/06/27 11:40:03 peter Exp $
+ * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 
 #include <sys/queue.h>
 #include <sys/lock.h>
 
 struct buf;
 struct mount;
 struct vnode;
 
 /*
  * To avoid including <ufs/ffs/softdep.h> 
  */   
 LIST_HEAD(workhead, worklist);
 /*
  * These are currently used only by the soft dependency code, hence
  * are stored once in a global variable. If other subsystems wanted
  * to use these hooks, a pointer to a set of bio_ops could be added
  * to each buffer.
  */
 extern struct bio_ops {
 	void	(*io_start) __P((struct buf *));
 	void	(*io_complete) __P((struct buf *));
 	void	(*io_deallocate) __P((struct buf *));
 	int	(*io_fsync) __P((struct vnode *));
 	int	(*io_sync) __P((struct mount *));
 } bioops;
 
 struct iodone_chain {
 	long	ic_prev_flags;
 	void	(*ic_prev_iodone) __P((struct buf *));
 	void	*ic_prev_iodone_chain;
 	struct {
 		long	ia_long;
 		void	*ia_ptr;
 	}	ic_args[5];
 };
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  *
  * NOTES:
  *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
  *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
  *	originally requested buffer size and can serve as a bounds check
  *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
  *
  *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
  *	ranges of dirty data that need to be written to backing store.
  *	The range is typically clipped at b_bcount ( not b_bufsize ).
  *
  *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
  *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
 	TAILQ_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	long	b_flags;		/* B_* flags. */
 	unsigned short b_qindex;	/* buffer queue index */
 	unsigned char b_usecount;	/* buffer use count */
 	unsigned char b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	int	b_error;		/* Errno value. */
 	long	b_bufsize;		/* Allocated buffer size. */
 	long	b_bcount;		/* Valid bytes in buffer. */
 	long	b_resid;		/* Remaining I/O. */
 	dev_t	b_dev;			/* Device associated with buffer. */
 	caddr_t	b_data;			/* Memory, superblocks, indirect etc. */
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	int	b_kvasize;		/* size of kva for buffer */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 	off_t	b_offset;		/* Offset into file */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
 					/* For nested b_iodone's. */
 	struct	iodone_chain *b_iodone_chain;
 	struct	vnode *b_vp;		/* Device vnode. */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
 	void	*b_caller1;		/* for private use by the driver */
 	void	*b_caller2;		/* for private use by the driver */
 	union	pager_info {
 		void	*pg_spc;
 		int	pg_reqpage;
 	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 	struct chain_info {		/* buffer chaining */
 		struct buf *parent;
 		int count;
 	} b_chain;
 };
 
 #define b_spc	b_pager.pg_spc
 
 /*
  * These flags are kept in b_flags.
  *
  * Notes:
  *
  *	B_ASYNC		VOP calls on bp's are usually async whether or not
  *			B_ASYNC is set, but some subsystems, such as NFS, like 
  *			to know what is best for the caller so they can
  *			optimize the I/O.
  *
  *	B_PAGING	Indicates that bp is being used by the paging system or
  *			some paging system and that the bp is not linked into
  *			the b_vp's clean/dirty linked lists or ref counts.
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
  *			The situation where B_DELWRI is set and B_CACHE is
  *			clear MUST be committed to disk by getblk() so 
  *			B_DELWRI can also be cleared.  See the comments for
  *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
  *			the caller is expected to clear B_ERROR|B_INVAL,
  *			set B_READ, and initiate an I/O.
  *
  *			The 'entire buffer' is defined to be the range from
  *			0 through b_bcount.
  *
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_BAD		0x00000008	/* Bad block revectoring in progress. */
 #define	B_UNUSED1	0x00000010	/* Old B_BUSY */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_FREEBUF	0x00000100	/* Instruct driver: free blocks */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_ERROR		0x00000800	/* I/O error occurred. */
 #define	B_SCANNED	0x00001000	/* VOP_FSYNC funcs mark written bufs */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK	0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_PHYS		0x00040000	/* I/O to user memory. */
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_DIRTY		0x00200000	/* Needs writing later. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_WANT		0x00800000	/* Used by vm_pager.c */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_AUTOCHAINDONE	0x80000000	/* Available flag */
 
 #define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \
 	"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
 	"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
 	"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
 	"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
  */
 #define	B_VNDIRTY	0x01		/* On vnode dirty list */
 #define	B_VNCLEAN	0x02		/* On vnode clean list */
 
 #define	NOOFFSET	(-1LL)		/* No buffer offset calculated yet */
 
 #ifdef KERNEL
 /*
  * Buffer locking
  */
 struct simplelock buftimelock;		/* Interlock on setting prio and timo */
 extern char *buf_wmesg;			/* Default buffer lock message */
 #define BUF_WMESG "bufwait"
 #include <sys/proc.h>			/* XXX for curproc */
 /*
  * Initialize a lock.
  */
 #define BUF_LOCKINIT(bp) \
 	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0)
 /*
  *
  * Get a lock sleeping non-interruptably until it becomes available.
  */
 static __inline int BUF_LOCK __P((struct buf *, int));
 static __inline int
 BUF_LOCK(struct buf *bp, int locktype)
 {
 	int s, ret;
 
 	s = splbio();
 	simple_lock(&buftimelock);
 	locktype |= LK_INTERLOCK;
 	bp->b_lock.lk_wmesg = buf_wmesg;
 	bp->b_lock.lk_prio = PRIBIO + 4;
 	bp->b_lock.lk_timo = 0;
 	ret = lockmgr(&(bp)->b_lock, locktype, &buftimelock, curproc);
 	splx(s);
 	return ret;
 }
 /*
  * Get a lock sleeping with specified interruptably and timeout.
  */
 static __inline int BUF_TIMELOCK __P((struct buf *, int, char *, int, int));
 static __inline int
 BUF_TIMELOCK(struct buf *bp, int locktype, char *wmesg, int catch, int timo)
 {
 	int s, ret;
 
 	s = splbio();
 	simple_lock(&buftimelock);
 	locktype |= LK_INTERLOCK;
 	bp->b_lock.lk_wmesg = wmesg;
 	bp->b_lock.lk_prio = (PRIBIO + 4) | catch;
 	bp->b_lock.lk_timo = timo;
 	ret = lockmgr(&(bp)->b_lock, (locktype), &buftimelock, curproc);
 	splx(s);
 	return ret;
 }
 /*
  * Release a lock. Only the acquiring process may free the lock unless
  * it has been handed off to biodone.
  */
 static __inline void BUF_UNLOCK __P((struct buf *));
 static __inline void
 BUF_UNLOCK(struct buf *bp)
 {
 	int s;
 
 	s = splbio();
 	lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curproc);
 	splx(s);
 }
 
 /*
  * Free a buffer lock.
  */
 #define BUF_LOCKFREE(bp) 			\
 	if (BUF_REFCNT(bp) > 0)			\
 		panic("free locked buf")
 /*
  * When initiating asynchronous I/O, change ownership of the lock to the
  * kernel. Once done, the lock may legally released by biodone. The
  * original owning process can no longer acquire it recursively, but must
  * wait until the I/O is completed and the lock has been freed by biodone.
  */
 static __inline void BUF_KERNPROC __P((struct buf *));
 static __inline void
 BUF_KERNPROC(struct buf *bp)
 {
 
 	bp->b_lock.lk_lockholder = LK_KERNPROC;
 }
 /*
  * Find out the number of references to a lock.
  */
 static __inline int BUF_REFCNT __P((struct buf *));
 static __inline int
 BUF_REFCNT(struct buf *bp)
 {
 	int s, ret;
 
 	s = splbio();
 	ret = lockcount(&(bp)->b_lock);
 	splx(s);
 	return ret;
 }
 
 #endif /* KERNEL */
 
 struct buf_queue_head {
 	TAILQ_HEAD(buf_queue, buf) queue;
 	daddr_t	last_pblkno;
 	struct	buf *insert_point;
 	struct	buf *switch_point;
 };
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
  * field of the buffer on which I/O is done.  At I/O completion, cluster
  * callback uses the structure to parcel I/O's to individual buffers, and
  * then free's this structure.
  */
 struct cluster_save {
 	long	bs_bcount;		/* Saved b_bcount. */
 	long	bs_bufsize;		/* Saved b_bufsize. */
 	void	*bs_saveaddr;		/* Saved b_addr. */
 	int	bs_nchildren;		/* Number of associated buffers. */
 	struct buf **bs_children;	/* List of associated buffers. */
 };
 
 #ifdef KERNEL
 static __inline void bufq_init __P((struct buf_queue_head *head));
 
 static __inline void bufq_insert_tail __P((struct buf_queue_head *head,
 					   struct buf *bp));
 
 static __inline void bufq_remove __P((struct buf_queue_head *head,
 				      struct buf *bp));
 
 static __inline struct buf *bufq_first __P((struct buf_queue_head *head));
 
 static __inline void
 bufq_init(struct buf_queue_head *head)
 {
 	TAILQ_INIT(&head->queue);
 	head->last_pblkno = 0;
 	head->insert_point = NULL;
 	head->switch_point = NULL;
 }
 
 static __inline void
 bufq_insert_tail(struct buf_queue_head *head, struct buf *bp)
 {
 	if ((bp->b_flags & B_ORDERED) != 0) {
 		head->insert_point = bp;
 		head->switch_point = NULL;
 	}
 	TAILQ_INSERT_TAIL(&head->queue, bp, b_act);
 }
 
 static __inline void
 bufq_remove(struct buf_queue_head *head, struct buf *bp)
 {
 	if (bp == head->switch_point)
 		head->switch_point = TAILQ_NEXT(bp, b_act);
 	if (bp == head->insert_point) {
 		head->insert_point = TAILQ_PREV(bp, buf_queue, b_act);
 		if (head->insert_point == NULL)
 			head->last_pblkno = 0;
 	} else if (bp == TAILQ_FIRST(&head->queue))
 		head->last_pblkno = bp->b_pblkno;
 	TAILQ_REMOVE(&head->queue, bp, b_act);
 	if (TAILQ_FIRST(&head->queue) == head->switch_point)
 		head->switch_point = NULL;
 }
 
 static __inline struct buf *
 bufq_first(struct buf_queue_head *head)
 {
 	return (TAILQ_FIRST(&head->queue));
 }
 
 #endif /* KERNEL */
 
 
 /*
  * number of buffer hash entries
  */
 #define BUFHSZ 512
 
 /*
  * buffer hash table calculation, originally by David Greenman
  */
 #define BUFHASH(vnp, bn)        \
 	(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_LOCKED	1	/* locked buffers */
-#define QUEUE_LRU	2	/* useful buffers */
-#define QUEUE_VMIO	3	/* VMIO buffers */
-#define QUEUE_AGE	4	/* not-useful buffers */
-#define QUEUE_EMPTY	5	/* empty buffer headers*/
+#define QUEUE_CLEAN	2	/* non-B_DELWRI buffers */
+#define QUEUE_DIRTY	3	/* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	5	/* empty buffer headers */
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /* Flags to low-level allocation routines. */
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 
 #ifdef KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern TAILQ_HEAD(swqueue, buf) bswlist;
 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 struct uio;
 
 void	bufinit __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
 int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **));
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
 void	bdirty __P((struct buf *));
 void	bundirty __P((struct buf *));
 int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
 struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 int	physread __P((dev_t dev, struct uio *uio, int ioflag));
 int	physwrite __P((dev_t dev, struct uio *uio, int ioflag));
 u_int	minphys __P((struct buf *));
 void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
 void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
 int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
 
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/buf.h b/sys/sys/buf.h
index 42f26e437968..e6d23d86d9cc 100644
--- a/sys/sys/buf.h
+++ b/sys/sys/buf.h
@@ -1,514 +1,514 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.9 (Berkeley) 3/30/95
- * $Id: buf.h,v 1.73 1999/06/27 11:40:03 peter Exp $
+ * $Id: buf.h,v 1.74 1999/06/29 05:59:47 peter Exp $
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 
 #include <sys/queue.h>
 #include <sys/lock.h>
 
 struct buf;
 struct mount;
 struct vnode;
 
 /*
  * To avoid including <ufs/ffs/softdep.h> 
  */   
 LIST_HEAD(workhead, worklist);
 /*
  * These are currently used only by the soft dependency code, hence
  * are stored once in a global variable. If other subsystems wanted
  * to use these hooks, a pointer to a set of bio_ops could be added
  * to each buffer.
  */
 extern struct bio_ops {
 	void	(*io_start) __P((struct buf *));
 	void	(*io_complete) __P((struct buf *));
 	void	(*io_deallocate) __P((struct buf *));
 	int	(*io_fsync) __P((struct vnode *));
 	int	(*io_sync) __P((struct mount *));
 } bioops;
 
 struct iodone_chain {
 	long	ic_prev_flags;
 	void	(*ic_prev_iodone) __P((struct buf *));
 	void	*ic_prev_iodone_chain;
 	struct {
 		long	ia_long;
 		void	*ia_ptr;
 	}	ic_args[5];
 };
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  *
  * NOTES:
  *	b_bufsize, b_bcount.  b_bufsize is the allocation size of the
  *	buffer, either DEV_BSIZE or PAGE_SIZE aligned.  b_bcount is the
  *	originally requested buffer size and can serve as a bounds check
  *	against EOF.  For most, but not all uses, b_bcount == b_bufsize.
  *
  *	b_dirtyoff, b_dirtyend.  Buffers support piecemeal, unaligned
  *	ranges of dirty data that need to be written to backing store.
  *	The range is typically clipped at b_bcount ( not b_bufsize ).
  *
  *	b_resid.  Number of bytes remaining in I/O.  After an I/O operation
  *	completes, b_resid is usually 0 indicating 100% success.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
 	TAILQ_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	long	b_flags;		/* B_* flags. */
 	unsigned short b_qindex;	/* buffer queue index */
 	unsigned char b_usecount;	/* buffer use count */
 	unsigned char b_xflags;		/* extra flags */
 	struct lock b_lock;		/* Buffer lock */
 	int	b_error;		/* Errno value. */
 	long	b_bufsize;		/* Allocated buffer size. */
 	long	b_bcount;		/* Valid bytes in buffer. */
 	long	b_resid;		/* Remaining I/O. */
 	dev_t	b_dev;			/* Device associated with buffer. */
 	caddr_t	b_data;			/* Memory, superblocks, indirect etc. */
 	caddr_t	b_kvabase;		/* base kva for buffer */
 	int	b_kvasize;		/* size of kva for buffer */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 	off_t	b_offset;		/* Offset into file */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
 					/* For nested b_iodone's. */
 	struct	iodone_chain *b_iodone_chain;
 	struct	vnode *b_vp;		/* Device vnode. */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	daddr_t	b_pblkno;               /* physical block number */
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
 	void	*b_caller1;		/* for private use by the driver */
 	void	*b_caller2;		/* for private use by the driver */
 	union	pager_info {
 		void	*pg_spc;
 		int	pg_reqpage;
 	} b_pager;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[btoc(MAXPHYS)];
 	int		b_npages;
 	struct	workhead b_dep;		/* List of filesystem dependencies. */
 	struct chain_info {		/* buffer chaining */
 		struct buf *parent;
 		int count;
 	} b_chain;
 };
 
 #define b_spc	b_pager.pg_spc
 
 /*
  * These flags are kept in b_flags.
  *
  * Notes:
  *
  *	B_ASYNC		VOP calls on bp's are usually async whether or not
  *			B_ASYNC is set, but some subsystems, such as NFS, like 
  *			to know what is best for the caller so they can
  *			optimize the I/O.
  *
  *	B_PAGING	Indicates that bp is being used by the paging system or
  *			some paging system and that the bp is not linked into
  *			the b_vp's clean/dirty linked lists or ref counts.
  *			Buffer vp reassignments are illegal in this case.
  *
  *	B_CACHE		This may only be set if the buffer is entirely valid.
  *			The situation where B_DELWRI is set and B_CACHE is
  *			clear MUST be committed to disk by getblk() so 
  *			B_DELWRI can also be cleared.  See the comments for
  *			getblk() in kern/vfs_bio.c.  If B_CACHE is clear,
  *			the caller is expected to clear B_ERROR|B_INVAL,
  *			set B_READ, and initiate an I/O.
  *
  *			The 'entire buffer' is defined to be the range from
  *			0 through b_bcount.
  *
  *	B_MALLOC	Request that the buffer be allocated from the malloc
  *			pool, DEV_BSIZE aligned instead of PAGE_SIZE aligned.
  *
  *	B_VMIO		Indicates that the buffer is tied into an VM object.
  *			The buffer's data is always PAGE_SIZE aligned even
  *			if b_bufsize and b_bcount are not.  ( b_bufsize is 
  *			always at least DEV_BSIZE aligned, though ).
  *	
  */
 
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_NEEDCOMMIT	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_BAD		0x00000008	/* Bad block revectoring in progress. */
 #define	B_UNUSED1	0x00000010	/* Old B_BUSY */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_FREEBUF	0x00000100	/* Instruct driver: free blocks */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_ERROR		0x00000800	/* I/O error occurred. */
 #define	B_SCANNED	0x00001000	/* VOP_FSYNC funcs mark written bufs */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK	0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_PHYS		0x00040000	/* I/O to user memory. */
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_DIRTY		0x00200000	/* Needs writing later. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_WANT		0x00800000	/* Used by vm_pager.c */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define	B_ORDERED	0x08000000	/* Must guarantee I/O ordering */
 #define B_RAM		0x10000000	/* Read ahead mark (flag) */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_AUTOCHAINDONE	0x80000000	/* Available flag */
 
 #define PRINT_BUF_FLAGS "\20\40autochain\37cluster\36vmio\35ram\34ordered" \
 	"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
 	"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
 	"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
 	"\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
 
 /*
  * These flags are kept in b_xflags.
  */
 #define	B_VNDIRTY	0x01		/* On vnode dirty list */
 #define	B_VNCLEAN	0x02		/* On vnode clean list */
 
 #define	NOOFFSET	(-1LL)		/* No buffer offset calculated yet */
 
 #ifdef KERNEL
 /*
  * Buffer locking
  */
 struct simplelock buftimelock;		/* Interlock on setting prio and timo */
 extern char *buf_wmesg;			/* Default buffer lock message */
 #define BUF_WMESG "bufwait"
 #include <sys/proc.h>			/* XXX for curproc */
 /*
  * Initialize a lock.
  */
 #define BUF_LOCKINIT(bp) \
 	lockinit(&(bp)->b_lock, PRIBIO + 4, buf_wmesg, 0, 0)
 /*
  *
  * Get a lock sleeping non-interruptably until it becomes available.
  */
 static __inline int BUF_LOCK __P((struct buf *, int));
 static __inline int
 BUF_LOCK(struct buf *bp, int locktype)
 {
 	int s, ret;
 
 	s = splbio();
 	simple_lock(&buftimelock);
 	locktype |= LK_INTERLOCK;
 	bp->b_lock.lk_wmesg = buf_wmesg;
 	bp->b_lock.lk_prio = PRIBIO + 4;
 	bp->b_lock.lk_timo = 0;
 	ret = lockmgr(&(bp)->b_lock, locktype, &buftimelock, curproc);
 	splx(s);
 	return ret;
 }
 /*
  * Get a lock sleeping with specified interruptably and timeout.
  */
 static __inline int BUF_TIMELOCK __P((struct buf *, int, char *, int, int));
 static __inline int
 BUF_TIMELOCK(struct buf *bp, int locktype, char *wmesg, int catch, int timo)
 {
 	int s, ret;
 
 	s = splbio();
 	simple_lock(&buftimelock);
 	locktype |= LK_INTERLOCK;
 	bp->b_lock.lk_wmesg = wmesg;
 	bp->b_lock.lk_prio = (PRIBIO + 4) | catch;
 	bp->b_lock.lk_timo = timo;
 	ret = lockmgr(&(bp)->b_lock, (locktype), &buftimelock, curproc);
 	splx(s);
 	return ret;
 }
 /*
  * Release a lock. Only the acquiring process may free the lock unless
  * it has been handed off to biodone.
  */
 static __inline void BUF_UNLOCK __P((struct buf *));
 static __inline void
 BUF_UNLOCK(struct buf *bp)
 {
 	int s;
 
 	s = splbio();
 	lockmgr(&(bp)->b_lock, LK_RELEASE, NULL, curproc);
 	splx(s);
 }
 
 /*
  * Free a buffer lock.
  */
 #define BUF_LOCKFREE(bp) 			\
 	if (BUF_REFCNT(bp) > 0)			\
 		panic("free locked buf")
 /*
  * When initiating asynchronous I/O, change ownership of the lock to the
  * kernel. Once done, the lock may legally released by biodone. The
  * original owning process can no longer acquire it recursively, but must
  * wait until the I/O is completed and the lock has been freed by biodone.
  */
 static __inline void BUF_KERNPROC __P((struct buf *));
 static __inline void
 BUF_KERNPROC(struct buf *bp)
 {
 
 	bp->b_lock.lk_lockholder = LK_KERNPROC;
 }
 /*
  * Find out the number of references to a lock.
  */
 static __inline int BUF_REFCNT __P((struct buf *));
 static __inline int
 BUF_REFCNT(struct buf *bp)
 {
 	int s, ret;
 
 	s = splbio();
 	ret = lockcount(&(bp)->b_lock);
 	splx(s);
 	return ret;
 }
 
 #endif /* KERNEL */
 
 struct buf_queue_head {
 	TAILQ_HEAD(buf_queue, buf) queue;
 	daddr_t	last_pblkno;
 	struct	buf *insert_point;
 	struct	buf *switch_point;
 };
 
 /*
  * This structure describes a clustered I/O.  It is stored in the b_saveaddr
  * field of the buffer on which I/O is done.  At I/O completion, cluster
  * callback uses the structure to parcel I/O's to individual buffers, and
  * then free's this structure.
  */
 struct cluster_save {
 	long	bs_bcount;		/* Saved b_bcount. */
 	long	bs_bufsize;		/* Saved b_bufsize. */
 	void	*bs_saveaddr;		/* Saved b_addr. */
 	int	bs_nchildren;		/* Number of associated buffers. */
 	struct buf **bs_children;	/* List of associated buffers. */
 };
 
 #ifdef KERNEL
 static __inline void bufq_init __P((struct buf_queue_head *head));
 
 static __inline void bufq_insert_tail __P((struct buf_queue_head *head,
 					   struct buf *bp));
 
 static __inline void bufq_remove __P((struct buf_queue_head *head,
 				      struct buf *bp));
 
 static __inline struct buf *bufq_first __P((struct buf_queue_head *head));
 
 static __inline void
 bufq_init(struct buf_queue_head *head)
 {
 	TAILQ_INIT(&head->queue);
 	head->last_pblkno = 0;
 	head->insert_point = NULL;
 	head->switch_point = NULL;
 }
 
 static __inline void
 bufq_insert_tail(struct buf_queue_head *head, struct buf *bp)
 {
 	if ((bp->b_flags & B_ORDERED) != 0) {
 		head->insert_point = bp;
 		head->switch_point = NULL;
 	}
 	TAILQ_INSERT_TAIL(&head->queue, bp, b_act);
 }
 
 static __inline void
 bufq_remove(struct buf_queue_head *head, struct buf *bp)
 {
 	if (bp == head->switch_point)
 		head->switch_point = TAILQ_NEXT(bp, b_act);
 	if (bp == head->insert_point) {
 		head->insert_point = TAILQ_PREV(bp, buf_queue, b_act);
 		if (head->insert_point == NULL)
 			head->last_pblkno = 0;
 	} else if (bp == TAILQ_FIRST(&head->queue))
 		head->last_pblkno = bp->b_pblkno;
 	TAILQ_REMOVE(&head->queue, bp, b_act);
 	if (TAILQ_FIRST(&head->queue) == head->switch_point)
 		head->switch_point = NULL;
 }
 
 static __inline struct buf *
 bufq_first(struct buf_queue_head *head)
 {
 	return (TAILQ_FIRST(&head->queue));
 }
 
 #endif /* KERNEL */
 
 
 /*
  * number of buffer hash entries
  */
 #define BUFHSZ 512
 
 /*
  * buffer hash table calculation, originally by David Greenman
  */
 #define BUFHASH(vnp, bn)        \
 	(&bufhashtbl[(((uintptr_t)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_LOCKED	1	/* locked buffers */
-#define QUEUE_LRU	2	/* useful buffers */
-#define QUEUE_VMIO	3	/* VMIO buffers */
-#define QUEUE_AGE	4	/* not-useful buffers */
-#define QUEUE_EMPTY	5	/* empty buffer headers*/
+#define QUEUE_CLEAN	2	/* non-B_DELWRI buffers */
+#define QUEUE_DIRTY	3	/* B_DELWRI buffers */
+#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	5	/* empty buffer headers */
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /* Flags to low-level allocation routines. */
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 
 #ifdef KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern TAILQ_HEAD(swqueue, buf) bswlist;
 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 struct uio;
 
 void	bufinit __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
 int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **));
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
 void	bdirty __P((struct buf *));
 void	bundirty __P((struct buf *));
 int	bowrite __P((struct buf *));
 void	brelse __P((struct buf *));
 void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
 struct buf *     getpbuf __P((int *));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, long, int, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 int	physread __P((dev_t dev, struct uio *uio, int ioflag));
 int	physwrite __P((dev_t dev, struct uio *uio, int ioflag));
 u_int	minphys __P((struct buf *));
 void	vfs_bio_set_validclean __P((struct buf *, int base, int size));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages __P((struct buf *));
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
 void	relpbuf __P((struct buf *, int *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
 int	allocbuf __P((struct buf *bp, int size));
 void	reassignbuf __P((struct buf *, struct vnode *));
 void	pbreassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((int *));
 
 #endif /* KERNEL */
 
 #endif /* !_SYS_BUF_H_ */
diff --git a/sys/sys/kernel.h b/sys/sys/kernel.h
index e483763917d2..5d41ccfeea9e 100644
--- a/sys/sys/kernel.h
+++ b/sys/sys/kernel.h
@@ -1,268 +1,269 @@
 /*-
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernel.h	8.3 (Berkeley) 1/21/94
- * $Id: kernel.h,v 1.55 1999/05/06 13:42:25 peter Exp $
+ * $Id: kernel.h,v 1.56 1999/07/01 13:21:43 peter Exp $
  */
 
 #ifndef _SYS_KERNEL_H_
 #define _SYS_KERNEL_H_
 
 #include <sys/linker_set.h>
 
 #ifdef KERNEL
 
 /* Global variables for the kernel. */
 
 /* 1.1 */
 extern long hostid;
 extern char hostname[MAXHOSTNAMELEN];
 extern int hostnamelen;
 extern char domainname[MAXHOSTNAMELEN];
 extern int domainnamelen;
 extern char kernelname[MAXPATHLEN];
 
 /* 1.2 */
 extern struct timeval boottime;
 
 extern struct timezone tz;			/* XXX */
 
 extern int tick;			/* usec per tick (1000000 / hz) */
 extern int tickadj;			/* "standard" clock skew, us./tick */
 extern int hz;				/* system clock's frequency */
 extern int psratio;			/* ratio: prof / stat */
 extern int stathz;			/* statistics clock's frequency */
 extern int profhz;			/* profiling clock's frequency */
 extern int ticks;
 extern int lbolt;			/* once a second sleep address */
 extern int tickdelta;
 extern long timedelta;
 
 #endif /* KERNEL */
 
 /*
  * Enumerated types for known system startup interfaces.
  *
  * Startup occurs in ascending numeric order; the list entries are
  * sorted prior to attempting startup to guarantee order.  Items
  * of the same level are arbitrated for order based on the 'order'
  * element.
  *
  * These numbers are arbitrary and are chosen ONLY for ordering; the
  * enumeration values are explicit rather than implicit to provide
  * for binary compatibility with inserted elements.
  *
  * The SI_SUB_RUN_SCHEDULER value must have the highest lexical value.
  *
  * The SI_SUB_CONSOLE and SI_SUB_SWAP values represent values used by
  * the BSD 4.4Lite but not by FreeBSD; they are maintained in dependent
  * order to support porting.
  *
  * The SI_SUB_PROTO_BEGIN and SI_SUB_PROTO_END bracket a range of
  * initializations to take place at splimp().  This is a historical
  * wart that should be removed -- probably running everything at
  * splimp() until the first init that doesn't want it is the correct
  * fix.  They are currently present to ensure historical behavior.
  */
 enum sysinit_sub_id {
 	SI_SUB_DUMMY		= 0x0000000,	/* not executed; for linker*/
 	SI_SUB_DONE		= 0x0000001,	/* processed*/
 	SI_SUB_CONSOLE		= 0x0800000,	/* console*/
 	SI_SUB_COPYRIGHT	= 0x0800001,	/* first use of console*/
 	SI_SUB_VM		= 0x1000000,	/* virtual memory system init*/
 	SI_SUB_KMEM		= 0x1800000,	/* kernel memory*/
 	SI_SUB_CPU		= 0x2000000,	/* CPU resource(s)*/
 	SI_SUB_KLD		= 0x2100000,	/* KLD and module setup */
 	SI_SUB_INTRINSIC	= 0x2200000,	/* proc 0*/
 	SI_SUB_DEVFS		= 0x2300000,	/* get DEVFS ready */
 	SI_SUB_DRIVERS		= 0x2400000,	/* Let Drivers initialize */
 	SI_SUB_CONFIGURE	= 0x2500000,	/* Configure devices */
 	SI_SUB_RUN_QUEUE	= 0x3000000,	/* the run queue*/
 	SI_SUB_VM_CONF		= 0x3800000,	/* config VM, set limits*/
 	SI_SUB_VFS		= 0x4000000,	/* virtual file system*/
 	SI_SUB_CLOCKS		= 0x4800000,	/* real time and stat clocks*/
 	SI_SUB_MBUF		= 0x5000000,	/* mbufs*/
 	SI_SUB_CLIST		= 0x5800000,	/* clists*/
 	SI_SUB_SYSV_SHM		= 0x6400000,	/* System V shared memory*/
 	SI_SUB_SYSV_SEM		= 0x6800000,	/* System V semaphores*/
 	SI_SUB_SYSV_MSG		= 0x6C00000,	/* System V message queues*/
 	SI_SUB_P1003_1B		= 0x6E00000,	/* P1003.1B realtime */
 	SI_SUB_PSEUDO		= 0x7000000,	/* pseudo devices*/
 	SI_SUB_EXEC		= 0x7400000,	/* execve() handlers */
 	SI_SUB_PROTO_BEGIN	= 0x8000000,	/* XXX: set splimp (kludge)*/
 	SI_SUB_PROTO_IF		= 0x8400000,	/* interfaces*/
 	SI_SUB_PROTO_DOMAIN	= 0x8800000,	/* domains (address families?)*/
 	SI_SUB_PROTO_END	= 0x8ffffff,	/* XXX: set splx (kludge)*/
 	SI_SUB_KPROF		= 0x9000000,	/* kernel profiling*/
 	SI_SUB_KICK_SCHEDULER	= 0xa000000,	/* start the timeout events*/
 	SI_SUB_INT_CONFIG_HOOKS	= 0xa800000,	/* Interrupts enabled config */
 	SI_SUB_ROOT_CONF	= 0xb000000,	/* Find root devices */
 	SI_SUB_DUMP_CONF	= 0xb200000,	/* Find dump devices */
 	SI_SUB_MOUNT_ROOT	= 0xb400000,	/* root mount*/
 	SI_SUB_ROOT_FDTAB	= 0xb800000,	/* root vnode in fd table...*/
 	SI_SUB_SWAP		= 0xc000000,	/* swap*/
 	SI_SUB_INTRINSIC_POST	= 0xd000000,	/* proc 0 cleanup*/
 	SI_SUB_KTHREAD_INIT	= 0xe000000,	/* init process*/
 	SI_SUB_KTHREAD_PAGE	= 0xe400000,	/* pageout daemon*/
 	SI_SUB_KTHREAD_VM	= 0xe800000,	/* vm daemon*/
+	SI_SUB_KTHREAD_BUF	= 0xea00000,	/* buffer daemon*/
 	SI_SUB_KTHREAD_UPDATE	= 0xec00000,	/* update daemon*/
 	SI_SUB_KTHREAD_IDLE	= 0xee00000,	/* idle procs*/
 	SI_SUB_SMP		= 0xf000000,	/* idle procs*/
 	SI_SUB_RUN_SCHEDULER	= 0xfffffff	/* scheduler: no return*/
 };
 
 
 /*
  * Some enumerated orders; "ANY" sorts last.
  */
 enum sysinit_elem_order {
 	SI_ORDER_FIRST		= 0x0000000,	/* first*/
 	SI_ORDER_SECOND		= 0x0000001,	/* second*/
 	SI_ORDER_THIRD		= 0x0000002,	/* third*/
 	SI_ORDER_MIDDLE		= 0x1000000,	/* somewhere in the middle */
 	SI_ORDER_ANY		= 0xfffffff	/* last*/
 };
 
 
 /*
  * A system initialization call instance
  *
  * At the moment there is one instance of sysinit.  We probably do not
  * want two which is why this code is if'd out, but we definitely want
  * to discern SYSINIT's which take non-constant data pointers and
  * SYSINIT's which take constant data pointers,
  *
  * The C_* macros take functions expecting const void * arguments 
  * while the non-C_* macros take functions expecting just void * arguments.
  *
  * With -Wcast-qual on, the compiler issues warnings:
  *	- if we pass non-const data or functions taking non-const data
  *	  to a C_* macro.
  *
  *	- if we pass const data to the normal macros
  *
  * However, no warning is issued if we pass a function taking const data
  * through a normal non-const macro.  This is ok because the function is
  * saying it won't modify the data so we don't care whether the data is
  * modifiable or not.
  */
 
 typedef void (*sysinit_nfunc_t) __P((void *));
 typedef void (*sysinit_cfunc_t) __P((const void *));
 
 struct sysinit {
 	unsigned int	subsystem;		/* subsystem identifier*/
 	unsigned int	order;			/* init order within subsystem*/
 	sysinit_cfunc_t func;			/* function		*/
 	const void	*udata;			/* multiplexer/argument */
 };
 
 /*
  * Default: no special processing
  *
  * The C_ version of SYSINIT is for data pointers to const
  * data ( and functions taking data pointers to const data ).
  * At the moment it is no different from SYSINIT and thus
  * still results in warnings.
  *
  * The casts are necessary to have the compiler produce the
  * correct warnings when -Wcast-qual is used.
  *
  */
 #define	C_SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_init = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		ident						\
 	};							\
 	DATA_SET(sysinit_set,uniquifier ## _sys_init);
 
 #define	SYSINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSINIT(uniquifier, subsystem, order,			\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)ident)
 
 /*
  * Called on module unload: no special processing
  */
 #define	C_SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	static struct sysinit uniquifier ## _sys_uninit = {	\
 		subsystem,					\
 		order,						\
 		func,						\
 		ident						\
 	};							\
 	DATA_SET(sysuninit_set,uniquifier ## _sys_uninit)
 
 #define	SYSUNINIT(uniquifier, subsystem, order, func, ident)	\
 	C_SYSUNINIT(uniquifier, subsystem, order,		\
 	(sysinit_cfunc_t)(sysinit_nfunc_t)func, (void *)ident)
 
 void	sysinit_add __P((struct sysinit **set));
 
 /*
  * Compatibility.  To be deprecated after LKM is removed.
  */
 #include <sys/module.h>
 #define	PSEUDO_SET(sym, name) \
 	static int name ## _modevent(module_t mod, int type, void *data) \
 	{ \
 		void (*initfunc)(void *) = (void (*)(void *))data; \
 		switch (type) { \
 		case MOD_LOAD: \
 			/* printf(#name " module load\n"); */ \
 			initfunc(NULL); \
 			break; \
 		case MOD_UNLOAD: \
 			printf(#name " module unload - not possible for this module type\n"); \
 			return EINVAL; \
 		} \
 		return 0; \
 	} \
 	static moduledata_t name ## _mod = { \
 		#name, \
 		name ## _modevent, \
 		(void *)sym \
 	}; \
 	DECLARE_MODULE(name, name ## _mod, SI_SUB_PSEUDO, SI_ORDER_ANY)
 
 extern struct linker_set execsw_set;
 
 #endif /* !_SYS_KERNEL_H_*/
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 8bc314792b7e..dcecdc99a352 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -1,410 +1,410 @@
 /*-
  * Copyright (c) 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)proc.h	8.15 (Berkeley) 5/19/95
- * $Id: proc.h,v 1.83 1999/06/30 15:33:41 peter Exp $
+ * $Id: proc.h,v 1.84 1999/07/01 13:21:45 peter Exp $
  */
 
 #ifndef _SYS_PROC_H_
 #define	_SYS_PROC_H_
 
 #include <machine/proc.h>		/* Machine-dependent proc substruct. */
 #include <sys/callout.h>		/* For struct callout_handle. */
 #include <sys/filedesc.h>
 #include <sys/queue.h>
 #include <sys/rtprio.h>			/* For struct rtprio. */
 #include <sys/signal.h>
 #ifndef KERNEL
 #include <sys/time.h>			/* For structs itimerval, timeval. */
 #endif
 #include <sys/ucred.h>
 
 /*
  * One structure allocated per session.
  */
 struct	session {
 	int	s_count;		/* Ref cnt; pgrps in session. */
 	struct	proc *s_leader;		/* Session leader. */
 	struct	vnode *s_ttyvp;		/* Vnode of controlling terminal. */
 	struct	tty *s_ttyp;		/* Controlling terminal. */
 	pid_t	s_sid;			/* Session ID */
 	char	s_login[roundup(MAXLOGNAME, sizeof(long))];	/* Setlogin() name. */
 };
 
 /*
  * One structure allocated per process group.
  */
 struct	pgrp {
 	LIST_ENTRY(pgrp) pg_hash;	/* Hash chain. */
 	LIST_HEAD(, proc) pg_members;	/* Pointer to pgrp members. */
 	struct	session *pg_session;	/* Pointer to session. */
 	struct  sigiolst pg_sigiolst;	/* List of sigio sources. */
 	pid_t	pg_id;			/* Pgrp id. */
 	int	pg_jobc;	/* # procs qualifying pgrp for job control */
 };
 
 struct	procsig {
 	sigset_t ps_sigignore;	/* Signals being ignored. */
 	sigset_t ps_sigcatch;	/* Signals being caught by user. */
 	int      ps_flag;
 	struct	 sigacts *ps_sigacts;
 	int	 ps_refcnt;
 };
 
 /*
  * pasleep structure, used by asleep() syscall to hold requested priority
  * and timeout values for await().
  */
 struct  pasleep {
 	int	as_priority;	/* Async priority. */
 	int	as_timo;	/* Async timeout. */
 };
 
 /*
  * Description of a process.
  *
  * This structure contains the information needed to manage a thread of
  * control, known in UN*X as a process; it has references to substructures
  * containing descriptions of things that the process uses, but may share
  * with related processes.  The process structure and the substructures
  * are always addressable except for those marked "(PROC ONLY)" below,
  * which might be addressable only on a processor on which the process
  * is running.
  */
 
 struct jail;
 
 struct	proc {
 	TAILQ_ENTRY(proc) p_procq;	/* run/sleep queue. */
 	LIST_ENTRY(proc) p_list;	/* List of all processes. */
 
 	/* substructures: */
 	struct	pcred *p_cred;		/* Process owner's identity. */
 	struct	filedesc *p_fd;		/* Ptr to open files structure. */
 	struct	pstats *p_stats;	/* Accounting/statistics (PROC ONLY). */
 	struct	plimit *p_limit;	/* Process limits. */
 	struct	vm_object *p_upages_obj;/* Upages object */
 	struct	procsig *p_procsig;
 #define p_sigacts	p_procsig->ps_sigacts
 #define p_sigignore	p_procsig->ps_sigignore
 #define p_sigcatch	p_procsig->ps_sigcatch
 
 #define	p_ucred		p_cred->pc_ucred
 #define	p_rlimit	p_limit->pl_rlimit
 
 	int	p_flag;			/* P_* flags. */
 	char	p_stat;			/* S* process status. */
 	char	p_pad1[3];
 
 	pid_t	p_pid;			/* Process identifier. */
 	LIST_ENTRY(proc) p_hash;	/* Hash chain. */
 	LIST_ENTRY(proc) p_pglist;	/* List of processes in pgrp. */
 	struct	proc *p_pptr;	 	/* Pointer to parent process. */
 	LIST_ENTRY(proc) p_sibling;	/* List of sibling processes. */
 	LIST_HEAD(, proc) p_children;	/* Pointer to list of children. */
 
 	struct callout_handle p_ithandle; /*
 					      * Callout handle for scheduling
 					      * p_realtimer.
 					      */
 /* The following fields are all zeroed upon creation in fork. */
 #define	p_startzero	p_oppid
 
 	pid_t	p_oppid;	 /* Save parent pid during ptrace. XXX */
 	int	p_dupfd;	 /* Sideways return value from fdopen. XXX */
 
 	struct	vmspace *p_vmspace;	/* Address space. */
 
 	/* scheduling */
 	u_int	p_estcpu;	 /* Time averaged value of p_cpticks. */
 	int	p_cpticks;	 /* Ticks of cpu time. */
 	fixpt_t	p_pctcpu;	 /* %cpu for this process during p_swtime */
 	void	*p_wchan;	 /* Sleep address. */
 	const char *p_wmesg;	 /* Reason for sleep. */
 	u_int	p_swtime;	 /* Time swapped in or out. */
 	u_int	p_slptime;	 /* Time since last blocked. */
 
 	struct	itimerval p_realtimer;	/* Alarm timer. */
 	u_int64_t	p_runtime;	/* Real time in microsec. */
 	u_quad_t p_uticks;		/* Statclock hits in user mode. */
 	u_quad_t p_sticks;		/* Statclock hits in system mode. */
 	u_quad_t p_iticks;		/* Statclock hits processing intr. */
 
 	int	p_traceflag;		/* Kernel trace points. */
 	struct	vnode *p_tracep;	/* Trace to vnode. */
 
 	int	p_siglist;		/* Signals arrived but not delivered. */
 
 	struct	vnode *p_textvp;	/* Vnode of executable. */
 
 	char	p_lock;			/* Process lock (prevent swap) count. */
 	u_char	p_oncpu;		/* Which cpu we are on */
 	u_char	p_lastcpu;		/* Last cpu we were on */
 	char	p_pad2;			/* alignment */
 
 	short	p_locks;		/* DEBUG: lockmgr count of held locks */
 	short	p_simple_locks;		/* DEBUG: count of held simple locks */
 	unsigned int	p_stops;	/* procfs event bitmask */
 	unsigned int	p_stype;	/* procfs stop event type */
 	char	p_step;			/* procfs stop *once* flag */
 	unsigned char	p_pfsflags;	/* procfs flags */
 	char	p_pad3[2];		/* padding for alignment */
 	register_t p_retval[2];		/* syscall aux returns */
 	struct	sigiolst p_sigiolst;	/* list of sigio sources */
 	int	p_sigparent;		/* signal to parent on exit */
 	sigset_t p_oldsigmask;		/* saved mask from before sigpause */
 	int	p_sig;			/* for core dump/debugger XXX */
         u_long	p_code;	  	        /* for core dump/debugger XXX */
 
 /* End area that is zeroed on creation. */
 #define	p_endzero	p_startcopy
 
 /* The following fields are all copied upon creation in fork. */
 #define	p_startcopy	p_sigmask
 
 	sigset_t p_sigmask;	/* Current signal mask. */
 	u_char	p_priority;	/* Process priority. */
 	u_char	p_usrpri;	/* User-priority based on p_cpu and p_nice. */
 	char	p_nice;		/* Process "nice" value. */
 	char	p_comm[MAXCOMLEN+1];
 
 	struct 	pgrp *p_pgrp;	/* Pointer to process group. */
 
 	struct 	sysentvec *p_sysent; /* System call dispatch information. */
 
 	struct	rtprio p_rtprio;	/* Realtime priority. */
 	struct prison *p_prison;
 /* End area that is copied on creation. */
 #define	p_endcopy	p_addr
 	struct	user *p_addr;	/* Kernel virtual addr of u-area (PROC ONLY). */
 	struct	mdproc p_md;	/* Any machine-dependent fields. */
 
 	u_short	p_xstat;	/* Exit status for wait; also stop signal. */
 	u_short	p_acflag;	/* Accounting flags. */
 	struct	rusage *p_ru;	/* Exit information. XXX */
 
 	int	p_nthreads;	/* number of threads (only in leader) */
 	void	*p_aioinfo;	/* ASYNC I/O info */
 	int	p_wakeup;	/* thread id */
 	struct proc *p_peers;	
 	struct proc *p_leader;
 	struct	pasleep p_asleep;	/* Used by asleep()/await(). */
 	void	*p_emuldata;	/* process-specific emulator state data */
 };
 
 #define	p_session	p_pgrp->pg_session
 #define	p_pgid		p_pgrp->pg_id
 
 /* Status values. */
 #define	SIDL	1		/* Process being created by fork. */
 #define	SRUN	2		/* Currently runnable. */
 #define	SSLEEP	3		/* Sleeping on an address. */
 #define	SSTOP	4		/* Process debugging or suspension. */
 #define	SZOMB	5		/* Awaiting collection by parent. */
 
 /* These flags are kept in p_flags. */
 #define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
 #define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
 #define	P_INMEM		0x00004	/* Loaded into memory. */
 #define	P_NOCLDSTOP	0x00008	/* No SIGCHLD when children stop. */
 #define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
 #define	P_PROFIL	0x00020	/* Has started profiling. */
 #define	P_SELECT	0x00040	/* Selecting; wakeup/waiting danger. */
 #define	P_SINTR		0x00080	/* Sleep is interruptible. */
 #define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
 #define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
 #define	P_TIMEOUT	0x00400	/* Timing out during sleep. */
 #define	P_TRACED	0x00800	/* Debugged process being traced. */
 #define	P_WAITED	0x01000	/* Debugging process has waited for child. */
 #define	P_WEXIT		0x02000	/* Working on exiting. */
 #define	P_EXEC		0x04000	/* Process called exec. */
 
 /* Should probably be changed into a hold count. */
 /* was	P_NOSWAP	0x08000	was: Do not swap upages; p->p_hold */
 /* was	P_PHYSIO	0x10000	was: Doing physical I/O; use p->p_hold */
 
 /* Should be moved to machine-dependent areas. */
 #define	P_OWEUPC	0x20000	/* Owe process an addupc() call at next ast. */
 
 #define	P_SWAPPING	0x40000	/* Process is being swapped. */
 #define	P_SWAPINREQ	0x80000	/* Swapin request due to wakeup */
 
 /* Marked a kernel thread */
-#define P_FLSINPROG	0x100000 /* dirty buffers flush is in progress */
+#define P_BUFEXHAUST	0x100000 /* dirty buffers flush is in progress */
 #define P_KTHREADP	0x200000 /* Process is really a kernel thread */
 
 #define	P_NOCLDWAIT	0x400000 /* No zombies if child dies */
 #define P_DEADLKTREAT   0x800000 /* lock aquisition - deadlock treatment */
 
 #define P_JAILED	0x1000000 /* Process is in jail */
 
 /*
  * MOVE TO ucred.h?
  *
  * Shareable process credentials (always resident).  This includes a reference
  * to the current user credentials as well as real and saved ids that may be
  * used to change ids.
  */
 struct	pcred {
 	struct	ucred *pc_ucred;	/* Current credentials. */
 	uid_t	p_ruid;			/* Real user id. */
 	uid_t	p_svuid;		/* Saved effective user id. */
 	gid_t	p_rgid;			/* Real group id. */
 	gid_t	p_svgid;		/* Saved effective group id. */
 	int	p_refcnt;		/* Number of references. */
 };
 
 struct	prochd {
 	struct	proc *ph_link;		/* Linked list of running processes. */
 	struct	proc *ph_rlink;
 };
 
 #ifdef KERNEL
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_SESSION);
 MALLOC_DECLARE(M_SUBPROC);
 MALLOC_DECLARE(M_ZOMBIE);
 #endif
 
 /* flags for suser_xxx() */
 #define PRISON_ROOT	1
 
 /* Handy macro to determine of p1 can mangle p2 */
 
 #define PRISON_CHECK(p1, p2) \
 	((!(p1)->p_prison) || (p1)->p_prison == (p2)->p_prison)
 
 /*
  * We use process IDs <= PID_MAX; PID_MAX + 1 must also fit in a pid_t,
  * as it is used to represent "no process group".
  */
 #define	PID_MAX		99999
 #define	NO_PID		100000
 
 #define SESS_LEADER(p)	((p)->p_session->s_leader == (p))
 #define	SESSHOLD(s)	((s)->s_count++)
 #define	SESSRELE(s) {							\
 	if (--(s)->s_count == 0)					\
 		FREE(s, M_SESSION);					\
 }
 
 extern void stopevent(struct proc*, unsigned int, unsigned int);
 #define	STOPEVENT(p,e,v)	do { \
 	if ((p)->p_stops & (e)) stopevent(p,e,v); } while (0)
 
 /* hold process U-area in memory, normally for ptrace/procfs work */
 #define PHOLD(p) {							\
 	if ((p)->p_lock++ == 0 && ((p)->p_flag & P_INMEM) == 0)	\
 		faultin(p);						\
 }
 #define PRELE(p)	(--(p)->p_lock)
 
 #define	PIDHASH(pid)	(&pidhashtbl[(pid) & pidhash])
 extern LIST_HEAD(pidhashhead, proc) *pidhashtbl;
 extern u_long pidhash;
 
 #define	PGRPHASH(pgid)	(&pgrphashtbl[(pgid) & pgrphash])
 extern LIST_HEAD(pgrphashhead, pgrp) *pgrphashtbl;
 extern u_long pgrphash;
 
 #ifndef SET_CURPROC
 #define SET_CURPROC(p)	(curproc = (p))
 #endif
 
 #ifndef curproc
 extern struct proc *curproc;		/* Current running proc. */
 extern int switchticks;			/* `ticks' at last context switch. */
 extern struct timeval switchtime;	/* Uptime at last context switch */
 #endif
 extern struct proc proc0;		/* Process slot for swapper. */
 extern int hogticks;			/* Limit on kernel cpu hogs. */
 extern int nprocs, maxproc;		/* Current and max number of procs. */
 extern int maxprocperuid;		/* Max procs per uid. */
 extern int sched_quantum;		/* Scheduling quantum in ticks */
 
 LIST_HEAD(proclist, proc);
 extern struct proclist allproc;		/* List of all processes. */
 extern struct proclist zombproc;	/* List of zombie processes. */
 extern struct proc *initproc, *pageproc, *updateproc; /* Process slots for init, pager. */
 
 #define	NQS	32			/* 32 run queues. */
 extern struct prochd qs[];
 extern struct prochd rtqs[];
 extern struct prochd idqs[];
 extern int	whichqs;	/* Bit mask summary of non-empty Q's. */
 extern int	whichrtqs;	/* Bit mask summary of non-empty Q's. */
 extern int	whichidqs;	/* Bit mask summary of non-empty Q's. */
 
 struct proc *pfind __P((pid_t));	/* Find process by id. */
 struct pgrp *pgfind __P((pid_t));	/* Find process group by id. */
 
 struct vm_zone;
 extern struct vm_zone *proc_zone;
 
 int	chgproccnt __P((uid_t uid, int diff));
 int	enterpgrp __P((struct proc *p, pid_t pgid, int mksess));
 void	fixjobc __P((struct proc *p, struct pgrp *pgrp, int entering));
 int	inferior __P((struct proc *p));
 int	leavepgrp __P((struct proc *p));
 void	mi_switch __P((void));
 void	procinit __P((void));
 void	resetpriority __P((struct proc *));
 int	roundrobin_interval __P((void));
 void	setrunnable __P((struct proc *));
 void	setrunqueue __P((struct proc *));
 void	sleepinit __P((void));
 int	suser __P((struct proc *));
 int	suser_xxx __P((struct ucred *cred, struct proc *proc, int flag));
 void	remrq __P((struct proc *));
 void	cpu_switch __P((struct proc *));
 void	unsleep __P((struct proc *));
 void	wakeup_one __P((void *chan));
 
 void	cpu_exit __P((struct proc *)) __dead2;
 void	exit1 __P((struct proc *, int)) __dead2;
 void	cpu_fork __P((struct proc *, struct proc *));
 void	cpu_set_fork_handler __P((struct proc *, void (*)(void *), void *));
 int	fork1 __P((struct proc *, int, struct proc **));
 int	trace_req __P((struct proc *));
 void	cpu_wait __P((struct proc *));
 int	cpu_coredump __P((struct proc *, struct vnode *, struct ucred *));
 void	setsugid __P((struct proc *p));
 void	faultin __P((struct proc *p));
 #endif	/* KERNEL */
 
 #endif	/* !_SYS_PROC_H_ */
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 414c922d1573..90406fae94c5 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -1,1461 +1,1459 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.c,v 1.142 1999/06/26 14:56:58 peter Exp $
+ * $Id: vm_pageout.c,v 1.143 1999/07/01 13:21:46 peter Exp $
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include "opt_vm.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <sys/lock.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout __P((void));
 static int vm_pageout_clean __P((vm_page_t));
 static int vm_pageout_scan __P((void));
 static int vm_pageout_free_page_calc __P((vm_size_t count));
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon __P((void));
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
 #endif
 
 
 int vm_pages_needed=0;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit=0;	/* Estimated number of pages deficit */
 int vm_pageout_pages_needed=0;	/* flag saying that the pageout daemon needs pages */
 
 extern int npendingio;
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 #endif
 extern int nswiodone;
 extern int vm_swap_size;
 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0;
 static int vm_pageout_full_stats_interval = 0;
 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm_lru=0;
 static int defer_swap_pageouts=0;
 static int disable_swap_pageouts=0;
 
 static int max_page_launder=100;
 #if defined(NO_SWAPPING)
 static int vm_swap_enabled=0;
 static int vm_swap_idle_enabled=0;
 #else
 static int vm_swap_enabled=1;
 static int vm_swap_idle_enabled=0;
 #endif
 
 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm,
 	CTLFLAG_RW, &vm_pageout_algorithm_lru, 0, "LRU page mgmt");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max,
 	CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval,
 	CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval,
 	CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max,
 	CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented");
 
 #if defined(NO_SWAPPING)
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RD, &vm_swap_enabled, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RD, &vm_swap_idle_enabled, 0, "");
 #else
 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled,
 	CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout");
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled,
 	CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria");
 #endif
 
 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 	CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 SYSCTL_INT(_vm, OID_AUTO, max_page_launder,
 	CTLFLAG_RW, &max_page_launder, 0, "Maximum number of pages to clean per pass");
 
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 #if !defined(NO_SWAPPING)
 typedef void freeer_fcn_t __P((vm_map_t, vm_object_t, vm_pindex_t, int));
 static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_pindex_t));
 static freeer_fcn_t vm_pageout_object_deactivate_pages;
 static void vm_req_vmdaemon __P((void));
 #endif
 static void vm_pageout_page_stats(void);
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.  Note the careful timing, however, the busy bit isn't set till
  * late and we cannot do anything that will mess with the page.
  */
 
 static int
 vm_pageout_clean(m)
 	vm_page_t m;
 {
 	register vm_object_t object;
 	vm_page_t mc[2*vm_pageout_page_count];
 	int pageout_count;
 	int i, forward_okay, backward_okay, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	object = m->object;
 
 	/*
 	 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP
 	 * with the new swapper, but we could have serious problems paging
 	 * out other object types if there is insufficient memory.  
 	 *
 	 * Unfortunately, checking free memory here is far too late, so the
 	 * check has been moved up a procedural level.
 	 */
 
 #if 0
 	/*
 	 * If not OBJT_SWAP, additional memory may be needed to do the pageout.
 	 * Try to avoid the deadlock.
 	 */
 	if ((object->type == OBJT_DEFAULT) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min))
 		return 0;
 #endif
 
 	/*
 	 * Don't mess with the page if it's busy.
 	 */
 	if ((m->hold_count != 0) ||
 	    ((m->busy != 0) || (m->flags & PG_BUSY)))
 		return 0;
 
 #if 0
 	/*
 	 * XXX REMOVED XXX.  vm_object_collapse() can block, which can
 	 * change the page state.  Calling vm_object_collapse() might also
 	 * destroy or rename the page because we have not busied it yet!!!
 	 * So this code segment is removed.
 	 */
 	/*
 	 * Try collapsing before it's too late.   XXX huh?  Why are we doing
 	 * this here?
 	 */
 	if (object->backing_object) {
 		vm_object_collapse(object);
 	}
 #endif
 
 	mc[vm_pageout_page_count] = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	forward_okay = TRUE;
 	if (pindex != 0)
 		backward_okay = TRUE;
 	else
 		backward_okay = FALSE;
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 */
 	for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) {
 		vm_page_t p;
 
 		/*
 		 * See if forward page is clusterable.
 		 */
 		if (forward_okay) {
 			/*
 			 * Stop forward scan at end of object.
 			 */
 			if ((pindex + i) > object->size) {
 				forward_okay = FALSE;
 				goto do_backward;
 			}
 			p = vm_page_lookup(object, pindex + i);
 			if (p) {
 				if (((p->queue - p->pc) == PQ_CACHE) ||
 					(p->flags & PG_BUSY) || p->busy) {
 					forward_okay = FALSE;
 					goto do_backward;
 				}
 				vm_page_test_dirty(p);
 				if ((p->dirty & p->valid) != 0 &&
 				    (p->queue == PQ_INACTIVE) &&
 				    (p->wire_count == 0) &&
 				    (p->hold_count == 0)) {
 					mc[vm_pageout_page_count + i] = p;
 					pageout_count++;
 					if (pageout_count == vm_pageout_page_count)
 						break;
 				} else {
 					forward_okay = FALSE;
 				}
 			} else {
 				forward_okay = FALSE;
 			}
 		}
 do_backward:
 		/*
 		 * See if backward page is clusterable.
 		 */
 		if (backward_okay) {
 			/*
 			 * Stop backward scan at beginning of object.
 			 */
 			if ((pindex - i) == 0) {
 				backward_okay = FALSE;
 			}
 			p = vm_page_lookup(object, pindex - i);
 			if (p) {
 				if (((p->queue - p->pc) == PQ_CACHE) ||
 					(p->flags & PG_BUSY) || p->busy) {
 					backward_okay = FALSE;
 					continue;
 				}
 				vm_page_test_dirty(p);
 				if ((p->dirty & p->valid) != 0 &&
 				    (p->queue == PQ_INACTIVE) &&
 				    (p->wire_count == 0) &&
 				    (p->hold_count == 0)) {
 					mc[vm_pageout_page_count - i] = p;
 					pageout_count++;
 					page_base--;
 					if (pageout_count == vm_pageout_page_count)
 						break;
 				} else {
 					backward_okay = FALSE;
 				}
 			} else {
 				backward_okay = FALSE;
 			}
 		}
 	}
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	return vm_pageout_flush(&mc[page_base], pageout_count, 0);
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  */
 
 int
 vm_pageout_flush(mc, count, flags)
 	vm_page_t *mc;
 	int count;
 	int flags;
 {
 	register vm_object_t object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i;
 
 	/*
 	 * Initiate I/O.  Bump the vm_page_t->busy counter and
 	 * mark the pages read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 */
 
 	for (i = 0; i < count; i++) {
 		vm_page_io_start(mc[i]);
 		vm_page_protect(mc[i], VM_PROT_READ);
 	}
 
 	object = mc[0]->object;
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count,
 	    (flags | ((object == kernel_object) ? OBJPC_SYNC : 0)),
 	    pageout_status);
 
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 			numpagedout++;
 			break;
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
 			mt->dirty = 0;
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
 			vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_io_finish(mt);
 		}
 	}
 	return numpagedout;
 }
 
 #if !defined(NO_SWAPPING)
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	deactivate enough pages to satisfy the inactive target
  *	requirements or if vm_page_proc_limit is set, then
  *	deactivate all of the pages in the object and its
  *	backing_objects.
  *
  *	The object and map must be locked.
  */
 static void
 vm_pageout_object_deactivate_pages(map, object, desired, map_remove_only)
 	vm_map_t map;
 	vm_object_t object;
 	vm_pindex_t desired;
 	int map_remove_only;
 {
 	register vm_page_t p, next;
 	int rcount;
 	int remove_mode;
 	int s;
 
 	if (object->type == OBJT_DEVICE)
 		return;
 
 	while (object) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			return;
 		if (object->paging_in_progress)
 			return;
 
 		remove_mode = map_remove_only;
 		if (object->shadow_count > 1)
 			remove_mode = 1;
 	/*
 	 * scan the objects entire memory queue
 	 */
 		rcount = object->resident_page_count;
 		p = TAILQ_FIRST(&object->memq);
 		while (p && (rcount-- > 0)) {
 			int actcount;
 			if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 				return;
 			next = TAILQ_NEXT(p, listq);
 			cnt.v_pdpages++;
 			if (p->wire_count != 0 ||
 			    p->hold_count != 0 ||
 			    p->busy != 0 ||
 			    (p->flags & PG_BUSY) ||
 			    !pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) {
 				p = next;
 				continue;
 			}
 
 			actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(p));
 			if (actcount) {
 				vm_page_flag_set(p, PG_REFERENCED);
 			} else if (p->flags & PG_REFERENCED) {
 				actcount = 1;
 			}
 
 			if ((p->queue != PQ_ACTIVE) &&
 				(p->flags & PG_REFERENCED)) {
 				vm_page_activate(p);
 				p->act_count += actcount;
 				vm_page_flag_clear(p, PG_REFERENCED);
 			} else if (p->queue == PQ_ACTIVE) {
 				if ((p->flags & PG_REFERENCED) == 0) {
 					p->act_count -= min(p->act_count, ACT_DECLINE);
 					if (!remove_mode && (vm_pageout_algorithm_lru || (p->act_count == 0))) {
 						vm_page_protect(p, VM_PROT_NONE);
 						vm_page_deactivate(p);
 					} else {
 						s = splvm();
 						TAILQ_REMOVE(&vm_page_queue_active, p, pageq);
 						TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq);
 						splx(s);
 					}
 				} else {
 					vm_page_activate(p);
 					vm_page_flag_clear(p, PG_REFERENCED);
 					if (p->act_count < (ACT_MAX - ACT_ADVANCE))
 						p->act_count += ACT_ADVANCE;
 					s = splvm();
 					TAILQ_REMOVE(&vm_page_queue_active, p, pageq);
 					TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq);
 					splx(s);
 				}
 			} else if (p->queue == PQ_INACTIVE) {
 				vm_page_protect(p, VM_PROT_NONE);
 			}
 			p = next;
 		}
 		object = object->backing_object;
 	}
 	return;
 }
 
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 static void
 vm_pageout_map_deactivate_pages(map, desired)
 	vm_map_t map;
 	vm_pindex_t desired;
 {
 	vm_map_entry_t tmpe;
 	vm_object_t obj, bigobj;
 
 	if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT, (void *)0, curproc)) {
 		return;
 	}
 
 	bigobj = NULL;
 
 	/*
 	 * first, search out the biggest object, and try to free pages from
 	 * that.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if ((obj != NULL) && (obj->shadow_count <= 1) &&
 				((bigobj == NULL) ||
 				 (bigobj->resident_page_count < obj->resident_page_count))) {
 				bigobj = obj;
 			}
 		}
 		tmpe = tmpe->next;
 	}
 
 	if (bigobj)
 		vm_pageout_object_deactivate_pages(map, bigobj, desired, 0);
 
 	/*
 	 * Next, hunt around for other pages to deactivate.  We actually
 	 * do this search sort of wrong -- .text first is not the best idea.
 	 */
 	tmpe = map->header.next;
 	while (tmpe != &map->header) {
 		if (pmap_resident_count(vm_map_pmap(map)) <= desired)
 			break;
 		if ((tmpe->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) {
 			obj = tmpe->object.vm_object;
 			if (obj)
 				vm_pageout_object_deactivate_pages(map, obj, desired, 0);
 		}
 		tmpe = tmpe->next;
 	};
 
 	/*
 	 * Remove all mappings if a process is swapped out, this will free page
 	 * table pages.
 	 */
 	if (desired == 0)
 		pmap_remove(vm_map_pmap(map),
 			VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS);
 	vm_map_unlock(map);
 	return;
 }
 #endif
 
 /*
  * Don't try to be fancy - being fancy can lead to VOP_LOCK's and therefore
  * to vnode deadlocks.  We only do it for OBJT_DEFAULT and OBJT_SWAP objects
  * which we know can be trivially freed.
  */
 
 void
 vm_pageout_page_free(vm_page_t m) {
 	vm_object_t object = m->object;
 	int type = object->type;
 
 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
 		vm_object_reference(object);
 	vm_page_busy(m);
 	vm_page_protect(m, VM_PROT_NONE);
 	vm_page_free(m);
 	if (type == OBJT_SWAP || type == OBJT_DEFAULT)
 		vm_object_deallocate(object);
 }
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static int
 vm_pageout_scan()
 {
 	vm_page_t m, next;
 	int page_shortage, maxscan, pcount;
 	int addl_page_shortage, addl_page_shortage_init;
 	int maxlaunder;
 	int launder_loop = 0;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
 	int force_wakeup = 0;
 	int actcount;
 	int vnodes_skipped = 0;
 	int s;
 
 	/*
 	 * Do whatever cleanup that the pmap code can.
 	 */
 	pmap_collect();
 
 	addl_page_shortage_init = vm_pageout_deficit;
 	vm_pageout_deficit = 0;
 
 	if (max_page_launder == 0)
 		max_page_launder = 1;
 
 	/*
 	 * Calculate the number of pages we want to either free or move
 	 * to the cache.
 	 */
 
 	page_shortage = (cnt.v_free_target + cnt.v_cache_min) -
 	    (cnt.v_free_count + cnt.v_cache_count);
 	page_shortage += addl_page_shortage_init;
 
 	/*
 	 * Figure out what to do with dirty pages when they are encountered.
 	 * Assume that 1/3 of the pages on the inactive list are clean.  If
 	 * we think we can reach our target, disable laundering (do not
 	 * clean any dirty pages).  If we miss the target we will loop back
 	 * up and do a laundering run.
 	 */
 
 	if (cnt.v_inactive_count / 3 > page_shortage) {
 		maxlaunder = 0;
 		launder_loop = 0;
 	} else {
 		maxlaunder = 
 		    (cnt.v_inactive_target > max_page_launder) ?
 		    max_page_launder : cnt.v_inactive_target;
 		launder_loop = 1;
 	}
 
 	/*
 	 * Start scanning the inactive queue for pages we can move to the
 	 * cache or free.  The scan will stop when the target is reached or
 	 * we have scanned the entire inactive queue.
 	 */
 
 rescan0:
 	addl_page_shortage = addl_page_shortage_init;
 	maxscan = cnt.v_inactive_count;
-	for (
-	    m = TAILQ_FIRST(&vm_page_queue_inactive);
-	    m != NULL && maxscan-- > 0 && page_shortage > 0;
-	    m = next
-	) {
+	for (m = TAILQ_FIRST(&vm_page_queue_inactive);
+	     m != NULL && maxscan-- > 0 && page_shortage > 0;
+	     m = next) {
 
 		cnt.v_pdpages++;
 
 		if (m->queue != PQ_INACTIVE) {
 			goto rescan0;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 
 		if (m->hold_count) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 			splx(s);
 			addl_page_shortage++;
 			continue;
 		}
 		/*
 		 * Dont mess with busy pages, keep in the front of the
 		 * queue, most likely are being paged out.
 		 */
 		if (m->busy || (m->flags & PG_BUSY)) {
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * If the object is not being used, we ignore previous 
 		 * references.
 		 */
 		if (m->object->ref_count == 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 
 		/*
 		 * Otherwise, if the page has been referenced while in the 
 		 * inactive queue, we bump the "activation count" upwards, 
 		 * making it less likely that the page will be added back to 
 		 * the inactive queue prematurely again.  Here we check the 
 		 * page tables (or emulated bits, if any), given the upper 
 		 * level VM system not knowing anything about existing 
 		 * references.
 		 */
 		} else if (((m->flags & PG_REFERENCED) == 0) &&
 			(actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m)))) {
 			vm_page_activate(m);
 			m->act_count += (actcount + ACT_ADVANCE);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system knows about any page 
 		 * references, we activate the page.  We also set the 
 		 * "activation count" higher than normal so that we will less 
 		 * likely place pages back onto the inactive queue again.
 		 */
 		if ((m->flags & PG_REFERENCED) != 0) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount = pmap_ts_referenced(VM_PAGE_TO_PHYS(m));
 			vm_page_activate(m);
 			m->act_count += (actcount + ACT_ADVANCE + 1);
 			continue;
 		}
 
 		/*
 		 * If the upper level VM system doesn't know anything about 
 		 * the page being dirty, we have to check for it again.  As 
 		 * far as the VM code knows, any partially dirty pages are 
 		 * fully dirty.
 		 */
 		if (m->dirty == 0) {
 			vm_page_test_dirty(m);
 		} else {
 			vm_page_dirty(m);
 		}
 
 		/*
 		 * Invalid pages can be easily freed
 		 */
 		if (m->valid == 0) {
 			vm_pageout_page_free(m);
 			cnt.v_dfree++;
 			--page_shortage;
 
 		/*
 		 * Clean pages can be placed onto the cache queue.
 		 */
 		} else if (m->dirty == 0) {
 			vm_page_cache(m);
 			--page_shortage;
 
 		/*
 		 * Dirty pages need to be paged out.  Note that we clean
 		 * only a limited number of pages per pagedaemon pass.
 		 */
 		} else if (maxlaunder > 0) {
 			int written;
 			int swap_pageouts_ok;
 			struct vnode *vp = NULL;
 
 			object = m->object;
 
 			if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) {
 				swap_pageouts_ok = 1;
 			} else {
 				swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts);
 				swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts &&
 					(cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min);
 										
 			}
 
 			/*
 			 * We don't bother paging objects that are "dead".  
 			 * Those objects are in a "rundown" state.
 			 */
 			if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) {
 				s = splvm();
 				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
 				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 				splx(s);
 				continue;
 			}
 
 			/*
 			 * For now we protect against potential memory
 			 * deadlocks by requiring significant memory to be 
 			 * free if the object is not OBJT_DEFAULT or OBJT_SWAP.
 			 * We do not 'trust' any other object type to operate
 			 * with low memory, not even OBJT_DEVICE.  The VM
 			 * allocator will special case allocations done by
 			 * the pageout daemon so the check below actually 
 			 * does have some hysteresis in it.  It isn't the best
 			 * solution, though.
 			 */
 
-			if (
-			    object->type != OBJT_DEFAULT &&
+			if (object->type != OBJT_DEFAULT &&
 			    object->type != OBJT_SWAP &&
-			    cnt.v_free_count < cnt.v_free_reserved
-			) {
+			    cnt.v_free_count < cnt.v_free_reserved) {
 				s = splvm();
 				TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
-				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
+				TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m,
+				    pageq);
 				splx(s);
 				continue;
 			}
 
 			/*
 			 * Presumably we have sufficient free memory to do
 			 * the more sophisticated checks and locking required
 			 * for vnodes.
 			 *
 			 * The object is already known NOT to be dead.  The
 			 * vget() may still block, though, because 
 			 * VOP_ISLOCKED() doesn't check to see if an inode
 			 * (v_data) is associated with the vnode.  If it isn't,
 			 * vget() will load in it from disk.  Worse, vget()
 			 * may actually get stuck waiting on "inode" if another
 			 * process is in the process of bringing the inode in.
 			 * This is bad news for us either way.
 			 *
 			 * So for the moment we check v_data == NULL as a
 			 * workaround.  This means that vnodes which do not
 			 * use v_data in the way we expect probably will not
 			 * wind up being paged out by the pager and it will be
 			 * up to the syncer to get them.  That's better then
 			 * us blocking here.
 			 *
 			 * This whole code section is bogus - we need to fix
 			 * the vnode pager to handle vm_page_t's without us
 			 * having to do any sophisticated VOP tests.
 			 */
 
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
 
 				if (VOP_ISLOCKED(vp) ||
 				    vp->v_data == NULL ||
 				    vget(vp, LK_EXCLUSIVE|LK_NOOBJ, curproc)) {
 					if ((m->queue == PQ_INACTIVE) &&
 						(m->hold_count == 0) &&
 						(m->busy == 0) &&
 						(m->flags & PG_BUSY) == 0) {
 						s = splvm();
 						TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
 						TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 						splx(s);
 					}
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					continue;
 				}
 
 				/*
 				 * The page might have been moved to another queue
 				 * during potential blocking in vget() above.
 				 */
 				if (m->queue != PQ_INACTIVE) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
 					continue;
 				}
 	
 				/*
 				 * The page may have been busied during the blocking in
 				 * vput();  We don't move the page back onto the end of
 				 * the queue so that statistics are more correct if we don't.
 				 */
 				if (m->busy || (m->flags & PG_BUSY)) {
 					vput(vp);
 					continue;
 				}
 
 				/*
 				 * If the page has become held, then skip it
 				 */
 				if (m->hold_count) {
 					s = splvm();
 					TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
 					TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 					splx(s);
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						vnodes_skipped++;
 					vput(vp);
 					continue;
 				}
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation.
 			 */
 			written = vm_pageout_clean(m);
 			if (vp)
 				vput(vp);
 
 			maxlaunder -= written;
 		}
 	}
 
 	/*
 	 * If we still have a page shortage and we didn't launder anything,
 	 * run the inactive scan again and launder something this time.
 	 */
 
 	if (launder_loop == 0 && page_shortage > 0) {
 		launder_loop = 1;
 		maxlaunder = 
 		    (cnt.v_inactive_target > max_page_launder) ?
 		    max_page_launder : cnt.v_inactive_target;
 		goto rescan0;
 	}
 
 	/*
 	 * Compute the page shortage from the point of view of having to
 	 * move pages from the active queue to the inactive queue.
 	 */
 
 	page_shortage = (cnt.v_inactive_target + cnt.v_cache_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	page_shortage += addl_page_shortage;
 
 	/*
 	 * Scan the active queue for things we can deactivate
 	 */
 
 	pcount = cnt.v_active_count;
 	m = TAILQ_FIRST(&vm_page_queue_active);
 
 	while ((m != NULL) && (pcount-- > 0) && (page_shortage > 0)) {
 
 		/*
 		 * This is a consistancy check, and should likely be a panic
 		 * or warning.
 		 */
 		if (m->queue != PQ_ACTIVE) {
 			break;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->flags & PG_BUSY) ||
 		    (m->hold_count != 0)) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			splx(s);
 			m = next;
 			continue;
 		}
 
 		/*
 		 * The count for pagedaemon pages is done after checking the
 		 * page for eligbility...
 		 */
 		cnt.v_pdpages++;
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 */
 		actcount = 0;
 		if (m->object->ref_count != 0) {
 			if (m->flags & PG_REFERENCED) {
 				actcount += 1;
 			}
 			actcount += pmap_ts_referenced(VM_PAGE_TO_PHYS(m));
 			if (actcount) {
 				m->act_count += ACT_ADVANCE + actcount;
 				if (m->act_count > ACT_MAX)
 					m->act_count = ACT_MAX;
 			}
 		}
 
 		/*
 		 * Since we have "tested" this bit, we need to clear it now.
 		 */
 		vm_page_flag_clear(m, PG_REFERENCED);
 
 		/*
 		 * Only if an object is currently being used, do we use the
 		 * page activation count stats.
 		 */
 		if (actcount && (m->object->ref_count != 0)) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			splx(s);
 		} else {
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 			if (vm_pageout_algorithm_lru ||
 				(m->object->ref_count == 0) || (m->act_count == 0)) {
 				page_shortage--;
 				if (m->object->ref_count == 0) {
 					vm_page_protect(m, VM_PROT_NONE);
 					if (m->dirty == 0)
 						vm_page_cache(m);
 					else
 						vm_page_deactivate(m);
 				} else {
 					vm_page_deactivate(m);
 				}
 			} else {
 				s = splvm();
 				TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 				TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 				splx(s);
 			}
 		}
 		m = next;
 	}
 
 	s = splvm();
 
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
 	 * code to be guaranteed space.  Since both cache and free queues 
 	 * are considered basically 'free', moving pages from cache to free
 	 * does not effect other calculations.
 	 */
 
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		static int cache_rover = 0;
 		m = vm_page_list_find(PQ_CACHE, cache_rover, FALSE);
 		if (!m)
 			break;
 		if ((m->flags & PG_BUSY) || m->busy || m->hold_count || m->wire_count) {
 #ifdef INVARIANTS
 			printf("Warning: busy page %p found in cache\n", m);
 #endif
 			vm_page_deactivate(m);
 			continue;
 		}
 		cache_rover = (cache_rover + PQ_PRIME2) & PQ_L2_MASK;
 		vm_pageout_page_free(m);
 		cnt.v_dfree++;
 	}
 	splx(s);
 
 #if !defined(NO_SWAPPING)
 	/*
 	 * Idle process swapout -- run once per second.
 	 */
 	if (vm_swap_idle_enabled) {
 		static long lsec;
 		if (time_second != lsec) {
 			vm_pageout_req_swapout |= VM_SWAP_IDLE;
 			vm_req_vmdaemon();
 			lsec = time_second;
 		}
 	}
 #endif
 		
 	/*
 	 * If we didn't get enough free pages, and we have skipped a vnode
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
 	if ((cnt.v_cache_count + cnt.v_free_count) <
 		(cnt.v_free_target + cnt.v_cache_min) ) {
 		if (vnodes_skipped &&
 		    (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) {
 			(void) speedup_syncer();
 		}
 #if !defined(NO_SWAPPING)
 		if (vm_swap_enabled &&
 			(cnt.v_free_count + cnt.v_cache_count < cnt.v_free_target)) {
 			vm_req_vmdaemon();
 			vm_pageout_req_swapout |= VM_SWAP_NORMAL;
 		}
 #endif
 	}
 
 	/*
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
 	if ((vm_swap_size == 0 || swap_pager_full) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 			/*
 			 * if this is a system process, skip it
 			 */
 			if ((p->p_flag & P_SYSTEM) || (p->p_lock > 0) ||
 			    (p->p_pid == 1) ||
 			    ((p->p_pid < 48) && (vm_swap_size != 0))) {
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
 				continue;
 			}
 			/*
 			 * get the process size
 			 */
 			size = vmspace_resident_count(p->p_vmspace);
 			/*
 			 * if the this process is bigger than the biggest one
 			 * remember it.
 			 */
 			if (size > bigsize) {
 				bigproc = p;
 				bigsize = size;
 			}
 		}
 		if (bigproc != NULL) {
 			killproc(bigproc, "out of swap space");
 			bigproc->p_estcpu = 0;
 			bigproc->p_nice = PRIO_MIN;
 			resetpriority(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
 	}
 	return force_wakeup;
 }
 
 /*
  * This routine tries to maintain the pseudo LRU active queue,
  * so that during long periods of time where there is no paging,
  * that some statistic accumlation still occurs.  This code
  * helps the situation where paging just starts to occur.
  */
 static void
 vm_pageout_page_stats()
 {
 	int s;
 	vm_page_t m,next;
 	int pcount,tpcount;		/* Number of pages to check */
 	static int fullintervalcount = 0;
 	int page_shortage;
 
 	page_shortage = (cnt.v_inactive_target + cnt.v_cache_max + cnt.v_free_min) -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	if (page_shortage <= 0)
 		return;
 
 	pcount = cnt.v_active_count;
 	fullintervalcount += vm_pageout_stats_interval;
 	if (fullintervalcount < vm_pageout_full_stats_interval) {
 		tpcount = (vm_pageout_stats_max * cnt.v_active_count) / cnt.v_page_count;
 		if (pcount > tpcount)
 			pcount = tpcount;
 	}
 
 	m = TAILQ_FIRST(&vm_page_queue_active);
 	while ((m != NULL) && (pcount-- > 0)) {
 		int actcount;
 
 		if (m->queue != PQ_ACTIVE) {
 			break;
 		}
 
 		next = TAILQ_NEXT(m, pageq);
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->flags & PG_BUSY) ||
 		    (m->hold_count != 0)) {
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			splx(s);
 			m = next;
 			continue;
 		}
 
 		actcount = 0;
 		if (m->flags & PG_REFERENCED) {
 			vm_page_flag_clear(m, PG_REFERENCED);
 			actcount += 1;
 		}
 
 		actcount += pmap_ts_referenced(VM_PAGE_TO_PHYS(m));
 		if (actcount) {
 			m->act_count += ACT_ADVANCE + actcount;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 			s = splvm();
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			splx(s);
 		} else {
 			if (m->act_count == 0) {
 				/*
 				 * We turn off page access, so that we have more accurate
 				 * RSS stats.  We don't do this in the normal page deactivation
 				 * when the system is loaded VM wise, because the cost of
 				 * the large number of page protect operations would be higher
 				 * than the value of doing the operation.
 				 */
 				vm_page_protect(m, VM_PROT_NONE);
 				vm_page_deactivate(m);
 			} else {
 				m->act_count -= min(m->act_count, ACT_DECLINE);
 				s = splvm();
 				TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 				TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 				splx(s);
 			}
 		}
 
 		m = next;
 	}
 }
 
 static int
 vm_pageout_free_page_calc(count)
 vm_size_t count;
 {
 	if (count < cnt.v_page_count)
 		 return 0;
 	/*
 	 * free_reserved needs to include enough for the largest swap pager
 	 * structures plus enough for any pv_entry structs when paging.
 	 */
 	if (cnt.v_page_count > 1024)
 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
 	else
 		cnt.v_free_min = 4;
 	cnt.v_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
 		cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = vm_pageout_page_count +
 		cnt.v_pageout_free_min + (count / 768) + PQ_L2_SIZE;
 	cnt.v_free_min += cnt.v_free_reserved;
 	return 1;
 }
 
 
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout()
 {
 	/*
 	 * Initialize some paging parameters.
 	 */
 
 	cnt.v_interrupt_free_min = 2;
 	if (cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	vm_pageout_free_page_calc(cnt.v_page_count);
 	/*
 	 * free_reserved needs to include enough for the largest swap pager
 	 * structures plus enough for any pv_entry structs when paging.
 	 */
 	if (cnt.v_free_count > 6144)
 		cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
 	else
 		cnt.v_free_target = 2 * cnt.v_free_min + cnt.v_free_reserved;
 
 	if (cnt.v_free_count > 2048) {
 		cnt.v_cache_min = cnt.v_free_target;
 		cnt.v_cache_max = 2 * cnt.v_cache_min;
 		cnt.v_inactive_target = (3 * cnt.v_free_target) / 2;
 	} else {
 		cnt.v_cache_min = 0;
 		cnt.v_cache_max = 0;
 		cnt.v_inactive_target = cnt.v_free_count / 4;
 	}
 	if (cnt.v_inactive_target > cnt.v_free_count / 3)
 		cnt.v_inactive_target = cnt.v_free_count / 3;
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
 
 	if (vm_pageout_stats_max == 0)
 		vm_pageout_stats_max = cnt.v_free_target;
 
 	/*
 	 * Set interval in seconds for stats scan.
 	 */
 	if (vm_pageout_stats_interval == 0)
 		vm_pageout_stats_interval = 5;
 	if (vm_pageout_full_stats_interval == 0)
 		vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4;
 	
 
 	/*
 	 * Set maximum free per pass
 	 */
 	if (vm_pageout_stats_free_max == 0)
 		vm_pageout_stats_free_max = 5;
 
 	max_page_launder = (cnt.v_page_count > 1800 ? 32 : 16);
 
+	curproc->p_flag |= P_BUFEXHAUST;
 	swap_pager_swap_init();
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
 		int error;
 		int s = splvm();
 		if (!vm_pages_needed ||
 			((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_min)) {
 			vm_pages_needed = 0;
 			error = tsleep(&vm_pages_needed,
 				PVM, "psleep", vm_pageout_stats_interval * hz);
 			if (error && !vm_pages_needed) {
 				splx(s);
 				vm_pageout_page_stats();
 				continue;
 			}
 		} else if (vm_pages_needed) {
 			vm_pages_needed = 0;
 			tsleep(&vm_pages_needed, PVM, "psleep", hz/2);
 		}
 
 		if (vm_pages_needed)
 			cnt.v_pdwakeups++;
 		vm_pages_needed = 0;
 		splx(s);
 		vm_pageout_scan();
 		vm_pageout_deficit = 0;
 		wakeup(&cnt.v_free_count);
 	}
 }
 
 void
 pagedaemon_wakeup()
 {
 	if (!vm_pages_needed && curproc != pageproc) {
 		vm_pages_needed++;
 		wakeup(&vm_pages_needed);
 	}
 }
 
 #if !defined(NO_SWAPPING)
 static void
 vm_req_vmdaemon()
 {
 	static int lastrun = 0;
 
 	if ((ticks > (lastrun + hz)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 }
 
 static void
 vm_daemon()
 {
 	struct proc *p;
 
 	while (TRUE) {
 		tsleep(&vm_daemon_needed, PPAUSE, "psleep", 0);
 		if (vm_pageout_req_swapout) {
 			swapout_procs(vm_pageout_req_swapout);
 			vm_pageout_req_swapout = 0;
 		}
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
 			vm_pindex_t limit, size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			limit = OFF_TO_IDX(
 			    qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
 				p->p_rlimit[RLIMIT_RSS].rlim_max));
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_flag & P_INMEM) == 0)
 				limit = 0;	/* XXX */
 
 			size = vmspace_resident_count(p->p_vmspace);
 			if (limit >= 0 && size >= limit) {
 				vm_pageout_map_deactivate_pages(
 				    &p->p_vmspace->vm_map, limit);
 			}
 		}
 	}
 }
 #endif
diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c
index 2a0dbc8ce87c..1895d4fedac7 100644
--- a/sys/vm/vm_pager.c
+++ b/sys/vm/vm_pager.c
@@ -1,603 +1,605 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pager.c	8.6 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pager.c,v 1.48 1999/06/26 02:46:48 mckusick Exp $
+ * $Id: vm_pager.c,v 1.49 1999/06/27 11:44:22 peter Exp $
  */
 
 /*
  *	Paging space routine stubs.  Emulates a matchmaker-like interface
  *	for builtin pagers.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/buf.h>
 #include <sys/ucred.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 MALLOC_DEFINE(M_VMPGDATA, "VM pgdata", "XXX: VM pager private data");
 
 extern struct pagerops defaultpagerops;
 extern struct pagerops swappagerops;
 extern struct pagerops vnodepagerops;
 extern struct pagerops devicepagerops;
 
 int cluster_pbuf_freecnt = -1;	/* unlimited to begin with */
 
 static int dead_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static vm_object_t dead_pager_alloc __P((void *, vm_ooffset_t, vm_prot_t,
 	vm_ooffset_t));
 static void dead_pager_putpages __P((vm_object_t, vm_page_t *, int, int, int *));
 static boolean_t dead_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *));
 static void dead_pager_dealloc __P((vm_object_t));
 
 static int
 dead_pager_getpages(obj, ma, count, req)
 	vm_object_t obj;
 	vm_page_t *ma;
 	int count;
 	int req;
 {
 	return VM_PAGER_FAIL;
 }
 
 static vm_object_t
 dead_pager_alloc(handle, size, prot, off)
 	void *handle;
 	vm_ooffset_t size;
 	vm_prot_t prot;
 	vm_ooffset_t off;
 {
 	return NULL;
 }
 
 static void
 dead_pager_putpages(object, m, count, flags, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int flags;
 	int *rtvals;
 {
 	int i;
 
 	for (i = 0; i < count; i++) {
 		rtvals[i] = VM_PAGER_AGAIN;
 	}
 }
 
 static int
 dead_pager_haspage(object, pindex, prev, next)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *prev;
 	int *next;
 {
 	if (prev)
 		*prev = 0;
 	if (next)
 		*next = 0;
 	return FALSE;
 }
 
 static void
 dead_pager_dealloc(object)
 	vm_object_t object;
 {
 	return;
 }
 
 static struct pagerops deadpagerops = {
 	NULL,
 	dead_pager_alloc,
 	dead_pager_dealloc,
 	dead_pager_getpages,
 	dead_pager_putpages,
 	dead_pager_haspage,
 	NULL
 };
 
 struct pagerops *pagertab[] = {
 	&defaultpagerops,	/* OBJT_DEFAULT */
 	&swappagerops,		/* OBJT_SWAP */
 	&vnodepagerops,		/* OBJT_VNODE */
 	&devicepagerops,	/* OBJT_DEVICE */
 	&deadpagerops		/* OBJT_DEAD */
 };
 
 int npagers = sizeof(pagertab) / sizeof(pagertab[0]);
 
 /*
  * Kernel address space for mapping pages.
  * Used by pagers where KVAs are needed for IO.
  *
  * XXX needs to be large enough to support the number of pending async
  * cleaning requests (NPENDINGIO == 64) * the maximum swap cluster size
  * (MAXPHYS == 64k) if you want to get the most efficiency.
  */
 #define PAGER_MAP_SIZE	(8 * 1024 * 1024)
 
 int pager_map_size = PAGER_MAP_SIZE;
 vm_map_t pager_map;
 static int bswneeded;
 static vm_offset_t swapbkva;		/* swap buffers kva */
 
 void
 vm_pager_init()
 {
 	struct pagerops **pgops;
 
 	/*
 	 * Initialize known pagers
 	 */
 	for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
 		if (pgops && ((*pgops)->pgo_init != NULL))
 			(*(*pgops)->pgo_init) ();
 }
 
 void
 vm_pager_bufferinit()
 {
 	struct buf *bp;
 	int i;
 
 	bp = swbuf;
 	/*
 	 * Now set up swap and physical I/O buffer headers.
 	 */
 	for (i = 0; i < nswbuf; i++, bp++) {
 		TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
 		BUF_LOCKINIT(bp);
 		LIST_INIT(&bp->b_dep);
 		bp->b_rcred = bp->b_wcred = NOCRED;
 		bp->b_xflags = 0;
 	}
 
 	cluster_pbuf_freecnt = nswbuf / 2;
 
 	swapbkva = kmem_alloc_pageable(pager_map, nswbuf * MAXPHYS);
 	if (!swapbkva)
 		panic("Not enough pager_map VM space for physical buffers");
 }
 
 /*
  * Allocate an instance of a pager of the given type.
  * Size, protection and offset parameters are passed in for pagers that
  * need to perform page-level validation (e.g. the device pager).
  */
 vm_object_t
 vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, vm_prot_t prot,
 		  vm_ooffset_t off)
 {
 	struct pagerops *ops;
 
 	ops = pagertab[type];
 	if (ops)
 		return ((*ops->pgo_alloc) (handle, size, prot, off));
 	return (NULL);
 }
 
 void
 vm_pager_deallocate(object)
 	vm_object_t object;
 {
 	(*pagertab[object->type]->pgo_dealloc) (object);
 }
 
 /*
  *      vm_pager_strategy:
  *
  *      called with no specific spl
  *      Execute strategy routine directly to pager.
  */
 
 void
 vm_pager_strategy(vm_object_t object, struct buf *bp)
 {
 	if (pagertab[object->type]->pgo_strategy) {
 	    (*pagertab[object->type]->pgo_strategy)(object, bp);
 	} else {
 		bp->b_flags |= B_ERROR;
 		bp->b_error = ENXIO;
 		biodone(bp);
 	}
 }
 
 /*
  * vm_pager_get_pages() - inline, see vm/vm_pager.h
  * vm_pager_put_pages() - inline, see vm/vm_pager.h
  * vm_pager_has_page() - inline, see vm/vm_pager.h
  * vm_pager_page_inserted() - inline, see vm/vm_pager.h
  * vm_pager_page_removed() - inline, see vm/vm_pager.h
  */
 
 #if 0
 /*
  *	vm_pager_sync:
  *
  *	Called by pageout daemon before going back to sleep.
  *	Gives pagers a chance to clean up any completed async pageing 
  *	operations.
  */
 void
 vm_pager_sync()
 {
 	struct pagerops **pgops;
 
 	for (pgops = pagertab; pgops < &pagertab[npagers]; pgops++)
 		if (pgops && ((*pgops)->pgo_sync != NULL))
 			(*(*pgops)->pgo_sync) ();
 }
 
 #endif
 
 vm_offset_t
 vm_pager_map_page(m)
 	vm_page_t m;
 {
 	vm_offset_t kva;
 
 	kva = kmem_alloc_wait(pager_map, PAGE_SIZE);
 	pmap_kenter(kva, VM_PAGE_TO_PHYS(m));
 	return (kva);
 }
 
 void
 vm_pager_unmap_page(kva)
 	vm_offset_t kva;
 {
 	pmap_kremove(kva);
 	kmem_free_wakeup(pager_map, kva, PAGE_SIZE);
 }
 
 vm_object_t
 vm_pager_object_lookup(pg_list, handle)
 	register struct pagerlst *pg_list;
 	void *handle;
 {
 	register vm_object_t object;
 
 	for (object = TAILQ_FIRST(pg_list); object != NULL; object = TAILQ_NEXT(object,pager_object_list))
 		if (object->handle == handle)
 			return (object);
 	return (NULL);
 }
 
 /*
  * initialize a physical buffer
  */
 
 static void
 initpbuf(struct buf *bp)
 {
 	bp->b_rcred = NOCRED;
 	bp->b_wcred = NOCRED;
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_data = (caddr_t) (MAXPHYS * (bp - swbuf)) + swapbkva;
 	bp->b_kvabase = bp->b_data;
 	bp->b_kvasize = MAXPHYS;
 	bp->b_xflags = 0;
 	bp->b_flags = 0;
 	bp->b_error = 0;
 	BUF_LOCK(bp, LK_EXCLUSIVE);
 }
 
 /*
  * allocate a physical buffer
  *
  *	There are a limited number (nswbuf) of physical buffers.  We need
  *	to make sure that no single subsystem is able to hog all of them,
  *	so each subsystem implements a counter which is typically initialized
  *	to 1/2 nswbuf.  getpbuf() decrements this counter in allocation and
  *	increments it on release, and blocks if the counter hits zero.  A
  *	subsystem may initialize the counter to -1 to disable the feature,
  *	but it must still be sure to match up all uses of getpbuf() with 
  *	relpbuf() using the same variable.
  *
  *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
  *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 struct buf *
 getpbuf(pfreecnt)
 	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
 
+retry:
 	if (pfreecnt) {
 		while (*pfreecnt == 0) {
 			tsleep(pfreecnt, PVM, "wswbuf0", 0);
 		}
 	}
 
 	/* get a bp from the swap buffer header pool */
 	while ((bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		bswneeded = 1;
 		tsleep(&bswneeded, PVM, "wswbuf1", 0);
+		goto retry;	/* loop in case someone else grabbed one */
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
 	if (pfreecnt)
 		--*pfreecnt;
 	splx(s);
 
 	initpbuf(bp);
 	return bp;
 }
 
 /*
  * allocate a physical buffer, if one is available.
  *
  *	Note that there is no NULL hack here - all subsystems using this
  *	call understand how to use pfreecnt.
  */
 struct buf *
 trypbuf(pfreecnt)
 	int *pfreecnt;
 {
 	int s;
 	struct buf *bp;
 
 	s = splvm();
 	if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist)) == NULL) {
 		splx(s);
 		return NULL;
 	}
 	TAILQ_REMOVE(&bswlist, bp, b_freelist);
 
 	--*pfreecnt;
 
 	splx(s);
 
 	initpbuf(bp);
 
 	return bp;
 }
 
 /*
  * release a physical buffer
  *
  *	NOTE: pfreecnt can be NULL, but this 'feature' will be removed
  *	relatively soon when the rest of the subsystems get smart about it. XXX
  */
 void
 relpbuf(bp, pfreecnt)
 	struct buf *bp;
 	int *pfreecnt;
 {
 	int s;
 
 	s = splvm();
 
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 
 	if (bp->b_vp)
 		pbrelvp(bp);
 
 	BUF_UNLOCK(bp);
 
 	TAILQ_INSERT_HEAD(&bswlist, bp, b_freelist);
 
 	if (bswneeded) {
 		bswneeded = 0;
 		wakeup(&bswneeded);
 	}
 	if (pfreecnt) {
 		if (++*pfreecnt == 1)
 			wakeup(pfreecnt);
 	}
 	splx(s);
 }
 
 /********************************************************
  *		CHAINING FUNCTIONS			*
  ********************************************************
  *
  *	These functions support recursion of I/O operations
  *	on bp's, typically by chaining one or more 'child' bp's
  *	to the parent.  Synchronous, asynchronous, and semi-synchronous
  *	chaining is possible.
  */
 
 /*
  *	vm_pager_chain_iodone:
  *
  *	io completion routine for child bp.  Currently we fudge a bit
  *	on dealing with b_resid.   Since users of these routines may issue
  *	multiple children simultaniously, sequencing of the error can be lost.
  */
 
 static void
 vm_pager_chain_iodone(struct buf *nbp)
 {
 	struct buf *bp;
 
 	if ((bp = nbp->b_chain.parent) != NULL) {
 		if (nbp->b_flags & B_ERROR) {
 			bp->b_flags |= B_ERROR;
 			bp->b_error = nbp->b_error;
 		} else if (nbp->b_resid != 0) {
 			bp->b_flags |= B_ERROR;
 			bp->b_error = EINVAL;
 		} else {
 			bp->b_resid -= nbp->b_bcount;
 		}
 		nbp->b_chain.parent = NULL;
 		--bp->b_chain.count;
 		if (bp->b_flags & B_WANT) {
 			bp->b_flags &= ~B_WANT;
 			wakeup(bp);
 		}
 		if (!bp->b_chain.count && (bp->b_flags & B_AUTOCHAINDONE)) {
 			bp->b_flags &= ~B_AUTOCHAINDONE;
 			if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
 				bp->b_flags |= B_ERROR;
 				bp->b_error = EINVAL;
 			}
 			biodone(bp);
 		}
 	}
 	nbp->b_flags |= B_DONE;
 	nbp->b_flags &= ~B_ASYNC;
 	relpbuf(nbp, NULL);
 }
 
 /*
  *	getchainbuf:
  *
  *	Obtain a physical buffer and chain it to its parent buffer.  When
  *	I/O completes, the parent buffer will be B_SIGNAL'd.  Errors are
  *	automatically propogated to the parent
  *
  *	Since these are brand new buffers, we do not have to clear B_INVAL
  *	and B_ERROR because they are already clear.
  */
 
 struct buf *
 getchainbuf(struct buf *bp, struct vnode *vp, int flags)
 {
 	struct buf *nbp = getpbuf(NULL);
 
 	nbp->b_chain.parent = bp;
 	++bp->b_chain.count;
 
 	if (bp->b_chain.count > 4)
 		waitchainbuf(bp, 4, 0);
 
 	nbp->b_flags = B_CALL | (bp->b_flags & B_ORDERED) | flags;
 	nbp->b_rcred = nbp->b_wcred = proc0.p_ucred;
 	nbp->b_iodone = vm_pager_chain_iodone;
 
 	crhold(nbp->b_rcred);
 	crhold(nbp->b_wcred);
 
 	if (vp)
 		pbgetvp(vp, nbp);
 	return(nbp);
 }
 
 void
 flushchainbuf(struct buf *nbp)
 {
 	if (nbp->b_bcount) {
 		nbp->b_bufsize = nbp->b_bcount;
 		if ((nbp->b_flags & B_READ) == 0)
 			nbp->b_dirtyend = nbp->b_bcount;
 		VOP_STRATEGY(nbp->b_vp, nbp);
 	} else {
 		biodone(nbp);
 	}
 }
 
 void
 waitchainbuf(struct buf *bp, int count, int done)
 {
  	int s;
 
 	s = splbio();
 	while (bp->b_chain.count > count) {
 		bp->b_flags |= B_WANT;
 		tsleep(bp, PRIBIO + 4, "bpchain", 0);
 	}
 	if (done) {
 		if (bp->b_resid != 0 && !(bp->b_flags & B_ERROR)) {
 			bp->b_flags |= B_ERROR;
 			bp->b_error = EINVAL;
 		}
 		biodone(bp);
 	}
 	splx(s);
 }
 
 void
 autochaindone(struct buf *bp)
 {
  	int s;
 
 	s = splbio();
 	if (bp->b_chain.count == 0)
 		biodone(bp);
 	else
 		bp->b_flags |= B_AUTOCHAINDONE;
 	splx(s);
 }