Index: head/sys/amd64/amd64/machdep.c
===================================================================
--- head/sys/amd64/amd64/machdep.c	(revision 13489)
+++ head/sys/amd64/amd64/machdep.c	(revision 13490)
@@ -1,1820 +1,1820 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.168 1996/01/04 21:10:53 wollman Exp $
+ *	$Id: machdep.c,v 1.169 1996/01/05 20:12:19 wollman Exp $
  */
 
 #include "npx.h"
 #include "isa.h"
 #include "opt_sysvipc.h"
 #include "opt_ddb.h"
 #include "opt_bounce.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/msgbuf.h>
 #include <sys/ioctl.h>
 #include <sys/sysent.h>
 #include <sys/tty.h>
 #include <sys/sysctl.h>
 #include <sys/devconf.h>
 #include <sys/vmmeter.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/npx.h>
 #include <machine/reg.h>
 #include <machine/psl.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/sysarch.h>
 #include <machine/cons.h>
 #include <machine/devconf.h>
 #include <machine/bootinfo.h>
 #include <machine/md_var.h>
 
 #include <i386/isa/isa.h>
 #include <i386/isa/isa_device.h>
 #include <i386/isa/rtc.h>
 #include <machine/random.h>
 
 extern void init386 __P((int first));
 extern int ptrace_set_pc __P((struct proc *p, unsigned int addr));
 extern int ptrace_single_step __P((struct proc *p));
 extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
 extern void dblfault_handler __P((void));
 
 extern void i486_bzero	__P((void *, size_t));
 extern void i586_bzero	__P((void *, size_t));
 extern void i686_bzero	__P((void *, size_t));
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static void identifycpu(void);
 
 char machine[] = "i386";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static char cpu_model[128];
 SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, "");
 
 struct kern_devconf kdc_cpu0 = {
 	0, 0, 0,		/* filled in by dev_attach */
 	"cpu", 0, { MDDT_CPU },
 	0, 0, 0, CPU_EXTERNALLEN,
 	0,			/* CPU has no parent */
 	0,			/* no parentdata */
 	DC_BUSY,		/* the CPU is always busy */
 	cpu_model,		/* no sense in duplication */
 	DC_CLS_CPU		/* class */
 };
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 
 #ifdef BOUNCE_BUFFERS
 extern char *bouncememory;
 extern int maxbkva;
 #ifdef BOUNCEPAGES
 int	bouncepages = BOUNCEPAGES;
 #else
 int	bouncepages = 0;
 #endif
 #endif	/* BOUNCE_BUFFERS */
 
 extern int freebufspace;
 int	msgbufmapped = 0;		/* set when safe to use msgbuf */
 int _udatasel, _ucodesel;
 
 
 int physmem = 0;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 int boothowto = 0, bootverbose = 0, Maxmem = 0;
 static int	badpages = 0;
 long dumplo;
 extern int bootdev;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 int cpu_class;
 
 static void dumpsys __P((void));
 static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 extern struct linker_set netisr_set;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
 
 	/* avail_end was pre-decremented in init_386() to compensate */
 	for (i = 0; i < btoc(sizeof (struct msgbuf)); i++)
 		pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp,
 			   avail_end + i * NBPG,
 			   VM_PROT_ALL, TRUE);
 	msgbufmapped = 1;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	startrtclock();
 	identifycpu();
 	printf("real memory  = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (badpages != 0) {
 		int indx = 1;
 
 		/*
 		 * XXX skip reporting ISA hole & unmanaged kernel memory
 		 */
 		if (phys_avail[0] == PAGE_SIZE)
 			indx += 2;
 
 		printf("Physical memory hole(s):\n");
 		for (; phys_avail[indx + 1] != 0; indx += 2) {
 			int size = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx],
 			    phys_avail[indx + 1] - 1, size, size / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Quickly wire in netisrs.
 	 */
 	setup_netisrs(&netisr_set);
 
 /*
 #ifdef ISDN
 	DONET(isdnintr, NETISR_ISDN);
 #endif
 */
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 	valloc(callout, struct callout, ncallout);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 12, 1024);
 	}
 	nswbuf = min(nbuf, 128);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * If there is more than 16MB of memory, allocate some bounce buffers
 	 */
 	if (Maxmem > 4096) {
 		if (bouncepages == 0) {
 			bouncepages = 64;
 			bouncepages += ((Maxmem - 4096) / 2048) * 32;
 		}
 		v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1));
 		valloc(bouncememory, char, bouncepages * PAGE_SIZE);
 	}
 #endif
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 #ifdef BOUNCE_BUFFERS
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) +
 				maxbkva + pager_map_size, TRUE);
 	io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE);
 #else
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE);
 #endif
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*MAXBSIZE), TRUE);
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size, TRUE);
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*ARG_MAX), TRUE);
 	u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(maxproc*UPAGES*PAGE_SIZE), FALSE);
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	mclrefcnt = (char *)malloc(nmbclusters+CLBYTES/MCLBYTES,
 				   M_MBUF, M_NOWAIT);
 	bzero(mclrefcnt, nmbclusters+CLBYTES/MCLBYTES);
 	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			       nmbclusters * MCLBYTES, FALSE);
 	/*
 	 * Initialize callouts
 	 */
 	callfree = callout;
 	for (i = 1; i < ncallout; i++)
 		callout[i-1].c_next = &callout[i];
 
         if (boothowto & RB_CONFIG) {
 		userconfig();
 		cninit();	/* the preferred console may have changed */
 	}
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * init bounce buffers
 	 */
 	vm_bounce_init();
 #endif
 	/*
 	 * XXX allocate a contiguous area for ISA (non busmaster) DMA
 	 * operations. This _should_ only be done if the DMA channels
 	 * will actually be used, but for now we do it always.
 	 */
 #define DMAPAGES 8
 	isaphysmem =
 	    vm_page_alloc_contig(DMAPAGES * PAGE_SIZE, 0, 0xfffffful, 64*1024);
 
 	printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	/*
 	 * In verbose mode, print out the BIOS's idea of the disk geometries.
 	 */
 	if (bootverbose) {
 		printf("BIOS Geometries:\n");
 		for (i = 0; i < N_BIOS_GEOM; i++) {
 			unsigned long bios_geom;
 			int max_cylinder, max_head, max_sector;
 
 			bios_geom = bootinfo.bi_bios_geom[i];
 
 			/*
 			 * XXX the bootstrap punts a 1200K floppy geometry
 			 * when the get-disk-geometry interrupt fails.  Skip
 			 * drives that have this geometry.
 			 */
 			if (bios_geom == 0x4f010f)
 				continue;
 
 			printf(" %x:%08lx ", i, bios_geom);
 			max_cylinder = bios_geom >> 16;
 			max_head = (bios_geom >> 8) & 0xff;
 			max_sector = bios_geom & 0xff;
 			printf(
 		"0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n",
 			       max_cylinder, max_cylinder + 1,
 			       max_head, max_head + 1,
 			       max_sector, max_sector);
 		}
 		printf(" %d accounted for\n", bootinfo.bi_n_bios_used);
 	}
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 static void
 setup_netisrs(ls)
 	struct linker_set *ls;
 {
 	int i;
 	const struct netisrtab *nit;
 
 	for(i = 0; ls->ls_items[i]; i++) {
 		nit = (const struct netisrtab *)ls->ls_items[i];
 		register_netisr(nit->nit_num, nit->nit_isr);
 	}
 }
 
 static struct cpu_nameclass i386_cpus[] = {
 	{ "Intel 80286",	CPUCLASS_286 },		/* CPU_286   */
 	{ "i386SX",		CPUCLASS_386 },		/* CPU_386SX */
 	{ "i386DX",		CPUCLASS_386 },		/* CPU_386   */
 	{ "i486SX",		CPUCLASS_486 },		/* CPU_486SX */
 	{ "i486DX",		CPUCLASS_486 },		/* CPU_486   */
 	{ "Pentium",		CPUCLASS_586 },		/* CPU_586   */
 	{ "Cy486DLC",		CPUCLASS_486 },		/* CPU_486DLC */
 	{ "Pentium Pro",	CPUCLASS_686 },		/* CPU_686 */
 };
 
 static void
 identifycpu()
 {
 	printf("CPU: ");
 	if (cpu >= 0
 	    && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) {
 		cpu_class = i386_cpus[cpu].cpu_class;
 		strncpy(cpu_model, i386_cpus[cpu].cpu_name, sizeof cpu_model);
 	} else {
 		printf("unknown cpu type %d\n", cpu);
 		panic("startup: bad cpu id");
 	}
 
 #if defined(I586_CPU) || defined(I686_CPU)
 	if (cpu_class == CPUCLASS_586 || cpu_class == CPUCLASS_686) {
 		calibrate_cyclecounter();
 	}
 #endif
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	if (!strcmp(cpu_vendor,"GenuineIntel")) {
 		if ((cpu_id & 0xf00) > 3) {
 			cpu_model[0] = '\0';
 
 			switch (cpu_id & 0x3000) {
 			case 0x1000:
 				strcpy(cpu_model, "Overdrive ");
 				break;
 			case 0x2000:
 				strcpy(cpu_model, "Dual ");
 				break;
 			}
 
 			switch (cpu_id & 0xf00) {
 			case 0x400:
 				strcat(cpu_model, "i486 ");
 				break;
 			case 0x500:
 				strcat(cpu_model, "Pentium"); /* nb no space */
 				break;
 			case 0x600:
 				strcat(cpu_model, "Pentium Pro");
 				break;
 			default:
 				strcat(cpu_model, "unknown");
 				break;
 			}
 
 			switch (cpu_id & 0xff0) {
 			case 0x400:
 				strcat(cpu_model, "DX"); break;
 			case 0x410:
 				strcat(cpu_model, "DX"); break;
 			case 0x420:
 				strcat(cpu_model, "SX"); break;
 			case 0x430:
 				strcat(cpu_model, "DX2"); break;
 			case 0x440:
 				strcat(cpu_model, "SL"); break;
 			case 0x450:
 				strcat(cpu_model, "SX2"); break;
 			case 0x470:
 				strcat(cpu_model, "DX2 Write-Back Enhanced");
 				break;
 			case 0x480:
 				strcat(cpu_model, "DX4"); break;
 				break;
 			}
 		}
 	}
 #endif
 	printf("%s (", cpu_model);
 	switch(cpu_class) {
 	case CPUCLASS_286:
 		printf("286");
 		break;
 #if defined(I386_CPU)
 	case CPUCLASS_386:
 		printf("386");
 		break;
 #endif
 #if defined(I486_CPU)
 	case CPUCLASS_486:
 		printf("486");
 		bzero = i486_bzero;
 		break;
 #endif
 #if defined(I586_CPU)
 	case CPUCLASS_586:
 		printf("%d.%02d-MHz ",
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
 		printf("586");
 		bzero = i586_bzero;
 		break;
 #endif
 #if defined(I686_CPU)
 	case CPUCLASS_686:
 		printf("%d.%02d-MHz ",
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
 		printf("686");
 		bzero = i686_bzero;
 		break;
 #endif
 	default:
 		printf("unknown");	/* will panic below... */
 	}
 	printf("-class CPU)\n");
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	if(*cpu_vendor)
 		printf("  Origin = \"%s\"",cpu_vendor);
 	if(cpu_id)
 		printf("  Id = 0x%lx",cpu_id);
 
 	if (!strcmp(cpu_vendor, "GenuineIntel")) {
 		printf("  Stepping=%ld", cpu_id & 0xf);
 		if (cpu_high > 0) {
 #define FEATUREFMT "\020\001FPU\002VME\003PSE\004MCE\005CX8\006APIC"
 			printf("\n  Features=0x%b", cpu_feature, FEATUREFMT);
 		}
 	}
 	/* Avoid ugly blank lines: only print newline when we have to. */
 	if (*cpu_vendor || cpu_id)
 		printf("\n");
 #endif
 	/*
 	 * Now that we have told the user what they have,
 	 * let them know if that machine type isn't configured.
 	 */
 	switch (cpu_class) {
 	case CPUCLASS_286:	/* a 286 should not make it this far, anyway */
 #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU)
 #error This kernel is not configured for one of the supported CPUs
 #endif
 #if !defined(I386_CPU)
 	case CPUCLASS_386:
 #endif
 #if !defined(I486_CPU)
 	case CPUCLASS_486:
 #endif
 #if !defined(I586_CPU)
 	case CPUCLASS_586:
 #endif
 #if !defined(I686_CPU)
 	case CPUCLASS_686:
 #endif
 		panic("CPU class not configured");
 	default:
 		break;
 	}
 	dev_attach(&kdc_cpu0);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * in u. to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	unsigned code;
 {
 	register struct proc *p = curproc;
 	register int *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler
 	 * context. Note that if the stack is in P0 space, the
 	 * call to grow() is a nop, and the useracc() check
 	 * will fail if the process has not already allocated
 	 * the space with a `brk'.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) &&
 	    (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SA_ONSTACK;
 	} else {
 		fp = (struct sigframe *)(regs[tESP]
 			- sizeof(struct sigframe));
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow(p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs[tERR];
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs[tEAX];
 	sf.sf_sc.sc_ebx = regs[tEBX];
 	sf.sf_sc.sc_ecx = regs[tECX];
 	sf.sf_sc.sc_edx = regs[tEDX];
 	sf.sf_sc.sc_esi = regs[tESI];
 	sf.sf_sc.sc_edi = regs[tEDI];
 	sf.sf_sc.sc_cs = regs[tCS];
 	sf.sf_sc.sc_ds = regs[tDS];
 	sf.sf_sc.sc_ss = regs[tSS];
 	sf.sf_sc.sc_es = regs[tES];
 	sf.sf_sc.sc_isp = regs[tISP];
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs[tESP];
 	sf.sf_sc.sc_fp = regs[tEBP];
 	sf.sf_sc.sc_pc = regs[tEIP];
 	sf.sf_sc.sc_ps = regs[tEFLAGS];
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	};
 
 	regs[tESP] = (int)fp;
 	regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc;
 	regs[tEFLAGS] &= ~PSL_VM;
 	regs[tCS] = _ucodesel;
 	regs[tDS] = _udatasel;
 	regs[tES] = _udatasel;
 	regs[tSS] = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap, retval)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 	int *retval;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register int *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs[tESP] points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0)
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = scp->sc_ps;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs[tEFLAGS] & ~PSL_RF)) {
 #ifdef DEBUG
     		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
     		return(EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     		printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 		trapsignal(p, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	/* restore scratch registers */
 	regs[tEAX] = scp->sc_eax;
 	regs[tEBX] = scp->sc_ebx;
 	regs[tECX] = scp->sc_ecx;
 	regs[tEDX] = scp->sc_edx;
 	regs[tESI] = scp->sc_esi;
 	regs[tEDI] = scp->sc_edi;
 	regs[tCS] = scp->sc_cs;
 	regs[tDS] = scp->sc_ds;
 	regs[tES] = scp->sc_es;
 	regs[tSS] = scp->sc_ss;
 	regs[tISP] = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK;
 	p->p_sigmask = scp->sc_mask &~
 	    (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP));
 	regs[tEBP] = scp->sc_fp;
 	regs[tESP] = scp->sc_sp;
 	regs[tEIP] = scp->sc_pc;
 	regs[tEFLAGS] = eflags;
 	return(EJUSTRETURN);
 }
 
 static int	waittime = -1;
 static struct pcb dumppcb;
 
 __dead void
 boot(howto)
 	int howto;
 {
 	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
 		register struct buf *bp;
 		int iter, nbusy;
 
 		waittime = 0;
 		printf("\nsyncing disks... ");
 
 		sync(&proc0, NULL, NULL);
 
 		for (iter = 0; iter < 20; iter++) {
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; ) {
 				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
 					nbusy++;
 				}
 			}
 			if (nbusy == 0)
 				break;
 			printf("%d ", nbusy);
 			DELAY(40000 * iter);
 		}
 		if (nbusy) {
 			/*
 			 * Failed to sync all blocks. Indicate this and don't
 			 * unmount filesystems (thus forcing an fsck on reboot).
 			 */
 			printf("giving up\n");
 #ifdef SHOW_BUSYBUFS
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; ) {
 				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
 					nbusy++;
 					printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno);
 				}
 			}
 			DELAY(5000000);	/* 5 seconds */
 #endif
 		} else {
 			printf("done\n");
 			/*
 			 * Unmount filesystems
 			 */
 			if (panicstr == 0)
 				vfs_unmountall();
 		}
 		DELAY(100000);			/* wait for console output to finish */
 		dev_shutdownall(FALSE);
 	}
 	splhigh();
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 		cngetc();
 	} else {
 		if (howto & RB_DUMP) {
 			if (!cold) {
 				savectx(&dumppcb, 0);
 				dumppcb.pcb_ptd = rcr3();
 				dumpsys();
 			}
 
 			if (PANIC_REBOOT_WAIT_TIME != 0) {
 				if (PANIC_REBOOT_WAIT_TIME != -1) {
 					int loop;
 					printf("Automatic reboot in %d seconds - press a key on the console to abort\n",
 						PANIC_REBOOT_WAIT_TIME);
 					for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) {
 						DELAY(1000 * 100); /* 1/10th second */
 						if (cncheckc()) /* Did user type a key? */
 							break;
 					}
 					if (!loop)
 						goto die;
 				}
 			} else { /* zero time specified - reboot NOW */
 				goto die;
 			}
 			printf("--> Press a key on the console to reboot <--\n");
 			cngetc();
 		}
 	}
 die:
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 	cpu_reset();
 	for(;;) ;
 	/* NOTREACHED */
 }
 
 /*
  * Magic number for savecore
  *
  * exported (symorder) and used at least by savecore(8)
  *
  */
 u_long		dumpmag = 0x8fca0101UL;	
 
 static int	dumpsize = 0;		/* also for savecore */
 
 static int	dodump = 1;
 SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
 
 /*
  * Doadump comes here after turning off memory management and
  * getting on the dump stack, either when called above, or by
  * the auto-restart code.
  */
 static void
 dumpsys()
 {
 
 	if (!dodump)
 		return;
 	if (dumpdev == NODEV)
 		return;
 	if ((minor(dumpdev)&07) != 1)
 		return;
 	if (!(bdevsw[major(dumpdev)]))
 		return;
 	if (!(bdevsw[major(dumpdev)]->d_dump))
 		return;
 	dumpsize = Maxmem;
 	printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo);
 	printf("dump ");
 	switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
 
 	case ENXIO:
 		printf("device bad\n");
 		break;
 
 	case EFAULT:
 		printf("device not ready\n");
 		break;
 
 	case EINVAL:
 		printf("area improper\n");
 		break;
 
 	case EIO:
 		printf("i/o error\n");
 		break;
 
 	case EINTR:
 		printf("aborted from console\n");
 		break;
 
 	default:
 		printf("succeeded\n");
 		break;
 	}
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 {
 	int *regs = p->p_md.md_regs;
 
 	bzero(regs, sizeof(struct trapframe));
 	regs[tEIP] = entry;
 	regs[tESP] = stack;
 	regs[tEFLAGS] = PSL_USER | (regs[tEFLAGS] & PSL_T);
 	regs[tSS] = _udatasel;
 	regs[tDS] = _udatasel;
 	regs[tES] = _udatasel;
 	regs[tCS] = _ucodesel;
 
 	p->p_addr->u_pcb.pcb_flags = 0;	/* no fp at all */
 	load_cr0(rcr0() | CR0_TS);	/* start emulating */
 #if	NNPX > 0
 	npxinit(__INITIAL_NPXCW__);
 #endif	/* NNPX > 0 */
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int currentldt;
 int _default_ldt;
 union descriptor gdt[NGDT];		/* global descriptor table */
 struct gate_descriptor idt[NIDT];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	3 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	4 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	5 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {	(int) kstack,		/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	7 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	10 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip = idt + idx;
 
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall);
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 extern inthand_t
 	IDTVEC(linux_syscall);
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	unsigned biosbasemem, biosextmem;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 	int	pagesinbase, pagesinext;
 	int	target_page, pa_indx;
 
 	proc0.p_addr = proc0paddr;
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
-	 * i386_btop(i386_round_page(etext)) - 1.
+	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * NBPG)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	/* Note. eventually want private ldts per process */
 	for (x = 0; x < NLDT; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 #if defined(COMPAT_LINUX) || defined(LINUX)
  	setidt(0x80, &IDTVEC(linux_syscall),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 	r_gdt.rd_limit = sizeof(gdt) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	currentldt = _default_ldt;
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	/* Use BIOS values stored in RTC CMOS RAM, since probing
 	 * breaks certain 386 AT relics.
 	 */
 	biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8);
 	biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8);
 
 	/*
 	 * Print a warning if the official BIOS interface disagrees
 	 * with the hackish interface used above.  Eventually only
 	 * the official interface should be used.
 	 */
 	if (bootinfo.bi_memsizes_valid) {
 		if (bootinfo.bi_basemem != biosbasemem)
 			printf("BIOS basemem (%ldK) != RTC basemem (%dK)\n",
 			       bootinfo.bi_basemem, biosbasemem);
 		if (bootinfo.bi_extmem != biosextmem)
 			printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n",
 			       bootinfo.bi_extmem, biosextmem);
 	}
 
 	/*
 	 * If BIOS tells us that it has more than 640k in the basemem,
 	 *	don't believe it - set it to 640k.
 	 */
 	if (biosbasemem > 640)
 		biosbasemem = 640;
 
 	/*
 	 * Some 386 machines might give us a bogus number for extended
 	 *	mem. If this happens, stop now.
 	 */
 #ifndef LARGEMEM
 	if (biosextmem > 65536) {
 		panic("extended memory beyond limit of 64MB");
 		/* NOTREACHED */
 	}
 #endif
 
 	pagesinbase = biosbasemem * 1024 / NBPG;
 	pagesinext = biosextmem * 1024 / NBPG;
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 *	there's 16MB of memory - this really confuses people that
 	 *	are trying to use bus mastering ISA controllers with the
 	 *	"16MB limit"; they only have 16MB, but the remapping puts
 	 *	them beyond the limit.
 	 */
 	/*
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((pagesinext > 3840) && (pagesinext < 4096))
 		pagesinext = 3840;
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of of the physical address space. It
 	 */
 	Maxmem = pagesinext + 0x100000/PAGE_SIZE;
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM/4;
 #endif
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap (first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 
 	/*
 	 * We currently don't bother testing base memory.
 	 * XXX  ...but we probably should.
 	 */
 	pa_indx = 0;
 	badpages = 0;
 	if (pagesinbase > 1) {
 		phys_avail[pa_indx++] = PAGE_SIZE;	/* skip first page of memory */
 		phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */
 		physmem = pagesinbase - 1;
 	} else {
 		/* point at first chunk end */
 		pa_indx++;
 	}
 
 	for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) {
 		int tmp, page_bad = FALSE;
 
 		/*
 		 * map page into kernel: valid, read/write, non-cacheable
 		 */
 		*(int *)CMAP1 = PG_V | PG_KW | PG_N | target_page;
 		pmap_update();
 
 		tmp = *(int *)CADDR1;
 		/*
 		 * Test for alternating 1's and 0's
 		 */
 		*(volatile int *)CADDR1 = 0xaaaaaaaa;
 		if (*(volatile int *)CADDR1 != 0xaaaaaaaa) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for alternating 0's and 1's
 		 */
 		*(volatile int *)CADDR1 = 0x55555555;
 		if (*(volatile int *)CADDR1 != 0x55555555) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 1's
 		 */
 		*(volatile int *)CADDR1 = 0xffffffff;
 		if (*(volatile int *)CADDR1 != 0xffffffff) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 0's
 		 */
 		*(volatile int *)CADDR1 = 0x0;
 		if (*(volatile int *)CADDR1 != 0x0) {
 			/*
 			 * test of page failed
 			 */
 			page_bad = TRUE;
 		}
 		/*
 		 * Restore original value.
 		 */
 		*(int *)CADDR1 = tmp;
 
 		/*
 		 * Adjust array of valid/good pages.
 		 */
 		if (page_bad == FALSE) {
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 */
 			if (phys_avail[pa_indx] == target_page) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = target_page;	/* start */
 				phys_avail[pa_indx] = target_page + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		} else {
 			badpages++;
 			page_bad = FALSE;
 		}
 	}
 
 	*(int *)CMAP1 = 0;
 	pmap_update();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf));
 
 	avail_end = phys_avail[pa_indx];
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* make a initial tss so microp can get interrupt stack on syscall! */
 	proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG;
 	proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs =
 		GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt =
 		(sizeof(struct i386tss))<<16;
 
 	ltr(gsel_tss);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode);
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_ptd = IdlePTD;
 }
 
 /*
  * The registers are in the frame; the frame is in the user area of
  * the process in question; when the process is active, the registers
  * are in "the kernel stack"; when it's not, they're still there, but
  * things get flipped around.  So, since p->p_md.md_regs is the whole address
  * of the register set, take its offset from the kernel stack, and
  * index into the user block.  Don't you just *love* virtual memory?
  * (I'm starting to think seymour is right...)
  */
 #define	TF_REGP(p)	((struct trapframe *) \
 			 ((char *)(p)->p_addr \
 			  + ((char *)(p)->p_md.md_regs - kstack)))
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned int addr;
 {
 	TF_REGP(p)->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	TF_REGP(p)->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	int data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - kstack;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = TF_REGP(p);
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct trapframe *tp;
 
 	tp = TF_REGP(p);
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct trapframe *tp;
 
 	tp = TF_REGP(p);
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 #define b_cylin	b_resid
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         /* calculate cylinder for disksort to order transfers with */
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         bp->b_cylin = bp->b_pblkno / lp->d_secpercyl;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 int
 disk_externalize(int drive, struct sysctl_req *req)
 {
 	return SYSCTL_OUT(req, &drive, sizeof drive);
 }
Index: head/sys/amd64/amd64/pmap.c
===================================================================
--- head/sys/amd64/amd64/pmap.c	(revision 13489)
+++ head/sys/amd64/amd64/pmap.c	(revision 13490)
@@ -1,1954 +1,2167 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.71 1995/12/17 07:19:15 bde Exp $
+ *	$Id: pmap.c,v 1.72 1995/12/22 18:21:26 bde Exp $
  */
 
 /*
  * Derived from hp300 version by Mike Hibler, this version by William
  * Jolitz uses a recursive map [a pde points to the page directory] to
  * map the page tables using the pagetables themselves. This is done to
  * reduce the impact on kernel virtual memory for lots of sparse address
  * space, and to reduce the cost of memory to each process.
  *
  *	Derived from: hp300/@(#)pmap.c	7.1 (Berkeley) 12/5/90
  */
 /*
  * Major modifications by John S. Dyson primarily to support
  * pageable page tables, eliminating pmap_attributes,
  * discontiguous memory pages, and using more efficient string
  * instructions. Jan 13, 1994.  Further modifications on Mar 2, 1994,
  * general clean-up and efficiency mods.
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/queue.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/pcb.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 
 #include <i386/isa/isa.h>
 
+#define PMAP_KEEP_PDIRS
+
+static void	init_pv_entries __P((int));
+
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023]))
 #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023])
 
 #define pmap_pte_pa(pte)	(*(int *)(pte) & PG_FRAME)
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_U) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)		((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
 #define pmap_pte_set_prot(pte, v)	((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static int protection_codes[8];
 
 static struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 
 vm_offset_t avail_start;	/* PA of first available physical page */
 vm_offset_t avail_end;		/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static vm_offset_t vm_first_phys;
 
 static int nkpt;
 
 extern vm_offset_t clean_sva, clean_eva;
 extern int cpu_class;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1;
 static pt_entry_t *CMAP2, *ptmmap;
 static pv_entry_t pv_table;
 caddr_t CADDR1, ptvmmap;
 static caddr_t CADDR2;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp;
 
 static void	free_pv_entry __P((pv_entry_t pv));
 static pt_entry_t *
 		get_pt_entry __P((pmap_t pmap));
 static pv_entry_t
 		get_pv_entry __P((void));
 static void	i386_protection_init __P((void));
-static void	init_pv_entries __P((int npg));
 static void	pmap_alloc_pv_entry __P((void));
 static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
 static void	pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
 				      vm_offset_t pa));
 static int	pmap_is_managed __P((vm_offset_t pa));
 static void	pmap_remove_all __P((vm_offset_t pa));
 static void	pmap_remove_entry __P((struct pmap *pmap, pv_entry_t pv,
 				       vm_offset_t va));
 static vm_page_t
 		pmap_pte_vm_page __P((pmap_t pmap, vm_offset_t pt));
 static boolean_t
 		pmap_testbit __P((vm_offset_t pa, int bit));
 
 /*
+ * The below are finer grained pmap_update routines.  These eliminate
+ * the gratuitious tlb flushes on non-i386 architectures.
+ */
+static __inline void
+pmap_update_1pg( vm_offset_t va) {
+#if defined(I386_CPU)
+	if (cpuclass == CPUCLASS_I386)
+		pmap_update();
+	else
+#endif
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va));
+}
+
+static __inline void
+pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) {
+#if defined(I386_CPU)
+	if (cpuclass == CPUCLASS_I386) {
+		pmap_update();
+	} else
+#endif
+	{
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1));
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2));
+	}
+}
+
+/*
  *	Routine:	pmap_pte
  *	Function:
  *		Extract the page table entry associated
  *		with the given map/virtual_address pair.
  * [ what about induced faults -wfj]
  */
 
-inline pt_entry_t * __pure
+__inline pt_entry_t * __pure
 pmap_pte(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 
 	if (pmap && *pmap_pde(pmap, va)) {
 		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME)))
 			return ((pt_entry_t *) vtopte(va));
 		/* otherwise, we are alternate address space */
 		else {
 			if (frame != ((int) APTDpde & PG_FRAME)) {
 				APTDpde = pmap->pm_pdir[PTDPTDI];
 				pmap_update();
 			}
 			return ((pt_entry_t *) avtopte(va));
 		}
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 
 vm_offset_t
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pa;
 
 	if (pmap && *pmap_pde(pmap, va)) {
 		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap)
 		    || (frame == ((int) PTDpde & PG_FRAME))) {
 			pa = *(int *) vtopte(va);
 			/* otherwise, we are alternate address space */
 		} else {
 			if (frame != ((int) APTDpde & PG_FRAME)) {
 				APTDpde = pmap->pm_pdir[PTDPTDI];
 				pmap_update();
 			}
 			pa = *(int *) avtopte(va);
 		}
 		return ((pa & PG_FRAME) | (va & ~PG_FRAME));
 	}
 	return 0;
 
 }
 
 /*
  * determine if a page is managed (memory vs. device)
  */
-static inline int
+static __inline int
 pmap_is_managed(pa)
 	vm_offset_t pa;
 {
 	int i;
 
 	if (!pmap_initialized)
 		return 0;
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		if (pa >= phys_avail[i] && pa < phys_avail[i + 1])
 			return 1;
 	}
 	return 0;
 }
 
 /*
  * find the vm_page_t of a pte (only) given va of pte and pmap
  */
 static __inline vm_page_t
 pmap_pte_vm_page(pmap, pt)
 	pmap_t pmap;
 	vm_offset_t pt;
 {
 	vm_page_t m;
 
-	pt = i386_trunc_page(pt);
-	pt = (pt - UPT_MIN_ADDRESS) / NBPG;
+	pt = trunc_page(pt);
+	pt = (pt - UPT_MIN_ADDRESS) / PAGE_SIZE;
 	pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME;
 	m = PHYS_TO_VM_PAGE(pt);
 	return m;
 }
 
 /*
  * Wire a page table page
  */
 __inline void
 pmap_use_pt(pmap, va)
 	pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pt;
 
 	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
 		return;
 
 	pt = (vm_offset_t) vtopte(va);
 	vm_page_hold(pmap_pte_vm_page(pmap, pt));
 }
 
 /*
  * Unwire a page table page
  */
-inline void
+__inline void
 pmap_unuse_pt(pmap, va)
 	pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pt;
 	vm_page_t m;
 
 	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
 		return;
 
 	pt = (vm_offset_t) vtopte(va);
 	m = pmap_pte_vm_page(pmap, pt);
 	vm_page_unhold(m);
 	if (pmap != kernel_pmap &&
 	    (m->hold_count == 0) &&
 	    (m->wire_count == 0) &&
 	    (va < KPT_MIN_ADDRESS)) {
+/*
+ * We don't free page-table-pages anymore because it can have a negative
+ * impact on perf at times.  Now we just deactivate, and it'll get cleaned
+ * up if needed...  Also, if the page ends up getting used, it will fault
+ * back into the process address space and be reactivated.
+ */
+#ifdef PMAP_FREE_OLD_PTES
 		pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
 		vm_page_free(m);
+#else
+		m->dirty = 0;
+		vm_page_deactivate(m);
+#endif
 	}
 }
 
 /* [ macro again?, should I force kstack into user map here? -wfj ] */
 void
 pmap_activate(pmap, pcbp)
 	register pmap_t pmap;
 	struct pcb *pcbp;
 {
 	PMAP_ACTIVATE(pmap, pcbp);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_offset_t firstaddr;
 	vm_offset_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 
 	avail_start = firstaddr;
 
 	/*
-	 * XXX The calculation of virtual_avail is wrong. It's NKPT*NBPG too
+	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize protection array.
 	 */
 	i386_protection_init();
 
 	/*
 	 * The kernel's pmap is statically allocated so we don't have to use
 	 * pmap_create, which is unlikely to work correctly at this part of
 	 * the boot sequence (XXX and which no longer exists).
 	 */
 	kernel_pmap = &kernel_pmap_store;
 
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD);
 
 	kernel_pmap->pm_count = 1;
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
-	v = (c)va; va += ((n)*NBPG); p = pte; pte += (n);
+	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = pmap_pte(kernel_pmap, va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
 
 	/*
 	 * ptmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufmap is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1)
 
 	virtual_avail = va;
 
 	*(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0;
 	pmap_update();
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_offset_t phys_start, phys_end;
 {
 	vm_offset_t addr;
 	vm_size_t npg, s;
 	int i;
 
 	/*
 	 * calculate the number of pv_entries needed
 	 */
 	vm_first_phys = phys_avail[0];
 	for (i = 0; phys_avail[i + 1]; i += 2);
-	npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG;
+	npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 	s = (vm_size_t) (sizeof(struct pv_entry) * npg);
-	s = i386_round_page(s);
+	s = round_page(s);
 	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
 	pv_table = (pv_entry_t) addr;
 
 	/*
 	 * init the pv free list
 	 */
 	init_pv_entries(npg);
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	For now, VM is already on, we only need to map the
  *	specified memory.
  */
 vm_offset_t
 pmap_map(virt, start, end, prot)
 	vm_offset_t virt;
 	vm_offset_t start;
 	vm_offset_t end;
 	int prot;
 {
 	while (start < end) {
 		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
 		virt += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	return (virt);
 }
 
+#ifdef PMAP_KEEP_PDIRS
+int nfreepdir;
+caddr_t *pdirlist;
+#define NFREEPDIR 3
+
+static void *
+pmap_getpdir() {
+	caddr_t *pdir;
+	if (pdirlist) {
+		--nfreepdir;
+		pdir = pdirlist;
+		pdirlist = (caddr_t *) *pdir;
+		bzero( (caddr_t) pdir, PAGE_SIZE);
+	} else {
+		pdir = (caddr_t *) kmem_alloc(kernel_map, PAGE_SIZE);
+	}
+
+	return (void *) pdir;
+}
+
+static void
+pmap_freepdir(void *pdir) {
+	if (nfreepdir > NFREEPDIR) {
+		kmem_free(kernel_map, (vm_offset_t) pdir, PAGE_SIZE);
+	} else {
+		* (caddr_t *) pdir = (caddr_t) pdirlist;
+		pdirlist = (caddr_t *) pdir;
+		++nfreepdir;
+	}
+}
+#endif
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
+
+#ifdef PMAP_KEEP_PDIRS
+	pmap->pm_pdir = pmap_getpdir();
+#else
 	pmap->pm_pdir = (pd_entry_t *) kmem_alloc(kernel_map, PAGE_SIZE);
+#endif
 
 	/* wire in kernel global address entries */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
 
 	/* install self-referential address mapping entry */
 	*(int *) (pmap->pm_pdir + PTDPTDI) =
 	    ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW;
 
 	pmap->pm_count = 1;
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 
 static vm_page_t nkpg;
 vm_offset_t kernel_vm_end;
 
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct proc *p;
 	struct pmap *pmap;
 	int s;
 
 	s = splhigh();
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
-			kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			++nkpt;
 		}
 	}
-	addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
-			kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 		++nkpt;
 		if (!nkpg) {
 			nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM);
 			if (!nkpg)
 				panic("pmap_growkernel: no memory to grow kernel");
 			vm_page_wire(nkpg);
 			vm_page_remove(nkpg);
 			pmap_zero_page(VM_PAGE_TO_PHYS(nkpg));
 		}
 		pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW);
 		nkpg = NULL;
 
 		for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 			if (p->p_vmspace) {
 				pmap = &p->p_vmspace->vm_pmap;
 				*pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
 			}
 		}
 		*pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
-		kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 /*
  *	Retire the given physical map from service.
  *	Should only be called if the map contains
  *	no valid mappings.
  */
 void
 pmap_destroy(pmap)
 	register pmap_t pmap;
 {
 	int count;
 
 	if (pmap == NULL)
 		return;
 
 	count = --pmap->pm_count;
 	if (count == 0) {
 		pmap_release(pmap);
 		free((caddr_t) pmap, M_VMPMAP);
 	}
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap)
 	register struct pmap *pmap;
 {
+#ifdef PMAP_KEEP_PDIRS
+	pmap_freepdir( (void *)pmap->pm_pdir);
+#else
 	kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE);
+#endif
 }
 
 /*
  *	Add a reference to the specified pmap.
  */
 void
 pmap_reference(pmap)
 	pmap_t pmap;
 {
 	if (pmap != NULL) {
 		pmap->pm_count++;
 	}
 }
 
-#define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2)
+#define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2)
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_freelistcnt;
 static pv_entry_t pv_freelist;
 static vm_offset_t pvva;
 static int npvvapg;
 
 /*
  * free the pv_entry back to the free list
  */
-inline static void
+static __inline void
 free_pv_entry(pv)
 	pv_entry_t pv;
 {
 	if (!pv)
 		return;
 	++pv_freelistcnt;
 	pv->pv_next = pv_freelist;
 	pv_freelist = pv;
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
-static inline pv_entry_t
+static __inline pv_entry_t
 get_pv_entry()
 {
 	pv_entry_t tmp;
 
 	/*
 	 * get more pv_entry pages if needed
 	 */
 	if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) {
 		pmap_alloc_pv_entry();
 	}
 	/*
 	 * get a pv_entry off of the free list
 	 */
 	--pv_freelistcnt;
 	tmp = pv_freelist;
 	pv_freelist = tmp->pv_next;
 	return tmp;
 }
 
 /*
  * this *strange* allocation routine *statistically* eliminates the
  * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure.
  * also -- this code is MUCH MUCH faster than the malloc equiv...
  */
 static void
 pmap_alloc_pv_entry()
 {
 	/*
 	 * do we have any pre-allocated map-pages left?
 	 */
 	if (npvvapg) {
 		vm_page_t m;
 
 		/*
 		 * we do this to keep recursion away
 		 */
 		pv_freelistcnt += PV_FREELIST_MIN;
 		/*
 		 * allocate a physical page out of the vm system
 		 */
 		m = vm_page_alloc(kernel_object,
 		    OFF_TO_IDX(pvva - vm_map_min(kernel_map)),
 		    VM_ALLOC_INTERRUPT);
 		if (m) {
 			int newentries;
 			int i;
 			pv_entry_t entry;
 
-			newentries = (NBPG / sizeof(struct pv_entry));
+			newentries = (PAGE_SIZE / sizeof(struct pv_entry));
 			/*
 			 * wire the page
 			 */
 			vm_page_wire(m);
 			m->flags &= ~PG_BUSY;
 			/*
 			 * let the kernel see it
 			 */
 			pmap_kenter(pvva, VM_PAGE_TO_PHYS(m));
 
 			entry = (pv_entry_t) pvva;
 			/*
 			 * update the allocation pointers
 			 */
-			pvva += NBPG;
+			pvva += PAGE_SIZE;
 			--npvvapg;
 
 			/*
 			 * free the entries into the free list
 			 */
 			for (i = 0; i < newentries; i++) {
 				free_pv_entry(entry);
 				entry++;
 			}
 		}
 		pv_freelistcnt -= PV_FREELIST_MIN;
 	}
 	if (!pv_freelist)
 		panic("get_pv_entry: cannot get a pv_entry_t");
 }
 
 
 
 /*
  * init the pv_entry allocation system
  */
 #define PVSPERPAGE 64
 void
 init_pv_entries(npg)
 	int npg;
 {
 	/*
 	 * allocate enough kvm space for PVSPERPAGE entries per page (lots)
 	 * kvm space is fairly cheap, be generous!!!  (the system can panic if
 	 * this is too small.)
 	 */
-	npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG;
-	pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG);
+	npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry)
+		+ PAGE_SIZE - 1) / PAGE_SIZE;
+	pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE);
 	/*
 	 * get the first batch of entries
 	 */
 	free_pv_entry(get_pv_entry());
 }
 
 static pt_entry_t *
 get_pt_entry(pmap)
 	pmap_t pmap;
 {
 	vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 	/* are we current address space or kernel? */
 	if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) {
 		return PTmap;
 	}
 	/* otherwise, we are alternate address space */
 	if (frame != ((int) APTDpde & PG_FRAME)) {
 		APTDpde = pmap->pm_pdir[PTDPTDI];
 		pmap_update();
 	}
 	return APTmap;
 }
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 static void
 pmap_remove_entry(pmap, pv, va)
 	struct pmap *pmap;
 	pv_entry_t pv;
 	vm_offset_t va;
 {
 	pv_entry_t npv;
 	int s;
 
 	s = splhigh();
 	if (pmap == pv->pv_pmap && va == pv->pv_va) {
 		npv = pv->pv_next;
 		if (npv) {
 			*pv = *npv;
 			free_pv_entry(npv);
 		} else {
 			pv->pv_pmap = NULL;
 		}
 	} else {
-		for (npv = pv->pv_next; npv; npv = npv->pv_next) {
+		for (npv = pv->pv_next; npv; (pv = npv, npv = pv->pv_next)) {
 			if (pmap == npv->pv_pmap && va == npv->pv_va) {
 				break;
 			}
-			pv = npv;
 		}
 		if (npv) {
 			pv->pv_next = npv->pv_next;
 			free_pv_entry(npv);
 		}
 	}
 	splx(s);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap, sva, eva)
 	struct pmap *pmap;
 	register vm_offset_t sva;
 	register vm_offset_t eva;
 {
 	register pt_entry_t *ptp, *ptq;
 	vm_offset_t pa;
 	register pv_entry_t pv;
 	vm_offset_t va;
 	pt_entry_t oldpte;
 
 	if (pmap == NULL)
 		return;
 
 	ptp = get_pt_entry(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
-	if ((sva + NBPG) == eva) {
+	if ((sva + PAGE_SIZE) == eva) {
 
 		if (*pmap_pde(pmap, sva) == 0)
 			return;
 
 		ptq = ptp + i386_btop(sva);
 
 		if (!*ptq)
 			return;
 		/*
 		 * Update statistics
 		 */
 		if (pmap_pte_w(ptq))
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
 		pa = pmap_pte_pa(ptq);
 		oldpte = *ptq;
 		*ptq = 0;
 
 		if (pmap_is_managed(pa)) {
 			if ((int) oldpte & PG_M) {
-				if (sva < USRSTACK + (UPAGES * NBPG) ||
+				if (sva < USRSTACK + (UPAGES * PAGE_SIZE) ||
 				    (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) {
 					PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 				}
 			}
 			pv = pa_to_pvh(pa);
 			pmap_remove_entry(pmap, pv, sva);
 		}
 		pmap_unuse_pt(pmap, sva);
-		pmap_update();
+		pmap_update_1pg(sva);
 		return;
 	}
 	sva = i386_btop(sva);
 	eva = i386_btop(eva);
 
 	while (sva < eva) {
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 
 		if (*pmap_pde(pmap, i386_ptob(sva)) == 0) {
 			/* We can race ahead here, straight to next pde.. */
 			sva = ((sva + NPTEPG) & ~(NPTEPG - 1));
 			continue;
 		}
 		ptq = ptp + sva;
 
 		/*
 		 * search for page table entries, use string operations that
 		 * are much faster than explicitly scanning when page tables
 		 * are not fully populated.
 		 */
 		if (*ptq == 0) {
 			vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1));
 			vm_offset_t nscan = pdnxt - sva;
 			int found = 0;
 
 			if ((nscan + sva) > eva)
 				nscan = eva - sva;
 
 			asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :
 			    "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx");
 
 			if (!found) {
 				sva = pdnxt;
 				continue;
 			}
 			ptq -= 1;
 
 			sva = ptq - ptp;
 		}
 		/*
 		 * Update statistics
 		 */
 		oldpte = *ptq;
 		if (((int) oldpte) & PG_W)
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
 		/*
 		 * Invalidate the PTEs. XXX: should cluster them up and
 		 * invalidate as many as possible at once.
 		 */
 		*ptq = 0;
 
 		va = i386_ptob(sva);
 
 		/*
 		 * Remove from the PV table (raise IPL since we may be called
 		 * at interrupt time).
 		 */
 		pa = ((int) oldpte) & PG_FRAME;
 		if (!pmap_is_managed(pa)) {
-			pmap_unuse_pt(pmap, va);
+			pmap_unuse_pt(pmap, (vm_offset_t) va);
 			++sva;
 			continue;
 		}
 		if ((int) oldpte & PG_M) {
-			if (sva < USRSTACK + (UPAGES * NBPG) ||
+			if (sva < USRSTACK + (UPAGES * PAGE_SIZE) ||
 			    (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) {
 				PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 			}
 		}
 		pv = pa_to_pvh(pa);
 		pmap_remove_entry(pmap, pv, va);
 		pmap_unuse_pt(pmap, va);
 		++sva;
 	}
 	pmap_update();
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 static void
 pmap_remove_all(pa)
 	vm_offset_t pa;
 {
-	register pv_entry_t pv, npv;
+	register pv_entry_t pv, opv, npv;
 	register pt_entry_t *pte, *ptp;
 	vm_offset_t va;
 	struct pmap *pmap;
 	vm_page_t m;
 	int s;
 	int anyvalid = 0;
 
 	/*
 	 * Not one of ours
 	 */
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (!pmap_is_managed(pa))
 		return;
 
-	pa = i386_trunc_page(pa);
-	pv = pa_to_pvh(pa);
-	m = PHYS_TO_VM_PAGE(pa);
+	pa = trunc_page(pa);
+	opv = pa_to_pvh(pa);
+	if (opv->pv_pmap == NULL)
+		return;
 
+	m = PHYS_TO_VM_PAGE(pa);
 	s = splhigh();
-	while (pv->pv_pmap != NULL) {
-		pmap = pv->pv_pmap;
+	pv = opv;
+	while (pv && ((pmap = pv->pv_pmap) != NULL)) {
 		ptp = get_pt_entry(pmap);
 		va = pv->pv_va;
 		pte = ptp + i386_btop(va);
 		if (pmap_pte_w(pte))
 			pmap->pm_stats.wired_count--;
 		if (*pte) {
 			pmap->pm_stats.resident_count--;
-			anyvalid++;
+			if (curproc != pageproc)
+				anyvalid++;
 
 			/*
 			 * Update the vm_page_t clean and reference bits.
 			 */
 			if ((int) *pte & PG_M) {
-				if (va < USRSTACK + (UPAGES * NBPG) ||
+				if (va < USRSTACK + (UPAGES * PAGE_SIZE) ||
 				    (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) {
 					PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 				}
 			}
 			*pte = 0;
 			pmap_unuse_pt(pmap, va);
 		}
+		pv = pv->pv_next;
+	}
+
+	for (pv = opv->pv_next; pv; pv = npv) {
 		npv = pv->pv_next;
-		if (npv) {
-			*pv = *npv;
-			free_pv_entry(npv);
-		} else {
-			pv->pv_pmap = NULL;
-		}
+		free_pv_entry(pv);
 	}
+
+	opv->pv_pmap = NULL;
+	opv->pv_next = NULL;
+		
 	splx(s);
 	if (anyvalid)
 		pmap_update();
 }
 
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap, sva, eva, prot)
 	register pmap_t pmap;
 	vm_offset_t sva, eva;
 	vm_prot_t prot;
 {
 	register pt_entry_t *pte;
 	register vm_offset_t va;
 	int i386prot;
 	register pt_entry_t *ptp;
 	int evap = i386_btop(eva);
 	int anyvalid = 0;;
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	ptp = get_pt_entry(pmap);
 
 	va = sva;
 	while (va < eva) {
 		int found = 0;
 		int svap;
 		vm_offset_t nscan;
 
 		/*
 		 * Page table page is not allocated. Skip it, we don't want to
 		 * force allocation of unnecessary PTE pages just to set the
 		 * protection.
 		 */
 		if (!*pmap_pde(pmap, va)) {
 			/* XXX: avoid address wrap around */
 	nextpde:
 			if (va >= i386_trunc_pdr((vm_offset_t) - 1))
 				break;
 			va = i386_round_pdr(va + PAGE_SIZE);
 			continue;
 		}
 		pte = ptp + i386_btop(va);
 
 		if (*pte == 0) {
 			/*
 			 * scan for a non-empty pte
 			 */
 			svap = pte - ptp;
 			nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap;
 
 			if (nscan + svap > evap)
 				nscan = evap - svap;
 
 			found = 0;
 			if (nscan)
 				asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :
 				    "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx");
 
 			if (!found)
 				goto nextpde;
 
 			pte -= 1;
 			svap = pte - ptp;
 
 			va = i386_ptob(svap);
 		}
 		anyvalid++;
 
 		i386prot = pte_prot(pmap, prot);
 		if (va < UPT_MAX_ADDRESS) {
 			i386prot |= PG_u;
 			if (va >= UPT_MIN_ADDRESS)
 				i386prot |= PG_RW;
 		}
 		pmap_pte_set_prot(pte, i386prot);
 		va += PAGE_SIZE;
 	}
 	if (anyvalid)
 		pmap_update();
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap, va, pa, prot, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 	vm_prot_t prot;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 	register pt_entry_t npte;
 	vm_offset_t opa;
 	int ptevalid = 0;
 
 	if (pmap == NULL)
 		return;
 
-	va = i386_trunc_page(va);
-	pa = i386_trunc_page(pa);
+	va = trunc_page(va);
+	pa = trunc_page(pa);
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (*pmap_pde(pmap, va) == 0) {
 		printf("kernel page directory invalid pdir=%p, va=0x%lx\n",
 			pmap->pm_pdir[PTDPTDI], va);
 		panic("invalid kernel page directory");
 	}
 	pte = pmap_pte(pmap, va);
 	opa = pmap_pte_pa(pte);
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (opa == pa) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && !pmap_pte_w(pte))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && pmap_pte_w(pte))
 			pmap->pm_stats.wired_count--;
 
 		goto validate;
 	}
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		pmap_remove(pmap, va, va + PAGE_SIZE);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_is_managed(pa)) {
 		register pv_entry_t pv, npv;
 		int s;
 
 		pv = pa_to_pvh(pa);
 		s = splhigh();
 		/*
 		 * No entries yet, use header as the first entry
 		 */
 		if (pv->pv_pmap == NULL) {
 			pv->pv_va = va;
 			pv->pv_pmap = pmap;
 			pv->pv_next = NULL;
 		}
 		/*
 		 * There is at least one other VA mapping this page. Place
 		 * this entry after the header.
 		 */
 		else {
 			npv = get_pv_entry();
 			npv->pv_va = va;
 			npv->pv_pmap = pmap;
 			npv->pv_next = pv->pv_next;
 			pv->pv_next = npv;
 		}
 		splx(s);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V));
 
 	/*
 	 * When forking (copy-on-write, etc): A process will turn off write
 	 * permissions for any of its writable pages.  If the data (object) is
 	 * only referred to by one process, the processes map is modified
 	 * directly as opposed to using the object manipulation routine.  When
 	 * using pmap_protect, the modified bits are not kept in the vm_page_t
 	 * data structure.  Therefore, when using pmap_enter in vm_fault to
 	 * bring back writability of a page, there has been no memory of the
 	 * modified or referenced bits except at the pte level.  this clause
 	 * supports the carryover of the modified and used (referenced) bits.
 	 */
 	if (pa == opa)
 		(int) npte |= (int) *pte & (PG_M | PG_U);
 
 	if (wired)
 		(int) npte |= PG_W;
 	if (va < UPT_MIN_ADDRESS)
 		(int) npte |= PG_u;
 	else if (va < UPT_MAX_ADDRESS)
 		(int) npte |= PG_u | PG_RW;
 
 	if (*pte != npte) {
 		if (*pte)
 			ptevalid++;
 		*pte = npte;
 	}
 	if (ptevalid) {
-		pmap_update();
+		pmap_update_1pg(va);
 	} else {
 		pmap_use_pt(pmap, va);
 	}
 }
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(va, m, count)
 	vm_offset_t va;
 	vm_page_t *m;
 	int count;
 {
 	int i;
 	int anyvalid = 0;
 	register pt_entry_t *pte;
 
 	for (i = 0; i < count; i++) {
-		pte = vtopte(va + i * NBPG);
-		if (*pte)
-			anyvalid++;
-		*pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V));
+		vm_offset_t tva = va + i * PAGE_SIZE;
+		pt_entry_t npte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V));
+		pte = vtopte(tva);
+		if (*pte && (*pte != npte))
+			pmap_update_1pg(tva);
+		*pte = npte;
 	}
-	if (anyvalid)
-		pmap_update();
 }
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(va, count)
 	vm_offset_t va;
 	int count;
 {
 	int i;
 	register pt_entry_t *pte;
 
 	for (i = 0; i < count; i++) {
-		pte = vtopte(va + i * NBPG);
+		vm_offset_t tva = va + i * PAGE_SIZE;
+		pte = vtopte(tva);
 		*pte = 0;
+		pmap_update_1pg(tva);
 	}
-	pmap_update();
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a pmap_update after doing the pmap_kenter...
  */
 void
 pmap_kenter(va, pa)
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register pt_entry_t *pte;
 	int wasvalid = 0;
 
 	pte = vtopte(va);
 
 	if (*pte)
 		wasvalid++;
 
 	*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V));
 
 	if (wasvalid)
-		pmap_update();
+		pmap_update_1pg(va);
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 void
 pmap_kremove(va)
 	vm_offset_t va;
 {
 	register pt_entry_t *pte;
 
 	pte = vtopte(va);
 
 	*pte = (pt_entry_t) 0;
-	pmap_update();
+	pmap_update_1pg(va);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
-static inline void
+static __inline void
 pmap_enter_quick(pmap, va, pa)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register pt_entry_t *pte;
 	register pv_entry_t pv, npv;
 	int s;
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 
 	pte = vtopte(va);
 
 	/* a fault on the page table might occur here */
 	if (*pte) {
 		pmap_remove(pmap, va, va + PAGE_SIZE);
 	}
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 	/*
 	 * No entries yet, use header as the first entry
 	 */
 	if (pv->pv_pmap == NULL) {
 		pv->pv_pmap = pmap;
 		pv->pv_va = va;
 		pv->pv_next = NULL;
 	}
 	/*
 	 * There is at least one other VA mapping this page. Place this entry
 	 * after the header.
 	 */
 	else {
 		npv = get_pv_entry();
 		npv->pv_va = va;
 		npv->pv_pmap = pmap;
 		npv->pv_next = pv->pv_next;
 		pv->pv_next = npv;
 	}
 	splx(s);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	*pte = (pt_entry_t) ((int) (pa | PG_V | PG_u));
 
 	pmap_use_pt(pmap, va);
 
 	return;
 }
 
-#define MAX_INIT_PT (512 * 4096)
+#define MAX_INIT_PT (512)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap, addr, object, pindex, size)
 	pmap_t pmap;
 	vm_offset_t addr;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_size_t size;
 {
 	vm_offset_t tmpidx;
 	int psize;
 	vm_page_t p;
 	int objpgs;
 
-	if (!pmap || ((size > MAX_INIT_PT) &&
-		(object->resident_page_count > MAX_INIT_PT / PAGE_SIZE))) {
+	psize = (size >> PAGE_SHIFT);
+
+	if (!pmap || ((psize > MAX_INIT_PT) &&
+		(object->resident_page_count > MAX_INIT_PT))) {
 		return;
 	}
 
-	psize = (size >> PAGE_SHIFT);
 	/*
 	 * if we are processing a major portion of the object, then scan the
 	 * entire thing.
 	 */
 	if (psize > (object->size >> 2)) {
 		objpgs = psize;
 
 		for (p = object->memq.tqh_first;
 		    ((objpgs > 0) && (p != NULL));
 		    p = p->listq.tqe_next) {
 
 			tmpidx = p->pindex;
 			if (tmpidx < pindex) {
 				continue;
 			}
 			tmpidx -= pindex;
 			if (tmpidx >= psize) {
 				continue;
 			}
-			if (((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) &&
-			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
-			    (p->bmapped == 0) &&
+			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-				if (p->flags & PG_CACHE)
+				if (p->queue == PQ_CACHE)
 					vm_page_deactivate(p);
 				vm_page_hold(p);
 				p->flags |= PG_MAPPED;
 				pmap_enter_quick(pmap,
 					addr + (tmpidx << PAGE_SHIFT),
 					VM_PAGE_TO_PHYS(p));
 				vm_page_unhold(p);
 			}
 			objpgs -= 1;
 		}
 	} else {
 		/*
 		 * else lookup the pages one-by-one.
 		 */
 		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
 			p = vm_page_lookup(object, tmpidx + pindex);
-			if (p &&
-			    ((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) &&
-			    (p->bmapped == 0) &&
-			    (p->busy == 0) &&
+			if (p && (p->busy == 0) &&
 			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-				if (p->flags & PG_CACHE)
+				if (p->queue == PQ_CACHE)
 					vm_page_deactivate(p);
 				vm_page_hold(p);
 				p->flags |= PG_MAPPED;
 				pmap_enter_quick(pmap,
 					addr + (tmpidx << PAGE_SHIFT),
 					VM_PAGE_TO_PHYS(p));
 				vm_page_unhold(p);
 			}
 		}
 	}
 }
 
 /*
+ * pmap_prefault provides a quick way of clustering
+ * pagefaults into a processes address space.  It is a "cousin"
+ * of pmap_object_init_pt, except it runs at page fault time instead
+ * of mmap time.
+ */
+#define PFBAK 2
+#define PFFOR 2
+#define PAGEORDER_SIZE (PFBAK+PFFOR)
+
+static int pmap_prefault_pageorder[] = {
+	-NBPG, NBPG, -2 * NBPG, 2 * NBPG
+};
+
+void
+pmap_prefault(pmap, addra, entry, object)
+	pmap_t pmap;
+	vm_offset_t addra;
+	vm_map_entry_t entry;
+	vm_object_t object;
+{
+	int i;
+	vm_offset_t starta;
+	vm_offset_t addr;
+	vm_pindex_t pindex;
+	vm_page_t m;
+	int pageorder_index;
+
+	if (entry->object.vm_object != object)
+		return;
+
+	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
+		return;
+
+	starta = addra - PFBAK * PAGE_SIZE;
+	if (starta < entry->start) {
+		starta = entry->start;
+	} else if (starta > addra) {
+		starta = 0;
+	}
+
+	for (i = 0; i < PAGEORDER_SIZE; i++) {
+		vm_object_t lobject;
+		pt_entry_t *pte;
+
+		addr = addra + pmap_prefault_pageorder[i];
+		if (addr < starta || addr >= entry->end)
+			continue;
+
+		pte = vtopte(addr);
+		if (*pte)
+			continue;
+
+		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
+		lobject = object;
+		for (m = vm_page_lookup(lobject, pindex);
+		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
+		    lobject = lobject->backing_object) {
+			if (lobject->backing_object_offset & (PAGE_MASK-1))
+				break;
+			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
+			m = vm_page_lookup(lobject->backing_object, pindex);
+		}
+
+		/*
+		 * give-up when a page is not in memory
+		 */
+		if (m == NULL)
+			break;
+
+		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
+		    (m->busy == 0) &&
+		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+
+			if (m->queue == PQ_CACHE) {
+				if (cnt.v_free_count + cnt.v_cache_count <
+					cnt.v_free_min)
+					break;
+				vm_page_deactivate(m);
+			}
+			vm_page_hold(m);
+			m->flags |= PG_MAPPED;
+			pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m));
+			vm_page_unhold(m);
+		}
+	}
+}
+
+/*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 
 	if (pmap == NULL)
 		return;
 
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
-	/*
-	 * When unwiring, set the modified bit in the pte -- could have been
-	 * changed by the kernel
-	 */
-	if (!wired)
-		(int) *pte |= PG_M;
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
 	pmap_t dst_pmap, src_pmap;
 	vm_offset_t dst_addr;
 	vm_size_t len;
 	vm_offset_t src_addr;
 {
 }
 
 /*
  *	Routine:	pmap_kernel
  *	Function:
  *		Returns the physical map handle for the kernel.
  */
 pmap_t
 pmap_kernel()
 {
 	return (kernel_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bzero to clear its contents, one machine dependent page
  *	at a time.
  */
 void
 pmap_zero_page(phys)
 	vm_offset_t phys;
 {
 	if (*(int *) CMAP2)
 		panic("pmap_zero_page: CMAP busy");
 
-	*(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys);
-	bzero(CADDR2, NBPG);
+	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(phys);
+	bzero(CADDR2, PAGE_SIZE);
 
 	*(int *) CMAP2 = 0;
-	pmap_update();
+	pmap_update_1pg((vm_offset_t) CADDR2);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(src, dst)
 	vm_offset_t src;
 	vm_offset_t dst;
 {
 	if (*(int *) CMAP1 || *(int *) CMAP2)
 		panic("pmap_copy_page: CMAP busy");
 
-	*(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src);
-	*(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst);
+	*(int *) CMAP1 = PG_V | PG_KW | trunc_page(src);
+	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(dst);
 
 #if __GNUC__ > 1
-	memcpy(CADDR2, CADDR1, NBPG);
+	memcpy(CADDR2, CADDR1, PAGE_SIZE);
 #else
-	bcopy(CADDR1, CADDR2, NBPG);
+	bcopy(CADDR1, CADDR2, PAGE_SIZE);
 #endif
 	*(int *) CMAP1 = 0;
 	*(int *) CMAP2 = 0;
-	pmap_update();
+	pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
 }
 
 
 /*
  *	Routine:	pmap_pageable
  *	Function:
  *		Make the specified pages (by pmap, offset)
  *		pageable (or not) as requested.
  *
  *		A page which is not pageable may not take
  *		a fault; therefore, its page table entry
  *		must remain valid for the duration.
  *
  *		This routine is merely advisory; pmap_enter
  *		will specify that these pages are to be wired
  *		down (or not) as appropriate.
  */
 void
 pmap_pageable(pmap, sva, eva, pageable)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 	boolean_t pageable;
 {
 }
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists(pmap, pa)
 	pmap_t pmap;
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			if (pv->pv_pmap == pmap) {
 				splx(s);
 				return TRUE;
 			}
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * pmap_testbit tests bits in pte's
  * note that the testbit/changebit routines are inline,
  * and a lot of things compile-time evaluate.
  */
 static __inline boolean_t
 pmap_testbit(pa, bit)
 	register vm_offset_t pa;
 	int bit;
 {
 	register pv_entry_t pv;
 	pt_entry_t *pte;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			/*
 			 * if the bit being tested is the modified bit, then
 			 * mark UPAGES as always modified, and ptes as never
 			 * modified.
 			 */
-			if (bit & PG_U) {
+			if (bit & (PG_U|PG_M)) {
 				if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) {
 					continue;
 				}
 			}
-			if (bit & PG_M) {
-				if (pv->pv_va >= USRSTACK) {
-					if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) {
-						continue;
-					}
-					if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) {
-						splx(s);
-						return TRUE;
-					} else if (pv->pv_va < KERNBASE) {
-						splx(s);
-						return FALSE;
-					}
-				}
-			}
 			if (!pv->pv_pmap) {
 				printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
 				continue;
 			}
 			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 			if ((int) *pte & bit) {
 				splx(s);
 				return TRUE;
 			}
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static __inline void
 pmap_changebit(pa, bit, setem)
 	vm_offset_t pa;
 	int bit;
 	boolean_t setem;
 {
 	register pv_entry_t pv;
 	register pt_entry_t *pte, npte;
 	vm_offset_t va;
+	int changed;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			va = pv->pv_va;
 
 			/*
 			 * don't write protect pager mappings
 			 */
 			if (!setem && (bit == PG_RW)) {
 				if (va >= clean_sva && va < clean_eva)
 					continue;
 			}
 			if (!pv->pv_pmap) {
 				printf("Null pmap (cb) at va: 0x%lx\n", va);
 				continue;
 			}
 			pte = pmap_pte(pv->pv_pmap, va);
-			if (setem)
+			if (setem) {
 				(int) npte = (int) *pte | bit;
-			else
+			} else {
 				(int) npte = (int) *pte & ~bit;
+			}
 			*pte = npte;
 		}
 	}
 	splx(s);
-	pmap_update();
+	if (curproc != pageproc)
+		pmap_update();
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(phys, prot)
 	vm_offset_t phys;
 	vm_prot_t prot;
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE))
 			pmap_changebit(phys, PG_RW, FALSE);
 		else
 			pmap_remove_all(phys);
 	}
 }
 
 vm_offset_t
 pmap_phys_address(ppn)
 	int ppn;
 {
 	return (i386_ptob(ppn));
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	by any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_U);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_M);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_U, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 i386_protection_init()
 {
 	register int *kp, prot;
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 			/*
 			 * Read access is also 0. There isn't any execute bit,
 			 * so just make it readable.
 			 */
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory. The non-cacheable bits are set on each
  * mapped page.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_offset_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva;
 	pt_entry_t *pte;
 
 	pa = trunc_page(pa);
 	size = roundup(size, PAGE_SIZE);
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0;) {
 		pte = vtopte(tmpva);
 		*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N));
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_update();
 
 	return ((void *) va);
 }
+
+#ifdef PMAP_DEBUG
+pmap_pid_dump(int pid) {
+	pmap_t pmap;
+	struct proc *p;
+	int npte = 0;
+	int index;
+	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
+		if (p->p_pid != pid)
+			continue;
+
+		if (p->p_vmspace) {
+			int i,j;
+			index = 0;
+			pmap = &p->p_vmspace->vm_pmap;
+			for(i=0;i<1024;i++) {
+				pd_entry_t *pde;
+				pt_entry_t *pte;
+				unsigned base = i << PD_SHIFT;
+				
+				pde = &pmap->pm_pdir[i];
+				if (pde && pmap_pde_v(pde)) {
+					for(j=0;j<1024;j++) {
+						unsigned va = base + (j << PG_SHIFT);
+						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
+							if (index) {
+								index = 0;
+								printf("\n");
+							}
+							return npte;
+						}
+						pte = pmap_pte( pmap, va);
+						if (pte && pmap_pte_v(pte)) {
+							vm_offset_t pa;
+							vm_page_t m;
+							pa = *(int *)pte;
+							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
+							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
+								va, pa, m->hold_count, m->wire_count, m->flags);
+							npte++;
+							index++;
+							if (index >= 2) {
+								index = 0;
+								printf("\n");
+							} else {
+								printf(" ");
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	return npte;
+}
+#endif
 
 #ifdef DEBUG
 
 static void	pads __P((pmap_t pm));
 static void	pmap_pvdump __P((vm_offset_t pa));
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	unsigned va, i, j;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < 1024; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < 1024; j++) {
 				va = (i << PD_SHIFT) + (j << PG_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *(int *) ptep);
 			};
 
 }
 
 static void
 pmap_pvdump(pa)
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 
 	printf("pa %x", pa);
 	for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) {
 #ifdef used_to_be
 		printf(" -> pmap %x, va %x, flags %x",
 		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
 #endif
 		printf(" -> pmap %x, va %x",
 		    pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/amd64/amd64/trap.c
===================================================================
--- head/sys/amd64/amd64/trap.c	(revision 13489)
+++ head/sys/amd64/amd64/trap.c	(revision 13490)
@@ -1,1061 +1,1062 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
- *	$Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $
+ *	$Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_ktrace.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/acct.h>
 #include <sys/kernel.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/psl.h>
 #include <machine/reg.h>
 #include <machine/trap.h>
 #include <machine/../isa/isa_device.h>
 
 #ifdef POWERFAIL_NMI
 # include <syslog.h>
 # include <machine/clock.h>
 #endif
 
 #include "isa.h"
 #include "npx.h"
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void linux_syscall __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int));
 static void trap_fatal __P((struct trapframe *));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		27
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 };
 
 static void userret __P((struct proc *p, struct trapframe *frame,
 			 u_quad_t oticks));
 
 static inline void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig, s;
 
 	while ((sig = CURSIG(p)) != 0)
 		postsig(sig);
 	p->p_priority = p->p_usrpri;
 	if (want_resched) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		s = splclock();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		splx(s);
 		while ((sig = CURSIG(p)) != 0)
 			postsig(sig);
 	}
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 		u_quad_t ticks = p->p_sticks - oticks;
 
 		if (ticks) {
 #ifdef PROFTIMER
 			extern int profscale;
 			addupc(frame->tf_eip, &p->p_stats->p_prof,
 			    ticks * profscale);
 #else
 			addupc(frame->tf_eip, &p->p_stats->p_prof, ticks);
 #endif
 		}
 	}
 	curpriority = p->p_priority;
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 #ifdef DEBUG
 	u_long eva;
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
 	if (ISPL(frame.tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		sticks = p->p_sticks;
 		p->p_md.md_regs = (int *)&frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 		case T_ASTFLT:		/* Allow process switch */
 			astoff();
 			cnt.v_soft++;
 			if (p->p_flag & P_OWEUPC) {
 				addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
 				p->p_flag &= ~P_OWEUPC;
 			}
 			goto out;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_STKFLT:		/* stack fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			i = trap_pfault(&frame, TRUE);
 			if (i == -1)
 				return;
 			if (i == 0)
 				goto out;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV_TRAP;
 			i = SIGFPE;
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			goto handle_powerfail;
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			panic("NMI indicates hardware failure");
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_SUBRNG_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #if NNPX > 0
 			/* if a transparent fault (due to context switch "late") */
 			if (npxdna())
 				return;
 #endif	/* NNPX > 0 */
 
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			i = (*pmath_emulate)(&frame);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					return;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(&frame, FALSE);
 			return;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 #define	MAYBE_DORETI_FAULT(where, whereto)				\
 	do {								\
 		if (frame.tf_eip == (int)where) {			\
 			frame.tf_eip = (int)whereto;			\
 			return;						\
 		}							\
 	} while (0)
 
 			if (intr_nesting_level == 0) {
 				MAYBE_DORETI_FAULT(doreti_iret,
 						   doreti_iret_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_ds,
 						   doreti_popl_ds_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_es,
 						   doreti_popl_es_fault);
 			}
 			if (curpcb && curpcb->pcb_onfault) {
 				frame.tf_eip = (int)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				return;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				return;
 			}
 			/*
 			 * Fall through.
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 	handle_powerfail:
 		{
 		  static unsigned lastalert = 0;
 
 		  if(time.tv_sec - lastalert > 10)
 		    {
 		      log(LOG_WARNING, "NMI: power fail\n");
 		      sysbeep(TIMER_FREQ/880, hz);
 		      lastalert = time.tv_sec;
 		    }
 		  return;
 		}
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 		}
 
 		trap_fatal(&frame);
 		return;
 	}
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	eva = rcr2();
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%x", eva);
 		uprintf("\n");
 	}
 #endif
 
 out:
 	userret(p, &frame, sticks);
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t ptepg;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		    (curpcb == NULL || curpcb->pcb_onfault == NULL))) {
 			trap_fatal(frame);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
 			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		vm_offset_t v;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
-			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
+			(void) vm_fault(map,
+				trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame)
 	struct trapframe *frame;
 {
 	int code, type, eva;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	eva = rcr2();
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	    softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	    softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace/trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 	printf("interrupt mask		= ");
 	if ((cpl & net_imask) == net_imask)
 		printf("net ");
 	if ((cpl & tty_imask) == tty_imask)
 		printf("tty ");
 	if ((cpl & bio_imask) == bio_imask)
 		printf("bio ");
 	if (cpl == 0)
 		printf("none");
 	printf("\n");
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if (kdb_trap (type, 0, frame))
 		return;
 #endif
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	struct pcb *pcb = curpcb;
 
 	if (pcb != NULL) {
 		printf("\nFatal double fault:\n");
 		printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip);
 		printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp);
 		printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp);
 	}
 
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va, v;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	++p->p_lock;
 
 	if ((caddr_t)va >= vm->vm_maxsaddr
 	    && (caddr_t)va < (caddr_t)USRSTACK) {
 		if (!grow(p, va)) {
 			--p->p_lock;
 			return (1);
 		}
 	}
 
 	v = trunc_page(vtopte(va));
 
 	/*
 	 * wire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
 
 	/*
 	 * unwire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE);
 	}
 
 	--p->p_lock;
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  * System call request from POSIX system call gate interface to kernel.
  * Like trap(), argument is call by reference.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int args[8], rval[2];
 	u_int code;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		code = fuword(params);
 		params += sizeof(int);
 	} else if (code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		code = fuword(params);
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	if ((i = callp->sy_narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 		goto bad;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 	rval[0] = 0;
 	rval[1] = frame.tf_edx;
 
 	error = (*callp->sy_call)(p, args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_edx = rval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes.
 		 */
 		frame.tf_eip -= 7;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 void
 linux_syscall(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	struct sysent *callp;
 	u_quad_t sticks;
 	int error;
 	int rval[2];
 	u_int code;
 	struct linux_syscall_args {
 		int arg1;
 		int arg2;
 		int arg3;
 		int arg4;
 		int arg5;
 	} args;
 
 	args.arg1 = frame.tf_ebx;
 	args.arg2 = frame.tf_ecx;
 	args.arg3 = frame.tf_edx;
 	args.arg4 = frame.tf_esi;
 	args.arg5 = frame.tf_edi;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("linux syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_mask)
 		code &= p->p_sysent->sv_mask;
 
 	if (code >= p->p_sysent->sv_size)
 		callp = &p->p_sysent->sv_table[0];
 	else
 		callp = &p->p_sysent->sv_table[code];
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args);
 #endif
 
 	rval[0] = 0;
 
 	error = (*callp->sy_call)(p, &args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/* Reconstruct pc, subtract size of int 0x80 */
 		frame.tf_eip -= 2;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = -error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 #endif /* COMPAT_LINUX || LINUX */
Index: head/sys/amd64/amd64/vm_machdep.c
===================================================================
--- head/sys/amd64/amd64/vm_machdep.c	(revision 13489)
+++ head/sys/amd64/amd64/vm_machdep.c	(revision 13490)
@@ -1,871 +1,871 @@
 /*-
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
- *	$Id: vm_machdep.c,v 1.49 1995/12/14 08:31:01 phk Exp $
+ *	$Id: vm_machdep.c,v 1.50 1996/01/05 20:12:23 wollman Exp $
  */
 
 #include "npx.h"
 #include "opt_bounce.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 #include <i386/isa/isa.h>
 
 static void	vm_fault_quick __P((caddr_t v, int prot));
 
 #ifdef BOUNCE_BUFFERS
 static vm_offset_t
 		vm_bounce_kva __P((int size, int waitok));
 static void	vm_bounce_kva_free __P((vm_offset_t addr, vm_offset_t size,
 					int now));
 static vm_offset_t
 		vm_bounce_page_find __P((int count));
 static void	vm_bounce_page_free __P((vm_offset_t pa, int count));
 
 static volatile int	kvasfreecnt;
 
 caddr_t		bouncememory;
 int		bouncepages;
 static int	bpwait;
 static vm_offset_t	*bouncepa;
 static int		bmwait, bmfreeing;
 
 #define BITS_IN_UNSIGNED (8*sizeof(unsigned))
 static int		bounceallocarraysize;
 static unsigned	*bounceallocarray;
 static int		bouncefree;
 
 #define SIXTEENMEG (4096*4096)
 #define MAXBKVA 1024
 int		maxbkva = MAXBKVA*NBPG;
 
 /* special list that can be used at interrupt time for eventual kva free */
 static struct kvasfree {
 	vm_offset_t addr;
 	vm_offset_t size;
 } kvaf[MAXBKVA];
 
 /*
  * get bounce buffer pages (count physically contiguous)
  * (only 1 inplemented now)
  */
 static vm_offset_t
 vm_bounce_page_find(count)
 	int count;
 {
 	int bit;
 	int s,i;
 
 	if (count != 1)
 		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
 
 	s = splbio();
 retry:
 	for (i = 0; i < bounceallocarraysize; i++) {
 		if (bounceallocarray[i] != 0xffffffff) {
 			bit = ffs(~bounceallocarray[i]);
 			if (bit) {
 				bounceallocarray[i] |= 1 << (bit - 1) ;
 				bouncefree -= count;
 				splx(s);
 				return bouncepa[(i * BITS_IN_UNSIGNED + (bit - 1))];
 			}
 		}
 	}
 	bpwait = 1;
 	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
 	goto retry;
 }
 
 static void
 vm_bounce_kva_free(addr, size, now)
 	vm_offset_t addr;
 	vm_offset_t size;
 	int now;
 {
 	int s = splbio();
 	kvaf[kvasfreecnt].addr = addr;
 	kvaf[kvasfreecnt].size = size;
 	++kvasfreecnt;
 	if( now) {
 		/*
 		 * this will do wakeups
 		 */
 		vm_bounce_kva(0,0);
 	} else {
 		if (bmwait) {
 		/*
 		 * if anyone is waiting on the bounce-map, then wakeup
 		 */
 			wakeup((caddr_t) io_map);
 			bmwait = 0;
 		}
 	}
 	splx(s);
 }
 
 /*
  * free count bounce buffer pages
  */
 static void
 vm_bounce_page_free(pa, count)
 	vm_offset_t pa;
 	int count;
 {
 	int allocindex;
 	int index;
 	int bit;
 
 	if (count != 1)
 		panic("vm_bounce_page_free -- no support for > 1 page yet!!!");
 
 	for(index=0;index<bouncepages;index++) {
 		if( pa == bouncepa[index])
 			break;
 	}
 
 	if( index == bouncepages)
 		panic("vm_bounce_page_free: invalid bounce buffer");
 
 	allocindex = index / BITS_IN_UNSIGNED;
 	bit = index % BITS_IN_UNSIGNED;
 
 	bounceallocarray[allocindex] &= ~(1 << bit);
 
 	bouncefree += count;
 	if (bpwait) {
 		bpwait = 0;
 		wakeup((caddr_t) &bounceallocarray);
 	}
 }
 
 /*
  * allocate count bounce buffer kva pages
  */
 static vm_offset_t
 vm_bounce_kva(size, waitok)
 	int size;
 	int waitok;
 {
 	int i;
 	vm_offset_t kva = 0;
 	vm_offset_t off;
 	int s = splbio();
 more:
 	if (!bmfreeing && kvasfreecnt) {
 		bmfreeing = 1;
 		for (i = 0; i < kvasfreecnt; i++) {
 			for(off=0;off<kvaf[i].size;off+=NBPG) {
 				pmap_kremove( kvaf[i].addr + off);
 			}
 			kmem_free_wakeup(io_map, kvaf[i].addr,
 				kvaf[i].size);
 		}
 		kvasfreecnt = 0;
 		bmfreeing = 0;
 		if( bmwait) {
 			bmwait = 0;
 			wakeup( (caddr_t) io_map);
 		}
 	}
 
 	if( size == 0) {
 		splx(s);
 		return NULL;
 	}
 
 	if ((kva = kmem_alloc_pageable(io_map, size)) == 0) {
 		if( !waitok) {
 			splx(s);
 			return NULL;
 		}
 		bmwait = 1;
 		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
 		goto more;
 	}
 	splx(s);
 	return kva;
 }
 
 /*
  * same as vm_bounce_kva -- but really allocate (but takes pages as arg)
  */
 vm_offset_t
 vm_bounce_kva_alloc(count)
 int count;
 {
 	int i;
 	vm_offset_t kva;
 	vm_offset_t pa;
 	if( bouncepages == 0) {
 		kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK);
 		return kva;
 	}
 	kva = vm_bounce_kva(count*NBPG, 1);
 	for(i=0;i<count;i++) {
 		pa = vm_bounce_page_find(1);
 		pmap_kenter(kva + i * NBPG, pa);
 	}
 	return kva;
 }
 
 /*
  * same as vm_bounce_kva_free -- but really free
  */
 void
 vm_bounce_kva_alloc_free(kva, count)
 	vm_offset_t kva;
 	int count;
 {
 	int i;
 	vm_offset_t pa;
 	if( bouncepages == 0) {
 		free((caddr_t) kva, M_TEMP);
 		return;
 	}
 	for(i = 0; i < count; i++) {
 		pa = pmap_kextract(kva + i * NBPG);
 		vm_bounce_page_free(pa, 1);
 	}
 	vm_bounce_kva_free(kva, count*NBPG, 0);
 }
 
 /*
  * do the things necessary to the struct buf to implement
  * bounce buffers...  inserted before the disk sort
  */
 void
 vm_bounce_alloc(bp)
 	struct buf *bp;
 {
 	int countvmpg;
 	vm_offset_t vastart, vaend;
 	vm_offset_t vapstart, vapend;
 	vm_offset_t va, kva;
 	vm_offset_t pa;
 	int dobounceflag = 0;
 	int i;
 
 	if (bouncepages == 0)
 		return;
 
 	if (bp->b_flags & B_BOUNCE) {
 		printf("vm_bounce_alloc: called recursively???\n");
 		return;
 	}
 
 	if (bp->b_bufsize < bp->b_bcount) {
 		printf(
 		    "vm_bounce_alloc: b_bufsize(0x%lx) < b_bcount(0x%lx) !!\n",
 			bp->b_bufsize, bp->b_bcount);
 		panic("vm_bounce_alloc");
 	}
 
 /*
  *  This is not really necessary
  *	if( bp->b_bufsize != bp->b_bcount) {
  *		printf("size: %d, count: %d\n", bp->b_bufsize, bp->b_bcount);
  *	}
  */
 
 
 	vastart = (vm_offset_t) bp->b_data;
 	vaend = (vm_offset_t) bp->b_data + bp->b_bufsize;
 
-	vapstart = i386_trunc_page(vastart);
-	vapend = i386_round_page(vaend);
+	vapstart = trunc_page(vastart);
+	vapend = round_page(vaend);
 	countvmpg = (vapend - vapstart) / NBPG;
 
 /*
  * if any page is above 16MB, then go into bounce-buffer mode
  */
 	va = vapstart;
 	for (i = 0; i < countvmpg; i++) {
 		pa = pmap_kextract(va);
 		if (pa >= SIXTEENMEG)
 			++dobounceflag;
 		if( pa == 0)
 			panic("vm_bounce_alloc: Unmapped page");
 		va += NBPG;
 	}
 	if (dobounceflag == 0)
 		return;
 
 	if (bouncepages < dobounceflag)
 		panic("Not enough bounce buffers!!!");
 
 /*
  * allocate a replacement kva for b_addr
  */
 	kva = vm_bounce_kva(countvmpg*NBPG, 1);
 #if 0
 	printf("%s: vapstart: %x, vapend: %x, countvmpg: %d, kva: %x ",
 		(bp->b_flags & B_READ) ? "read":"write",
 			vapstart, vapend, countvmpg, kva);
 #endif
 	va = vapstart;
 	for (i = 0; i < countvmpg; i++) {
 		pa = pmap_kextract(va);
 		if (pa >= SIXTEENMEG) {
 			/*
 			 * allocate a replacement page
 			 */
 			vm_offset_t bpa = vm_bounce_page_find(1);
 			pmap_kenter(kva + (NBPG * i), bpa);
 #if 0
 			printf("r(%d): (%x,%x,%x) ", i, va, pa, bpa);
 #endif
 			/*
 			 * if we are writing, the copy the data into the page
 			 */
 			if ((bp->b_flags & B_READ) == 0) {
 				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
 			}
 		} else {
 			/*
 			 * use original page
 			 */
 			pmap_kenter(kva + (NBPG * i), pa);
 		}
 		va += NBPG;
 	}
 
 /*
  * flag the buffer as being bounced
  */
 	bp->b_flags |= B_BOUNCE;
 /*
  * save the original buffer kva
  */
 	bp->b_savekva = bp->b_data;
 /*
  * put our new kva into the buffer (offset by original offset)
  */
 	bp->b_data = (caddr_t) (((vm_offset_t) kva) |
 				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
 #if 0
 	printf("b_savekva: %x, newva: %x\n", bp->b_savekva, bp->b_data);
 #endif
 	return;
 }
 
 /*
  * hook into biodone to free bounce buffer
  */
 void
 vm_bounce_free(bp)
 	struct buf *bp;
 {
 	int i;
 	vm_offset_t origkva, bouncekva, bouncekvaend;
 
 /*
  * if this isn't a bounced buffer, then just return
  */
 	if ((bp->b_flags & B_BOUNCE) == 0)
 		return;
 
 /*
  *  This check is not necessary
  *	if (bp->b_bufsize != bp->b_bcount) {
  *		printf("vm_bounce_free: b_bufsize=%d, b_bcount=%d\n",
  *			bp->b_bufsize, bp->b_bcount);
  *	}
  */
 
 	origkva = (vm_offset_t) bp->b_savekva;
 	bouncekva = (vm_offset_t) bp->b_data;
 /*
 	printf("free: %d ", bp->b_bufsize);
 */
 
 /*
  * check every page in the kva space for b_addr
  */
 	for (i = 0; i < bp->b_bufsize; ) {
 		vm_offset_t mybouncepa;
 		vm_offset_t copycount;
 
-		copycount = i386_round_page(bouncekva + 1) - bouncekva;
-		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
+		copycount = round_page(bouncekva + 1) - bouncekva;
+		mybouncepa = pmap_kextract(trunc_page(bouncekva));
 
 /*
  * if this is a bounced pa, then process as one
  */
-		if ( mybouncepa != pmap_kextract( i386_trunc_page( origkva))) {
+		if ( mybouncepa != pmap_kextract( trunc_page( origkva))) {
 			vm_offset_t tocopy = copycount;
 			if (i + tocopy > bp->b_bufsize)
 				tocopy = bp->b_bufsize - i;
 /*
  * if this is a read, then copy from bounce buffer into original buffer
  */
 			if (bp->b_flags & B_READ)
 				bcopy((caddr_t) bouncekva, (caddr_t) origkva, tocopy);
 /*
  * free the bounce allocation
  */
 
 /*
 			printf("(kva: %x, pa: %x)", bouncekva, mybouncepa);
 */
 			vm_bounce_page_free(mybouncepa, 1);
 		}
 
 		origkva += copycount;
 		bouncekva += copycount;
 		i += copycount;
 	}
 
 /*
 	printf("\n");
 */
 /*
  * add the old kva into the "to free" list
  */
 
-	bouncekva= i386_trunc_page((vm_offset_t) bp->b_data);
-	bouncekvaend= i386_round_page((vm_offset_t)bp->b_data + bp->b_bufsize);
+	bouncekva= trunc_page((vm_offset_t) bp->b_data);
+	bouncekvaend= round_page((vm_offset_t)bp->b_data + bp->b_bufsize);
 
 /*
 	printf("freeva: %d\n", (bouncekvaend - bouncekva) / NBPG);
 */
 	vm_bounce_kva_free( bouncekva, (bouncekvaend - bouncekva), 0);
 	bp->b_data = bp->b_savekva;
 	bp->b_savekva = 0;
 	bp->b_flags &= ~B_BOUNCE;
 
 	return;
 }
 
 
 /*
  * init the bounce buffer system
  */
 void
 vm_bounce_init()
 {
 	int i;
 
 	kvasfreecnt = 0;
 
 	if (bouncepages == 0)
 		return;
 
 	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
 	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
 
 	if (!bounceallocarray)
 		panic("Cannot allocate bounce resource array");
 
 	bouncepa = malloc(bouncepages * sizeof(vm_offset_t), M_TEMP, M_NOWAIT);
 	if (!bouncepa)
 		panic("Cannot allocate physical memory array");
 
 	for(i=0;i<bounceallocarraysize;i++) {
 		bounceallocarray[i] = 0xffffffff;
 	}
 
 	for(i=0;i<bouncepages;i++) {
 		vm_offset_t pa;
 		if( (pa = pmap_kextract((vm_offset_t) bouncememory + i * NBPG)) >= SIXTEENMEG)
 			panic("bounce memory out of range");
 		if( pa == 0)
 			panic("bounce memory not resident");
 		bouncepa[i] = pa;
 		bounceallocarray[i/(8*sizeof(int))] &= ~(1<<(i%(8*sizeof(int))));
 	}
 	bouncefree = bouncepages;
 
 }
 #endif /* BOUNCE_BUFFERS */
 
 /*
  * quick version of vm_fault
  */
 static void
 vm_fault_quick(v, prot)
 	caddr_t v;
 	int prot;
 {
 	if (prot & VM_PROT_WRITE)
 		subyte(v, fubyte(v));
 	else
 		fubyte(v);
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the kernel stack and pcb, making the child
  * ready to run, and marking it so that it can return differently
  * than the parent.  Returns 1 in the child process, 0 in the parent.
  * We currently double-map the user area so that the stack is at the same
  * address in each process; in the future we will probably relocate
  * the frame pointers on the stack after copying.
  */
 int
 cpu_fork(p1, p2)
 	register struct proc *p1, *p2;
 {
 	register struct user *up = p2->p_addr;
 	int offset;
 
 	/*
 	 * Copy pcb and stack from proc p1 to p2.
 	 * We do this as cheaply as possible, copying only the active
 	 * part of the stack.  The stack and pcb need to agree;
 	 * this is tricky, as the final pcb is constructed by savectx,
 	 * but its frame isn't yet on the stack when the stack is copied.
 	 * swtch compensates for this when the child eventually runs.
 	 * This should be done differently, with a single call
 	 * that copies and updates the pcb+stack,
 	 * replacing the bcopy and savectx.
 	 */
 	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
 	offset = mvesp() - (int)kstack;
 	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
 	    (unsigned) ctob(UPAGES) - offset);
 	p2->p_md.md_regs = p1->p_md.md_regs;
 
 	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
 
 	/*
 	 *
 	 * Arrange for a non-local goto when the new process
 	 * is started, to resume here, returning nonzero from setjmp.
 	 */
 	if (savectx(&up->u_pcb, 1)) {
 		/*
 		 * Return 1 in child.
 		 */
 		return (1);
 	}
 	return (0);
 }
 
 void
 cpu_exit(p)
 	register struct proc *p;
 {
 
 #if NNPX > 0
 	npxexit(p);
 #endif	/* NNPX */
 	cnt.v_swtch++;
 	cpu_switch(p);
 	panic("cpu_exit");
 }
 
 void
-cpu_wait(p) struct proc *p; {
-/*	extern vm_map_t upages_map; */
-
+cpu_wait(p)
+	struct proc *p;
+{
 	/* drop per-process resources */
- 	pmap_remove(vm_map_pmap(u_map), (vm_offset_t) p->p_addr,
-		((vm_offset_t) p->p_addr) + ctob(UPAGES));
+	pmap_qremove((vm_offset_t) p->p_addr, UPAGES);
 	kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
 	vmspace_free(p->p_vmspace);
 }
 
 /*
  * Dump the machine specific header information at the start of a core dump.
  */
 int
 cpu_coredump(p, vp, cred)
 	struct proc *p;
 	struct vnode *vp;
 	struct ucred *cred;
 {
 
 	return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES),
 	    (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL,
 	    p));
 }
 
 #ifdef notyet
 static void
 setredzone(pte, vaddr)
 	u_short *pte;
 	caddr_t vaddr;
 {
 /* eventually do this by setting up an expand-down stack segment
    for ss0: selector, allowing stack access down to top of u.
    this means though that protection violations need to be handled
    thru a double fault exception that must do an integral task
    switch to a known good context, within which a dump can be
    taken. a sensible scheme might be to save the initial context
    used by sched (that has physical memory mapped 1:1 at bottom)
    and take the dump while still in mapped mode */
 }
 #endif
 
 /*
  * Convert kernel VA to physical address
  */
 u_long
 kvtop(void *addr)
 {
 	vm_offset_t va;
 
 	va = pmap_kextract((vm_offset_t)addr);
 	if (va == 0)
 		panic("kvtop: zero page frame");
 	return((int)va);
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  */
 void
 vmapbuf(bp)
 	register struct buf *bp;
 {
 	register int npf;
 	register caddr_t addr;
 	int off;
 	vm_offset_t kva;
 	vm_offset_t pa;
 
 	if ((bp->b_flags & B_PHYS) == 0)
 		panic("vmapbuf");
 
 	/*
 	 * this is the kva that is to be used for
 	 * the temporary kernel mapping
 	 */
 	kva = (vm_offset_t) bp->b_saveaddr;
 
 	for (addr = (caddr_t)trunc_page(bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += PAGE_SIZE) {
 
 /*
  * do the vm_fault if needed, do the copy-on-write thing when
  * reading stuff off device into memory.
  */
 		vm_fault_quick(addr,
 			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 		pa = pmap_kextract((vm_offset_t) addr);
 		if (pa == 0)
 			panic("vmapbuf: page not present");
 /*
  * hold the data page
  */
 #ifdef DIAGNOSTIC
 		if( VM_PAGE_TO_PHYS(PHYS_TO_VM_PAGE(pa)) != pa)
 			panic("vmapbuf: confused PHYS_TO_VM_PAGE mapping");
 #endif
 		vm_page_hold(PHYS_TO_VM_PAGE(pa));
 	}
 
 	addr = bp->b_saveaddr = bp->b_data;
 	off = (int)addr & PGOFSET;
 	npf = btoc(round_page(bp->b_bufsize + off));
 	bp->b_data = (caddr_t) (kva + off);
 	while (npf--) {
 		pa = pmap_kextract((vm_offset_t)addr);
 		if (pa == 0)
 			panic("vmapbuf: null page frame");
 		pmap_kenter(kva, trunc_page(pa));
 		addr += PAGE_SIZE;
 		kva += PAGE_SIZE;
 	}
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  */
 void
 vunmapbuf(bp)
 	register struct buf *bp;
 {
 	register caddr_t addr;
 	vm_offset_t pa;
 
 	if ((bp->b_flags & B_PHYS) == 0)
 		panic("vunmapbuf");
 
 	for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += NBPG)
 		pmap_kremove((vm_offset_t) addr);
 
 	bp->b_data = bp->b_saveaddr;
 	bp->b_saveaddr = NULL;
 
 /*
  * unhold the pde, and data pages
  */
 	for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += NBPG) {
 	/*
 	 * release the data page
 	 */
 		pa = pmap_kextract((vm_offset_t) addr);
 		vm_page_unhold(PHYS_TO_VM_PAGE(pa));
 	}
 }
 
 /*
  * Force reset the processor by invalidating the entire address space!
  */
 void
 cpu_reset() {
 
 	/*
 	 * Attempt to do a CPU reset via the keyboard controller,
 	 * do not turn of the GateA20, as any machine that fails
 	 * to do the reset here would then end up in no man's land.
 	 */
 
 #ifndef BROKEN_KEYBOARD_RESET
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
 	printf("Keyboard reset did not work, attempting CPU shutdown\n");
 	DELAY(1000000);	/* wait 1 sec for printf to complete */
 #endif
 
 	/* force a shutdown by unmapping entire address space ! */
 	bzero((caddr_t) PTD, NBPG);
 
 	/* "good night, sweet prince .... <THUNK!>" */
 	pmap_update();
 	/* NOTREACHED */
 	while(1);
 }
 
 /*
  * Grow the user stack to allow for 'sp'. This version grows the stack in
  *	chunks of SGROWSIZ.
  */
 int
 grow(p, sp)
 	struct proc *p;
 	u_int sp;
 {
 	unsigned int nss;
 	caddr_t v;
 	struct vmspace *vm = p->p_vmspace;
 
 	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
 	    return (1);
 
 	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
 
 	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
 		return (0);
 
 	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
 	    SGROWSIZ) < nss) {
 		int grow_amount;
 		/*
 		 * If necessary, grow the VM that the stack occupies
 		 * to allow for the rlimit. This allows us to not have
 		 * to allocate all of the VM up-front in execve (which
 		 * is expensive).
 		 * Grow the VM by the amount requested rounded up to
 		 * the nearest SGROWSIZ to provide for some hysteresis.
 		 */
 		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
 		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
 		    SGROWSIZ) - grow_amount;
 		/*
 		 * If there isn't enough room to extend by SGROWSIZ, then
 		 * just extend to the maximum size
 		 */
 		if (v < vm->vm_maxsaddr) {
 			v = vm->vm_maxsaddr;
 			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
 		}
 		if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v,
-		    grow_amount, FALSE) != KERN_SUCCESS)) {
+		    grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) {
 			return (0);
 		}
 		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
 	}
 
 	return (1);
 }
 
 /*
  * prototype routine to implement the pre-zeroed page mechanism
  * this routine is called from the idle loop.
  */
 int
 vm_page_zero_idle() {
 	vm_page_t m;
 	if ((cnt.v_free_count > cnt.v_interrupt_free_min) &&
 		(m = vm_page_queue_free.tqh_first)) {
 		TAILQ_REMOVE(&vm_page_queue_free, m, pageq);
 		enable_intr();
 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
 		disable_intr();
 		TAILQ_INSERT_HEAD(&vm_page_queue_zero, m, pageq);
+		m->queue = PQ_ZERO;
 		++vm_page_zero_count;
 		return 1;
 	}
 	return 0;
 }
Index: head/sys/fs/msdosfs/msdosfs_denode.c
===================================================================
--- head/sys/fs/msdosfs/msdosfs_denode.c	(revision 13489)
+++ head/sys/fs/msdosfs/msdosfs_denode.c	(revision 13490)
@@ -1,728 +1,730 @@
-/*	$Id: msdosfs_denode.c,v 1.14 1995/12/03 16:41:53 bde Exp $ */
+/*	$Id: msdosfs_denode.c,v 1.15 1995/12/07 12:47:19 davidg Exp $ */
 /*	$NetBSD: msdosfs_denode.c,v 1.9 1994/08/21 18:44:00 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994 Wolfgang Solfrank.
  * Copyright (C) 1994 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/types.h>
 #include <sys/kernel.h>		/* defines "time" */
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 
 #include <msdosfs/bpb.h>
 #include <msdosfs/msdosfsmount.h>
 #include <msdosfs/direntry.h>
 #include <msdosfs/denode.h>
 #include <msdosfs/fat.h>
 
 struct denode **dehashtbl;
 u_long dehash;			/* size of hash table - 1 */
 #define	DEHASH(dev, deno)	(((dev) + (deno)) & dehash)
 
 union _qcvt {
 	quad_t qcvt;
 	long val[2];
 };
 #define SETHIGH(q, h) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_HIGHWORD] = (h); \
 	(q) = tmp.qcvt; \
 }
 #define SETLOW(q, l) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_LOWWORD] = (l); \
 	(q) = tmp.qcvt; \
 }
 
 static struct denode *
 		msdosfs_hashget __P((dev_t dev, u_long dirclust,
 				     u_long diroff));
 static void	msdosfs_hashins __P((struct denode *dep));
 static void	msdosfs_hashrem __P((struct denode *dep));
 
 int msdosfs_init()
 {
 	dehashtbl = hashinit(desiredvnodes/2, M_MSDOSFSMNT, &dehash);
 	return 0;
 }
 
 static struct denode *
 msdosfs_hashget(dev, dirclust, diroff)
 	dev_t dev;
 	u_long dirclust;
 	u_long diroff;
 {
 	struct denode *dep;
 
 	for (;;)
 		for (dep = dehashtbl[DEHASH(dev, dirclust + diroff)];;
 		     dep = dep->de_next) {
 			if (dep == NULL)
 				return NULL;
 			if (dirclust != dep->de_dirclust
 			    || diroff != dep->de_diroffset
 			    || dev != dep->de_dev
 			    || dep->de_refcnt == 0)
 				continue;
 			if (dep->de_flag & DE_LOCKED) {
 				dep->de_flag |= DE_WANTED;
 				(void) tsleep((caddr_t)dep, PINOD, "msdhgt", 0);
 				break;
 			}
 			if (!vget(DETOV(dep), 1))
 				return dep;
 			break;
 		}
 	/* NOTREACHED */
 }
 
 static void
 msdosfs_hashins(dep)
 	struct denode *dep;
 {
 	struct denode **depp, *deq;
 
 	depp = &dehashtbl[DEHASH(dep->de_dev, dep->de_dirclust + dep->de_diroffset)];
 	deq = *depp;
 	if (deq)
 		deq->de_prev = &dep->de_next;
 	dep->de_next = deq;
 	dep->de_prev = depp;
 	*depp = dep;
 }
 
 static void
 msdosfs_hashrem(dep)
 	struct denode *dep;
 {
 	struct denode *deq;
 	deq = dep->de_next;
 	if (deq)
 		deq->de_prev = dep->de_prev;
 	*dep->de_prev = deq;
 #ifdef DIAGNOSTIC
 	dep->de_next = NULL;
 	dep->de_prev = NULL;
 #endif
 }
 
 /*
  * If deget() succeeds it returns with the gotten denode locked().
  *
  * pmp	     - address of msdosfsmount structure of the filesystem containing
  *	       the denode of interest.  The pm_dev field and the address of
  *	       the msdosfsmount structure are used.
  * dirclust  - which cluster bp contains, if dirclust is 0 (root directory)
  *	       diroffset is relative to the beginning of the root directory,
  *	       otherwise it is cluster relative.
  * diroffset - offset past begin of cluster of denode we want
  * direntptr - address of the direntry structure of interest. If direntptr is
  *	       NULL, the block is read if necessary.
  * depp	     - returns the address of the gotten denode.
  */
 int
 deget(pmp, dirclust, diroffset, direntptr, depp)
 	struct msdosfsmount *pmp;	/* so we know the maj/min number */
 	u_long dirclust;		/* cluster this dir entry came from */
 	u_long diroffset;		/* index of entry within the cluster */
 	struct direntry *direntptr;
 	struct denode **depp;		/* returns the addr of the gotten denode */
 {
 	int error;
 	dev_t dev = pmp->pm_dev;
 	struct mount *mntp = pmp->pm_mountp;
 	struct denode *ldep;
 	struct vnode *nvp;
 	struct buf *bp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("deget(pmp %p, dirclust %ld, diroffset %x, direntptr %p, depp %p)\n",
 	       pmp, dirclust, diroffset, direntptr, depp);
 #endif
 
 	/*
 	 * If dir entry is given and refers to a directory, convert to
 	 * canonical form
 	 */
 	if (direntptr && (direntptr->deAttributes & ATTR_DIRECTORY)) {
 		dirclust = getushort(direntptr->deStartCluster);
 		if (dirclust == MSDOSFSROOT)
 			diroffset = MSDOSFSROOT_OFS;
 		else
 			diroffset = 0;
 	}
 
 	/*
 	 * See if the denode is in the denode cache. Use the location of
 	 * the directory entry to compute the hash value. For subdir use
 	 * address of "." entry. for root dir use cluster MSDOSFSROOT,
 	 * offset MSDOSFSROOT_OFS
 	 *
 	 * NOTE: The check for de_refcnt > 0 below insures the denode being
 	 * examined does not represent an unlinked but still open file.
 	 * These files are not to be accessible even when the directory
 	 * entry that represented the file happens to be reused while the
 	 * deleted file is still open.
 	 */
 	ldep = msdosfs_hashget(dev, dirclust, diroffset);
 	if (ldep) {
 		*depp = ldep;
 		return 0;
 	}
 
 
 	/*
 	 * Directory entry was not in cache, have to create a vnode and
 	 * copy it from the passed disk buffer.
 	 */
 	/* getnewvnode() does a VREF() on the vnode */
 	error = getnewvnode(VT_MSDOSFS, mntp, msdosfs_vnodeop_p, &nvp);
 	if (error) {
 		*depp = 0;
 		return error;
 	}
 	MALLOC(ldep, struct denode *, sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK);
 	bzero((caddr_t)ldep, sizeof *ldep);
 	nvp->v_data = ldep;
 	ldep->de_vnode = nvp;
 	ldep->de_flag = 0;
 	ldep->de_devvp = 0;
 	ldep->de_lockf = 0;
 	ldep->de_dev = dev;
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	fc_purge(ldep, 0);	/* init the fat cache for this denode */
 
 	/*
 	 * Insert the denode into the hash queue and lock the denode so it
 	 * can't be accessed until we've read it in and have done what we
 	 * need to it.
 	 */
 	VOP_LOCK(nvp);
 	msdosfs_hashins(ldep);
 
 	/*
 	 * Copy the directory entry into the denode area of the vnode.
 	 */
 	if (dirclust == MSDOSFSROOT && diroffset == MSDOSFSROOT_OFS) {
 		/*
 		 * Directory entry for the root directory. There isn't one,
 		 * so we manufacture one. We should probably rummage
 		 * through the root directory and find a label entry (if it
 		 * exists), and then use the time and date from that entry
 		 * as the time and date for the root denode.
 		 */
 		ldep->de_Attributes = ATTR_DIRECTORY;
 		ldep->de_StartCluster = MSDOSFSROOT;
 		ldep->de_FileSize = pmp->pm_rootdirsize * pmp->pm_BytesPerSec;
 		/*
 		 * fill in time and date so that dos2unixtime() doesn't
 		 * spit up when called from msdosfs_getattr() with root
 		 * denode
 		 */
 		ldep->de_Time = 0x0000;	/* 00:00:00	 */
 		ldep->de_Date = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT)
 		    | (1 << DD_DAY_SHIFT);
 		/* Jan 1, 1980	 */
 		/* leave the other fields as garbage */
 	} else {
 		bp = NULL;
 		if (!direntptr) {
 			error = readep(pmp, dirclust, diroffset, &bp,
 				       &direntptr);
 			if (error)
 				return error;
 		}
 		DE_INTERNALIZE(ldep, direntptr);
 		if (bp)
 			brelse(bp);
 	}
 
 	/*
 	 * Fill in a few fields of the vnode and finish filling in the
 	 * denode.  Then return the address of the found denode.
 	 */
 	ldep->de_pmp = pmp;
 	ldep->de_devvp = pmp->pm_devvp;
 	ldep->de_refcnt = 1;
 	if (ldep->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Since DOS directory entries that describe directories
 		 * have 0 in the filesize field, we take this opportunity
 		 * to find out the length of the directory and plug it into
 		 * the denode structure.
 		 */
 		u_long size;
 
 		nvp->v_type = VDIR;
 		if (ldep->de_StartCluster == MSDOSFSROOT)
 			nvp->v_flag |= VROOT;
 		else {
 			error = pcbmap(ldep, 0xffff, 0, &size);
 			if (error == E2BIG) {
 				ldep->de_FileSize = size << pmp->pm_cnshift;
 				error = 0;
 			} else
 				printf("deget(): pcbmap returned %d\n", error);
 		}
 	} else
 		nvp->v_type = VREG;
 	SETHIGH(ldep->de_modrev, mono_time.tv_sec);
 	SETLOW(ldep->de_modrev, mono_time.tv_usec * 4294);
 	VREF(ldep->de_devvp);
 	*depp = ldep;
 	return 0;
 }
 
 int
 deupdat(dep, tp, waitfor)
 	struct denode *dep;
 	struct timespec *tp;
 	int waitfor;
 {
 	int error;
 	struct buf *bp;
 	struct direntry *dirp;
 	struct vnode *vp = DETOV(dep);
 
 #ifdef MSDOSFS_DEBUG
 	printf("deupdat(): dep %p\n", dep);
 #endif
 
 	/*
 	 * If the denode-modified and update-mtime bits are off,
 	 * or this denode is from a readonly filesystem,
 	 * or this denode is for a directory,
 	 * or the denode represents an open but unlinked file,
 	 * then don't do anything.  DOS directory
 	 * entries that describe a directory do not ever get
 	 * updated.  This is the way DOS treats them.
 	 */
 	if ((dep->de_flag & (DE_MODIFIED | DE_UPDATE)) == 0 ||
 	    vp->v_mount->mnt_flag & MNT_RDONLY ||
 	    dep->de_Attributes & ATTR_DIRECTORY ||
 	    dep->de_refcnt <= 0)
 		return 0;
 
 	/*
 	 * Read in the cluster containing the directory entry we want to
 	 * update.
 	 */
 	error = readde(dep, &bp, &dirp);
 	if (error)
 		return error;
 
 	/*
 	 * If the mtime is to be updated, put the passed in time into the
 	 * directory entry.
 	 */
 	if (dep->de_flag & DE_UPDATE) {
 		dep->de_Attributes |= ATTR_ARCHIVE;
 		unix2dostime(tp, &dep->de_Date, &dep->de_Time);
 	}
 
 	/*
 	 * The mtime is now up to date.  The denode will be unmodifed soon.
 	 */
 	dep->de_flag &= ~(DE_MODIFIED | DE_UPDATE);
 
 	/*
 	 * Copy the directory entry out of the denode into the cluster it
 	 * came from.
 	 */
 	DE_EXTERNALIZE(dirp, dep);
 
 	/*
 	 * Write the cluster back to disk.  If they asked for us to wait
 	 * for the write to complete, then use bwrite() otherwise use
 	 * bdwrite().
 	 */
 	error = 0;		/* note that error is 0 from above, but ... */
 	if (waitfor)
 		error = bwrite(bp);
 	else
 		bdwrite(bp);
 	return error;
 }
 
 /*
  * Truncate the file described by dep to the length specified by length.
  */
 int
 detrunc(dep, length, flags, cred, p)
 	struct denode *dep;
 	u_long length;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 {
 	int error;
 	int allerror;
 	int vflags;
 	u_long eofentry;
 	u_long chaintofree;
 	daddr_t bn;
 	int boff;
 	int isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct timespec ts;
 
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): file %s, length %d, flags %d\n", dep->de_Name, length, flags);
 #endif
 
 	/*
 	 * Disallow attempts to truncate the root directory since it is of
 	 * fixed size.  That's just the way dos filesystems are.  We use
 	 * the VROOT bit in the vnode because checking for the directory
 	 * bit and a startcluster of 0 in the denode is not adequate to
 	 * recognize the root directory at this point in a file or
 	 * directory's life.
 	 */
 	if (DETOV(dep)->v_flag & VROOT) {
 		printf(
     "detrunc(): can't truncate root directory, clust %ld, offset %ld\n",
 		    dep->de_dirclust, dep->de_diroffset);
 		return EINVAL;
 	}
 
-	vnode_pager_setsize(DETOV(dep), length);
 
-	if (dep->de_FileSize < length)
+	if (dep->de_FileSize < length) {
+		vnode_pager_setsize(DETOV(dep), length);
 		return deextend(dep, length, cred);
+	}
 
 	/*
 	 * If the desired length is 0 then remember the starting cluster of
 	 * the file and set the StartCluster field in the directory entry
 	 * to 0.  If the desired length is not zero, then get the number of
 	 * the last cluster in the shortened file.  Then get the number of
 	 * the first cluster in the part of the file that is to be freed.
 	 * Then set the next cluster pointer in the last cluster of the
 	 * file to CLUST_EOFE.
 	 */
 	if (length == 0) {
 		chaintofree = dep->de_StartCluster;
 		dep->de_StartCluster = 0;
 		eofentry = ~0;
 	} else {
 		error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): pcbmap fails %d\n", error);
 #endif
 			return error;
 		}
 	}
 
 	fc_purge(dep, (length + pmp->pm_crbomask) >> pmp->pm_cnshift);
 
 	/*
 	 * If the new length is not a multiple of the cluster size then we
 	 * must zero the tail end of the new last cluster in case it
 	 * becomes part of the file again because of a seek.
 	 */
 	if ((boff = length & pmp->pm_crbomask) != 0) {
 		/*
 		 * should read from file vnode or filesystem vnode
 		 * depending on if file or dir
 		 */
 		if (isadir) {
 			bn = cntobn(pmp, eofentry);
 			error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		} else {
 			bn = de_blk(pmp, length);
 			error = bread(DETOV(dep), bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		}
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): bread fails %d\n", error);
 #endif
 			return error;
 		}
 		/*
 		 * is this the right place for it?
 		 */
 		bzero(bp->b_data + boff, pmp->pm_bpcluster - boff);
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else
 			bdwrite(bp);
 	}
 
 	/*
 	 * Write out the updated directory entry.  Even if the update fails
 	 * we free the trailing clusters.
 	 */
 	dep->de_FileSize = length;
 	dep->de_flag |= DE_UPDATE;
 	vflags = (length > 0 ? V_SAVE : 0) | V_SAVEMETA;
 	vinvalbuf(DETOV(dep), vflags, cred, p, 0, 0);
+	vnode_pager_setsize(DETOV(dep), length);
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
 	allerror = deupdat(dep, &ts, 1);
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): allerror %d, eofentry %d\n",
 	       allerror, eofentry);
 #endif
 
 	/*
 	 * If we need to break the cluster chain for the file then do it
 	 * now.
 	 */
 	if (eofentry != ~0) {
 		error = fatentry(FAT_GET_AND_SET, pmp, eofentry,
 				 &chaintofree, CLUST_EOFE);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): fatentry errors %d\n", error);
 #endif
 			return error;
 		}
 		fc_setcache(dep, FC_LASTFC, (length - 1) >> pmp->pm_cnshift,
 			    eofentry);
 	}
 
 	/*
 	 * Now free the clusters removed from the file because of the
 	 * truncation.
 	 */
 	if (chaintofree != 0 && !MSDOSFSEOF(chaintofree))
 		freeclusterchain(pmp, chaintofree);
 
 	return allerror;
 }
 
 /*
  * Extend the file described by dep to length specified by length.
  */
 int
 deextend(dep, length, cred)
 	struct denode *dep;
 	off_t length;
 	struct ucred *cred;
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	u_long count;
 	int error;
 	struct timespec ts;
 
 	/*
 	 * The root of a DOS filesystem cannot be extended.
 	 */
 	if (DETOV(dep)->v_flag & VROOT)
 		return EINVAL;
 
 	/*
 	 * Directories can only be extended by the superuser.
 	 * Is this really important?
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		error = suser(cred, NULL);
 		if (error)
 			return error;
 	}
 
 	if (length <= dep->de_FileSize)
 		panic("deextend: file too large");
 
 	/*
 	 * Compute the number of clusters to allocate.
 	 */
 	count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize);
 	if (count > 0) {
 		if (count > pmp->pm_freeclustercount)
 			return ENOSPC;
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
 			(void) detrunc(dep, dep->de_FileSize, 0, cred, NULL);
 			return error;
 		}
 	}
 
 	dep->de_flag |= DE_UPDATE;
 	dep->de_FileSize = length;
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
 	return deupdat(dep, &ts, 1);
 }
 
 /*
  * Move a denode to its correct hash queue after the file it represents has
  * been moved to a new directory.
  */
 int reinsert(dep)
 	struct denode *dep;
 {
 	/*
 	 * Fix up the denode cache.  If the denode is for a directory,
 	 * there is nothing to do since the hash is based on the starting
 	 * cluster of the directory file and that hasn't changed.  If for a
 	 * file the hash is based on the location of the directory entry,
 	 * so we must remove it from the cache and re-enter it with the
 	 * hash based on the new location of the directory entry.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) {
 		msdosfs_hashrem(dep);
 		msdosfs_hashins(dep);
 	}
 	return 0;
 }
 
 int
 msdosfs_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n",
 	    dep, dep->de_Name, dep->de_refcnt);
 #endif
 
 	if (prtactive && vp->v_usecount != 0)
 		vprint("msdosfs_reclaim(): pushing active", vp);
 
 	/*
 	 * Remove the denode from the denode hash chain we are in.
 	 */
 	msdosfs_hashrem(dep);
 
 	cache_purge(vp);
 	/*
 	 * Indicate that one less file on the filesystem is open.
 	 */
 	if (dep->de_devvp) {
 		vrele(dep->de_devvp);
 		dep->de_devvp = 0;
 	}
 
 	dep->de_flag = 0;
 
 	FREE(dep, M_MSDOSFSNODE);
 	vp->v_data = NULL;
 
 	return 0;
 }
 
 int
 msdosfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	int error = 0;
 	struct timespec ts;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]);
 #endif
 
 	if (prtactive && vp->v_usecount != 0)
 		vprint("msdosfs_inactive(): pushing active", vp);
 
 	/*
 	 * Get rid of denodes related to stale file handles. Hmmm, what
 	 * does this really do?
 	 */
 	if (dep->de_Name[0] == SLOT_DELETED) {
 		if ((vp->v_flag & VXLOCK) == 0)
 			vgone(vp);
 		return 0;
 	}
 
 	/*
 	 * If the file has been deleted and it is on a read/write
 	 * filesystem, then truncate the file, and mark the directory slot
 	 * as empty.  (This may not be necessary for the dos filesystem.)
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %x, MNT_RDONLY %x\n",
 	       dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY);
 #endif
 	VOP_LOCK(vp);
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		error = detrunc(dep, (u_long) 0, 0, NOCRED, NULL);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
 	if (dep->de_flag & (DE_MODIFIED | DE_UPDATE)) {
 		TIMEVAL_TO_TIMESPEC(&time, &ts);
 		deupdat(dep, &ts, 0);
 	}
 	VOP_UNLOCK(vp);
 	dep->de_flag = 0;
 
 	/*
 	 * If we are done with the denode, then reclaim it so that it can
 	 * be reused now.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vp->v_usecount,
 	       dep->de_Name[0]);
 #endif
 	if (vp->v_usecount == 0 && dep->de_Name[0] == SLOT_DELETED)
 		vgone(vp);
 	return error;
 }
Index: head/sys/fs/procfs/procfs_mem.c
===================================================================
--- head/sys/fs/procfs/procfs_mem.c	(revision 13489)
+++ head/sys/fs/procfs/procfs_mem.c	(revision 13490)
@@ -1,246 +1,247 @@
 /*
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993 Sean Eric Fagan
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry and Sean Eric Fagan.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_mem.c	8.4 (Berkeley) 1/21/94
  *
- *	$Id: procfs_mem.c,v 1.13 1995/12/11 04:56:31 dyson Exp $
+ *	$Id: procfs_mem.c,v 1.14 1995/12/17 07:19:24 bde Exp $
  */
 
 /*
  * This is a lightly hacked and merged version
  * of sef's pread/pwrite functions
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <miscfs/procfs/procfs.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 static int	procfs_rwmem __P((struct proc *p, struct uio *uio));
 
 static int
 procfs_rwmem(p, uio)
 	struct proc *p;
 	struct uio *uio;
 {
 	int error;
 	int writing;
 
 	writing = uio->uio_rw == UIO_WRITE;
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
 		vm_map_t map, tmap;
 		vm_object_t object;
 		vm_offset_t kva = 0;
 		vm_offset_t uva;
 		int page_offset;		/* offset into page */
 		vm_offset_t pageno;		/* page number */
 		vm_map_entry_t out_entry;
 		vm_prot_t out_prot;
 		vm_page_t m;
 		boolean_t wired, single_use;
 		vm_pindex_t pindex;
 		u_int len;
 		int fix_prot;
 
 		uva = (vm_offset_t) uio->uio_offset;
 		if (uva >= VM_MAXUSER_ADDRESS) {
 			if (writing || (uva >= (VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE))) {
 				error = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Get the page number of this segment.
 		 */
 		pageno = trunc_page(uva);
 		page_offset = uva - pageno;
 
 		/*
 		 * How many bytes to copy
 		 */
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
 
 		/*
 		 * The map we want...
 		 */
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * Check the permissions for the area we're interested
 		 * in.
 		 */
 		fix_prot = 0;
 		if (writing)
 			fix_prot = !vm_map_check_protection(map, pageno,
 					pageno + PAGE_SIZE, VM_PROT_WRITE);
 
 		if (fix_prot) {
 			/*
 			 * If the page is not writable, we make it so.
 			 * XXX It is possible that a page may *not* be
 			 * read/executable, if a process changes that!
 			 * We will assume, for now, that a page is either
 			 * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE.
 			 */
 			error = vm_map_protect(map, pageno,
 					pageno + PAGE_SIZE, VM_PROT_ALL, 0);
 			if (error)
 				break;
 		}
 
 		/*
 		 * Now we need to get the page.  out_entry, out_prot, wired,
 		 * and single_use aren't used.  One would think the vm code
 		 * would be a *bit* nicer...  We use tmap because
 		 * vm_map_lookup() can change the map argument.
 		 */
 		tmap = map;
 		error = vm_map_lookup(&tmap, pageno,
 				      writing ? VM_PROT_WRITE : VM_PROT_READ,
 				      &out_entry, &object, &pindex, &out_prot,
 				      &wired, &single_use);
 		/*
 		 * We're done with tmap now.
 		 */
 		if (!error)
 			vm_map_lookup_done(tmap, out_entry);
 
 		/*
 		 * Fault the page in...
 		 */
 		if (!error && writing && object->backing_object) {
 			m = vm_page_lookup(object, pindex);
 			if (m == 0)
 				error = vm_fault(map, pageno,
 							VM_PROT_WRITE, FALSE);
 		}
 
 		/* Find space in kernel_map for the page we're interested in */
 		if (!error)
 			error = vm_map_find(kernel_map, object,
-				IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1);
+				IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1,
+				VM_PROT_ALL, VM_PROT_ALL, 0);
 
 		if (!error) {
 			/*
 			 * Neither vm_map_lookup() nor vm_map_find() appear
 			 * to add a reference count to the object, so we do
 			 * that here and now.
 			 */
 			vm_object_reference(object);
 
 			/*
 			 * Mark the page we just found as pageable.
 			 */
 			error = vm_map_pageable(kernel_map, kva,
 				kva + PAGE_SIZE, 0);
 
 			/*
 			 * Now do the i/o move.
 			 */
 			if (!error)
 				error = uiomove((caddr_t)(kva + page_offset),
 						len, uio);
 
 			vm_map_remove(kernel_map, kva, kva + PAGE_SIZE);
 		}
 		if (fix_prot)
 			vm_map_protect(map, pageno, pageno + PAGE_SIZE,
 					VM_PROT_READ|VM_PROT_EXECUTE, 0);
 	} while (error == 0 && uio->uio_resid > 0);
 
 	return (error);
 }
 
 /*
  * Copy data in and out of the target process.
  * We do this by mapping the process's page into
  * the kernel and then doing a uiomove direct
  * from the kernel address space.
  */
 int
 procfs_domem(curp, p, pfs, uio)
 	struct proc *curp;
 	struct proc *p;
 	struct pfsnode *pfs;
 	struct uio *uio;
 {
 	int error;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	error = procfs_rwmem(p, uio);
 
 	return (error);
 }
 
 /*
  * Given process (p), find the vnode from which
  * it's text segment is being executed.
  *
  * It would be nice to grab this information from
  * the VM system, however, there is no sure-fire
  * way of doing that.  Instead, fork(), exec() and
  * wait() all maintain the p_textvp field in the
  * process proc structure which contains a held
  * reference to the exec'ed vnode.
  */
 struct vnode *
 procfs_findtextvp(p)
 	struct proc *p;
 {
 	return (p->p_textvp);
 }
Index: head/sys/gnu/ext2fs/ext2_bmap.c
===================================================================
--- head/sys/gnu/ext2fs/ext2_bmap.c	(revision 13489)
+++ head/sys/gnu/ext2fs/ext2_bmap.c	(revision 13490)
@@ -1,317 +1,317 @@
 /*
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.6 (Berkeley) 1/21/94
- * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $
+ * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 /*
  * Bmap converts a the logical block number of a file to its physical block
  * number on the disk. The conversion is done by using the logical block
  * number to index into the array of block pointers described by the dinode.
  */
 int
 ufs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct vnode **a_vpp;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	/*
 	 * Check for underlying vnode requests and ensure that logical
 	 * to physical mapping is requested.
 	 */
 	if (ap->a_vpp != NULL)
 		*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
 	if (ap->a_bnp == NULL)
 		return (0);
 
 	return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
 	    ap->a_runp, ap->a_runb));
 }
 
 /*
  * Indirect blocks are now on the vnode for the file.  They are given negative
  * logical block numbers.  Indirect blocks are addressed by the negative
  * address of the first data block to which they point.  Double indirect blocks
  * are addressed by one less than the address of the first indirect block to
  * which they point.  Triple indirect blocks are addressed by one less than
  * the address of the first double indirect block to which they point.
  *
  * ufs_bmaparray does the bmap conversion, and if requested returns the
  * array of logical blocks which must be traversed to get to a block.
  * Each entry contains the offset into that block that gets you to the
  * next block and the disk address of the block (if it is assigned).
  */
 
 int
 ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	struct vnode *vp;
 	register daddr_t bn;
 	daddr_t *bnp;
 	struct indir *ap;
 	int *nump;
 	int *runp;
 	int *runb;
 {
 	register struct inode *ip;
 	struct buf *bp;
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct vnode *devvp;
 	struct indir a[NIADDR+1], *xap;
 	daddr_t daddr;
 	long metalbn;
 	int error, maxrun = 0, num;
 
 	ip = VTOI(vp);
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 #ifdef DIAGNOSTIC
 	if (ap != NULL && nump == NULL || ap == NULL && nump != NULL)
 		panic("ufs_bmaparray: invalid arguments");
 #endif
 
 	if (runp) {
 		/*
 		 * XXX
 		 * If MAXPHYS is the largest transfer the disks can handle,
 		 * we probably want maxrun to be 1 block less so that we
 		 * don't create a block larger than the device can handle.
 		 */
 		*runp = 0;
 		maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
 	}
 
 	if (runb) {
 		*runb = 0;
 	}
 
 	xap = ap == NULL ? a : ap;
 	if (!nump)
 		nump = &num;
 	error = ufs_getlbns(vp, bn, xap, nump);
 	if (error)
 		return (error);
 
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
 		if (*bnp == 0)
 			*bnp = -1;
 		else if (runp) {
 			daddr_t bnb = bn;
 			for (++bn; bn < NDADDR && *runp < maxrun &&
 			    is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]);
 			    ++bn, ++*runp);
 			bn = bnb;
 			if (runb && (bn > 0)) {
 				for (--bn; (bn >= 0) && (*runb < maxrun) &&
 					is_sequential(ump, ip->i_db[bn],
 						ip->i_db[bn+1]);
 						--bn, ++*runb);
 			}
 		}
 		return (0);
 	}
 
 
 	/* Get disk address out of indirect block array */
 	daddr = ip->i_ib[xap->in_off];
 
 	devvp = VFSTOUFS(vp->v_mount)->um_devvp;
 	for (bp = NULL, ++xap; --num; ++xap) {
 		/*
 		 * Exit the loop if there is no disk address assigned yet and
 		 * the indirect block isn't in the cache, or if we were
 		 * looking for an indirect block and we've found it.
 		 */
 
 		metalbn = xap->in_lbn;
 		if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
 			break;
 		/*
 		 * If we get here, we've either got the block in the cache
 		 * or we have a disk address for it, go fetch it.
 		 */
 		if (bp)
-			brelse(bp);
+			bqrelse(bp);
 
 		xap->in_exists = 1;
 		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
 		if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef DIAGNOSTIC
 			if (!daddr)
 				panic("ufs_bmaparry: indirect block not in cache");
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
 			error = biowait(bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 		}
 
 		daddr = ((daddr_t *)bp->b_data)[xap->in_off];
 		if (num == 1 && daddr && runp) {
 			for (bn = xap->in_off + 1;
 			    bn < MNINDIR(ump) && *runp < maxrun &&
 			    is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1],
 			    ((daddr_t *)bp->b_data)[bn]);
 			    ++bn, ++*runp);
 			bn = xap->in_off;
 			if (runb && bn) {
 				for(--bn; bn > 0 && *runb < maxrun &&
 			    		is_sequential(ump, ((daddr_t *)bp->b_data)[bn],
 					    ((daddr_t *)bp->b_data)[bn+1]);
 			    		--bn, ++*runb);
 			}
 		}
 	}
 	if (bp)
-		brelse(bp);
+		bqrelse(bp);
 
 	daddr = blkptrtodb(ump, daddr);
 	*bnp = daddr == 0 ? -1 : daddr;
 	return (0);
 }
 
 /*
  * Create an array of logical block number/offset pairs which represent the
  * path of indirect blocks required to access a data block.  The first "pair"
  * contains the logical block number of the appropriate single, double or
  * triple indirect block and the offset into the inode indirect block array.
  * Note, the logical block number of the inode single/double/triple indirect
  * block appears twice in the array, once with the offset into the i_ib and
  * once with the offset into the page itself.
  */
 int
 ufs_getlbns(vp, bn, ap, nump)
 	struct vnode *vp;
 	register daddr_t bn;
 	struct indir *ap;
 	int *nump;
 {
 	long metalbn, realbn;
 	struct ufsmount *ump;
 	int blockcnt, i, numlevels, off;
 
 	ump = VFSTOUFS(vp->v_mount);
 	if (nump)
 		*nump = 0;
 	numlevels = 0;
 	realbn = bn;
 	if ((long)bn < 0)
 		bn = -(long)bn;
 
 	/* The first NDADDR blocks are direct blocks. */
 	if (bn < NDADDR)
 		return (0);
 
 	/*
 	 * Determine the number of levels of indirection.  After this loop
 	 * is done, blockcnt indicates the number of data blocks possible
 	 * at the given level of indirection, and NIADDR - i is the number
 	 * of levels of indirection needed to locate the requested block.
 	 */
 	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
 		if (i == 0)
 			return (EFBIG);
 		blockcnt *= MNINDIR(ump);
 		if (bn < blockcnt)
 			break;
 	}
 
 	/* Calculate the address of the first meta-block. */
 	if (realbn >= 0)
 		metalbn = -(realbn - bn + NIADDR - i);
 	else
 		metalbn = -(-realbn - bn + NIADDR - i);
 
 	/*
 	 * At each iteration, off is the offset into the bap array which is
 	 * an array of disk addresses at the current level of indirection.
 	 * The logical block number and the offset in that block are stored
 	 * into the argument array.
 	 */
 	ap->in_lbn = metalbn;
 	ap->in_off = off = NIADDR - i;
 	ap->in_exists = 0;
 	ap++;
 	for (++numlevels; i <= NIADDR; i++) {
 		/* If searching for a meta-data block, quit when found. */
 		if (metalbn == realbn)
 			break;
 
 		blockcnt /= MNINDIR(ump);
 		off = (bn / blockcnt) % MNINDIR(ump);
 
 		++numlevels;
 		ap->in_lbn = metalbn;
 		ap->in_off = off;
 		ap->in_exists = 0;
 		++ap;
 
 		metalbn -= -1 + off * blockcnt;
 	}
 	if (nump)
 		*nump = numlevels;
 	return (0);
 }
Index: head/sys/gnu/ext2fs/ext2_inode.c
===================================================================
--- head/sys/gnu/ext2fs/ext2_inode.c	(revision 13489)
+++ head/sys/gnu/ext2fs/ext2_inode.c	(revision 13490)
@@ -1,550 +1,551 @@
 /*
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ext2_inode.c	8.5 (Berkeley) 12/30/93
  */
 
 #if !defined(__FreeBSD__)
 #include "quota.h"
 #include "diagnostic.h"
 #else
 #include "opt_quota.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #if !defined(__FreeBSD__)
 #include <sys/trace.h>
 #endif
 #include <sys/resourcevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <gnu/ext2fs/ext2_fs.h>
 #include <gnu/ext2fs/ext2_fs_sb.h>
 #include <gnu/ext2fs/fs.h>
 #include <gnu/ext2fs/ext2_extern.h>
 
 static int ext2_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int,
 	    long *));
 
 int
 ext2_init()
 {
 	return (ufs_init());
 }
 
 /*
  * Update the access, modified, and inode change times as specified by the
  * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is
  * used to specify that the inode needs to be updated but that the times have
  * already been set. The access and modified times are taken from the second
  * and third parameters; the inode change time is always taken from the current
  * time. If waitfor is set, then wait for the disk write of the inode to
  * complete.
  */
 int
 ext2_update(ap)
 	struct vop_update_args /* {
 		struct vnode *a_vp;
 		struct timeval *a_access;
 		struct timeval *a_modify;
 		int a_waitfor;
 	} */ *ap;
 {
 	register struct ext2_sb_info *fs;
 	struct buf *bp;
 	struct inode *ip;
 	int error;
 #if !defined(__FreeBSD__)
 	struct timeval time;
 #endif
 
 	ip = VTOI(ap->a_vp);
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) {
 		ip->i_flag &=
 		    ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 		return (0);
 	}
 	if ((ip->i_flag &
 	    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)
 		return (0);
 	if (ip->i_flag & IN_ACCESS)
 		ip->i_atime.ts_sec = ap->a_access->tv_sec;
 	if (ip->i_flag & IN_UPDATE) {
 		ip->i_mtime.ts_sec = ap->a_modify->tv_sec;
 		ip->i_modrev++;
 	}
 	if (ip->i_flag & IN_CHANGE) {
 #if !defined(__FreeBSD__)
 		get_time(&time);
 #endif
 		ip->i_ctime.ts_sec = time.tv_sec;
 	}
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 	fs = ip->i_e2fs;
 	if (error = bread(ip->i_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->s_blocksize, NOCRED, &bp)) {
 		brelse(bp);
 		return (error);
 	}
 	ext2_di2ei( &ip->i_din, (struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE *
 	    ino_to_fsbo(fs, ip->i_number)));
 /*
 	if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
 		return (bwrite(bp));
 	else {
 */
 		bdwrite(bp);
 		return (0);
 /*
 	}
 */
 }
 
 #define	SINGLE	0	/* index of single indirect block */
 #define	DOUBLE	1	/* index of double indirect block */
 #define	TRIPLE	2	/* index of triple indirect block */
 /*
  * Truncate the inode oip to at most length size, freeing the
  * disk blocks.
  */
 int
 ext2_truncate(ap)
 	struct vop_truncate_args /* {
 		struct vnode *a_vp;
 		off_t a_length;
 		int a_flags;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct vnode *ovp = ap->a_vp;
 	register daddr_t lastblock;
 	register struct inode *oip;
 	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
 	daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	off_t length = ap->a_length;
 	register struct ext2_sb_info *fs;
 	struct buf *bp;
 	int offset, size, level;
 	long count, nblocks, vflags, blocksreleased = 0;
 	struct timeval tv;
 	register int i;
 	int aflags, error, allerror;
 	off_t osize;
 /*
 printf("ext2_truncate called %d to %d\n", VTOI(ovp)->i_number, ap->a_length);
 */	/* 
 	 * negative file sizes will totally break the code below and
 	 * are not meaningful anyways.
 	 */
 	if (length < 0)
 	    return EFBIG;
 
 	oip = VTOI(ovp);
 #if defined(__FreeBSD__)
 	tv = time;
 #else
 	get_time(&tv);
 #endif
 	if (ovp->v_type == VLNK &&
 	    oip->i_size < ovp->v_mount->mnt_maxsymlinklen) {
 #if DIAGNOSTIC
 		if (length != 0)
 			panic("ext2_truncate: partial truncate of symlink");
 #endif
 		bzero((char *)&oip->i_shortlink, (u_int)oip->i_size);
 		oip->i_size = 0;
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	if (oip->i_size == length) {
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 0));
 	}
 #if QUOTA
 	if (error = getinoquota(oip))
 		return (error);
 #endif
-	vnode_pager_setsize(ovp, (u_long)length);
 	fs = oip->i_e2fs;
 	osize = oip->i_size;
 	ext2_discard_prealloc(oip);
 	/*
 	 * Lengthen the size of the file. We must ensure that the
 	 * last byte of the file is allocated. Since the smallest
 	 * value of oszie is 0, length will be at least 1.
 	 */
 	if (osize < length) {
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
+		vnode_pager_setsize(ovp, length);
 		if (error = ext2_balloc(oip, lbn, offset + 1, ap->a_cred, &bp,
 		    aflags))
 			return (error);
 		oip->i_size = length;
 #if !defined(__FreeBSD__)
 		(void) vnode_pager_uncache(ovp);
 #endif
 		if (aflags & IO_SYNC)
 			bwrite(bp);
 		else
 			bawrite(bp);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundry, the contents of the
 	 * partial block following the end of the file must be
 	 * zero'ed in case it ever become accessable again because
 	 * of subsequent file growth.
 	 */
 	/* I don't understand the comment above */
 	offset = blkoff(fs, length);
 	if (offset == 0) {
 		oip->i_size = length;
 	} else {
 		lbn = lblkno(fs, length);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
 		if (error = ext2_balloc(oip, lbn, offset, ap->a_cred, &bp,
 		    aflags))
 			return (error);
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 #if !defined(__FreeBSD__)
 		(void) vnode_pager_uncache(ovp);
 #endif
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
 		allocbuf(bp, size);
 		if (aflags & IO_SYNC)
 			bwrite(bp);
 		else
 			bawrite(bp);
 	}
 	/*
 	 * Calculate index into inode's block list of
 	 * last direct and indirect blocks (if any)
 	 * which we want to keep.  Lastblock is -1 when
 	 * the file is truncated to 0.
 	 */
 	lastblock = lblkno(fs, length + fs->s_blocksize - 1) - 1;
 	lastiblock[SINGLE] = lastblock - NDADDR;
 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 	nblocks = btodb(fs->s_blocksize);
 	/*
 	 * Update file and block pointers on disk before we start freeing
 	 * blocks.  If we crash before free'ing blocks below, the blocks
 	 * will be returned to the free list.  lastiblock values are also
 	 * normalized to -1 for calls to ext2_indirtrunc below.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks);
 	for (level = TRIPLE; level >= SINGLE; level--)
 		if (lastiblock[level] < 0) {
 			oip->i_ib[level] = 0;
 			lastiblock[level] = -1;
 		}
 	for (i = NDADDR - 1; i > lastblock; i--)
 		oip->i_db[i] = 0;
 	oip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT))
 		allerror = error;
 	/*
 	 * Having written the new inode to disk, save its new configuration
 	 * and put back the old block pointers long enough to process them.
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks);
 	bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks);
 	oip->i_size = osize;
 	vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA;
 	allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0);
 
 	/*
 	 * Indirect blocks first.
 	 */
 	indir_lbn[SINGLE] = -NDADDR;
 	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		bn = oip->i_ib[level];
 		if (bn != 0) {
 			error = ext2_indirtrunc(oip, indir_lbn[level],
 			    fsbtodb(fs, bn), lastiblock[level], level, &count);
 			if (error)
 				allerror = error;
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				oip->i_ib[level] = 0;
 				ext2_blkfree(oip, bn, fs->s_frag_size);
 				blocksreleased += nblocks;
 			}
 		}
 		if (lastiblock[level] >= 0)
 			goto done;
 	}
 
 	/*
 	 * All whole direct blocks or frags.
 	 */
 	for (i = NDADDR - 1; i > lastblock; i--) {
 		register long bsize;
 
 		bn = oip->i_db[i];
 		if (bn == 0)
 			continue;
 		oip->i_db[i] = 0;
 		bsize = blksize(fs, oip, i);
 		ext2_blkfree(oip, bn, bsize);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
 		goto done;
 
 	/*
 	 * Finally, look for a change in size of the
 	 * last direct block; release any frags.
 	 */
 	bn = oip->i_db[lastblock];
 	if (bn != 0) {
 		long oldspace, newspace;
 
 		/*
 		 * Calculate amount of space we're giving
 		 * back as old block size minus new block size.
 		 */
 		oldspace = blksize(fs, oip, lastblock);
 		oip->i_size = length;
 		newspace = blksize(fs, oip, lastblock);
 		if (newspace == 0)
 			panic("itrunc: newspace");
 		if (oldspace - newspace > 0) {
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
 			ext2_blkfree(oip, bn, oldspace - newspace);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
 done:
 #if DIAGNOSTIC
 	for (level = SINGLE; level <= TRIPLE; level++)
 		if (newblks[NDADDR + level] != oip->i_ib[level])
 			panic("itrunc1");
 	for (i = 0; i < NDADDR; i++)
 		if (newblks[i] != oip->i_db[i])
 			panic("itrunc2");
 	if (length == 0 &&
 	    (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first))
 		panic("itrunc3");
 #endif /* DIAGNOSTIC */
 	/*
 	 * Put back the real size.
 	 */
 	oip->i_size = length;
 	oip->i_blocks -= blocksreleased;
 	if (oip->i_blocks < 0)			/* sanity */
 		oip->i_blocks = 0;
 	oip->i_flag |= IN_CHANGE;
+	vnode_pager_setsize(ovp, length);
 #if QUOTA
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
 #endif
 	return (allerror);
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block bn.  Blocks are free'd in LIFO order up to (but not including)
  * lastbn.  If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  *
  * NB: triple indirect blocks are untested.
  */
 
 static int
 ext2_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	register struct inode *ip;
 	daddr_t lbn, lastbn;
 	daddr_t dbn;
 	int level;
 	long *countp;
 {
 	register int i;
 	struct buf *bp;
 	register struct ext2_sb_info *fs = ip->i_e2fs;
 	register daddr_t *bap;
 	struct vnode *vp;
 	daddr_t *copy, nb, nlbn, last;
 	long blkcount, factor;
 	int nblocks, blocksreleased = 0;
 	int error = 0, allerror = 0;
 
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
 	 * block so we need not calculate the index.
 	 */
 	factor = 1;
 	for (i = SINGLE; i < level; i++)
 		factor *= NINDIR(fs);
 	last = lastbn;
 	if (lastbn > 0)
 		last /= factor;
 	nblocks = btodb(fs->s_blocksize);
 	/*
 	 * Get buffer of block pointers, zero those entries corresponding
 	 * to blocks to be free'd, and update on disk copy first.  Since
 	 * double(triple) indirect before single(double) indirect, calls
 	 * to bmap on these blocks will fail.  However, we already have
 	 * the on disk address, so we have to set the b_blkno field
 	 * explicitly instead of letting bread do everything for us.
 	 */
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->s_blocksize, 0, 0);
 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
 		/* Braces must be here in case trace evaluates to nothing. */
 #if !defined(__FreeBSD__)
 		trace(TR_BREADHIT, pack(vp, fs->s_blocksize), lbn);
 #endif
 	} else {
 #if !defined(__FreeBSD__)
 		trace(TR_BREADMISS, pack(vp, fs->s_blocksize), lbn);
 		get_proc()->p_stats->p_ru.ru_inblock++;	/* pay for read */
 #endif
 		bp->b_flags |= B_READ;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ext2_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
 #if defined(__FreeBSD__)
 		vfs_busy_pages(bp, 0);
 #endif
 		VOP_STRATEGY(bp);
 		error = biowait(bp);
 	}
 	if (error) {
 		brelse(bp);
 		*countp = 0;
 		return (error);
 	}
 
 	bap = (daddr_t *)bp->b_data;
 	MALLOC(copy, daddr_t *, fs->s_blocksize, M_TEMP, M_WAITOK);
 	bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->s_blocksize);
 	bzero((caddr_t)&bap[last + 1],
 	  (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t));
 	if (last == -1)
 		bp->b_flags |= B_INVAL;
 	error = bwrite(bp);
 	if (error)
 		allerror = error;
 	bap = copy;
 
 	/*
 	 * Recursively free totally unused blocks.
 	 */
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = bap[i];
 		if (nb == 0)
 			continue;
 		if (level > SINGLE) {
 			if (error = ext2_indirtrunc(ip, nlbn,
 			    fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount))
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 		ext2_blkfree(ip, nb, fs->s_blocksize);
 		blocksreleased += nblocks;
 	}
 
 	/*
 	 * Recursively free last partial block.
 	 */
 	if (level > SINGLE && lastbn >= 0) {
 		last = lastbn % factor;
 		nb = bap[i];
 		if (nb != 0) {
 			if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    last, level - 1, &blkcount))
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 	}
 	FREE(copy, M_TEMP);
 	*countp = blocksreleased;
 	return (allerror);
 }
 
 /*
  *	discard preallocated blocks
  */
 int
 ext2_inactive(ap)
         struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	ext2_discard_prealloc(VTOI(ap->a_vp));
 	return ufs_inactive(ap);
 }
 
Index: head/sys/gnu/ext2fs/ext2_readwrite.c
===================================================================
--- head/sys/gnu/ext2fs/ext2_readwrite.c	(revision 13489)
+++ head/sys/gnu/ext2fs/ext2_readwrite.c	(revision 13490)
@@ -1,323 +1,326 @@
 /*
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.7 (Berkeley) 1/21/94
  */
 
 #if !defined(__FreeBSD__)
 #include "diagnostic.h"
 #endif
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
 #define	FS			struct ext2_sb_info
 #define	I_FS			i_e2fs
 #define	READ			ext2_read
 #define	READ_S			"ext2_read"
 #define	WRITE			ext2_write
 #define	WRITE_S			"ext2_write"
 
 /*
  * Vnode op for reading.
  */
 /* ARGSUSED */
 static int
 READ(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct inode *ip;
 	register struct uio *uio;
 	register FS *fs;
 	struct buf *bp;
 	daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error;
 	u_short mode;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	mode = ip->i_mode;
 	uio = ap->a_uio;
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("%s: mode", READ_S);
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("%s: short symlink", READ_S);
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("%s: type %d", READ_S, vp->v_type);
 #endif
 	fs = ip->I_FS;
 #if 0
 	if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize)
 		return (EFBIG);
 #endif
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = BLKSIZE(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->s_frag_size - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) > ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if (doclusterread)
 			error = cluster_read(vp,
 			    ip->i_size, lbn, size, NOCRED, &bp);
 		else if (lbn - 1 == vp->v_lastr) {
 			int nextsize = BLKSIZE(fs, ip, nextlbn);
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else
 			error = bread(vp, lbn, size, NOCRED, &bp);
-		if (error)
+		if (error) {
+			brelse(bp);
+			bp = NULL;
 			break;
+		}
 		vp->v_lastr = lbn;
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 #if !defined(__FreeBSD__)
 		if (S_ISREG(mode) && (xfersize + blkoffset == fs->s_frag_size ||
 		    uio->uio_offset == ip->i_size))
 			bp->b_flags |= B_AGE;
 #endif
-		brelse(bp);
+		bqrelse(bp);
 	}
 	if (bp != NULL)
-		brelse(bp);
+		bqrelse(bp);
 	ip->i_flag |= IN_ACCESS;
 	return (error);
 }
 
 /*
  * Vnode op for writing.
  */
 static int
 WRITE(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct uio *uio;
 	register struct inode *ip;
 	register FS *fs;
 	struct buf *bp;
 	struct proc *p;
 	daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
 
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("%s: mode", WRITE_S);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		if ((ioflag & IO_SYNC) == 0)
 			panic("%s: nonsync dir write", WRITE_S);
 		break;
 	default:
 		panic("%s: type", WRITE_S);
 	}
 
 	fs = ip->I_FS;
 #if 0
 	if (uio->uio_offset < 0 ||
 	    (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 		return (EFBIG);
 #endif
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	p = uio->uio_procp;
 	if (vp->v_type == VREG && p &&
 	    uio->uio_offset + uio->uio_resid >
 	    p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	flags = ioflag & IO_SYNC ? B_SYNC : 0;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->s_frag_size - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 
 #if defined(__FreeBSD__)
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, (u_long)uio->uio_offset + xfersize);
 #endif
 
 		if (fs->s_frag_size > xfersize)
 			flags |= B_CLRBUF;
 		else
 			flags &= ~B_CLRBUF;
 
 		error = ext2_balloc(ip,
 		    lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
 
 		if (error)
 			break;
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
 #if !defined(__FreeBSD__)
 			vnode_pager_setsize(vp, (u_long)ip->i_size);
 #endif
 		}
 #if !defined(__FreeBSD__)
 		(void)vnode_pager_uncache(vp);
 #endif
 
 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (xfersize + blkoffset == fs->s_frag_size) {
 			if (doclusterwrite) {
 #if defined(__FreeBSD__)
 				bp->b_flags |= B_CLUSTEROK;
 #endif
 				cluster_write(bp, ip->i_size);
 			} else {
 #if !defined(__FreeBSD__)
 				bp->b_flags |= B_AGE;
 #endif
 				bawrite(bp);
 			}
 		} else {
 #if defined(__FreeBSD__)
 			if (doclusterwrite)
 				bp->b_flags |= B_CLUSTEROK;
 #endif
 			bdwrite(bp);
 		}
 
 		if (error || xfersize == 0)
 			break;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_mode &= ~(ISUID | ISGID);
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)VOP_TRUNCATE(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_procp);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
 		struct timeval tv;
 #if !defined(__FreeBSD__)
 		get_time(&tv);
 #else
 		tv = time;
 #endif
 		error = VOP_UPDATE(vp, &tv, &tv, 1);
 	}
 	return (error);
 }
Index: head/sys/gnu/fs/ext2fs/ext2_bmap.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_bmap.c	(revision 13489)
+++ head/sys/gnu/fs/ext2fs/ext2_bmap.c	(revision 13490)
@@ -1,317 +1,317 @@
 /*
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.6 (Berkeley) 1/21/94
- * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $
+ * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 /*
  * Bmap converts a the logical block number of a file to its physical block
  * number on the disk. The conversion is done by using the logical block
  * number to index into the array of block pointers described by the dinode.
  */
 int
 ufs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct vnode **a_vpp;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	/*
 	 * Check for underlying vnode requests and ensure that logical
 	 * to physical mapping is requested.
 	 */
 	if (ap->a_vpp != NULL)
 		*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
 	if (ap->a_bnp == NULL)
 		return (0);
 
 	return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
 	    ap->a_runp, ap->a_runb));
 }
 
 /*
  * Indirect blocks are now on the vnode for the file.  They are given negative
  * logical block numbers.  Indirect blocks are addressed by the negative
  * address of the first data block to which they point.  Double indirect blocks
  * are addressed by one less than the address of the first indirect block to
  * which they point.  Triple indirect blocks are addressed by one less than
  * the address of the first double indirect block to which they point.
  *
  * ufs_bmaparray does the bmap conversion, and if requested returns the
  * array of logical blocks which must be traversed to get to a block.
  * Each entry contains the offset into that block that gets you to the
  * next block and the disk address of the block (if it is assigned).
  */
 
 int
 ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	struct vnode *vp;
 	register daddr_t bn;
 	daddr_t *bnp;
 	struct indir *ap;
 	int *nump;
 	int *runp;
 	int *runb;
 {
 	register struct inode *ip;
 	struct buf *bp;
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct vnode *devvp;
 	struct indir a[NIADDR+1], *xap;
 	daddr_t daddr;
 	long metalbn;
 	int error, maxrun = 0, num;
 
 	ip = VTOI(vp);
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 #ifdef DIAGNOSTIC
 	if (ap != NULL && nump == NULL || ap == NULL && nump != NULL)
 		panic("ufs_bmaparray: invalid arguments");
 #endif
 
 	if (runp) {
 		/*
 		 * XXX
 		 * If MAXPHYS is the largest transfer the disks can handle,
 		 * we probably want maxrun to be 1 block less so that we
 		 * don't create a block larger than the device can handle.
 		 */
 		*runp = 0;
 		maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
 	}
 
 	if (runb) {
 		*runb = 0;
 	}
 
 	xap = ap == NULL ? a : ap;
 	if (!nump)
 		nump = &num;
 	error = ufs_getlbns(vp, bn, xap, nump);
 	if (error)
 		return (error);
 
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
 		if (*bnp == 0)
 			*bnp = -1;
 		else if (runp) {
 			daddr_t bnb = bn;
 			for (++bn; bn < NDADDR && *runp < maxrun &&
 			    is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]);
 			    ++bn, ++*runp);
 			bn = bnb;
 			if (runb && (bn > 0)) {
 				for (--bn; (bn >= 0) && (*runb < maxrun) &&
 					is_sequential(ump, ip->i_db[bn],
 						ip->i_db[bn+1]);
 						--bn, ++*runb);
 			}
 		}
 		return (0);
 	}
 
 
 	/* Get disk address out of indirect block array */
 	daddr = ip->i_ib[xap->in_off];
 
 	devvp = VFSTOUFS(vp->v_mount)->um_devvp;
 	for (bp = NULL, ++xap; --num; ++xap) {
 		/*
 		 * Exit the loop if there is no disk address assigned yet and
 		 * the indirect block isn't in the cache, or if we were
 		 * looking for an indirect block and we've found it.
 		 */
 
 		metalbn = xap->in_lbn;
 		if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
 			break;
 		/*
 		 * If we get here, we've either got the block in the cache
 		 * or we have a disk address for it, go fetch it.
 		 */
 		if (bp)
-			brelse(bp);
+			bqrelse(bp);
 
 		xap->in_exists = 1;
 		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
 		if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef DIAGNOSTIC
 			if (!daddr)
 				panic("ufs_bmaparry: indirect block not in cache");
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
 			error = biowait(bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 		}
 
 		daddr = ((daddr_t *)bp->b_data)[xap->in_off];
 		if (num == 1 && daddr && runp) {
 			for (bn = xap->in_off + 1;
 			    bn < MNINDIR(ump) && *runp < maxrun &&
 			    is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1],
 			    ((daddr_t *)bp->b_data)[bn]);
 			    ++bn, ++*runp);
 			bn = xap->in_off;
 			if (runb && bn) {
 				for(--bn; bn > 0 && *runb < maxrun &&
 			    		is_sequential(ump, ((daddr_t *)bp->b_data)[bn],
 					    ((daddr_t *)bp->b_data)[bn+1]);
 			    		--bn, ++*runb);
 			}
 		}
 	}
 	if (bp)
-		brelse(bp);
+		bqrelse(bp);
 
 	daddr = blkptrtodb(ump, daddr);
 	*bnp = daddr == 0 ? -1 : daddr;
 	return (0);
 }
 
 /*
  * Create an array of logical block number/offset pairs which represent the
  * path of indirect blocks required to access a data block.  The first "pair"
  * contains the logical block number of the appropriate single, double or
  * triple indirect block and the offset into the inode indirect block array.
  * Note, the logical block number of the inode single/double/triple indirect
  * block appears twice in the array, once with the offset into the i_ib and
  * once with the offset into the page itself.
  */
 int
 ufs_getlbns(vp, bn, ap, nump)
 	struct vnode *vp;
 	register daddr_t bn;
 	struct indir *ap;
 	int *nump;
 {
 	long metalbn, realbn;
 	struct ufsmount *ump;
 	int blockcnt, i, numlevels, off;
 
 	ump = VFSTOUFS(vp->v_mount);
 	if (nump)
 		*nump = 0;
 	numlevels = 0;
 	realbn = bn;
 	if ((long)bn < 0)
 		bn = -(long)bn;
 
 	/* The first NDADDR blocks are direct blocks. */
 	if (bn < NDADDR)
 		return (0);
 
 	/*
 	 * Determine the number of levels of indirection.  After this loop
 	 * is done, blockcnt indicates the number of data blocks possible
 	 * at the given level of indirection, and NIADDR - i is the number
 	 * of levels of indirection needed to locate the requested block.
 	 */
 	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
 		if (i == 0)
 			return (EFBIG);
 		blockcnt *= MNINDIR(ump);
 		if (bn < blockcnt)
 			break;
 	}
 
 	/* Calculate the address of the first meta-block. */
 	if (realbn >= 0)
 		metalbn = -(realbn - bn + NIADDR - i);
 	else
 		metalbn = -(-realbn - bn + NIADDR - i);
 
 	/*
 	 * At each iteration, off is the offset into the bap array which is
 	 * an array of disk addresses at the current level of indirection.
 	 * The logical block number and the offset in that block are stored
 	 * into the argument array.
 	 */
 	ap->in_lbn = metalbn;
 	ap->in_off = off = NIADDR - i;
 	ap->in_exists = 0;
 	ap++;
 	for (++numlevels; i <= NIADDR; i++) {
 		/* If searching for a meta-data block, quit when found. */
 		if (metalbn == realbn)
 			break;
 
 		blockcnt /= MNINDIR(ump);
 		off = (bn / blockcnt) % MNINDIR(ump);
 
 		++numlevels;
 		ap->in_lbn = metalbn;
 		ap->in_off = off;
 		ap->in_exists = 0;
 		++ap;
 
 		metalbn -= -1 + off * blockcnt;
 	}
 	if (nump)
 		*nump = numlevels;
 	return (0);
 }
Index: head/sys/gnu/fs/ext2fs/ext2_inode.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_inode.c	(revision 13489)
+++ head/sys/gnu/fs/ext2fs/ext2_inode.c	(revision 13490)
@@ -1,550 +1,551 @@
 /*
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ext2_inode.c	8.5 (Berkeley) 12/30/93
  */
 
 #if !defined(__FreeBSD__)
 #include "quota.h"
 #include "diagnostic.h"
 #else
 #include "opt_quota.h"
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #if !defined(__FreeBSD__)
 #include <sys/trace.h>
 #endif
 #include <sys/resourcevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <gnu/ext2fs/ext2_fs.h>
 #include <gnu/ext2fs/ext2_fs_sb.h>
 #include <gnu/ext2fs/fs.h>
 #include <gnu/ext2fs/ext2_extern.h>
 
 static int ext2_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int,
 	    long *));
 
 int
 ext2_init()
 {
 	return (ufs_init());
 }
 
 /*
  * Update the access, modified, and inode change times as specified by the
  * IACCESS, IUPDATE, and ICHANGE flags respectively. The IMODIFIED flag is
  * used to specify that the inode needs to be updated but that the times have
  * already been set. The access and modified times are taken from the second
  * and third parameters; the inode change time is always taken from the current
  * time. If waitfor is set, then wait for the disk write of the inode to
  * complete.
  */
 int
 ext2_update(ap)
 	struct vop_update_args /* {
 		struct vnode *a_vp;
 		struct timeval *a_access;
 		struct timeval *a_modify;
 		int a_waitfor;
 	} */ *ap;
 {
 	register struct ext2_sb_info *fs;
 	struct buf *bp;
 	struct inode *ip;
 	int error;
 #if !defined(__FreeBSD__)
 	struct timeval time;
 #endif
 
 	ip = VTOI(ap->a_vp);
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) {
 		ip->i_flag &=
 		    ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 		return (0);
 	}
 	if ((ip->i_flag &
 	    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)
 		return (0);
 	if (ip->i_flag & IN_ACCESS)
 		ip->i_atime.ts_sec = ap->a_access->tv_sec;
 	if (ip->i_flag & IN_UPDATE) {
 		ip->i_mtime.ts_sec = ap->a_modify->tv_sec;
 		ip->i_modrev++;
 	}
 	if (ip->i_flag & IN_CHANGE) {
 #if !defined(__FreeBSD__)
 		get_time(&time);
 #endif
 		ip->i_ctime.ts_sec = time.tv_sec;
 	}
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 	fs = ip->i_e2fs;
 	if (error = bread(ip->i_devvp,
 	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->s_blocksize, NOCRED, &bp)) {
 		brelse(bp);
 		return (error);
 	}
 	ext2_di2ei( &ip->i_din, (struct ext2_inode *) ((char *)bp->b_data + EXT2_INODE_SIZE *
 	    ino_to_fsbo(fs, ip->i_number)));
 /*
 	if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
 		return (bwrite(bp));
 	else {
 */
 		bdwrite(bp);
 		return (0);
 /*
 	}
 */
 }
 
 #define	SINGLE	0	/* index of single indirect block */
 #define	DOUBLE	1	/* index of double indirect block */
 #define	TRIPLE	2	/* index of triple indirect block */
 /*
  * Truncate the inode oip to at most length size, freeing the
  * disk blocks.
  */
 int
 ext2_truncate(ap)
 	struct vop_truncate_args /* {
 		struct vnode *a_vp;
 		off_t a_length;
 		int a_flags;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct vnode *ovp = ap->a_vp;
 	register daddr_t lastblock;
 	register struct inode *oip;
 	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
 	daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	off_t length = ap->a_length;
 	register struct ext2_sb_info *fs;
 	struct buf *bp;
 	int offset, size, level;
 	long count, nblocks, vflags, blocksreleased = 0;
 	struct timeval tv;
 	register int i;
 	int aflags, error, allerror;
 	off_t osize;
 /*
 printf("ext2_truncate called %d to %d\n", VTOI(ovp)->i_number, ap->a_length);
 */	/* 
 	 * negative file sizes will totally break the code below and
 	 * are not meaningful anyways.
 	 */
 	if (length < 0)
 	    return EFBIG;
 
 	oip = VTOI(ovp);
 #if defined(__FreeBSD__)
 	tv = time;
 #else
 	get_time(&tv);
 #endif
 	if (ovp->v_type == VLNK &&
 	    oip->i_size < ovp->v_mount->mnt_maxsymlinklen) {
 #if DIAGNOSTIC
 		if (length != 0)
 			panic("ext2_truncate: partial truncate of symlink");
 #endif
 		bzero((char *)&oip->i_shortlink, (u_int)oip->i_size);
 		oip->i_size = 0;
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	if (oip->i_size == length) {
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 0));
 	}
 #if QUOTA
 	if (error = getinoquota(oip))
 		return (error);
 #endif
-	vnode_pager_setsize(ovp, (u_long)length);
 	fs = oip->i_e2fs;
 	osize = oip->i_size;
 	ext2_discard_prealloc(oip);
 	/*
 	 * Lengthen the size of the file. We must ensure that the
 	 * last byte of the file is allocated. Since the smallest
 	 * value of oszie is 0, length will be at least 1.
 	 */
 	if (osize < length) {
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
+		vnode_pager_setsize(ovp, length);
 		if (error = ext2_balloc(oip, lbn, offset + 1, ap->a_cred, &bp,
 		    aflags))
 			return (error);
 		oip->i_size = length;
 #if !defined(__FreeBSD__)
 		(void) vnode_pager_uncache(ovp);
 #endif
 		if (aflags & IO_SYNC)
 			bwrite(bp);
 		else
 			bawrite(bp);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundry, the contents of the
 	 * partial block following the end of the file must be
 	 * zero'ed in case it ever become accessable again because
 	 * of subsequent file growth.
 	 */
 	/* I don't understand the comment above */
 	offset = blkoff(fs, length);
 	if (offset == 0) {
 		oip->i_size = length;
 	} else {
 		lbn = lblkno(fs, length);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
 		if (error = ext2_balloc(oip, lbn, offset, ap->a_cred, &bp,
 		    aflags))
 			return (error);
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 #if !defined(__FreeBSD__)
 		(void) vnode_pager_uncache(ovp);
 #endif
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
 		allocbuf(bp, size);
 		if (aflags & IO_SYNC)
 			bwrite(bp);
 		else
 			bawrite(bp);
 	}
 	/*
 	 * Calculate index into inode's block list of
 	 * last direct and indirect blocks (if any)
 	 * which we want to keep.  Lastblock is -1 when
 	 * the file is truncated to 0.
 	 */
 	lastblock = lblkno(fs, length + fs->s_blocksize - 1) - 1;
 	lastiblock[SINGLE] = lastblock - NDADDR;
 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 	nblocks = btodb(fs->s_blocksize);
 	/*
 	 * Update file and block pointers on disk before we start freeing
 	 * blocks.  If we crash before free'ing blocks below, the blocks
 	 * will be returned to the free list.  lastiblock values are also
 	 * normalized to -1 for calls to ext2_indirtrunc below.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks);
 	for (level = TRIPLE; level >= SINGLE; level--)
 		if (lastiblock[level] < 0) {
 			oip->i_ib[level] = 0;
 			lastiblock[level] = -1;
 		}
 	for (i = NDADDR - 1; i > lastblock; i--)
 		oip->i_db[i] = 0;
 	oip->i_flag |= IN_CHANGE | IN_UPDATE;
 	if (error = VOP_UPDATE(ovp, &tv, &tv, MNT_WAIT))
 		allerror = error;
 	/*
 	 * Having written the new inode to disk, save its new configuration
 	 * and put back the old block pointers long enough to process them.
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks);
 	bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks);
 	oip->i_size = osize;
 	vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA;
 	allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0);
 
 	/*
 	 * Indirect blocks first.
 	 */
 	indir_lbn[SINGLE] = -NDADDR;
 	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		bn = oip->i_ib[level];
 		if (bn != 0) {
 			error = ext2_indirtrunc(oip, indir_lbn[level],
 			    fsbtodb(fs, bn), lastiblock[level], level, &count);
 			if (error)
 				allerror = error;
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				oip->i_ib[level] = 0;
 				ext2_blkfree(oip, bn, fs->s_frag_size);
 				blocksreleased += nblocks;
 			}
 		}
 		if (lastiblock[level] >= 0)
 			goto done;
 	}
 
 	/*
 	 * All whole direct blocks or frags.
 	 */
 	for (i = NDADDR - 1; i > lastblock; i--) {
 		register long bsize;
 
 		bn = oip->i_db[i];
 		if (bn == 0)
 			continue;
 		oip->i_db[i] = 0;
 		bsize = blksize(fs, oip, i);
 		ext2_blkfree(oip, bn, bsize);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
 		goto done;
 
 	/*
 	 * Finally, look for a change in size of the
 	 * last direct block; release any frags.
 	 */
 	bn = oip->i_db[lastblock];
 	if (bn != 0) {
 		long oldspace, newspace;
 
 		/*
 		 * Calculate amount of space we're giving
 		 * back as old block size minus new block size.
 		 */
 		oldspace = blksize(fs, oip, lastblock);
 		oip->i_size = length;
 		newspace = blksize(fs, oip, lastblock);
 		if (newspace == 0)
 			panic("itrunc: newspace");
 		if (oldspace - newspace > 0) {
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
 			ext2_blkfree(oip, bn, oldspace - newspace);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
 done:
 #if DIAGNOSTIC
 	for (level = SINGLE; level <= TRIPLE; level++)
 		if (newblks[NDADDR + level] != oip->i_ib[level])
 			panic("itrunc1");
 	for (i = 0; i < NDADDR; i++)
 		if (newblks[i] != oip->i_db[i])
 			panic("itrunc2");
 	if (length == 0 &&
 	    (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first))
 		panic("itrunc3");
 #endif /* DIAGNOSTIC */
 	/*
 	 * Put back the real size.
 	 */
 	oip->i_size = length;
 	oip->i_blocks -= blocksreleased;
 	if (oip->i_blocks < 0)			/* sanity */
 		oip->i_blocks = 0;
 	oip->i_flag |= IN_CHANGE;
+	vnode_pager_setsize(ovp, length);
 #if QUOTA
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
 #endif
 	return (allerror);
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block bn.  Blocks are free'd in LIFO order up to (but not including)
  * lastbn.  If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  *
  * NB: triple indirect blocks are untested.
  */
 
 static int
 ext2_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	register struct inode *ip;
 	daddr_t lbn, lastbn;
 	daddr_t dbn;
 	int level;
 	long *countp;
 {
 	register int i;
 	struct buf *bp;
 	register struct ext2_sb_info *fs = ip->i_e2fs;
 	register daddr_t *bap;
 	struct vnode *vp;
 	daddr_t *copy, nb, nlbn, last;
 	long blkcount, factor;
 	int nblocks, blocksreleased = 0;
 	int error = 0, allerror = 0;
 
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
 	 * block so we need not calculate the index.
 	 */
 	factor = 1;
 	for (i = SINGLE; i < level; i++)
 		factor *= NINDIR(fs);
 	last = lastbn;
 	if (lastbn > 0)
 		last /= factor;
 	nblocks = btodb(fs->s_blocksize);
 	/*
 	 * Get buffer of block pointers, zero those entries corresponding
 	 * to blocks to be free'd, and update on disk copy first.  Since
 	 * double(triple) indirect before single(double) indirect, calls
 	 * to bmap on these blocks will fail.  However, we already have
 	 * the on disk address, so we have to set the b_blkno field
 	 * explicitly instead of letting bread do everything for us.
 	 */
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->s_blocksize, 0, 0);
 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
 		/* Braces must be here in case trace evaluates to nothing. */
 #if !defined(__FreeBSD__)
 		trace(TR_BREADHIT, pack(vp, fs->s_blocksize), lbn);
 #endif
 	} else {
 #if !defined(__FreeBSD__)
 		trace(TR_BREADMISS, pack(vp, fs->s_blocksize), lbn);
 		get_proc()->p_stats->p_ru.ru_inblock++;	/* pay for read */
 #endif
 		bp->b_flags |= B_READ;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ext2_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
 #if defined(__FreeBSD__)
 		vfs_busy_pages(bp, 0);
 #endif
 		VOP_STRATEGY(bp);
 		error = biowait(bp);
 	}
 	if (error) {
 		brelse(bp);
 		*countp = 0;
 		return (error);
 	}
 
 	bap = (daddr_t *)bp->b_data;
 	MALLOC(copy, daddr_t *, fs->s_blocksize, M_TEMP, M_WAITOK);
 	bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->s_blocksize);
 	bzero((caddr_t)&bap[last + 1],
 	  (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t));
 	if (last == -1)
 		bp->b_flags |= B_INVAL;
 	error = bwrite(bp);
 	if (error)
 		allerror = error;
 	bap = copy;
 
 	/*
 	 * Recursively free totally unused blocks.
 	 */
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = bap[i];
 		if (nb == 0)
 			continue;
 		if (level > SINGLE) {
 			if (error = ext2_indirtrunc(ip, nlbn,
 			    fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount))
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 		ext2_blkfree(ip, nb, fs->s_blocksize);
 		blocksreleased += nblocks;
 	}
 
 	/*
 	 * Recursively free last partial block.
 	 */
 	if (level > SINGLE && lastbn >= 0) {
 		last = lastbn % factor;
 		nb = bap[i];
 		if (nb != 0) {
 			if (error = ext2_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    last, level - 1, &blkcount))
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 	}
 	FREE(copy, M_TEMP);
 	*countp = blocksreleased;
 	return (allerror);
 }
 
 /*
  *	discard preallocated blocks
  */
 int
 ext2_inactive(ap)
         struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	ext2_discard_prealloc(VTOI(ap->a_vp));
 	return ufs_inactive(ap);
 }
 
Index: head/sys/gnu/fs/ext2fs/ext2_readwrite.c
===================================================================
--- head/sys/gnu/fs/ext2fs/ext2_readwrite.c	(revision 13489)
+++ head/sys/gnu/fs/ext2fs/ext2_readwrite.c	(revision 13490)
@@ -1,323 +1,326 @@
 /*
  *  modified for Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.7 (Berkeley) 1/21/94
  */
 
 #if !defined(__FreeBSD__)
 #include "diagnostic.h"
 #endif
 
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
 #define	FS			struct ext2_sb_info
 #define	I_FS			i_e2fs
 #define	READ			ext2_read
 #define	READ_S			"ext2_read"
 #define	WRITE			ext2_write
 #define	WRITE_S			"ext2_write"
 
 /*
  * Vnode op for reading.
  */
 /* ARGSUSED */
 static int
 READ(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct inode *ip;
 	register struct uio *uio;
 	register FS *fs;
 	struct buf *bp;
 	daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error;
 	u_short mode;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	mode = ip->i_mode;
 	uio = ap->a_uio;
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("%s: mode", READ_S);
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("%s: short symlink", READ_S);
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("%s: type %d", READ_S, vp->v_type);
 #endif
 	fs = ip->I_FS;
 #if 0
 	if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize)
 		return (EFBIG);
 #endif
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = BLKSIZE(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->s_frag_size - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) > ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if (doclusterread)
 			error = cluster_read(vp,
 			    ip->i_size, lbn, size, NOCRED, &bp);
 		else if (lbn - 1 == vp->v_lastr) {
 			int nextsize = BLKSIZE(fs, ip, nextlbn);
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else
 			error = bread(vp, lbn, size, NOCRED, &bp);
-		if (error)
+		if (error) {
+			brelse(bp);
+			bp = NULL;
 			break;
+		}
 		vp->v_lastr = lbn;
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 #if !defined(__FreeBSD__)
 		if (S_ISREG(mode) && (xfersize + blkoffset == fs->s_frag_size ||
 		    uio->uio_offset == ip->i_size))
 			bp->b_flags |= B_AGE;
 #endif
-		brelse(bp);
+		bqrelse(bp);
 	}
 	if (bp != NULL)
-		brelse(bp);
+		bqrelse(bp);
 	ip->i_flag |= IN_ACCESS;
 	return (error);
 }
 
 /*
  * Vnode op for writing.
  */
 static int
 WRITE(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct uio *uio;
 	register struct inode *ip;
 	register FS *fs;
 	struct buf *bp;
 	struct proc *p;
 	daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
 
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 
 #if DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("%s: mode", WRITE_S);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		if ((ioflag & IO_SYNC) == 0)
 			panic("%s: nonsync dir write", WRITE_S);
 		break;
 	default:
 		panic("%s: type", WRITE_S);
 	}
 
 	fs = ip->I_FS;
 #if 0
 	if (uio->uio_offset < 0 ||
 	    (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 		return (EFBIG);
 #endif
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	p = uio->uio_procp;
 	if (vp->v_type == VREG && p &&
 	    uio->uio_offset + uio->uio_resid >
 	    p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	flags = ioflag & IO_SYNC ? B_SYNC : 0;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->s_frag_size - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 
 #if defined(__FreeBSD__)
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, (u_long)uio->uio_offset + xfersize);
 #endif
 
 		if (fs->s_frag_size > xfersize)
 			flags |= B_CLRBUF;
 		else
 			flags &= ~B_CLRBUF;
 
 		error = ext2_balloc(ip,
 		    lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
 
 		if (error)
 			break;
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
 #if !defined(__FreeBSD__)
 			vnode_pager_setsize(vp, (u_long)ip->i_size);
 #endif
 		}
 #if !defined(__FreeBSD__)
 		(void)vnode_pager_uncache(vp);
 #endif
 
 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 
 		if (ioflag & IO_SYNC)
 			(void)bwrite(bp);
 		else if (xfersize + blkoffset == fs->s_frag_size) {
 			if (doclusterwrite) {
 #if defined(__FreeBSD__)
 				bp->b_flags |= B_CLUSTEROK;
 #endif
 				cluster_write(bp, ip->i_size);
 			} else {
 #if !defined(__FreeBSD__)
 				bp->b_flags |= B_AGE;
 #endif
 				bawrite(bp);
 			}
 		} else {
 #if defined(__FreeBSD__)
 			if (doclusterwrite)
 				bp->b_flags |= B_CLUSTEROK;
 #endif
 			bdwrite(bp);
 		}
 
 		if (error || xfersize == 0)
 			break;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_mode &= ~(ISUID | ISGID);
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)VOP_TRUNCATE(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_procp);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
 		struct timeval tv;
 #if !defined(__FreeBSD__)
 		get_time(&tv);
 #else
 		tv = time;
 #endif
 		error = VOP_UPDATE(vp, &tv, &tv, 1);
 	}
 	return (error);
 }
Index: head/sys/i386/i386/machdep.c
===================================================================
--- head/sys/i386/i386/machdep.c	(revision 13489)
+++ head/sys/i386/i386/machdep.c	(revision 13490)
@@ -1,1820 +1,1820 @@
 /*-
  * Copyright (c) 1992 Terrence R. Lambert.
  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
- *	$Id: machdep.c,v 1.168 1996/01/04 21:10:53 wollman Exp $
+ *	$Id: machdep.c,v 1.169 1996/01/05 20:12:19 wollman Exp $
  */
 
 #include "npx.h"
 #include "isa.h"
 #include "opt_sysvipc.h"
 #include "opt_ddb.h"
 #include "opt_bounce.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/reboot.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/msgbuf.h>
 #include <sys/ioctl.h>
 #include <sys/sysent.h>
 #include <sys/tty.h>
 #include <sys/sysctl.h>
 #include <sys/devconf.h>
 #include <sys/vmmeter.h>
 
 #ifdef SYSVSHM
 #include <sys/shm.h>
 #endif
 
 #ifdef SYSVMSG
 #include <sys/msg.h>
 #endif
 
 #ifdef SYSVSEM
 #include <sys/sem.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 #include <sys/exec.h>
 #include <sys/vnode.h>
 
 #include <ddb/ddb.h>
 
 #include <net/netisr.h>
 
 #include <machine/cpu.h>
 #include <machine/npx.h>
 #include <machine/reg.h>
 #include <machine/psl.h>
 #include <machine/clock.h>
 #include <machine/specialreg.h>
 #include <machine/sysarch.h>
 #include <machine/cons.h>
 #include <machine/devconf.h>
 #include <machine/bootinfo.h>
 #include <machine/md_var.h>
 
 #include <i386/isa/isa.h>
 #include <i386/isa/isa_device.h>
 #include <i386/isa/rtc.h>
 #include <machine/random.h>
 
 extern void init386 __P((int first));
 extern int ptrace_set_pc __P((struct proc *p, unsigned int addr));
 extern int ptrace_single_step __P((struct proc *p));
 extern int ptrace_write_u __P((struct proc *p, vm_offset_t off, int data));
 extern void dblfault_handler __P((void));
 
 extern void i486_bzero	__P((void *, size_t));
 extern void i586_bzero	__P((void *, size_t));
 extern void i686_bzero	__P((void *, size_t));
 
 static void cpu_startup __P((void *));
 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL)
 
 static void identifycpu(void);
 
 char machine[] = "i386";
 SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");
 
 static char cpu_model[128];
 SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0, "");
 
 struct kern_devconf kdc_cpu0 = {
 	0, 0, 0,		/* filled in by dev_attach */
 	"cpu", 0, { MDDT_CPU },
 	0, 0, 0, CPU_EXTERNALLEN,
 	0,			/* CPU has no parent */
 	0,			/* no parentdata */
 	DC_BUSY,		/* the CPU is always busy */
 	cpu_model,		/* no sense in duplication */
 	DC_CLS_CPU		/* class */
 };
 
 #ifndef PANIC_REBOOT_WAIT_TIME
 #define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
 #endif
 
 #ifdef BOUNCE_BUFFERS
 extern char *bouncememory;
 extern int maxbkva;
 #ifdef BOUNCEPAGES
 int	bouncepages = BOUNCEPAGES;
 #else
 int	bouncepages = 0;
 #endif
 #endif	/* BOUNCE_BUFFERS */
 
 extern int freebufspace;
 int	msgbufmapped = 0;		/* set when safe to use msgbuf */
 int _udatasel, _ucodesel;
 
 
 int physmem = 0;
 
 static int
 sysctl_hw_physmem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0, ctob(physmem), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_physmem, "I", "");
 
 static int
 sysctl_hw_usermem SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp, 0,
 		ctob(physmem - cnt.v_wire_count), req);
 	return (error);
 }
 
 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_hw_usermem, "I", "");
 
 int boothowto = 0, bootverbose = 0, Maxmem = 0;
 static int	badpages = 0;
 long dumplo;
 extern int bootdev;
 
 vm_offset_t phys_avail[10];
 
 /* must be 2 less so 0 0 can signal end of chunks */
 #define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)
 
 int cpu_class;
 
 static void dumpsys __P((void));
 static void setup_netisrs __P((struct linker_set *)); /* XXX declare elsewhere */
 
 static vm_offset_t buffer_sva, buffer_eva;
 vm_offset_t clean_sva, clean_eva;
 static vm_offset_t pager_sva, pager_eva;
 extern struct linker_set netisr_set;
 
 #define offsetof(type, member)	((size_t)(&((type *)0)->member))
 
 static void
 cpu_startup(dummy)
 	void *dummy;
 {
 	register unsigned i;
 	register caddr_t v;
 	vm_offset_t maxaddr;
 	vm_size_t size = 0;
 	int firstaddr;
 	vm_offset_t minaddr;
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 
 	/*
 	 * Initialize error message buffer (at end of core).
 	 */
 
 	/* avail_end was pre-decremented in init_386() to compensate */
 	for (i = 0; i < btoc(sizeof (struct msgbuf)); i++)
 		pmap_enter(pmap_kernel(), (vm_offset_t)msgbufp,
 			   avail_end + i * NBPG,
 			   VM_PROT_ALL, TRUE);
 	msgbufmapped = 1;
 
 	/*
 	 * Good {morning,afternoon,evening,night}.
 	 */
 	printf(version);
 	startrtclock();
 	identifycpu();
 	printf("real memory  = %d (%dK bytes)\n", ptoa(Maxmem), ptoa(Maxmem) / 1024);
 	/*
 	 * Display any holes after the first chunk of extended memory.
 	 */
 	if (badpages != 0) {
 		int indx = 1;
 
 		/*
 		 * XXX skip reporting ISA hole & unmanaged kernel memory
 		 */
 		if (phys_avail[0] == PAGE_SIZE)
 			indx += 2;
 
 		printf("Physical memory hole(s):\n");
 		for (; phys_avail[indx + 1] != 0; indx += 2) {
 			int size = phys_avail[indx + 1] - phys_avail[indx];
 
 			printf("0x%08lx - 0x%08lx, %d bytes (%d pages)\n", phys_avail[indx],
 			    phys_avail[indx + 1] - 1, size, size / PAGE_SIZE);
 		}
 	}
 
 	/*
 	 * Quickly wire in netisrs.
 	 */
 	setup_netisrs(&netisr_set);
 
 /*
 #ifdef ISDN
 	DONET(isdnintr, NETISR_ISDN);
 #endif
 */
 
 	/*
 	 * Allocate space for system data structures.
 	 * The first available kernel virtual address is in "v".
 	 * As pages of kernel virtual memory are allocated, "v" is incremented.
 	 * As pages of memory are allocated and cleared,
 	 * "firstaddr" is incremented.
 	 * An index into the kernel page table corresponding to the
 	 * virtual memory address maintained in "v" is kept in "mapaddr".
 	 */
 
 	/*
 	 * Make two passes.  The first pass calculates how much memory is
 	 * needed and allocates it.  The second pass assigns virtual
 	 * addresses to the various data structures.
 	 */
 	firstaddr = 0;
 again:
 	v = (caddr_t)firstaddr;
 
 #define	valloc(name, type, num) \
 	    (name) = (type *)v; v = (caddr_t)((name)+(num))
 #define	valloclim(name, type, num, lim) \
 	    (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
 	valloc(callout, struct callout, ncallout);
 #ifdef SYSVSHM
 	valloc(shmsegs, struct shmid_ds, shminfo.shmmni);
 #endif
 #ifdef SYSVSEM
 	valloc(sema, struct semid_ds, seminfo.semmni);
 	valloc(sem, struct sem, seminfo.semmns);
 	/* This is pretty disgusting! */
 	valloc(semu, int, (seminfo.semmnu * seminfo.semusz) / sizeof(int));
 #endif
 #ifdef SYSVMSG
 	valloc(msgpool, char, msginfo.msgmax);
 	valloc(msgmaps, struct msgmap, msginfo.msgseg);
 	valloc(msghdrs, struct msg, msginfo.msgtql);
 	valloc(msqids, struct msqid_ds, msginfo.msgmni);
 #endif
 
 	if (nbuf == 0) {
 		nbuf = 30;
 		if( physmem > 1024)
 			nbuf += min((physmem - 1024) / 12, 1024);
 	}
 	nswbuf = min(nbuf, 128);
 
 	valloc(swbuf, struct buf, nswbuf);
 	valloc(buf, struct buf, nbuf);
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * If there is more than 16MB of memory, allocate some bounce buffers
 	 */
 	if (Maxmem > 4096) {
 		if (bouncepages == 0) {
 			bouncepages = 64;
 			bouncepages += ((Maxmem - 4096) / 2048) * 32;
 		}
 		v = (caddr_t)((vm_offset_t)((vm_offset_t)v + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1));
 		valloc(bouncememory, char, bouncepages * PAGE_SIZE);
 	}
 #endif
 
 	/*
 	 * End of first pass, size has been calculated so allocate memory
 	 */
 	if (firstaddr == 0) {
 		size = (vm_size_t)(v - firstaddr);
 		firstaddr = (int)kmem_alloc(kernel_map, round_page(size));
 		if (firstaddr == 0)
 			panic("startup: no room for tables");
 		goto again;
 	}
 
 	/*
 	 * End of second pass, addresses have been assigned
 	 */
 	if ((vm_size_t)(v - firstaddr) != size)
 		panic("startup: table size inconsistency");
 
 #ifdef BOUNCE_BUFFERS
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) +
 				maxbkva + pager_map_size, TRUE);
 	io_map = kmem_suballoc(clean_map, &minaddr, &maxaddr, maxbkva, FALSE);
 #else
 	clean_map = kmem_suballoc(kernel_map, &clean_sva, &clean_eva,
 			(nbuf*MAXBSIZE) + (nswbuf*MAXPHYS) + pager_map_size, TRUE);
 #endif
 	buffer_map = kmem_suballoc(clean_map, &buffer_sva, &buffer_eva,
 				(nbuf*MAXBSIZE), TRUE);
 	pager_map = kmem_suballoc(clean_map, &pager_sva, &pager_eva,
 				(nswbuf*MAXPHYS) + pager_map_size, TRUE);
 	exec_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(16*ARG_MAX), TRUE);
 	u_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr,
 				(maxproc*UPAGES*PAGE_SIZE), FALSE);
 
 	/*
 	 * Finally, allocate mbuf pool.  Since mclrefcnt is an off-size
 	 * we use the more space efficient malloc in place of kmem_alloc.
 	 */
 	mclrefcnt = (char *)malloc(nmbclusters+CLBYTES/MCLBYTES,
 				   M_MBUF, M_NOWAIT);
 	bzero(mclrefcnt, nmbclusters+CLBYTES/MCLBYTES);
 	mb_map = kmem_suballoc(kmem_map, (vm_offset_t *)&mbutl, &maxaddr,
 			       nmbclusters * MCLBYTES, FALSE);
 	/*
 	 * Initialize callouts
 	 */
 	callfree = callout;
 	for (i = 1; i < ncallout; i++)
 		callout[i-1].c_next = &callout[i];
 
         if (boothowto & RB_CONFIG) {
 		userconfig();
 		cninit();	/* the preferred console may have changed */
 	}
 
 #ifdef BOUNCE_BUFFERS
 	/*
 	 * init bounce buffers
 	 */
 	vm_bounce_init();
 #endif
 	/*
 	 * XXX allocate a contiguous area for ISA (non busmaster) DMA
 	 * operations. This _should_ only be done if the DMA channels
 	 * will actually be used, but for now we do it always.
 	 */
 #define DMAPAGES 8
 	isaphysmem =
 	    vm_page_alloc_contig(DMAPAGES * PAGE_SIZE, 0, 0xfffffful, 64*1024);
 
 	printf("avail memory = %d (%dK bytes)\n", ptoa(cnt.v_free_count),
 	    ptoa(cnt.v_free_count) / 1024);
 
 	/*
 	 * Set up buffers, so they can be used to read disk labels.
 	 */
 	bufinit();
 	vm_pager_bufferinit();
 
 	/*
 	 * In verbose mode, print out the BIOS's idea of the disk geometries.
 	 */
 	if (bootverbose) {
 		printf("BIOS Geometries:\n");
 		for (i = 0; i < N_BIOS_GEOM; i++) {
 			unsigned long bios_geom;
 			int max_cylinder, max_head, max_sector;
 
 			bios_geom = bootinfo.bi_bios_geom[i];
 
 			/*
 			 * XXX the bootstrap punts a 1200K floppy geometry
 			 * when the get-disk-geometry interrupt fails.  Skip
 			 * drives that have this geometry.
 			 */
 			if (bios_geom == 0x4f010f)
 				continue;
 
 			printf(" %x:%08lx ", i, bios_geom);
 			max_cylinder = bios_geom >> 16;
 			max_head = (bios_geom >> 8) & 0xff;
 			max_sector = bios_geom & 0xff;
 			printf(
 		"0..%d=%d cylinders, 0..%d=%d heads, 1..%d=%d sectors\n",
 			       max_cylinder, max_cylinder + 1,
 			       max_head, max_head + 1,
 			       max_sector, max_sector);
 		}
 		printf(" %d accounted for\n", bootinfo.bi_n_bios_used);
 	}
 }
 
 int
 register_netisr(num, handler)
 	int num;
 	netisr_t *handler;
 {
 	
 	if (num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs)) ) {
 		printf("register_netisr: bad isr number: %d\n", num);
 		return (EINVAL);
 	}
 	netisrs[num] = handler;
 	return (0);
 }
 
 static void
 setup_netisrs(ls)
 	struct linker_set *ls;
 {
 	int i;
 	const struct netisrtab *nit;
 
 	for(i = 0; ls->ls_items[i]; i++) {
 		nit = (const struct netisrtab *)ls->ls_items[i];
 		register_netisr(nit->nit_num, nit->nit_isr);
 	}
 }
 
 static struct cpu_nameclass i386_cpus[] = {
 	{ "Intel 80286",	CPUCLASS_286 },		/* CPU_286   */
 	{ "i386SX",		CPUCLASS_386 },		/* CPU_386SX */
 	{ "i386DX",		CPUCLASS_386 },		/* CPU_386   */
 	{ "i486SX",		CPUCLASS_486 },		/* CPU_486SX */
 	{ "i486DX",		CPUCLASS_486 },		/* CPU_486   */
 	{ "Pentium",		CPUCLASS_586 },		/* CPU_586   */
 	{ "Cy486DLC",		CPUCLASS_486 },		/* CPU_486DLC */
 	{ "Pentium Pro",	CPUCLASS_686 },		/* CPU_686 */
 };
 
 static void
 identifycpu()
 {
 	printf("CPU: ");
 	if (cpu >= 0
 	    && cpu < (sizeof i386_cpus/sizeof(struct cpu_nameclass))) {
 		cpu_class = i386_cpus[cpu].cpu_class;
 		strncpy(cpu_model, i386_cpus[cpu].cpu_name, sizeof cpu_model);
 	} else {
 		printf("unknown cpu type %d\n", cpu);
 		panic("startup: bad cpu id");
 	}
 
 #if defined(I586_CPU) || defined(I686_CPU)
 	if (cpu_class == CPUCLASS_586 || cpu_class == CPUCLASS_686) {
 		calibrate_cyclecounter();
 	}
 #endif
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	if (!strcmp(cpu_vendor,"GenuineIntel")) {
 		if ((cpu_id & 0xf00) > 3) {
 			cpu_model[0] = '\0';
 
 			switch (cpu_id & 0x3000) {
 			case 0x1000:
 				strcpy(cpu_model, "Overdrive ");
 				break;
 			case 0x2000:
 				strcpy(cpu_model, "Dual ");
 				break;
 			}
 
 			switch (cpu_id & 0xf00) {
 			case 0x400:
 				strcat(cpu_model, "i486 ");
 				break;
 			case 0x500:
 				strcat(cpu_model, "Pentium"); /* nb no space */
 				break;
 			case 0x600:
 				strcat(cpu_model, "Pentium Pro");
 				break;
 			default:
 				strcat(cpu_model, "unknown");
 				break;
 			}
 
 			switch (cpu_id & 0xff0) {
 			case 0x400:
 				strcat(cpu_model, "DX"); break;
 			case 0x410:
 				strcat(cpu_model, "DX"); break;
 			case 0x420:
 				strcat(cpu_model, "SX"); break;
 			case 0x430:
 				strcat(cpu_model, "DX2"); break;
 			case 0x440:
 				strcat(cpu_model, "SL"); break;
 			case 0x450:
 				strcat(cpu_model, "SX2"); break;
 			case 0x470:
 				strcat(cpu_model, "DX2 Write-Back Enhanced");
 				break;
 			case 0x480:
 				strcat(cpu_model, "DX4"); break;
 				break;
 			}
 		}
 	}
 #endif
 	printf("%s (", cpu_model);
 	switch(cpu_class) {
 	case CPUCLASS_286:
 		printf("286");
 		break;
 #if defined(I386_CPU)
 	case CPUCLASS_386:
 		printf("386");
 		break;
 #endif
 #if defined(I486_CPU)
 	case CPUCLASS_486:
 		printf("486");
 		bzero = i486_bzero;
 		break;
 #endif
 #if defined(I586_CPU)
 	case CPUCLASS_586:
 		printf("%d.%02d-MHz ",
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
 		printf("586");
 		bzero = i586_bzero;
 		break;
 #endif
 #if defined(I686_CPU)
 	case CPUCLASS_686:
 		printf("%d.%02d-MHz ",
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) / 100,
 		       ((100 * i586_ctr_rate) >> I586_CTR_RATE_SHIFT) % 100);
 		printf("686");
 		bzero = i686_bzero;
 		break;
 #endif
 	default:
 		printf("unknown");	/* will panic below... */
 	}
 	printf("-class CPU)\n");
 #if defined(I486_CPU) || defined(I586_CPU) || defined(I686_CPU)
 	if(*cpu_vendor)
 		printf("  Origin = \"%s\"",cpu_vendor);
 	if(cpu_id)
 		printf("  Id = 0x%lx",cpu_id);
 
 	if (!strcmp(cpu_vendor, "GenuineIntel")) {
 		printf("  Stepping=%ld", cpu_id & 0xf);
 		if (cpu_high > 0) {
 #define FEATUREFMT "\020\001FPU\002VME\003PSE\004MCE\005CX8\006APIC"
 			printf("\n  Features=0x%b", cpu_feature, FEATUREFMT);
 		}
 	}
 	/* Avoid ugly blank lines: only print newline when we have to. */
 	if (*cpu_vendor || cpu_id)
 		printf("\n");
 #endif
 	/*
 	 * Now that we have told the user what they have,
 	 * let them know if that machine type isn't configured.
 	 */
 	switch (cpu_class) {
 	case CPUCLASS_286:	/* a 286 should not make it this far, anyway */
 #if !defined(I386_CPU) && !defined(I486_CPU) && !defined(I586_CPU) && !defined(I686_CPU)
 #error This kernel is not configured for one of the supported CPUs
 #endif
 #if !defined(I386_CPU)
 	case CPUCLASS_386:
 #endif
 #if !defined(I486_CPU)
 	case CPUCLASS_486:
 #endif
 #if !defined(I586_CPU)
 	case CPUCLASS_586:
 #endif
 #if !defined(I686_CPU)
 	case CPUCLASS_686:
 #endif
 		panic("CPU class not configured");
 	default:
 		break;
 	}
 	dev_attach(&kdc_cpu0);
 }
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * in u. to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 void
 sendsig(catcher, sig, mask, code)
 	sig_t catcher;
 	int sig, mask;
 	unsigned code;
 {
 	register struct proc *p = curproc;
 	register int *regs;
 	register struct sigframe *fp;
 	struct sigframe sf;
 	struct sigacts *psp = p->p_sigacts;
 	int oonstack;
 
 	regs = p->p_md.md_regs;
         oonstack = psp->ps_sigstk.ss_flags & SA_ONSTACK;
 	/*
 	 * Allocate and validate space for the signal handler
 	 * context. Note that if the stack is in P0 space, the
 	 * call to grow() is a nop, and the useracc() check
 	 * will fail if the process has not already allocated
 	 * the space with a `brk'.
 	 */
         if ((psp->ps_flags & SAS_ALTSTACK) &&
 	    (psp->ps_sigstk.ss_flags & SA_ONSTACK) == 0 &&
 	    (psp->ps_sigonstack & sigmask(sig))) {
 		fp = (struct sigframe *)(psp->ps_sigstk.ss_sp +
 		    psp->ps_sigstk.ss_size - sizeof(struct sigframe));
 		psp->ps_sigstk.ss_flags |= SA_ONSTACK;
 	} else {
 		fp = (struct sigframe *)(regs[tESP]
 			- sizeof(struct sigframe));
 	}
 
 	/*
 	 * grow() will return FALSE if the fp will not fit inside the stack
 	 *	and the stack can not be grown. useracc will return FALSE
 	 *	if access is denied.
 	 */
 	if ((grow(p, (int)fp) == FALSE) ||
 	    (useracc((caddr_t)fp, sizeof (struct sigframe), B_WRITE) == FALSE)) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		SIGACTION(p, SIGILL) = SIG_DFL;
 		sig = sigmask(SIGILL);
 		p->p_sigignore &= ~sig;
 		p->p_sigcatch &= ~sig;
 		p->p_sigmask &= ~sig;
 		psignal(p, SIGILL);
 		return;
 	}
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl) {
 		if (sig < p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[sig];
 		else
 			sig = p->p_sysent->sv_sigsize + 1;
 	}
 	sf.sf_signum = sig;
 	sf.sf_code = code;
 	sf.sf_scp = &fp->sf_sc;
 	sf.sf_addr = (char *) regs[tERR];
 	sf.sf_handler = catcher;
 
 	/* save scratch registers */
 	sf.sf_sc.sc_eax = regs[tEAX];
 	sf.sf_sc.sc_ebx = regs[tEBX];
 	sf.sf_sc.sc_ecx = regs[tECX];
 	sf.sf_sc.sc_edx = regs[tEDX];
 	sf.sf_sc.sc_esi = regs[tESI];
 	sf.sf_sc.sc_edi = regs[tEDI];
 	sf.sf_sc.sc_cs = regs[tCS];
 	sf.sf_sc.sc_ds = regs[tDS];
 	sf.sf_sc.sc_ss = regs[tSS];
 	sf.sf_sc.sc_es = regs[tES];
 	sf.sf_sc.sc_isp = regs[tISP];
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	sf.sf_sc.sc_onstack = oonstack;
 	sf.sf_sc.sc_mask = mask;
 	sf.sf_sc.sc_sp = regs[tESP];
 	sf.sf_sc.sc_fp = regs[tEBP];
 	sf.sf_sc.sc_pc = regs[tEIP];
 	sf.sf_sc.sc_ps = regs[tEFLAGS];
 
 	/*
 	 * Copy the sigframe out to the user's stack.
 	 */
 	if (copyout(&sf, fp, sizeof(struct sigframe)) != 0) {
 		/*
 		 * Something is wrong with the stack pointer.
 		 * ...Kill the process.
 		 */
 		sigexit(p, SIGILL);
 	};
 
 	regs[tESP] = (int)fp;
 	regs[tEIP] = (int)((struct pcb *)kstack)->pcb_sigc;
 	regs[tEFLAGS] &= ~PSL_VM;
 	regs[tCS] = _ucodesel;
 	regs[tDS] = _udatasel;
 	regs[tES] = _udatasel;
 	regs[tSS] = _udatasel;
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * state to gain improper privileges.
  */
 int
 sigreturn(p, uap, retval)
 	struct proc *p;
 	struct sigreturn_args /* {
 		struct sigcontext *sigcntxp;
 	} */ *uap;
 	int *retval;
 {
 	register struct sigcontext *scp;
 	register struct sigframe *fp;
 	register int *regs = p->p_md.md_regs;
 	int eflags;
 
 	/*
 	 * (XXX old comment) regs[tESP] points to the return address.
 	 * The user scp pointer is above that.
 	 * The return address is faked in the signal trampoline code
 	 * for consistency.
 	 */
 	scp = uap->sigcntxp;
 	fp = (struct sigframe *)
 	     ((caddr_t)scp - offsetof(struct sigframe, sf_sc));
 
 	if (useracc((caddr_t)fp, sizeof (*fp), 0) == 0)
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to change privileged or reserved flags.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = scp->sc_ps;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs[tEFLAGS] & ~PSL_RF)) {
 #ifdef DEBUG
     		printf("sigreturn: eflags = 0x%x\n", eflags);
 #endif
     		return(EINVAL);
 	}
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(scp->sc_cs)) {
 #ifdef DEBUG
     		printf("sigreturn: cs = 0x%x\n", scp->sc_cs);
 #endif
 		trapsignal(p, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	/* restore scratch registers */
 	regs[tEAX] = scp->sc_eax;
 	regs[tEBX] = scp->sc_ebx;
 	regs[tECX] = scp->sc_ecx;
 	regs[tEDX] = scp->sc_edx;
 	regs[tESI] = scp->sc_esi;
 	regs[tEDI] = scp->sc_edi;
 	regs[tCS] = scp->sc_cs;
 	regs[tDS] = scp->sc_ds;
 	regs[tES] = scp->sc_es;
 	regs[tSS] = scp->sc_ss;
 	regs[tISP] = scp->sc_isp;
 
 	if (useracc((caddr_t)scp, sizeof (*scp), 0) == 0)
 		return(EINVAL);
 
 	if (scp->sc_onstack & 01)
 		p->p_sigacts->ps_sigstk.ss_flags |= SA_ONSTACK;
 	else
 		p->p_sigacts->ps_sigstk.ss_flags &= ~SA_ONSTACK;
 	p->p_sigmask = scp->sc_mask &~
 	    (sigmask(SIGKILL)|sigmask(SIGCONT)|sigmask(SIGSTOP));
 	regs[tEBP] = scp->sc_fp;
 	regs[tESP] = scp->sc_sp;
 	regs[tEIP] = scp->sc_pc;
 	regs[tEFLAGS] = eflags;
 	return(EJUSTRETURN);
 }
 
 static int	waittime = -1;
 static struct pcb dumppcb;
 
 __dead void
 boot(howto)
 	int howto;
 {
 	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
 		register struct buf *bp;
 		int iter, nbusy;
 
 		waittime = 0;
 		printf("\nsyncing disks... ");
 
 		sync(&proc0, NULL, NULL);
 
 		for (iter = 0; iter < 20; iter++) {
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; ) {
 				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
 					nbusy++;
 				}
 			}
 			if (nbusy == 0)
 				break;
 			printf("%d ", nbusy);
 			DELAY(40000 * iter);
 		}
 		if (nbusy) {
 			/*
 			 * Failed to sync all blocks. Indicate this and don't
 			 * unmount filesystems (thus forcing an fsck on reboot).
 			 */
 			printf("giving up\n");
 #ifdef SHOW_BUSYBUFS
 			nbusy = 0;
 			for (bp = &buf[nbuf]; --bp >= buf; ) {
 				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
 					nbusy++;
 					printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno);
 				}
 			}
 			DELAY(5000000);	/* 5 seconds */
 #endif
 		} else {
 			printf("done\n");
 			/*
 			 * Unmount filesystems
 			 */
 			if (panicstr == 0)
 				vfs_unmountall();
 		}
 		DELAY(100000);			/* wait for console output to finish */
 		dev_shutdownall(FALSE);
 	}
 	splhigh();
 	if (howto & RB_HALT) {
 		printf("\n");
 		printf("The operating system has halted.\n");
 		printf("Please press any key to reboot.\n\n");
 		cngetc();
 	} else {
 		if (howto & RB_DUMP) {
 			if (!cold) {
 				savectx(&dumppcb, 0);
 				dumppcb.pcb_ptd = rcr3();
 				dumpsys();
 			}
 
 			if (PANIC_REBOOT_WAIT_TIME != 0) {
 				if (PANIC_REBOOT_WAIT_TIME != -1) {
 					int loop;
 					printf("Automatic reboot in %d seconds - press a key on the console to abort\n",
 						PANIC_REBOOT_WAIT_TIME);
 					for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) {
 						DELAY(1000 * 100); /* 1/10th second */
 						if (cncheckc()) /* Did user type a key? */
 							break;
 					}
 					if (!loop)
 						goto die;
 				}
 			} else { /* zero time specified - reboot NOW */
 				goto die;
 			}
 			printf("--> Press a key on the console to reboot <--\n");
 			cngetc();
 		}
 	}
 die:
 	printf("Rebooting...\n");
 	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
 	cpu_reset();
 	for(;;) ;
 	/* NOTREACHED */
 }
 
 /*
  * Magic number for savecore
  *
  * exported (symorder) and used at least by savecore(8)
  *
  */
 u_long		dumpmag = 0x8fca0101UL;	
 
 static int	dumpsize = 0;		/* also for savecore */
 
 static int	dodump = 1;
 SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
 
 /*
  * Doadump comes here after turning off memory management and
  * getting on the dump stack, either when called above, or by
  * the auto-restart code.
  */
 static void
 dumpsys()
 {
 
 	if (!dodump)
 		return;
 	if (dumpdev == NODEV)
 		return;
 	if ((minor(dumpdev)&07) != 1)
 		return;
 	if (!(bdevsw[major(dumpdev)]))
 		return;
 	if (!(bdevsw[major(dumpdev)]->d_dump))
 		return;
 	dumpsize = Maxmem;
 	printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo);
 	printf("dump ");
 	switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
 
 	case ENXIO:
 		printf("device bad\n");
 		break;
 
 	case EFAULT:
 		printf("device not ready\n");
 		break;
 
 	case EINVAL:
 		printf("area improper\n");
 		break;
 
 	case EIO:
 		printf("i/o error\n");
 		break;
 
 	case EINTR:
 		printf("aborted from console\n");
 		break;
 
 	default:
 		printf("succeeded\n");
 		break;
 	}
 }
 
 /*
  * Clear registers on exec
  */
 void
 setregs(p, entry, stack)
 	struct proc *p;
 	u_long entry;
 	u_long stack;
 {
 	int *regs = p->p_md.md_regs;
 
 	bzero(regs, sizeof(struct trapframe));
 	regs[tEIP] = entry;
 	regs[tESP] = stack;
 	regs[tEFLAGS] = PSL_USER | (regs[tEFLAGS] & PSL_T);
 	regs[tSS] = _udatasel;
 	regs[tDS] = _udatasel;
 	regs[tES] = _udatasel;
 	regs[tCS] = _ucodesel;
 
 	p->p_addr->u_pcb.pcb_flags = 0;	/* no fp at all */
 	load_cr0(rcr0() | CR0_TS);	/* start emulating */
 #if	NNPX > 0
 	npxinit(__INITIAL_NPXCW__);
 #endif	/* NNPX > 0 */
 }
 
 static int
 sysctl_machdep_adjkerntz SYSCTL_HANDLER_ARGS
 {
 	int error;
 	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
 		req);
 	if (!error && req->newptr)
 		resettodr();
 	return (error);
 }
 
 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
 	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
 
 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo, 
 	CTLFLAG_RD, &bootinfo, bootinfo, "");
 
 /*
  * Initialize 386 and configure to run kernel
  */
 
 /*
  * Initialize segments & interrupt table
  */
 
 int currentldt;
 int _default_ldt;
 union descriptor gdt[NGDT];		/* global descriptor table */
 struct gate_descriptor idt[NIDT];	/* interrupt descriptor table */
 union descriptor ldt[NLDT];		/* local descriptor table */
 
 static struct i386tss dblfault_tss;
 static char dblfault_stack[PAGE_SIZE];
 
 extern  struct user *proc0paddr;
 
 /* software prototypes -- in more palatable form */
 struct soft_segment_descriptor gdt_segs[] = {
 /* GNULL_SEL	0 Null Descriptor */
 {	0x0,			/* segment base address  */
 	0x0,			/* length */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GCODE_SEL	1 Code Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GDATA_SEL	2 Data Descriptor for kernel */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GLDT_SEL	3 LDT Descriptor */
 {	(int) ldt,		/* segment base address  */
 	sizeof(ldt)-1,		/* length - all address space */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GTGATE_SEL	4 Null Descriptor - Placeholder */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPANIC_SEL	5 Panic Tss Descriptor */
 {	(int) &dblfault_tss,	/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GPROC0_SEL	6 Proc 0 Tss Descriptor */
 {	(int) kstack,		/* segment base address  */
 	sizeof(struct i386tss)-1,/* length - all address space */
 	SDT_SYS386TSS,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GUSERLDT_SEL	7 User LDT Descriptor per process */
 {	(int) ldt,		/* segment base address  */
 	(512 * sizeof(union descriptor)-1),		/* length */
 	SDT_SYSLDT,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* unused - default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE32_SEL 8 APM BIOS 32-bit interface (32bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMCODE16_SEL 9 APM BIOS 32-bit interface (16bit Code) */
 {	0,			/* segment base address (overwritten by APM)  */
 	0xfffff,		/* length */
 	SDT_MEMERA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 /* GAPMDATA_SEL	10 APM BIOS 32-bit interface (Data) */
 {	0,			/* segment base address (overwritten by APM) */
 	0xfffff,		/* length */
 	SDT_MEMRWA,		/* segment type */
 	0,			/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 static struct soft_segment_descriptor ldt_segs[] = {
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Null Descriptor - overwritten by call gate */
 {	0x0,			/* segment base address  */
 	0x0,			/* length - all address space */
 	0,			/* segment type */
 	0,			/* segment descriptor priority level */
 	0,			/* segment descriptor present */
 	0, 0,
 	0,			/* default 32 vs 16 bit size */
 	0  			/* limit granularity (byte/page units)*/ },
 	/* Code Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMERA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 	/* Data Descriptor for user */
 {	0x0,			/* segment base address  */
 	0xfffff,		/* length - all address space */
 	SDT_MEMRWA,		/* segment type */
 	SEL_UPL,		/* segment descriptor priority level */
 	1,			/* segment descriptor present */
 	0, 0,
 	1,			/* default 32 vs 16 bit size */
 	1  			/* limit granularity (byte/page units)*/ },
 };
 
 void
 setidt(idx, func, typ, dpl, selec)
 	int idx;
 	inthand_t *func;
 	int typ;
 	int dpl;
 	int selec;
 {
 	struct gate_descriptor *ip = idt + idx;
 
 	ip->gd_looffset = (int)func;
 	ip->gd_selector = selec;
 	ip->gd_stkcpy = 0;
 	ip->gd_xx = 0;
 	ip->gd_type = typ;
 	ip->gd_dpl = dpl;
 	ip->gd_p = 1;
 	ip->gd_hioffset = ((int)func)>>16 ;
 }
 
 #define	IDTVEC(name)	__CONCAT(X,name)
 
 extern inthand_t
 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
 	IDTVEC(page), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
 	IDTVEC(syscall);
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 extern inthand_t
 	IDTVEC(linux_syscall);
 #endif
 
 void
 sdtossd(sd, ssd)
 	struct segment_descriptor *sd;
 	struct soft_segment_descriptor *ssd;
 {
 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
 	ssd->ssd_type  = sd->sd_type;
 	ssd->ssd_dpl   = sd->sd_dpl;
 	ssd->ssd_p     = sd->sd_p;
 	ssd->ssd_def32 = sd->sd_def32;
 	ssd->ssd_gran  = sd->sd_gran;
 }
 
 void
 init386(first)
 	int first;
 {
 	int x;
 	unsigned biosbasemem, biosextmem;
 	struct gate_descriptor *gdp;
 	int gsel_tss;
 	/* table descriptors - used to load tables by microp */
 	struct region_descriptor r_gdt, r_idt;
 	int	pagesinbase, pagesinext;
 	int	target_page, pa_indx;
 
 	proc0.p_addr = proc0paddr;
 
 	/*
 	 * Initialize the console before we print anything out.
 	 */
 	cninit();
 
 	/*
 	 * make gdt memory segments, the code segment goes up to end of the
 	 * page with etext in it, the data segment goes to the end of
 	 * the address space
 	 */
 	/*
 	 * XXX text protection is temporarily (?) disabled.  The limit was
-	 * i386_btop(i386_round_page(etext)) - 1.
+	 * i386_btop(round_page(etext)) - 1.
 	 */
 	gdt_segs[GCODE_SEL].ssd_limit = i386_btop(0) - 1;
 	gdt_segs[GDATA_SEL].ssd_limit = i386_btop(0) - 1;
 	for (x = 0; x < NGDT; x++)
 		ssdtosd(&gdt_segs[x], &gdt[x].sd);
 
 	/* make ldt memory segments */
 	/*
 	 * The data segment limit must not cover the user area because we
 	 * don't want the user area to be writable in copyout() etc. (page
 	 * level protection is lost in kernel mode on 386's).  Also, we
 	 * don't want the user area to be writable directly (page level
 	 * protection of the user area is not available on 486's with
 	 * CR0_WP set, because there is no user-read/kernel-write mode).
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  And it
 	 * should be spelled ...MAX_USER...
 	 */
 #define VM_END_USER_RW_ADDRESS	VM_MAXUSER_ADDRESS
 	/*
 	 * The code segment limit has to cover the user area until we move
 	 * the signal trampoline out of the user area.  This is safe because
 	 * the code segment cannot be written to directly.
 	 */
 #define VM_END_USER_R_ADDRESS	(VM_END_USER_RW_ADDRESS + UPAGES * NBPG)
 	ldt_segs[LUCODE_SEL].ssd_limit = i386_btop(VM_END_USER_R_ADDRESS) - 1;
 	ldt_segs[LUDATA_SEL].ssd_limit = i386_btop(VM_END_USER_RW_ADDRESS) - 1;
 	/* Note. eventually want private ldts per process */
 	for (x = 0; x < NLDT; x++)
 		ssdtosd(&ldt_segs[x], &ldt[x].sd);
 
 	/* exceptions */
 	for (x = 0; x < NIDT; x++)
 		setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(0, &IDTVEC(div),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(1, &IDTVEC(dbg),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(2, &IDTVEC(nmi),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
  	setidt(3, &IDTVEC(bpt),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(4, &IDTVEC(ofl),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(5, &IDTVEC(bnd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(6, &IDTVEC(ill),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(7, &IDTVEC(dna),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(8, 0,  SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
 	setidt(9, &IDTVEC(fpusegm),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(10, &IDTVEC(tss),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(11, &IDTVEC(missing),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(12, &IDTVEC(stk),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(13, &IDTVEC(prot),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(14, &IDTVEC(page),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(15, &IDTVEC(rsvd),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(16, &IDTVEC(fpu),  SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 	setidt(17, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
 #if defined(COMPAT_LINUX) || defined(LINUX)
  	setidt(0x80, &IDTVEC(linux_syscall),  SDT_SYS386TGT, SEL_UPL, GSEL(GCODE_SEL, SEL_KPL));
 #endif
 
 #include	"isa.h"
 #if	NISA >0
 	isa_defaultirq();
 #endif
 	rand_initialize();
 
 	r_gdt.rd_limit = sizeof(gdt) - 1;
 	r_gdt.rd_base =  (int) gdt;
 	lgdt(&r_gdt);
 
 	r_idt.rd_limit = sizeof(idt) - 1;
 	r_idt.rd_base = (int) idt;
 	lidt(&r_idt);
 
 	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	lldt(_default_ldt);
 	currentldt = _default_ldt;
 
 #ifdef DDB
 	kdb_init();
 	if (boothowto & RB_KDB)
 		Debugger("Boot flags requested debugger");
 #endif
 
 	/* Use BIOS values stored in RTC CMOS RAM, since probing
 	 * breaks certain 386 AT relics.
 	 */
 	biosbasemem = rtcin(RTC_BASELO)+ (rtcin(RTC_BASEHI)<<8);
 	biosextmem = rtcin(RTC_EXTLO)+ (rtcin(RTC_EXTHI)<<8);
 
 	/*
 	 * Print a warning if the official BIOS interface disagrees
 	 * with the hackish interface used above.  Eventually only
 	 * the official interface should be used.
 	 */
 	if (bootinfo.bi_memsizes_valid) {
 		if (bootinfo.bi_basemem != biosbasemem)
 			printf("BIOS basemem (%ldK) != RTC basemem (%dK)\n",
 			       bootinfo.bi_basemem, biosbasemem);
 		if (bootinfo.bi_extmem != biosextmem)
 			printf("BIOS extmem (%ldK) != RTC extmem (%dK)\n",
 			       bootinfo.bi_extmem, biosextmem);
 	}
 
 	/*
 	 * If BIOS tells us that it has more than 640k in the basemem,
 	 *	don't believe it - set it to 640k.
 	 */
 	if (biosbasemem > 640)
 		biosbasemem = 640;
 
 	/*
 	 * Some 386 machines might give us a bogus number for extended
 	 *	mem. If this happens, stop now.
 	 */
 #ifndef LARGEMEM
 	if (biosextmem > 65536) {
 		panic("extended memory beyond limit of 64MB");
 		/* NOTREACHED */
 	}
 #endif
 
 	pagesinbase = biosbasemem * 1024 / NBPG;
 	pagesinext = biosextmem * 1024 / NBPG;
 
 	/*
 	 * Special hack for chipsets that still remap the 384k hole when
 	 *	there's 16MB of memory - this really confuses people that
 	 *	are trying to use bus mastering ISA controllers with the
 	 *	"16MB limit"; they only have 16MB, but the remapping puts
 	 *	them beyond the limit.
 	 */
 	/*
 	 * If extended memory is between 15-16MB (16-17MB phys address range),
 	 *	chop it to 15MB.
 	 */
 	if ((pagesinext > 3840) && (pagesinext < 4096))
 		pagesinext = 3840;
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of of the physical address space. It
 	 */
 	Maxmem = pagesinext + 0x100000/PAGE_SIZE;
 
 #ifdef MAXMEM
 	Maxmem = MAXMEM/4;
 #endif
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap (first, 0);
 
 	/*
 	 * Size up each available chunk of physical memory.
 	 */
 
 	/*
 	 * We currently don't bother testing base memory.
 	 * XXX  ...but we probably should.
 	 */
 	pa_indx = 0;
 	badpages = 0;
 	if (pagesinbase > 1) {
 		phys_avail[pa_indx++] = PAGE_SIZE;	/* skip first page of memory */
 		phys_avail[pa_indx] = ptoa(pagesinbase);/* memory up to the ISA hole */
 		physmem = pagesinbase - 1;
 	} else {
 		/* point at first chunk end */
 		pa_indx++;
 	}
 
 	for (target_page = avail_start; target_page < ptoa(Maxmem); target_page += PAGE_SIZE) {
 		int tmp, page_bad = FALSE;
 
 		/*
 		 * map page into kernel: valid, read/write, non-cacheable
 		 */
 		*(int *)CMAP1 = PG_V | PG_KW | PG_N | target_page;
 		pmap_update();
 
 		tmp = *(int *)CADDR1;
 		/*
 		 * Test for alternating 1's and 0's
 		 */
 		*(volatile int *)CADDR1 = 0xaaaaaaaa;
 		if (*(volatile int *)CADDR1 != 0xaaaaaaaa) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for alternating 0's and 1's
 		 */
 		*(volatile int *)CADDR1 = 0x55555555;
 		if (*(volatile int *)CADDR1 != 0x55555555) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 1's
 		 */
 		*(volatile int *)CADDR1 = 0xffffffff;
 		if (*(volatile int *)CADDR1 != 0xffffffff) {
 			page_bad = TRUE;
 		}
 		/*
 		 * Test for all 0's
 		 */
 		*(volatile int *)CADDR1 = 0x0;
 		if (*(volatile int *)CADDR1 != 0x0) {
 			/*
 			 * test of page failed
 			 */
 			page_bad = TRUE;
 		}
 		/*
 		 * Restore original value.
 		 */
 		*(int *)CADDR1 = tmp;
 
 		/*
 		 * Adjust array of valid/good pages.
 		 */
 		if (page_bad == FALSE) {
 			/*
 			 * If this good page is a continuation of the
 			 * previous set of good pages, then just increase
 			 * the end pointer. Otherwise start a new chunk.
 			 * Note that "end" points one higher than end,
 			 * making the range >= start and < end.
 			 */
 			if (phys_avail[pa_indx] == target_page) {
 				phys_avail[pa_indx] += PAGE_SIZE;
 			} else {
 				pa_indx++;
 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
 					printf("Too many holes in the physical address space, giving up\n");
 					pa_indx--;
 					break;
 				}
 				phys_avail[pa_indx++] = target_page;	/* start */
 				phys_avail[pa_indx] = target_page + PAGE_SIZE;	/* end */
 			}
 			physmem++;
 		} else {
 			badpages++;
 			page_bad = FALSE;
 		}
 	}
 
 	*(int *)CMAP1 = 0;
 	pmap_update();
 
 	/*
 	 * XXX
 	 * The last chunk must contain at least one page plus the message
 	 * buffer to avoid complicating other code (message buffer address
 	 * calculation, etc.).
 	 */
 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
 	    round_page(sizeof(struct msgbuf)) >= phys_avail[pa_indx]) {
 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
 		phys_avail[pa_indx--] = 0;
 		phys_avail[pa_indx--] = 0;
 	}
 
 	Maxmem = atop(phys_avail[pa_indx]);
 
 	/* Trim off space for the message buffer. */
 	phys_avail[pa_indx] -= round_page(sizeof(struct msgbuf));
 
 	avail_end = phys_avail[pa_indx];
 
 	/* now running on new page tables, configured,and u/iom is accessible */
 
 	/* make a initial tss so microp can get interrupt stack on syscall! */
 	proc0.p_addr->u_pcb.pcb_tss.tss_esp0 = (int) kstack + UPAGES*NBPG;
 	proc0.p_addr->u_pcb.pcb_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL) ;
 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
 
 	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
 	    dblfault_tss.tss_esp2 = (int) &dblfault_stack[sizeof(dblfault_stack)];
 	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
 	    dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cr3 = IdlePTD;
 	dblfault_tss.tss_eip = (int) dblfault_handler;
 	dblfault_tss.tss_eflags = PSL_KERNEL;
 	dblfault_tss.tss_ds = dblfault_tss.tss_es = dblfault_tss.tss_fs = dblfault_tss.tss_gs =
 		GSEL(GDATA_SEL, SEL_KPL);
 	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
 	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 
 	((struct i386tss *)gdt_segs[GPROC0_SEL].ssd_base)->tss_ioopt =
 		(sizeof(struct i386tss))<<16;
 
 	ltr(gsel_tss);
 
 	/* make a call gate to reenter kernel with */
 	gdp = &ldt[LSYS5CALLS_SEL].gd;
 
 	x = (int) &IDTVEC(syscall);
 	gdp->gd_looffset = x++;
 	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
 	gdp->gd_stkcpy = 1;
 	gdp->gd_type = SDT_SYS386CGT;
 	gdp->gd_dpl = SEL_UPL;
 	gdp->gd_p = 1;
 	gdp->gd_hioffset = ((int) &IDTVEC(syscall)) >>16;
 
 	/* transfer to user mode */
 
 	_ucodesel = LSEL(LUCODE_SEL, SEL_UPL);
 	_udatasel = LSEL(LUDATA_SEL, SEL_UPL);
 
 	/* setup proc 0's pcb */
 	bcopy(&sigcode, proc0.p_addr->u_pcb.pcb_sigc, szsigcode);
 	proc0.p_addr->u_pcb.pcb_flags = 0;
 	proc0.p_addr->u_pcb.pcb_ptd = IdlePTD;
 }
 
 /*
  * The registers are in the frame; the frame is in the user area of
  * the process in question; when the process is active, the registers
  * are in "the kernel stack"; when it's not, they're still there, but
  * things get flipped around.  So, since p->p_md.md_regs is the whole address
  * of the register set, take its offset from the kernel stack, and
  * index into the user block.  Don't you just *love* virtual memory?
  * (I'm starting to think seymour is right...)
  */
 #define	TF_REGP(p)	((struct trapframe *) \
 			 ((char *)(p)->p_addr \
 			  + ((char *)(p)->p_md.md_regs - kstack)))
 
 int
 ptrace_set_pc(p, addr)
 	struct proc *p;
 	unsigned int addr;
 {
 	TF_REGP(p)->tf_eip = addr;
 	return (0);
 }
 
 int
 ptrace_single_step(p)
 	struct proc *p;
 {
 	TF_REGP(p)->tf_eflags |= PSL_T;
 	return (0);
 }
 
 int ptrace_write_u(p, off, data)
 	struct proc *p;
 	vm_offset_t off;
 	int data;
 {
 	struct trapframe frame_copy;
 	vm_offset_t min;
 	struct trapframe *tp;
 
 	/*
 	 * Privileged kernel state is scattered all over the user area.
 	 * Only allow write access to parts of regs and to fpregs.
 	 */
 	min = (char *)p->p_md.md_regs - kstack;
 	if (off >= min && off <= min + sizeof(struct trapframe) - sizeof(int)) {
 		tp = TF_REGP(p);
 		frame_copy = *tp;
 		*(int *)((char *)&frame_copy + (off - min)) = data;
 		if (!EFLAGS_SECURE(frame_copy.tf_eflags, tp->tf_eflags) ||
 		    !CS_SECURE(frame_copy.tf_cs))
 			return (EINVAL);
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	min = offsetof(struct user, u_pcb) + offsetof(struct pcb, pcb_savefpu);
 	if (off >= min && off <= min + sizeof(struct save87) - sizeof(int)) {
 		*(int*)((char *)p->p_addr + off) = data;
 		return (0);
 	}
 	return (EFAULT);
 }
 
 int
 fill_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct trapframe *tp;
 
 	tp = TF_REGP(p);
 	regs->r_es = tp->tf_es;
 	regs->r_ds = tp->tf_ds;
 	regs->r_edi = tp->tf_edi;
 	regs->r_esi = tp->tf_esi;
 	regs->r_ebp = tp->tf_ebp;
 	regs->r_ebx = tp->tf_ebx;
 	regs->r_edx = tp->tf_edx;
 	regs->r_ecx = tp->tf_ecx;
 	regs->r_eax = tp->tf_eax;
 	regs->r_eip = tp->tf_eip;
 	regs->r_cs = tp->tf_cs;
 	regs->r_eflags = tp->tf_eflags;
 	regs->r_esp = tp->tf_esp;
 	regs->r_ss = tp->tf_ss;
 	return (0);
 }
 
 int
 set_regs(p, regs)
 	struct proc *p;
 	struct reg *regs;
 {
 	struct trapframe *tp;
 
 	tp = TF_REGP(p);
 	if (!EFLAGS_SECURE(regs->r_eflags, tp->tf_eflags) ||
 	    !CS_SECURE(regs->r_cs))
 		return (EINVAL);
 	tp->tf_es = regs->r_es;
 	tp->tf_ds = regs->r_ds;
 	tp->tf_edi = regs->r_edi;
 	tp->tf_esi = regs->r_esi;
 	tp->tf_ebp = regs->r_ebp;
 	tp->tf_ebx = regs->r_ebx;
 	tp->tf_edx = regs->r_edx;
 	tp->tf_ecx = regs->r_ecx;
 	tp->tf_eax = regs->r_eax;
 	tp->tf_eip = regs->r_eip;
 	tp->tf_cs = regs->r_cs;
 	tp->tf_eflags = regs->r_eflags;
 	tp->tf_esp = regs->r_esp;
 	tp->tf_ss = regs->r_ss;
 	return (0);
 }
 
 #ifndef DDB
 void
 Debugger(const char *msg)
 {
 	printf("Debugger(\"%s\") called.\n", msg);
 }
 #endif /* no DDB */
 
 #include <sys/disklabel.h>
 #define b_cylin	b_resid
 /*
  * Determine the size of the transfer, and make sure it is
  * within the boundaries of the partition. Adjust transfer
  * if needed, and signal errors or early completion.
  */
 int
 bounds_check_with_label(struct buf *bp, struct disklabel *lp, int wlabel)
 {
         struct partition *p = lp->d_partitions + dkpart(bp->b_dev);
         int labelsect = lp->d_partitions[0].p_offset;
         int maxsz = p->p_size,
                 sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
 
         /* overwriting disk label ? */
         /* XXX should also protect bootstrap in first 8K */
         if (bp->b_blkno + p->p_offset <= LABELSECTOR + labelsect &&
 #if LABELSECTOR != 0
             bp->b_blkno + p->p_offset + sz > LABELSECTOR + labelsect &&
 #endif
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 
 #if     defined(DOSBBSECTOR) && defined(notyet)
         /* overwriting master boot record? */
         if (bp->b_blkno + p->p_offset <= DOSBBSECTOR &&
             (bp->b_flags & B_READ) == 0 && wlabel == 0) {
                 bp->b_error = EROFS;
                 goto bad;
         }
 #endif
 
         /* beyond partition? */
         if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
                 /* if exactly at end of disk, return an EOF */
                 if (bp->b_blkno == maxsz) {
                         bp->b_resid = bp->b_bcount;
                         return(0);
                 }
                 /* or truncate if part of it fits */
                 sz = maxsz - bp->b_blkno;
                 if (sz <= 0) {
                         bp->b_error = EINVAL;
                         goto bad;
                 }
                 bp->b_bcount = sz << DEV_BSHIFT;
         }
 
         /* calculate cylinder for disksort to order transfers with */
         bp->b_pblkno = bp->b_blkno + p->p_offset;
         bp->b_cylin = bp->b_pblkno / lp->d_secpercyl;
         return(1);
 
 bad:
         bp->b_flags |= B_ERROR;
         return(-1);
 }
 
 int
 disk_externalize(int drive, struct sysctl_req *req)
 {
 	return SYSCTL_OUT(req, &drive, sizeof drive);
 }
Index: head/sys/i386/i386/pmap.c
===================================================================
--- head/sys/i386/i386/pmap.c	(revision 13489)
+++ head/sys/i386/i386/pmap.c	(revision 13490)
@@ -1,1954 +1,2167 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
- *	$Id: pmap.c,v 1.71 1995/12/17 07:19:15 bde Exp $
+ *	$Id: pmap.c,v 1.72 1995/12/22 18:21:26 bde Exp $
  */
 
 /*
  * Derived from hp300 version by Mike Hibler, this version by William
  * Jolitz uses a recursive map [a pde points to the page directory] to
  * map the page tables using the pagetables themselves. This is done to
  * reduce the impact on kernel virtual memory for lots of sparse address
  * space, and to reduce the cost of memory to each process.
  *
  *	Derived from: hp300/@(#)pmap.c	7.1 (Berkeley) 12/5/90
  */
 /*
  * Major modifications by John S. Dyson primarily to support
  * pageable page tables, eliminating pmap_attributes,
  * discontiguous memory pages, and using more efficient string
  * instructions. Jan 13, 1994.  Further modifications on Mar 2, 1994,
  * general clean-up and efficiency mods.
  */
 
 /*
  *	Manages physical address maps.
  *
  *	In addition to hardware address maps, this
  *	module is called upon to provide software-use-only
  *	maps which may or may not be stored in the same
  *	form as hardware maps.  These pseudo-maps are
  *	used to store intermediate results from copy
  *	operations to and from address spaces.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/queue.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/pcb.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 
 #include <i386/isa/isa.h>
 
+#define PMAP_KEEP_PDIRS
+
+static void	init_pv_entries __P((int));
+
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[((vm_offset_t)(v) >> PD_SHIFT)&1023]))
 #define pdir_pde(m, v) (m[((vm_offset_t)(v) >> PD_SHIFT)&1023])
 
 #define pmap_pte_pa(pte)	(*(int *)(pte) & PG_FRAME)
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_U) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)		((v)?(*(int *)pte |= PG_W):(*(int *)pte &= ~PG_W))
 #define pmap_pte_set_prot(pte, v)	((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)	(protection_codes[p])
 static int protection_codes[8];
 
 static struct pmap kernel_pmap_store;
 pmap_t kernel_pmap;
 
 vm_offset_t avail_start;	/* PA of first available physical page */
 vm_offset_t avail_end;		/* PA of last available physical page */
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 static boolean_t pmap_initialized = FALSE;	/* Has pmap_init completed? */
 static vm_offset_t vm_first_phys;
 
 static int nkpt;
 
 extern vm_offset_t clean_sva, clean_eva;
 extern int cpu_class;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1;
 static pt_entry_t *CMAP2, *ptmmap;
 static pv_entry_t pv_table;
 caddr_t CADDR1, ptvmmap;
 static caddr_t CADDR2;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp;
 
 static void	free_pv_entry __P((pv_entry_t pv));
 static pt_entry_t *
 		get_pt_entry __P((pmap_t pmap));
 static pv_entry_t
 		get_pv_entry __P((void));
 static void	i386_protection_init __P((void));
-static void	init_pv_entries __P((int npg));
 static void	pmap_alloc_pv_entry __P((void));
 static void	pmap_changebit __P((vm_offset_t pa, int bit, boolean_t setem));
 static void	pmap_enter_quick __P((pmap_t pmap, vm_offset_t va,
 				      vm_offset_t pa));
 static int	pmap_is_managed __P((vm_offset_t pa));
 static void	pmap_remove_all __P((vm_offset_t pa));
 static void	pmap_remove_entry __P((struct pmap *pmap, pv_entry_t pv,
 				       vm_offset_t va));
 static vm_page_t
 		pmap_pte_vm_page __P((pmap_t pmap, vm_offset_t pt));
 static boolean_t
 		pmap_testbit __P((vm_offset_t pa, int bit));
 
 /*
+ * The below are finer grained pmap_update routines.  These eliminate
+ * the gratuitious tlb flushes on non-i386 architectures.
+ */
+static __inline void
+pmap_update_1pg( vm_offset_t va) {
+#if defined(I386_CPU)
+	if (cpuclass == CPUCLASS_I386)
+		pmap_update();
+	else
+#endif
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va));
+}
+
+static __inline void
+pmap_update_2pg( vm_offset_t va1, vm_offset_t va2) {
+#if defined(I386_CPU)
+	if (cpuclass == CPUCLASS_I386) {
+		pmap_update();
+	} else
+#endif
+	{
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va1));
+		__asm __volatile(".byte 0xf,0x1,0x38": :"a" (va2));
+	}
+}
+
+/*
  *	Routine:	pmap_pte
  *	Function:
  *		Extract the page table entry associated
  *		with the given map/virtual_address pair.
  * [ what about induced faults -wfj]
  */
 
-inline pt_entry_t * __pure
+__inline pt_entry_t * __pure
 pmap_pte(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 
 	if (pmap && *pmap_pde(pmap, va)) {
 		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap) || (frame == ((int) PTDpde & PG_FRAME)))
 			return ((pt_entry_t *) vtopte(va));
 		/* otherwise, we are alternate address space */
 		else {
 			if (frame != ((int) APTDpde & PG_FRAME)) {
 				APTDpde = pmap->pm_pdir[PTDPTDI];
 				pmap_update();
 			}
 			return ((pt_entry_t *) avtopte(va));
 		}
 	}
 	return (0);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 
 vm_offset_t
 pmap_extract(pmap, va)
 	register pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pa;
 
 	if (pmap && *pmap_pde(pmap, va)) {
 		vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 		/* are we current address space or kernel? */
 		if ((pmap == kernel_pmap)
 		    || (frame == ((int) PTDpde & PG_FRAME))) {
 			pa = *(int *) vtopte(va);
 			/* otherwise, we are alternate address space */
 		} else {
 			if (frame != ((int) APTDpde & PG_FRAME)) {
 				APTDpde = pmap->pm_pdir[PTDPTDI];
 				pmap_update();
 			}
 			pa = *(int *) avtopte(va);
 		}
 		return ((pa & PG_FRAME) | (va & ~PG_FRAME));
 	}
 	return 0;
 
 }
 
 /*
  * determine if a page is managed (memory vs. device)
  */
-static inline int
+static __inline int
 pmap_is_managed(pa)
 	vm_offset_t pa;
 {
 	int i;
 
 	if (!pmap_initialized)
 		return 0;
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		if (pa >= phys_avail[i] && pa < phys_avail[i + 1])
 			return 1;
 	}
 	return 0;
 }
 
 /*
  * find the vm_page_t of a pte (only) given va of pte and pmap
  */
 static __inline vm_page_t
 pmap_pte_vm_page(pmap, pt)
 	pmap_t pmap;
 	vm_offset_t pt;
 {
 	vm_page_t m;
 
-	pt = i386_trunc_page(pt);
-	pt = (pt - UPT_MIN_ADDRESS) / NBPG;
+	pt = trunc_page(pt);
+	pt = (pt - UPT_MIN_ADDRESS) / PAGE_SIZE;
 	pt = ((vm_offset_t) pmap->pm_pdir[pt]) & PG_FRAME;
 	m = PHYS_TO_VM_PAGE(pt);
 	return m;
 }
 
 /*
  * Wire a page table page
  */
 __inline void
 pmap_use_pt(pmap, va)
 	pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pt;
 
 	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
 		return;
 
 	pt = (vm_offset_t) vtopte(va);
 	vm_page_hold(pmap_pte_vm_page(pmap, pt));
 }
 
 /*
  * Unwire a page table page
  */
-inline void
+__inline void
 pmap_unuse_pt(pmap, va)
 	pmap_t pmap;
 	vm_offset_t va;
 {
 	vm_offset_t pt;
 	vm_page_t m;
 
 	if ((va >= UPT_MIN_ADDRESS) || !pmap_initialized)
 		return;
 
 	pt = (vm_offset_t) vtopte(va);
 	m = pmap_pte_vm_page(pmap, pt);
 	vm_page_unhold(m);
 	if (pmap != kernel_pmap &&
 	    (m->hold_count == 0) &&
 	    (m->wire_count == 0) &&
 	    (va < KPT_MIN_ADDRESS)) {
+/*
+ * We don't free page-table-pages anymore because it can have a negative
+ * impact on perf at times.  Now we just deactivate, and it'll get cleaned
+ * up if needed...  Also, if the page ends up getting used, it will fault
+ * back into the process address space and be reactivated.
+ */
+#ifdef PMAP_FREE_OLD_PTES
 		pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
 		vm_page_free(m);
+#else
+		m->dirty = 0;
+		vm_page_deactivate(m);
+#endif
 	}
 }
 
 /* [ macro again?, should I force kstack into user map here? -wfj ] */
 void
 pmap_activate(pmap, pcbp)
 	register pmap_t pmap;
 	struct pcb *pcbp;
 {
 	PMAP_ACTIVATE(pmap, pcbp);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(firstaddr, loadaddr)
 	vm_offset_t firstaddr;
 	vm_offset_t loadaddr;
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 
 	avail_start = firstaddr;
 
 	/*
-	 * XXX The calculation of virtual_avail is wrong. It's NKPT*NBPG too
+	 * XXX The calculation of virtual_avail is wrong. It's NKPT*PAGE_SIZE too
 	 * large. It should instead be correctly calculated in locore.s and
 	 * not based on 'first' (which is a physical address, not a virtual
 	 * address, for the start of unused physical memory). The kernel
 	 * page tables are NOT double mapped and thus should not be included
 	 * in this calculation.
 	 */
 	virtual_avail = (vm_offset_t) KERNBASE + firstaddr;
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize protection array.
 	 */
 	i386_protection_init();
 
 	/*
 	 * The kernel's pmap is statically allocated so we don't have to use
 	 * pmap_create, which is unlikely to work correctly at this part of
 	 * the boot sequence (XXX and which no longer exists).
 	 */
 	kernel_pmap = &kernel_pmap_store;
 
 	kernel_pmap->pm_pdir = (pd_entry_t *) (KERNBASE + IdlePTD);
 
 	kernel_pmap->pm_count = 1;
 	nkpt = NKPT;
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
-	v = (c)va; va += ((n)*NBPG); p = pte; pte += (n);
+	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = pmap_pte(kernel_pmap, va);
 
 	/*
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 */
 	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
 	SYSMAP(caddr_t, CMAP2, CADDR2, 1)
 
 	/*
 	 * ptmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
 
 	/*
 	 * msgbufmap is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, msgbufmap, msgbufp, 1)
 
 	virtual_avail = va;
 
 	*(int *) CMAP1 = *(int *) CMAP2 = *(int *) PTD = 0;
 	pmap_update();
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  *	pmap_init has been enhanced to support in a fairly consistant
  *	way, discontiguous physical memory.
  */
 void
 pmap_init(phys_start, phys_end)
 	vm_offset_t phys_start, phys_end;
 {
 	vm_offset_t addr;
 	vm_size_t npg, s;
 	int i;
 
 	/*
 	 * calculate the number of pv_entries needed
 	 */
 	vm_first_phys = phys_avail[0];
 	for (i = 0; phys_avail[i + 1]; i += 2);
-	npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / NBPG;
+	npg = (phys_avail[(i - 2) + 1] - vm_first_phys) / PAGE_SIZE;
 
 	/*
 	 * Allocate memory for random pmap data structures.  Includes the
 	 * pv_head_table.
 	 */
 	s = (vm_size_t) (sizeof(struct pv_entry) * npg);
-	s = i386_round_page(s);
+	s = round_page(s);
 	addr = (vm_offset_t) kmem_alloc(kernel_map, s);
 	pv_table = (pv_entry_t) addr;
 
 	/*
 	 * init the pv free list
 	 */
 	init_pv_entries(npg);
 	/*
 	 * Now it is safe to enable pv_table recording.
 	 */
 	pmap_initialized = TRUE;
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	For now, VM is already on, we only need to map the
  *	specified memory.
  */
 vm_offset_t
 pmap_map(virt, start, end, prot)
 	vm_offset_t virt;
 	vm_offset_t start;
 	vm_offset_t end;
 	int prot;
 {
 	while (start < end) {
 		pmap_enter(kernel_pmap, virt, start, prot, FALSE);
 		virt += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
 	return (virt);
 }
 
+#ifdef PMAP_KEEP_PDIRS
+int nfreepdir;
+caddr_t *pdirlist;
+#define NFREEPDIR 3
+
+static void *
+pmap_getpdir() {
+	caddr_t *pdir;
+	if (pdirlist) {
+		--nfreepdir;
+		pdir = pdirlist;
+		pdirlist = (caddr_t *) *pdir;
+		bzero( (caddr_t) pdir, PAGE_SIZE);
+	} else {
+		pdir = (caddr_t *) kmem_alloc(kernel_map, PAGE_SIZE);
+	}
+
+	return (void *) pdir;
+}
+
+static void
+pmap_freepdir(void *pdir) {
+	if (nfreepdir > NFREEPDIR) {
+		kmem_free(kernel_map, (vm_offset_t) pdir, PAGE_SIZE);
+	} else {
+		* (caddr_t *) pdir = (caddr_t) pdirlist;
+		pdirlist = (caddr_t *) pdir;
+		++nfreepdir;
+	}
+}
+#endif
+
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 void
 pmap_pinit(pmap)
 	register struct pmap *pmap;
 {
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
+
+#ifdef PMAP_KEEP_PDIRS
+	pmap->pm_pdir = pmap_getpdir();
+#else
 	pmap->pm_pdir = (pd_entry_t *) kmem_alloc(kernel_map, PAGE_SIZE);
+#endif
 
 	/* wire in kernel global address entries */
 	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
 
 	/* install self-referential address mapping entry */
 	*(int *) (pmap->pm_pdir + PTDPTDI) =
 	    ((int) pmap_kextract((vm_offset_t) pmap->pm_pdir)) | PG_V | PG_KW;
 
 	pmap->pm_count = 1;
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 
 static vm_page_t nkpg;
 vm_offset_t kernel_vm_end;
 
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	struct proc *p;
 	struct pmap *pmap;
 	int s;
 
 	s = splhigh();
 	if (kernel_vm_end == 0) {
 		kernel_vm_end = KERNBASE;
 		nkpt = 0;
 		while (pdir_pde(PTD, kernel_vm_end)) {
-			kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			++nkpt;
 		}
 	}
-	addr = (addr + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
-			kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+			kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 			continue;
 		}
 		++nkpt;
 		if (!nkpg) {
 			nkpg = vm_page_alloc(kernel_object, 0, VM_ALLOC_SYSTEM);
 			if (!nkpg)
 				panic("pmap_growkernel: no memory to grow kernel");
 			vm_page_wire(nkpg);
 			vm_page_remove(nkpg);
 			pmap_zero_page(VM_PAGE_TO_PHYS(nkpg));
 		}
 		pdir_pde(PTD, kernel_vm_end) = (pd_entry_t) (VM_PAGE_TO_PHYS(nkpg) | PG_V | PG_KW);
 		nkpg = NULL;
 
 		for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 			if (p->p_vmspace) {
 				pmap = &p->p_vmspace->vm_pmap;
 				*pmap_pde(pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
 			}
 		}
 		*pmap_pde(kernel_pmap, kernel_vm_end) = pdir_pde(PTD, kernel_vm_end);
-		kernel_vm_end = (kernel_vm_end + NBPG * NPTEPG) & ~(NBPG * NPTEPG - 1);
+		kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
 	}
 	splx(s);
 }
 
 /*
  *	Retire the given physical map from service.
  *	Should only be called if the map contains
  *	no valid mappings.
  */
 void
 pmap_destroy(pmap)
 	register pmap_t pmap;
 {
 	int count;
 
 	if (pmap == NULL)
 		return;
 
 	count = --pmap->pm_count;
 	if (count == 0) {
 		pmap_release(pmap);
 		free((caddr_t) pmap, M_VMPMAP);
 	}
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap)
 	register struct pmap *pmap;
 {
+#ifdef PMAP_KEEP_PDIRS
+	pmap_freepdir( (void *)pmap->pm_pdir);
+#else
 	kmem_free(kernel_map, (vm_offset_t) pmap->pm_pdir, PAGE_SIZE);
+#endif
 }
 
 /*
  *	Add a reference to the specified pmap.
  */
 void
 pmap_reference(pmap)
 	pmap_t pmap;
 {
 	if (pmap != NULL) {
 		pmap->pm_count++;
 	}
 }
 
-#define PV_FREELIST_MIN ((NBPG / sizeof (struct pv_entry)) / 2)
+#define PV_FREELIST_MIN ((PAGE_SIZE / sizeof (struct pv_entry)) / 2)
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static int pv_freelistcnt;
 static pv_entry_t pv_freelist;
 static vm_offset_t pvva;
 static int npvvapg;
 
 /*
  * free the pv_entry back to the free list
  */
-inline static void
+static __inline void
 free_pv_entry(pv)
 	pv_entry_t pv;
 {
 	if (!pv)
 		return;
 	++pv_freelistcnt;
 	pv->pv_next = pv_freelist;
 	pv_freelist = pv;
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  * the memory allocation is performed bypassing the malloc code
  * because of the possibility of allocations at interrupt time.
  */
-static inline pv_entry_t
+static __inline pv_entry_t
 get_pv_entry()
 {
 	pv_entry_t tmp;
 
 	/*
 	 * get more pv_entry pages if needed
 	 */
 	if (pv_freelistcnt < PV_FREELIST_MIN || pv_freelist == 0) {
 		pmap_alloc_pv_entry();
 	}
 	/*
 	 * get a pv_entry off of the free list
 	 */
 	--pv_freelistcnt;
 	tmp = pv_freelist;
 	pv_freelist = tmp->pv_next;
 	return tmp;
 }
 
 /*
  * this *strange* allocation routine *statistically* eliminates the
  * *possibility* of a malloc failure (*FATAL*) for a pv_entry_t data structure.
  * also -- this code is MUCH MUCH faster than the malloc equiv...
  */
 static void
 pmap_alloc_pv_entry()
 {
 	/*
 	 * do we have any pre-allocated map-pages left?
 	 */
 	if (npvvapg) {
 		vm_page_t m;
 
 		/*
 		 * we do this to keep recursion away
 		 */
 		pv_freelistcnt += PV_FREELIST_MIN;
 		/*
 		 * allocate a physical page out of the vm system
 		 */
 		m = vm_page_alloc(kernel_object,
 		    OFF_TO_IDX(pvva - vm_map_min(kernel_map)),
 		    VM_ALLOC_INTERRUPT);
 		if (m) {
 			int newentries;
 			int i;
 			pv_entry_t entry;
 
-			newentries = (NBPG / sizeof(struct pv_entry));
+			newentries = (PAGE_SIZE / sizeof(struct pv_entry));
 			/*
 			 * wire the page
 			 */
 			vm_page_wire(m);
 			m->flags &= ~PG_BUSY;
 			/*
 			 * let the kernel see it
 			 */
 			pmap_kenter(pvva, VM_PAGE_TO_PHYS(m));
 
 			entry = (pv_entry_t) pvva;
 			/*
 			 * update the allocation pointers
 			 */
-			pvva += NBPG;
+			pvva += PAGE_SIZE;
 			--npvvapg;
 
 			/*
 			 * free the entries into the free list
 			 */
 			for (i = 0; i < newentries; i++) {
 				free_pv_entry(entry);
 				entry++;
 			}
 		}
 		pv_freelistcnt -= PV_FREELIST_MIN;
 	}
 	if (!pv_freelist)
 		panic("get_pv_entry: cannot get a pv_entry_t");
 }
 
 
 
 /*
  * init the pv_entry allocation system
  */
 #define PVSPERPAGE 64
 void
 init_pv_entries(npg)
 	int npg;
 {
 	/*
 	 * allocate enough kvm space for PVSPERPAGE entries per page (lots)
 	 * kvm space is fairly cheap, be generous!!!  (the system can panic if
 	 * this is too small.)
 	 */
-	npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry) + NBPG - 1) / NBPG;
-	pvva = kmem_alloc_pageable(kernel_map, npvvapg * NBPG);
+	npvvapg = ((npg * PVSPERPAGE) * sizeof(struct pv_entry)
+		+ PAGE_SIZE - 1) / PAGE_SIZE;
+	pvva = kmem_alloc_pageable(kernel_map, npvvapg * PAGE_SIZE);
 	/*
 	 * get the first batch of entries
 	 */
 	free_pv_entry(get_pv_entry());
 }
 
 static pt_entry_t *
 get_pt_entry(pmap)
 	pmap_t pmap;
 {
 	vm_offset_t frame = (int) pmap->pm_pdir[PTDPTDI] & PG_FRAME;
 
 	/* are we current address space or kernel? */
 	if (pmap == kernel_pmap || frame == ((int) PTDpde & PG_FRAME)) {
 		return PTmap;
 	}
 	/* otherwise, we are alternate address space */
 	if (frame != ((int) APTDpde & PG_FRAME)) {
 		APTDpde = pmap->pm_pdir[PTDPTDI];
 		pmap_update();
 	}
 	return APTmap;
 }
 
 /*
  * If it is the first entry on the list, it is actually
  * in the header and we must copy the following entry up
  * to the header.  Otherwise we must search the list for
  * the entry.  In either case we free the now unused entry.
  */
 static void
 pmap_remove_entry(pmap, pv, va)
 	struct pmap *pmap;
 	pv_entry_t pv;
 	vm_offset_t va;
 {
 	pv_entry_t npv;
 	int s;
 
 	s = splhigh();
 	if (pmap == pv->pv_pmap && va == pv->pv_va) {
 		npv = pv->pv_next;
 		if (npv) {
 			*pv = *npv;
 			free_pv_entry(npv);
 		} else {
 			pv->pv_pmap = NULL;
 		}
 	} else {
-		for (npv = pv->pv_next; npv; npv = npv->pv_next) {
+		for (npv = pv->pv_next; npv; (pv = npv, npv = pv->pv_next)) {
 			if (pmap == npv->pv_pmap && va == npv->pv_va) {
 				break;
 			}
-			pv = npv;
 		}
 		if (npv) {
 			pv->pv_next = npv->pv_next;
 			free_pv_entry(npv);
 		}
 	}
 	splx(s);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap, sva, eva)
 	struct pmap *pmap;
 	register vm_offset_t sva;
 	register vm_offset_t eva;
 {
 	register pt_entry_t *ptp, *ptq;
 	vm_offset_t pa;
 	register pv_entry_t pv;
 	vm_offset_t va;
 	pt_entry_t oldpte;
 
 	if (pmap == NULL)
 		return;
 
 	ptp = get_pt_entry(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
-	if ((sva + NBPG) == eva) {
+	if ((sva + PAGE_SIZE) == eva) {
 
 		if (*pmap_pde(pmap, sva) == 0)
 			return;
 
 		ptq = ptp + i386_btop(sva);
 
 		if (!*ptq)
 			return;
 		/*
 		 * Update statistics
 		 */
 		if (pmap_pte_w(ptq))
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
 		pa = pmap_pte_pa(ptq);
 		oldpte = *ptq;
 		*ptq = 0;
 
 		if (pmap_is_managed(pa)) {
 			if ((int) oldpte & PG_M) {
-				if (sva < USRSTACK + (UPAGES * NBPG) ||
+				if (sva < USRSTACK + (UPAGES * PAGE_SIZE) ||
 				    (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) {
 					PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 				}
 			}
 			pv = pa_to_pvh(pa);
 			pmap_remove_entry(pmap, pv, sva);
 		}
 		pmap_unuse_pt(pmap, sva);
-		pmap_update();
+		pmap_update_1pg(sva);
 		return;
 	}
 	sva = i386_btop(sva);
 	eva = i386_btop(eva);
 
 	while (sva < eva) {
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 
 		if (*pmap_pde(pmap, i386_ptob(sva)) == 0) {
 			/* We can race ahead here, straight to next pde.. */
 			sva = ((sva + NPTEPG) & ~(NPTEPG - 1));
 			continue;
 		}
 		ptq = ptp + sva;
 
 		/*
 		 * search for page table entries, use string operations that
 		 * are much faster than explicitly scanning when page tables
 		 * are not fully populated.
 		 */
 		if (*ptq == 0) {
 			vm_offset_t pdnxt = ((sva + NPTEPG) & ~(NPTEPG - 1));
 			vm_offset_t nscan = pdnxt - sva;
 			int found = 0;
 
 			if ((nscan + sva) > eva)
 				nscan = eva - sva;
 
 			asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :
 			    "=D"(ptq), "=a"(found) : "c"(nscan), "0"(ptq) : "cx");
 
 			if (!found) {
 				sva = pdnxt;
 				continue;
 			}
 			ptq -= 1;
 
 			sva = ptq - ptp;
 		}
 		/*
 		 * Update statistics
 		 */
 		oldpte = *ptq;
 		if (((int) oldpte) & PG_W)
 			pmap->pm_stats.wired_count--;
 		pmap->pm_stats.resident_count--;
 
 		/*
 		 * Invalidate the PTEs. XXX: should cluster them up and
 		 * invalidate as many as possible at once.
 		 */
 		*ptq = 0;
 
 		va = i386_ptob(sva);
 
 		/*
 		 * Remove from the PV table (raise IPL since we may be called
 		 * at interrupt time).
 		 */
 		pa = ((int) oldpte) & PG_FRAME;
 		if (!pmap_is_managed(pa)) {
-			pmap_unuse_pt(pmap, va);
+			pmap_unuse_pt(pmap, (vm_offset_t) va);
 			++sva;
 			continue;
 		}
 		if ((int) oldpte & PG_M) {
-			if (sva < USRSTACK + (UPAGES * NBPG) ||
+			if (sva < USRSTACK + (UPAGES * PAGE_SIZE) ||
 			    (sva >= KERNBASE && (sva < clean_sva || sva >= clean_eva))) {
 				PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 			}
 		}
 		pv = pa_to_pvh(pa);
 		pmap_remove_entry(pmap, pv, va);
 		pmap_unuse_pt(pmap, va);
 		++sva;
 	}
 	pmap_update();
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 static void
 pmap_remove_all(pa)
 	vm_offset_t pa;
 {
-	register pv_entry_t pv, npv;
+	register pv_entry_t pv, opv, npv;
 	register pt_entry_t *pte, *ptp;
 	vm_offset_t va;
 	struct pmap *pmap;
 	vm_page_t m;
 	int s;
 	int anyvalid = 0;
 
 	/*
 	 * Not one of ours
 	 */
 	/*
 	 * XXX this makes pmap_page_protect(NONE) illegal for non-managed
 	 * pages!
 	 */
 	if (!pmap_is_managed(pa))
 		return;
 
-	pa = i386_trunc_page(pa);
-	pv = pa_to_pvh(pa);
-	m = PHYS_TO_VM_PAGE(pa);
+	pa = trunc_page(pa);
+	opv = pa_to_pvh(pa);
+	if (opv->pv_pmap == NULL)
+		return;
 
+	m = PHYS_TO_VM_PAGE(pa);
 	s = splhigh();
-	while (pv->pv_pmap != NULL) {
-		pmap = pv->pv_pmap;
+	pv = opv;
+	while (pv && ((pmap = pv->pv_pmap) != NULL)) {
 		ptp = get_pt_entry(pmap);
 		va = pv->pv_va;
 		pte = ptp + i386_btop(va);
 		if (pmap_pte_w(pte))
 			pmap->pm_stats.wired_count--;
 		if (*pte) {
 			pmap->pm_stats.resident_count--;
-			anyvalid++;
+			if (curproc != pageproc)
+				anyvalid++;
 
 			/*
 			 * Update the vm_page_t clean and reference bits.
 			 */
 			if ((int) *pte & PG_M) {
-				if (va < USRSTACK + (UPAGES * NBPG) ||
+				if (va < USRSTACK + (UPAGES * PAGE_SIZE) ||
 				    (va >= KERNBASE && (va < clean_sva || va >= clean_eva))) {
 					PHYS_TO_VM_PAGE(pa)->dirty |= VM_PAGE_BITS_ALL;
 				}
 			}
 			*pte = 0;
 			pmap_unuse_pt(pmap, va);
 		}
+		pv = pv->pv_next;
+	}
+
+	for (pv = opv->pv_next; pv; pv = npv) {
 		npv = pv->pv_next;
-		if (npv) {
-			*pv = *npv;
-			free_pv_entry(npv);
-		} else {
-			pv->pv_pmap = NULL;
-		}
+		free_pv_entry(pv);
 	}
+
+	opv->pv_pmap = NULL;
+	opv->pv_next = NULL;
+		
 	splx(s);
 	if (anyvalid)
 		pmap_update();
 }
 
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap, sva, eva, prot)
 	register pmap_t pmap;
 	vm_offset_t sva, eva;
 	vm_prot_t prot;
 {
 	register pt_entry_t *pte;
 	register vm_offset_t va;
 	int i386prot;
 	register pt_entry_t *ptp;
 	int evap = i386_btop(eva);
 	int anyvalid = 0;;
 
 	if (pmap == NULL)
 		return;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	ptp = get_pt_entry(pmap);
 
 	va = sva;
 	while (va < eva) {
 		int found = 0;
 		int svap;
 		vm_offset_t nscan;
 
 		/*
 		 * Page table page is not allocated. Skip it, we don't want to
 		 * force allocation of unnecessary PTE pages just to set the
 		 * protection.
 		 */
 		if (!*pmap_pde(pmap, va)) {
 			/* XXX: avoid address wrap around */
 	nextpde:
 			if (va >= i386_trunc_pdr((vm_offset_t) - 1))
 				break;
 			va = i386_round_pdr(va + PAGE_SIZE);
 			continue;
 		}
 		pte = ptp + i386_btop(va);
 
 		if (*pte == 0) {
 			/*
 			 * scan for a non-empty pte
 			 */
 			svap = pte - ptp;
 			nscan = ((svap + NPTEPG) & ~(NPTEPG - 1)) - svap;
 
 			if (nscan + svap > evap)
 				nscan = evap - svap;
 
 			found = 0;
 			if (nscan)
 				asm("xorl %%eax,%%eax;cld;repe;scasl;jz 1f;incl %%eax;1:;" :
 				    "=D"(pte), "=a"(found) : "c"(nscan), "0"(pte) : "cx");
 
 			if (!found)
 				goto nextpde;
 
 			pte -= 1;
 			svap = pte - ptp;
 
 			va = i386_ptob(svap);
 		}
 		anyvalid++;
 
 		i386prot = pte_prot(pmap, prot);
 		if (va < UPT_MAX_ADDRESS) {
 			i386prot |= PG_u;
 			if (va >= UPT_MIN_ADDRESS)
 				i386prot |= PG_RW;
 		}
 		pmap_pte_set_prot(pte, i386prot);
 		va += PAGE_SIZE;
 	}
 	if (anyvalid)
 		pmap_update();
 }
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 void
 pmap_enter(pmap, va, pa, prot, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 	vm_prot_t prot;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 	register pt_entry_t npte;
 	vm_offset_t opa;
 	int ptevalid = 0;
 
 	if (pmap == NULL)
 		return;
 
-	va = i386_trunc_page(va);
-	pa = i386_trunc_page(pa);
+	va = trunc_page(va);
+	pa = trunc_page(pa);
 	if (va > VM_MAX_KERNEL_ADDRESS)
 		panic("pmap_enter: toobig");
 
 	/*
 	 * Page Directory table entry not valid, we need a new PT page
 	 */
 	if (*pmap_pde(pmap, va) == 0) {
 		printf("kernel page directory invalid pdir=%p, va=0x%lx\n",
 			pmap->pm_pdir[PTDPTDI], va);
 		panic("invalid kernel page directory");
 	}
 	pte = pmap_pte(pmap, va);
 	opa = pmap_pte_pa(pte);
 
 	/*
 	 * Mapping has not changed, must be protection or wiring change.
 	 */
 	if (opa == pa) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if (wired && !pmap_pte_w(pte))
 			pmap->pm_stats.wired_count++;
 		else if (!wired && pmap_pte_w(pte))
 			pmap->pm_stats.wired_count--;
 
 		goto validate;
 	}
 	/*
 	 * Mapping has changed, invalidate old range and fall through to
 	 * handle validating new mapping.
 	 */
 	if (opa) {
 		pmap_remove(pmap, va, va + PAGE_SIZE);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 	if (pmap_is_managed(pa)) {
 		register pv_entry_t pv, npv;
 		int s;
 
 		pv = pa_to_pvh(pa);
 		s = splhigh();
 		/*
 		 * No entries yet, use header as the first entry
 		 */
 		if (pv->pv_pmap == NULL) {
 			pv->pv_va = va;
 			pv->pv_pmap = pmap;
 			pv->pv_next = NULL;
 		}
 		/*
 		 * There is at least one other VA mapping this page. Place
 		 * this entry after the header.
 		 */
 		else {
 			npv = get_pv_entry();
 			npv->pv_va = va;
 			npv->pv_pmap = pmap;
 			npv->pv_next = pv->pv_next;
 			pv->pv_next = npv;
 		}
 		splx(s);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 	if (wired)
 		pmap->pm_stats.wired_count++;
 
 validate:
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	npte = (pt_entry_t) ((int) (pa | pte_prot(pmap, prot) | PG_V));
 
 	/*
 	 * When forking (copy-on-write, etc): A process will turn off write
 	 * permissions for any of its writable pages.  If the data (object) is
 	 * only referred to by one process, the processes map is modified
 	 * directly as opposed to using the object manipulation routine.  When
 	 * using pmap_protect, the modified bits are not kept in the vm_page_t
 	 * data structure.  Therefore, when using pmap_enter in vm_fault to
 	 * bring back writability of a page, there has been no memory of the
 	 * modified or referenced bits except at the pte level.  this clause
 	 * supports the carryover of the modified and used (referenced) bits.
 	 */
 	if (pa == opa)
 		(int) npte |= (int) *pte & (PG_M | PG_U);
 
 	if (wired)
 		(int) npte |= PG_W;
 	if (va < UPT_MIN_ADDRESS)
 		(int) npte |= PG_u;
 	else if (va < UPT_MAX_ADDRESS)
 		(int) npte |= PG_u | PG_RW;
 
 	if (*pte != npte) {
 		if (*pte)
 			ptevalid++;
 		*pte = npte;
 	}
 	if (ptevalid) {
-		pmap_update();
+		pmap_update_1pg(va);
 	} else {
 		pmap_use_pt(pmap, va);
 	}
 }
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  */
 void
 pmap_qenter(va, m, count)
 	vm_offset_t va;
 	vm_page_t *m;
 	int count;
 {
 	int i;
 	int anyvalid = 0;
 	register pt_entry_t *pte;
 
 	for (i = 0; i < count; i++) {
-		pte = vtopte(va + i * NBPG);
-		if (*pte)
-			anyvalid++;
-		*pte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V));
+		vm_offset_t tva = va + i * PAGE_SIZE;
+		pt_entry_t npte = (pt_entry_t) ((int) (VM_PAGE_TO_PHYS(m[i]) | PG_RW | PG_V));
+		pte = vtopte(tva);
+		if (*pte && (*pte != npte))
+			pmap_update_1pg(tva);
+		*pte = npte;
 	}
-	if (anyvalid)
-		pmap_update();
 }
 /*
  * this routine jerks page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(va, count)
 	vm_offset_t va;
 	int count;
 {
 	int i;
 	register pt_entry_t *pte;
 
 	for (i = 0; i < count; i++) {
-		pte = vtopte(va + i * NBPG);
+		vm_offset_t tva = va + i * PAGE_SIZE;
+		pte = vtopte(tva);
 		*pte = 0;
+		pmap_update_1pg(tva);
 	}
-	pmap_update();
 }
 
 /*
  * add a wired page to the kva
  * note that in order for the mapping to take effect -- you
  * should do a pmap_update after doing the pmap_kenter...
  */
 void
 pmap_kenter(va, pa)
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register pt_entry_t *pte;
 	int wasvalid = 0;
 
 	pte = vtopte(va);
 
 	if (*pte)
 		wasvalid++;
 
 	*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V));
 
 	if (wasvalid)
-		pmap_update();
+		pmap_update_1pg(va);
 }
 
 /*
  * remove a page from the kernel pagetables
  */
 void
 pmap_kremove(va)
 	vm_offset_t va;
 {
 	register pt_entry_t *pte;
 
 	pte = vtopte(va);
 
 	*pte = (pt_entry_t) 0;
-	pmap_update();
+	pmap_update_1pg(va);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * 5. Tlbflush is deferred to calling procedure.
  * 6. Page IS managed.
  * but is *MUCH* faster than pmap_enter...
  */
 
-static inline void
+static __inline void
 pmap_enter_quick(pmap, va, pa)
 	register pmap_t pmap;
 	vm_offset_t va;
 	register vm_offset_t pa;
 {
 	register pt_entry_t *pte;
 	register pv_entry_t pv, npv;
 	int s;
 
 	/*
 	 * Enter on the PV list if part of our managed memory Note that we
 	 * raise IPL while manipulating pv_table since pmap_enter can be
 	 * called at interrupt time.
 	 */
 
 	pte = vtopte(va);
 
 	/* a fault on the page table might occur here */
 	if (*pte) {
 		pmap_remove(pmap, va, va + PAGE_SIZE);
 	}
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 	/*
 	 * No entries yet, use header as the first entry
 	 */
 	if (pv->pv_pmap == NULL) {
 		pv->pv_pmap = pmap;
 		pv->pv_va = va;
 		pv->pv_next = NULL;
 	}
 	/*
 	 * There is at least one other VA mapping this page. Place this entry
 	 * after the header.
 	 */
 	else {
 		npv = get_pv_entry();
 		npv->pv_va = va;
 		npv->pv_pmap = pmap;
 		npv->pv_next = pv->pv_next;
 		pv->pv_next = npv;
 	}
 	splx(s);
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	/*
 	 * Now validate mapping with desired protection/wiring.
 	 */
 	*pte = (pt_entry_t) ((int) (pa | PG_V | PG_u));
 
 	pmap_use_pt(pmap, va);
 
 	return;
 }
 
-#define MAX_INIT_PT (512 * 4096)
+#define MAX_INIT_PT (512)
 /*
  * pmap_object_init_pt preloads the ptes for a given object
  * into the specified pmap.  This eliminates the blast of soft
  * faults on process startup and immediately after an mmap.
  */
 void
 pmap_object_init_pt(pmap, addr, object, pindex, size)
 	pmap_t pmap;
 	vm_offset_t addr;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_size_t size;
 {
 	vm_offset_t tmpidx;
 	int psize;
 	vm_page_t p;
 	int objpgs;
 
-	if (!pmap || ((size > MAX_INIT_PT) &&
-		(object->resident_page_count > MAX_INIT_PT / PAGE_SIZE))) {
+	psize = (size >> PAGE_SHIFT);
+
+	if (!pmap || ((psize > MAX_INIT_PT) &&
+		(object->resident_page_count > MAX_INIT_PT))) {
 		return;
 	}
 
-	psize = (size >> PAGE_SHIFT);
 	/*
 	 * if we are processing a major portion of the object, then scan the
 	 * entire thing.
 	 */
 	if (psize > (object->size >> 2)) {
 		objpgs = psize;
 
 		for (p = object->memq.tqh_first;
 		    ((objpgs > 0) && (p != NULL));
 		    p = p->listq.tqe_next) {
 
 			tmpidx = p->pindex;
 			if (tmpidx < pindex) {
 				continue;
 			}
 			tmpidx -= pindex;
 			if (tmpidx >= psize) {
 				continue;
 			}
-			if (((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) &&
-			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
-			    (p->bmapped == 0) &&
+			if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->busy == 0) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-				if (p->flags & PG_CACHE)
+				if (p->queue == PQ_CACHE)
 					vm_page_deactivate(p);
 				vm_page_hold(p);
 				p->flags |= PG_MAPPED;
 				pmap_enter_quick(pmap,
 					addr + (tmpidx << PAGE_SHIFT),
 					VM_PAGE_TO_PHYS(p));
 				vm_page_unhold(p);
 			}
 			objpgs -= 1;
 		}
 	} else {
 		/*
 		 * else lookup the pages one-by-one.
 		 */
 		for (tmpidx = 0; tmpidx < psize; tmpidx += 1) {
 			p = vm_page_lookup(object, tmpidx + pindex);
-			if (p &&
-			    ((p->flags & (PG_ACTIVE | PG_INACTIVE | PG_CACHE)) != 0) &&
-			    (p->bmapped == 0) &&
-			    (p->busy == 0) &&
+			if (p && (p->busy == 0) &&
 			    ((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
 			    (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
-				if (p->flags & PG_CACHE)
+				if (p->queue == PQ_CACHE)
 					vm_page_deactivate(p);
 				vm_page_hold(p);
 				p->flags |= PG_MAPPED;
 				pmap_enter_quick(pmap,
 					addr + (tmpidx << PAGE_SHIFT),
 					VM_PAGE_TO_PHYS(p));
 				vm_page_unhold(p);
 			}
 		}
 	}
 }
 
 /*
+ * pmap_prefault provides a quick way of clustering
+ * pagefaults into a processes address space.  It is a "cousin"
+ * of pmap_object_init_pt, except it runs at page fault time instead
+ * of mmap time.
+ */
+#define PFBAK 2
+#define PFFOR 2
+#define PAGEORDER_SIZE (PFBAK+PFFOR)
+
+static int pmap_prefault_pageorder[] = {
+	-NBPG, NBPG, -2 * NBPG, 2 * NBPG
+};
+
+void
+pmap_prefault(pmap, addra, entry, object)
+	pmap_t pmap;
+	vm_offset_t addra;
+	vm_map_entry_t entry;
+	vm_object_t object;
+{
+	int i;
+	vm_offset_t starta;
+	vm_offset_t addr;
+	vm_pindex_t pindex;
+	vm_page_t m;
+	int pageorder_index;
+
+	if (entry->object.vm_object != object)
+		return;
+
+	if (!curproc || (pmap != &curproc->p_vmspace->vm_pmap))
+		return;
+
+	starta = addra - PFBAK * PAGE_SIZE;
+	if (starta < entry->start) {
+		starta = entry->start;
+	} else if (starta > addra) {
+		starta = 0;
+	}
+
+	for (i = 0; i < PAGEORDER_SIZE; i++) {
+		vm_object_t lobject;
+		pt_entry_t *pte;
+
+		addr = addra + pmap_prefault_pageorder[i];
+		if (addr < starta || addr >= entry->end)
+			continue;
+
+		pte = vtopte(addr);
+		if (*pte)
+			continue;
+
+		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
+		lobject = object;
+		for (m = vm_page_lookup(lobject, pindex);
+		    (!m && (lobject->type == OBJT_DEFAULT) && (lobject->backing_object));
+		    lobject = lobject->backing_object) {
+			if (lobject->backing_object_offset & (PAGE_MASK-1))
+				break;
+			pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
+			m = vm_page_lookup(lobject->backing_object, pindex);
+		}
+
+		/*
+		 * give-up when a page is not in memory
+		 */
+		if (m == NULL)
+			break;
+
+		if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
+		    (m->busy == 0) &&
+		    (m->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
+
+			if (m->queue == PQ_CACHE) {
+				if (cnt.v_free_count + cnt.v_cache_count <
+					cnt.v_free_min)
+					break;
+				vm_page_deactivate(m);
+			}
+			vm_page_hold(m);
+			m->flags |= PG_MAPPED;
+			pmap_enter_quick(pmap, addr, VM_PAGE_TO_PHYS(m));
+			vm_page_unhold(m);
+		}
+	}
+}
+
+/*
  *	Routine:	pmap_change_wiring
  *	Function:	Change the wiring attribute for a map/virtual-address
  *			pair.
  *	In/out conditions:
  *			The mapping must already exist in the pmap.
  */
 void
 pmap_change_wiring(pmap, va, wired)
 	register pmap_t pmap;
 	vm_offset_t va;
 	boolean_t wired;
 {
 	register pt_entry_t *pte;
 
 	if (pmap == NULL)
 		return;
 
 	pte = pmap_pte(pmap, va);
 
 	if (wired && !pmap_pte_w(pte))
 		pmap->pm_stats.wired_count++;
 	else if (!wired && pmap_pte_w(pte))
 		pmap->pm_stats.wired_count--;
 
 	/*
 	 * Wiring is not a hardware characteristic so there is no need to
 	 * invalidate TLB.
 	 */
 	pmap_pte_set_w(pte, wired);
-	/*
-	 * When unwiring, set the modified bit in the pte -- could have been
-	 * changed by the kernel
-	 */
-	if (!wired)
-		(int) *pte |= PG_M;
 }
 
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
 	pmap_t dst_pmap, src_pmap;
 	vm_offset_t dst_addr;
 	vm_size_t len;
 	vm_offset_t src_addr;
 {
 }
 
 /*
  *	Routine:	pmap_kernel
  *	Function:
  *		Returns the physical map handle for the kernel.
  */
 pmap_t
 pmap_kernel()
 {
 	return (kernel_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bzero to clear its contents, one machine dependent page
  *	at a time.
  */
 void
 pmap_zero_page(phys)
 	vm_offset_t phys;
 {
 	if (*(int *) CMAP2)
 		panic("pmap_zero_page: CMAP busy");
 
-	*(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(phys);
-	bzero(CADDR2, NBPG);
+	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(phys);
+	bzero(CADDR2, PAGE_SIZE);
 
 	*(int *) CMAP2 = 0;
-	pmap_update();
+	pmap_update_1pg((vm_offset_t) CADDR2);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(src, dst)
 	vm_offset_t src;
 	vm_offset_t dst;
 {
 	if (*(int *) CMAP1 || *(int *) CMAP2)
 		panic("pmap_copy_page: CMAP busy");
 
-	*(int *) CMAP1 = PG_V | PG_KW | i386_trunc_page(src);
-	*(int *) CMAP2 = PG_V | PG_KW | i386_trunc_page(dst);
+	*(int *) CMAP1 = PG_V | PG_KW | trunc_page(src);
+	*(int *) CMAP2 = PG_V | PG_KW | trunc_page(dst);
 
 #if __GNUC__ > 1
-	memcpy(CADDR2, CADDR1, NBPG);
+	memcpy(CADDR2, CADDR1, PAGE_SIZE);
 #else
-	bcopy(CADDR1, CADDR2, NBPG);
+	bcopy(CADDR1, CADDR2, PAGE_SIZE);
 #endif
 	*(int *) CMAP1 = 0;
 	*(int *) CMAP2 = 0;
-	pmap_update();
+	pmap_update_2pg( (vm_offset_t) CADDR1, (vm_offset_t) CADDR2);
 }
 
 
 /*
  *	Routine:	pmap_pageable
  *	Function:
  *		Make the specified pages (by pmap, offset)
  *		pageable (or not) as requested.
  *
  *		A page which is not pageable may not take
  *		a fault; therefore, its page table entry
  *		must remain valid for the duration.
  *
  *		This routine is merely advisory; pmap_enter
  *		will specify that these pages are to be wired
  *		down (or not) as appropriate.
  */
 void
 pmap_pageable(pmap, sva, eva, pageable)
 	pmap_t pmap;
 	vm_offset_t sva, eva;
 	boolean_t pageable;
 {
 }
 
 /*
  * this routine returns true if a physical page resides
  * in the given pmap.
  */
 boolean_t
 pmap_page_exists(pmap, pa)
 	pmap_t pmap;
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			if (pv->pv_pmap == pmap) {
 				splx(s);
 				return TRUE;
 			}
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * pmap_testbit tests bits in pte's
  * note that the testbit/changebit routines are inline,
  * and a lot of things compile-time evaluate.
  */
 static __inline boolean_t
 pmap_testbit(pa, bit)
 	register vm_offset_t pa;
 	int bit;
 {
 	register pv_entry_t pv;
 	pt_entry_t *pte;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return FALSE;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Not found, check current mappings returning immediately if found.
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			/*
 			 * if the bit being tested is the modified bit, then
 			 * mark UPAGES as always modified, and ptes as never
 			 * modified.
 			 */
-			if (bit & PG_U) {
+			if (bit & (PG_U|PG_M)) {
 				if ((pv->pv_va >= clean_sva) && (pv->pv_va < clean_eva)) {
 					continue;
 				}
 			}
-			if (bit & PG_M) {
-				if (pv->pv_va >= USRSTACK) {
-					if (pv->pv_va >= clean_sva && pv->pv_va < clean_eva) {
-						continue;
-					}
-					if (pv->pv_va < USRSTACK + (UPAGES * NBPG)) {
-						splx(s);
-						return TRUE;
-					} else if (pv->pv_va < KERNBASE) {
-						splx(s);
-						return FALSE;
-					}
-				}
-			}
 			if (!pv->pv_pmap) {
 				printf("Null pmap (tb) at va: 0x%lx\n", pv->pv_va);
 				continue;
 			}
 			pte = pmap_pte(pv->pv_pmap, pv->pv_va);
 			if ((int) *pte & bit) {
 				splx(s);
 				return TRUE;
 			}
 		}
 	}
 	splx(s);
 	return (FALSE);
 }
 
 /*
  * this routine is used to modify bits in ptes
  */
 static __inline void
 pmap_changebit(pa, bit, setem)
 	vm_offset_t pa;
 	int bit;
 	boolean_t setem;
 {
 	register pv_entry_t pv;
 	register pt_entry_t *pte, npte;
 	vm_offset_t va;
+	int changed;
 	int s;
 
 	if (!pmap_is_managed(pa))
 		return;
 
 	pv = pa_to_pvh(pa);
 	s = splhigh();
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
 	 * setting RO do we need to clear the VAC?
 	 */
 	if (pv->pv_pmap != NULL) {
 		for (; pv; pv = pv->pv_next) {
 			va = pv->pv_va;
 
 			/*
 			 * don't write protect pager mappings
 			 */
 			if (!setem && (bit == PG_RW)) {
 				if (va >= clean_sva && va < clean_eva)
 					continue;
 			}
 			if (!pv->pv_pmap) {
 				printf("Null pmap (cb) at va: 0x%lx\n", va);
 				continue;
 			}
 			pte = pmap_pte(pv->pv_pmap, va);
-			if (setem)
+			if (setem) {
 				(int) npte = (int) *pte | bit;
-			else
+			} else {
 				(int) npte = (int) *pte & ~bit;
+			}
 			*pte = npte;
 		}
 	}
 	splx(s);
-	pmap_update();
+	if (curproc != pageproc)
+		pmap_update();
 }
 
 /*
  *      pmap_page_protect:
  *
  *      Lower the permission for all mappings to a given page.
  */
 void
 pmap_page_protect(phys, prot)
 	vm_offset_t phys;
 	vm_prot_t prot;
 {
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if (prot & (VM_PROT_READ | VM_PROT_EXECUTE))
 			pmap_changebit(phys, PG_RW, FALSE);
 		else
 			pmap_remove_all(phys);
 	}
 }
 
 vm_offset_t
 pmap_phys_address(ppn)
 	int ppn;
 {
 	return (i386_ptob(ppn));
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	by any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_U);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_offset_t pa)
 {
 	return pmap_testbit((pa), PG_M);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_M, FALSE);
 }
 
 /*
  *	pmap_clear_reference:
  *
  *	Clear the reference bit on the specified physical page.
  */
 void
 pmap_clear_reference(vm_offset_t pa)
 {
 	pmap_changebit((pa), PG_U, FALSE);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 static void
 i386_protection_init()
 {
 	register int *kp, prot;
 
 	kp = protection_codes;
 	for (prot = 0; prot < 8; prot++) {
 		switch (prot) {
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
 			/*
 			 * Read access is also 0. There isn't any execute bit,
 			 * so just make it readable.
 			 */
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
 		case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
 			*kp++ = 0;
 			break;
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
 		case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
 			*kp++ = PG_RW;
 			break;
 		}
 	}
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory. The non-cacheable bits are set on each
  * mapped page.
  */
 void *
 pmap_mapdev(pa, size)
 	vm_offset_t pa;
 	vm_size_t size;
 {
 	vm_offset_t va, tmpva;
 	pt_entry_t *pte;
 
 	pa = trunc_page(pa);
 	size = roundup(size, PAGE_SIZE);
 
 	va = kmem_alloc_pageable(kernel_map, size);
 	if (!va)
 		panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
 
 	for (tmpva = va; size > 0;) {
 		pte = vtopte(tmpva);
 		*pte = (pt_entry_t) ((int) (pa | PG_RW | PG_V | PG_N));
 		size -= PAGE_SIZE;
 		tmpva += PAGE_SIZE;
 		pa += PAGE_SIZE;
 	}
 	pmap_update();
 
 	return ((void *) va);
 }
+
+#ifdef PMAP_DEBUG
+pmap_pid_dump(int pid) {
+	pmap_t pmap;
+	struct proc *p;
+	int npte = 0;
+	int index;
+	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
+		if (p->p_pid != pid)
+			continue;
+
+		if (p->p_vmspace) {
+			int i,j;
+			index = 0;
+			pmap = &p->p_vmspace->vm_pmap;
+			for(i=0;i<1024;i++) {
+				pd_entry_t *pde;
+				pt_entry_t *pte;
+				unsigned base = i << PD_SHIFT;
+				
+				pde = &pmap->pm_pdir[i];
+				if (pde && pmap_pde_v(pde)) {
+					for(j=0;j<1024;j++) {
+						unsigned va = base + (j << PG_SHIFT);
+						if (va >= (vm_offset_t) VM_MIN_KERNEL_ADDRESS) {
+							if (index) {
+								index = 0;
+								printf("\n");
+							}
+							return npte;
+						}
+						pte = pmap_pte( pmap, va);
+						if (pte && pmap_pte_v(pte)) {
+							vm_offset_t pa;
+							vm_page_t m;
+							pa = *(int *)pte;
+							m = PHYS_TO_VM_PAGE((pa & PG_FRAME));
+							printf("va: 0x%x, pt: 0x%x, h: %d, w: %d, f: 0x%x",
+								va, pa, m->hold_count, m->wire_count, m->flags);
+							npte++;
+							index++;
+							if (index >= 2) {
+								index = 0;
+								printf("\n");
+							} else {
+								printf(" ");
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	return npte;
+}
+#endif
 
 #ifdef DEBUG
 
 static void	pads __P((pmap_t pm));
 static void	pmap_pvdump __P((vm_offset_t pa));
 
 /* print address space of pmap*/
 static void
 pads(pm)
 	pmap_t pm;
 {
 	unsigned va, i, j;
 	pt_entry_t *ptep;
 
 	if (pm == kernel_pmap)
 		return;
 	for (i = 0; i < 1024; i++)
 		if (pm->pm_pdir[i])
 			for (j = 0; j < 1024; j++) {
 				va = (i << PD_SHIFT) + (j << PG_SHIFT);
 				if (pm == kernel_pmap && va < KERNBASE)
 					continue;
 				if (pm != kernel_pmap && va > UPT_MAX_ADDRESS)
 					continue;
 				ptep = pmap_pte(pm, va);
 				if (pmap_pte_v(ptep))
 					printf("%x:%x ", va, *(int *) ptep);
 			};
 
 }
 
 static void
 pmap_pvdump(pa)
 	vm_offset_t pa;
 {
 	register pv_entry_t pv;
 
 	printf("pa %x", pa);
 	for (pv = pa_to_pvh(pa); pv; pv = pv->pv_next) {
 #ifdef used_to_be
 		printf(" -> pmap %x, va %x, flags %x",
 		    pv->pv_pmap, pv->pv_va, pv->pv_flags);
 #endif
 		printf(" -> pmap %x, va %x",
 		    pv->pv_pmap, pv->pv_va);
 		pads(pv->pv_pmap);
 	}
 	printf(" ");
 }
 #endif
Index: head/sys/i386/i386/trap.c
===================================================================
--- head/sys/i386/i386/trap.c	(revision 13489)
+++ head/sys/i386/i386/trap.c	(revision 13490)
@@ -1,1061 +1,1062 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
- *	$Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $
+ *	$Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_ktrace.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/acct.h>
 #include <sys/kernel.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/psl.h>
 #include <machine/reg.h>
 #include <machine/trap.h>
 #include <machine/../isa/isa_device.h>
 
 #ifdef POWERFAIL_NMI
 # include <syslog.h>
 # include <machine/clock.h>
 #endif
 
 #include "isa.h"
 #include "npx.h"
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void linux_syscall __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int));
 static void trap_fatal __P((struct trapframe *));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		27
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 };
 
 static void userret __P((struct proc *p, struct trapframe *frame,
 			 u_quad_t oticks));
 
 static inline void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig, s;
 
 	while ((sig = CURSIG(p)) != 0)
 		postsig(sig);
 	p->p_priority = p->p_usrpri;
 	if (want_resched) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		s = splclock();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		splx(s);
 		while ((sig = CURSIG(p)) != 0)
 			postsig(sig);
 	}
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 		u_quad_t ticks = p->p_sticks - oticks;
 
 		if (ticks) {
 #ifdef PROFTIMER
 			extern int profscale;
 			addupc(frame->tf_eip, &p->p_stats->p_prof,
 			    ticks * profscale);
 #else
 			addupc(frame->tf_eip, &p->p_stats->p_prof, ticks);
 #endif
 		}
 	}
 	curpriority = p->p_priority;
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 #ifdef DEBUG
 	u_long eva;
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
 	if (ISPL(frame.tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		sticks = p->p_sticks;
 		p->p_md.md_regs = (int *)&frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 		case T_ASTFLT:		/* Allow process switch */
 			astoff();
 			cnt.v_soft++;
 			if (p->p_flag & P_OWEUPC) {
 				addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
 				p->p_flag &= ~P_OWEUPC;
 			}
 			goto out;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_STKFLT:		/* stack fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			i = trap_pfault(&frame, TRUE);
 			if (i == -1)
 				return;
 			if (i == 0)
 				goto out;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV_TRAP;
 			i = SIGFPE;
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			goto handle_powerfail;
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			panic("NMI indicates hardware failure");
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_SUBRNG_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #if NNPX > 0
 			/* if a transparent fault (due to context switch "late") */
 			if (npxdna())
 				return;
 #endif	/* NNPX > 0 */
 
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			i = (*pmath_emulate)(&frame);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					return;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(&frame, FALSE);
 			return;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 #define	MAYBE_DORETI_FAULT(where, whereto)				\
 	do {								\
 		if (frame.tf_eip == (int)where) {			\
 			frame.tf_eip = (int)whereto;			\
 			return;						\
 		}							\
 	} while (0)
 
 			if (intr_nesting_level == 0) {
 				MAYBE_DORETI_FAULT(doreti_iret,
 						   doreti_iret_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_ds,
 						   doreti_popl_ds_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_es,
 						   doreti_popl_es_fault);
 			}
 			if (curpcb && curpcb->pcb_onfault) {
 				frame.tf_eip = (int)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				return;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				return;
 			}
 			/*
 			 * Fall through.
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 	handle_powerfail:
 		{
 		  static unsigned lastalert = 0;
 
 		  if(time.tv_sec - lastalert > 10)
 		    {
 		      log(LOG_WARNING, "NMI: power fail\n");
 		      sysbeep(TIMER_FREQ/880, hz);
 		      lastalert = time.tv_sec;
 		    }
 		  return;
 		}
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 		}
 
 		trap_fatal(&frame);
 		return;
 	}
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	eva = rcr2();
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%x", eva);
 		uprintf("\n");
 	}
 #endif
 
 out:
 	userret(p, &frame, sticks);
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t ptepg;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		    (curpcb == NULL || curpcb->pcb_onfault == NULL))) {
 			trap_fatal(frame);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
 			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		vm_offset_t v;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
-			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
+			(void) vm_fault(map,
+				trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame)
 	struct trapframe *frame;
 {
 	int code, type, eva;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	eva = rcr2();
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	    softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	    softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace/trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 	printf("interrupt mask		= ");
 	if ((cpl & net_imask) == net_imask)
 		printf("net ");
 	if ((cpl & tty_imask) == tty_imask)
 		printf("tty ");
 	if ((cpl & bio_imask) == bio_imask)
 		printf("bio ");
 	if (cpl == 0)
 		printf("none");
 	printf("\n");
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if (kdb_trap (type, 0, frame))
 		return;
 #endif
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	struct pcb *pcb = curpcb;
 
 	if (pcb != NULL) {
 		printf("\nFatal double fault:\n");
 		printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip);
 		printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp);
 		printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp);
 	}
 
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va, v;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	++p->p_lock;
 
 	if ((caddr_t)va >= vm->vm_maxsaddr
 	    && (caddr_t)va < (caddr_t)USRSTACK) {
 		if (!grow(p, va)) {
 			--p->p_lock;
 			return (1);
 		}
 	}
 
 	v = trunc_page(vtopte(va));
 
 	/*
 	 * wire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
 
 	/*
 	 * unwire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE);
 	}
 
 	--p->p_lock;
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  * System call request from POSIX system call gate interface to kernel.
  * Like trap(), argument is call by reference.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int args[8], rval[2];
 	u_int code;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		code = fuword(params);
 		params += sizeof(int);
 	} else if (code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		code = fuword(params);
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	if ((i = callp->sy_narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 		goto bad;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 	rval[0] = 0;
 	rval[1] = frame.tf_edx;
 
 	error = (*callp->sy_call)(p, args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_edx = rval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes.
 		 */
 		frame.tf_eip -= 7;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 void
 linux_syscall(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	struct sysent *callp;
 	u_quad_t sticks;
 	int error;
 	int rval[2];
 	u_int code;
 	struct linux_syscall_args {
 		int arg1;
 		int arg2;
 		int arg3;
 		int arg4;
 		int arg5;
 	} args;
 
 	args.arg1 = frame.tf_ebx;
 	args.arg2 = frame.tf_ecx;
 	args.arg3 = frame.tf_edx;
 	args.arg4 = frame.tf_esi;
 	args.arg5 = frame.tf_edi;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("linux syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_mask)
 		code &= p->p_sysent->sv_mask;
 
 	if (code >= p->p_sysent->sv_size)
 		callp = &p->p_sysent->sv_table[0];
 	else
 		callp = &p->p_sysent->sv_table[code];
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args);
 #endif
 
 	rval[0] = 0;
 
 	error = (*callp->sy_call)(p, &args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/* Reconstruct pc, subtract size of int 0x80 */
 		frame.tf_eip -= 2;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = -error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 #endif /* COMPAT_LINUX || LINUX */
Index: head/sys/i386/i386/vm_machdep.c
===================================================================
--- head/sys/i386/i386/vm_machdep.c	(revision 13489)
+++ head/sys/i386/i386/vm_machdep.c	(revision 13490)
@@ -1,871 +1,871 @@
 /*-
  * Copyright (c) 1982, 1986 The Regents of the University of California.
  * Copyright (c) 1989, 1990 William Jolitz
  * Copyright (c) 1994 John Dyson
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
  *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
- *	$Id: vm_machdep.c,v 1.49 1995/12/14 08:31:01 phk Exp $
+ *	$Id: vm_machdep.c,v 1.50 1996/01/05 20:12:23 wollman Exp $
  */
 
 #include "npx.h"
 #include "opt_bounce.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <machine/clock.h>
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 #include <i386/isa/isa.h>
 
 static void	vm_fault_quick __P((caddr_t v, int prot));
 
 #ifdef BOUNCE_BUFFERS
 static vm_offset_t
 		vm_bounce_kva __P((int size, int waitok));
 static void	vm_bounce_kva_free __P((vm_offset_t addr, vm_offset_t size,
 					int now));
 static vm_offset_t
 		vm_bounce_page_find __P((int count));
 static void	vm_bounce_page_free __P((vm_offset_t pa, int count));
 
 static volatile int	kvasfreecnt;
 
 caddr_t		bouncememory;
 int		bouncepages;
 static int	bpwait;
 static vm_offset_t	*bouncepa;
 static int		bmwait, bmfreeing;
 
 #define BITS_IN_UNSIGNED (8*sizeof(unsigned))
 static int		bounceallocarraysize;
 static unsigned	*bounceallocarray;
 static int		bouncefree;
 
 #define SIXTEENMEG (4096*4096)
 #define MAXBKVA 1024
 int		maxbkva = MAXBKVA*NBPG;
 
 /* special list that can be used at interrupt time for eventual kva free */
 static struct kvasfree {
 	vm_offset_t addr;
 	vm_offset_t size;
 } kvaf[MAXBKVA];
 
 /*
  * get bounce buffer pages (count physically contiguous)
  * (only 1 inplemented now)
  */
 static vm_offset_t
 vm_bounce_page_find(count)
 	int count;
 {
 	int bit;
 	int s,i;
 
 	if (count != 1)
 		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
 
 	s = splbio();
 retry:
 	for (i = 0; i < bounceallocarraysize; i++) {
 		if (bounceallocarray[i] != 0xffffffff) {
 			bit = ffs(~bounceallocarray[i]);
 			if (bit) {
 				bounceallocarray[i] |= 1 << (bit - 1) ;
 				bouncefree -= count;
 				splx(s);
 				return bouncepa[(i * BITS_IN_UNSIGNED + (bit - 1))];
 			}
 		}
 	}
 	bpwait = 1;
 	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
 	goto retry;
 }
 
 static void
 vm_bounce_kva_free(addr, size, now)
 	vm_offset_t addr;
 	vm_offset_t size;
 	int now;
 {
 	int s = splbio();
 	kvaf[kvasfreecnt].addr = addr;
 	kvaf[kvasfreecnt].size = size;
 	++kvasfreecnt;
 	if( now) {
 		/*
 		 * this will do wakeups
 		 */
 		vm_bounce_kva(0,0);
 	} else {
 		if (bmwait) {
 		/*
 		 * if anyone is waiting on the bounce-map, then wakeup
 		 */
 			wakeup((caddr_t) io_map);
 			bmwait = 0;
 		}
 	}
 	splx(s);
 }
 
 /*
  * free count bounce buffer pages
  */
 static void
 vm_bounce_page_free(pa, count)
 	vm_offset_t pa;
 	int count;
 {
 	int allocindex;
 	int index;
 	int bit;
 
 	if (count != 1)
 		panic("vm_bounce_page_free -- no support for > 1 page yet!!!");
 
 	for(index=0;index<bouncepages;index++) {
 		if( pa == bouncepa[index])
 			break;
 	}
 
 	if( index == bouncepages)
 		panic("vm_bounce_page_free: invalid bounce buffer");
 
 	allocindex = index / BITS_IN_UNSIGNED;
 	bit = index % BITS_IN_UNSIGNED;
 
 	bounceallocarray[allocindex] &= ~(1 << bit);
 
 	bouncefree += count;
 	if (bpwait) {
 		bpwait = 0;
 		wakeup((caddr_t) &bounceallocarray);
 	}
 }
 
 /*
  * allocate count bounce buffer kva pages
  */
 static vm_offset_t
 vm_bounce_kva(size, waitok)
 	int size;
 	int waitok;
 {
 	int i;
 	vm_offset_t kva = 0;
 	vm_offset_t off;
 	int s = splbio();
 more:
 	if (!bmfreeing && kvasfreecnt) {
 		bmfreeing = 1;
 		for (i = 0; i < kvasfreecnt; i++) {
 			for(off=0;off<kvaf[i].size;off+=NBPG) {
 				pmap_kremove( kvaf[i].addr + off);
 			}
 			kmem_free_wakeup(io_map, kvaf[i].addr,
 				kvaf[i].size);
 		}
 		kvasfreecnt = 0;
 		bmfreeing = 0;
 		if( bmwait) {
 			bmwait = 0;
 			wakeup( (caddr_t) io_map);
 		}
 	}
 
 	if( size == 0) {
 		splx(s);
 		return NULL;
 	}
 
 	if ((kva = kmem_alloc_pageable(io_map, size)) == 0) {
 		if( !waitok) {
 			splx(s);
 			return NULL;
 		}
 		bmwait = 1;
 		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
 		goto more;
 	}
 	splx(s);
 	return kva;
 }
 
 /*
  * same as vm_bounce_kva -- but really allocate (but takes pages as arg)
  */
 vm_offset_t
 vm_bounce_kva_alloc(count)
 int count;
 {
 	int i;
 	vm_offset_t kva;
 	vm_offset_t pa;
 	if( bouncepages == 0) {
 		kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK);
 		return kva;
 	}
 	kva = vm_bounce_kva(count*NBPG, 1);
 	for(i=0;i<count;i++) {
 		pa = vm_bounce_page_find(1);
 		pmap_kenter(kva + i * NBPG, pa);
 	}
 	return kva;
 }
 
 /*
  * same as vm_bounce_kva_free -- but really free
  */
 void
 vm_bounce_kva_alloc_free(kva, count)
 	vm_offset_t kva;
 	int count;
 {
 	int i;
 	vm_offset_t pa;
 	if( bouncepages == 0) {
 		free((caddr_t) kva, M_TEMP);
 		return;
 	}
 	for(i = 0; i < count; i++) {
 		pa = pmap_kextract(kva + i * NBPG);
 		vm_bounce_page_free(pa, 1);
 	}
 	vm_bounce_kva_free(kva, count*NBPG, 0);
 }
 
 /*
  * do the things necessary to the struct buf to implement
  * bounce buffers...  inserted before the disk sort
  */
 void
 vm_bounce_alloc(bp)
 	struct buf *bp;
 {
 	int countvmpg;
 	vm_offset_t vastart, vaend;
 	vm_offset_t vapstart, vapend;
 	vm_offset_t va, kva;
 	vm_offset_t pa;
 	int dobounceflag = 0;
 	int i;
 
 	if (bouncepages == 0)
 		return;
 
 	if (bp->b_flags & B_BOUNCE) {
 		printf("vm_bounce_alloc: called recursively???\n");
 		return;
 	}
 
 	if (bp->b_bufsize < bp->b_bcount) {
 		printf(
 		    "vm_bounce_alloc: b_bufsize(0x%lx) < b_bcount(0x%lx) !!\n",
 			bp->b_bufsize, bp->b_bcount);
 		panic("vm_bounce_alloc");
 	}
 
 /*
  *  This is not really necessary
  *	if( bp->b_bufsize != bp->b_bcount) {
  *		printf("size: %d, count: %d\n", bp->b_bufsize, bp->b_bcount);
  *	}
  */
 
 
 	vastart = (vm_offset_t) bp->b_data;
 	vaend = (vm_offset_t) bp->b_data + bp->b_bufsize;
 
-	vapstart = i386_trunc_page(vastart);
-	vapend = i386_round_page(vaend);
+	vapstart = trunc_page(vastart);
+	vapend = round_page(vaend);
 	countvmpg = (vapend - vapstart) / NBPG;
 
 /*
  * if any page is above 16MB, then go into bounce-buffer mode
  */
 	va = vapstart;
 	for (i = 0; i < countvmpg; i++) {
 		pa = pmap_kextract(va);
 		if (pa >= SIXTEENMEG)
 			++dobounceflag;
 		if( pa == 0)
 			panic("vm_bounce_alloc: Unmapped page");
 		va += NBPG;
 	}
 	if (dobounceflag == 0)
 		return;
 
 	if (bouncepages < dobounceflag)
 		panic("Not enough bounce buffers!!!");
 
 /*
  * allocate a replacement kva for b_addr
  */
 	kva = vm_bounce_kva(countvmpg*NBPG, 1);
 #if 0
 	printf("%s: vapstart: %x, vapend: %x, countvmpg: %d, kva: %x ",
 		(bp->b_flags & B_READ) ? "read":"write",
 			vapstart, vapend, countvmpg, kva);
 #endif
 	va = vapstart;
 	for (i = 0; i < countvmpg; i++) {
 		pa = pmap_kextract(va);
 		if (pa >= SIXTEENMEG) {
 			/*
 			 * allocate a replacement page
 			 */
 			vm_offset_t bpa = vm_bounce_page_find(1);
 			pmap_kenter(kva + (NBPG * i), bpa);
 #if 0
 			printf("r(%d): (%x,%x,%x) ", i, va, pa, bpa);
 #endif
 			/*
 			 * if we are writing, the copy the data into the page
 			 */
 			if ((bp->b_flags & B_READ) == 0) {
 				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
 			}
 		} else {
 			/*
 			 * use original page
 			 */
 			pmap_kenter(kva + (NBPG * i), pa);
 		}
 		va += NBPG;
 	}
 
 /*
  * flag the buffer as being bounced
  */
 	bp->b_flags |= B_BOUNCE;
 /*
  * save the original buffer kva
  */
 	bp->b_savekva = bp->b_data;
 /*
  * put our new kva into the buffer (offset by original offset)
  */
 	bp->b_data = (caddr_t) (((vm_offset_t) kva) |
 				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
 #if 0
 	printf("b_savekva: %x, newva: %x\n", bp->b_savekva, bp->b_data);
 #endif
 	return;
 }
 
 /*
  * hook into biodone to free bounce buffer
  */
 void
 vm_bounce_free(bp)
 	struct buf *bp;
 {
 	int i;
 	vm_offset_t origkva, bouncekva, bouncekvaend;
 
 /*
  * if this isn't a bounced buffer, then just return
  */
 	if ((bp->b_flags & B_BOUNCE) == 0)
 		return;
 
 /*
  *  This check is not necessary
  *	if (bp->b_bufsize != bp->b_bcount) {
  *		printf("vm_bounce_free: b_bufsize=%d, b_bcount=%d\n",
  *			bp->b_bufsize, bp->b_bcount);
  *	}
  */
 
 	origkva = (vm_offset_t) bp->b_savekva;
 	bouncekva = (vm_offset_t) bp->b_data;
 /*
 	printf("free: %d ", bp->b_bufsize);
 */
 
 /*
  * check every page in the kva space for b_addr
  */
 	for (i = 0; i < bp->b_bufsize; ) {
 		vm_offset_t mybouncepa;
 		vm_offset_t copycount;
 
-		copycount = i386_round_page(bouncekva + 1) - bouncekva;
-		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
+		copycount = round_page(bouncekva + 1) - bouncekva;
+		mybouncepa = pmap_kextract(trunc_page(bouncekva));
 
 /*
  * if this is a bounced pa, then process as one
  */
-		if ( mybouncepa != pmap_kextract( i386_trunc_page( origkva))) {
+		if ( mybouncepa != pmap_kextract( trunc_page( origkva))) {
 			vm_offset_t tocopy = copycount;
 			if (i + tocopy > bp->b_bufsize)
 				tocopy = bp->b_bufsize - i;
 /*
  * if this is a read, then copy from bounce buffer into original buffer
  */
 			if (bp->b_flags & B_READ)
 				bcopy((caddr_t) bouncekva, (caddr_t) origkva, tocopy);
 /*
  * free the bounce allocation
  */
 
 /*
 			printf("(kva: %x, pa: %x)", bouncekva, mybouncepa);
 */
 			vm_bounce_page_free(mybouncepa, 1);
 		}
 
 		origkva += copycount;
 		bouncekva += copycount;
 		i += copycount;
 	}
 
 /*
 	printf("\n");
 */
 /*
  * add the old kva into the "to free" list
  */
 
-	bouncekva= i386_trunc_page((vm_offset_t) bp->b_data);
-	bouncekvaend= i386_round_page((vm_offset_t)bp->b_data + bp->b_bufsize);
+	bouncekva= trunc_page((vm_offset_t) bp->b_data);
+	bouncekvaend= round_page((vm_offset_t)bp->b_data + bp->b_bufsize);
 
 /*
 	printf("freeva: %d\n", (bouncekvaend - bouncekva) / NBPG);
 */
 	vm_bounce_kva_free( bouncekva, (bouncekvaend - bouncekva), 0);
 	bp->b_data = bp->b_savekva;
 	bp->b_savekva = 0;
 	bp->b_flags &= ~B_BOUNCE;
 
 	return;
 }
 
 
 /*
  * init the bounce buffer system
  */
 void
 vm_bounce_init()
 {
 	int i;
 
 	kvasfreecnt = 0;
 
 	if (bouncepages == 0)
 		return;
 
 	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
 	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
 
 	if (!bounceallocarray)
 		panic("Cannot allocate bounce resource array");
 
 	bouncepa = malloc(bouncepages * sizeof(vm_offset_t), M_TEMP, M_NOWAIT);
 	if (!bouncepa)
 		panic("Cannot allocate physical memory array");
 
 	for(i=0;i<bounceallocarraysize;i++) {
 		bounceallocarray[i] = 0xffffffff;
 	}
 
 	for(i=0;i<bouncepages;i++) {
 		vm_offset_t pa;
 		if( (pa = pmap_kextract((vm_offset_t) bouncememory + i * NBPG)) >= SIXTEENMEG)
 			panic("bounce memory out of range");
 		if( pa == 0)
 			panic("bounce memory not resident");
 		bouncepa[i] = pa;
 		bounceallocarray[i/(8*sizeof(int))] &= ~(1<<(i%(8*sizeof(int))));
 	}
 	bouncefree = bouncepages;
 
 }
 #endif /* BOUNCE_BUFFERS */
 
 /*
  * quick version of vm_fault
  */
 static void
 vm_fault_quick(v, prot)
 	caddr_t v;
 	int prot;
 {
 	if (prot & VM_PROT_WRITE)
 		subyte(v, fubyte(v));
 	else
 		fubyte(v);
 }
 
 /*
  * Finish a fork operation, with process p2 nearly set up.
  * Copy and update the kernel stack and pcb, making the child
  * ready to run, and marking it so that it can return differently
  * than the parent.  Returns 1 in the child process, 0 in the parent.
  * We currently double-map the user area so that the stack is at the same
  * address in each process; in the future we will probably relocate
  * the frame pointers on the stack after copying.
  */
 int
 cpu_fork(p1, p2)
 	register struct proc *p1, *p2;
 {
 	register struct user *up = p2->p_addr;
 	int offset;
 
 	/*
 	 * Copy pcb and stack from proc p1 to p2.
 	 * We do this as cheaply as possible, copying only the active
 	 * part of the stack.  The stack and pcb need to agree;
 	 * this is tricky, as the final pcb is constructed by savectx,
 	 * but its frame isn't yet on the stack when the stack is copied.
 	 * swtch compensates for this when the child eventually runs.
 	 * This should be done differently, with a single call
 	 * that copies and updates the pcb+stack,
 	 * replacing the bcopy and savectx.
 	 */
 	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
 	offset = mvesp() - (int)kstack;
 	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
 	    (unsigned) ctob(UPAGES) - offset);
 	p2->p_md.md_regs = p1->p_md.md_regs;
 
 	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
 
 	/*
 	 *
 	 * Arrange for a non-local goto when the new process
 	 * is started, to resume here, returning nonzero from setjmp.
 	 */
 	if (savectx(&up->u_pcb, 1)) {
 		/*
 		 * Return 1 in child.
 		 */
 		return (1);
 	}
 	return (0);
 }
 
 void
 cpu_exit(p)
 	register struct proc *p;
 {
 
 #if NNPX > 0
 	npxexit(p);
 #endif	/* NNPX */
 	cnt.v_swtch++;
 	cpu_switch(p);
 	panic("cpu_exit");
 }
 
 void
-cpu_wait(p) struct proc *p; {
-/*	extern vm_map_t upages_map; */
-
+cpu_wait(p)
+	struct proc *p;
+{
 	/* drop per-process resources */
- 	pmap_remove(vm_map_pmap(u_map), (vm_offset_t) p->p_addr,
-		((vm_offset_t) p->p_addr) + ctob(UPAGES));
+	pmap_qremove((vm_offset_t) p->p_addr, UPAGES);
 	kmem_free(u_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
 	vmspace_free(p->p_vmspace);
 }
 
 /*
  * Dump the machine specific header information at the start of a core dump.
  */
 int
 cpu_coredump(p, vp, cred)
 	struct proc *p;
 	struct vnode *vp;
 	struct ucred *cred;
 {
 
 	return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES),
 	    (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL,
 	    p));
 }
 
 #ifdef notyet
 static void
 setredzone(pte, vaddr)
 	u_short *pte;
 	caddr_t vaddr;
 {
 /* eventually do this by setting up an expand-down stack segment
    for ss0: selector, allowing stack access down to top of u.
    this means though that protection violations need to be handled
    thru a double fault exception that must do an integral task
    switch to a known good context, within which a dump can be
    taken. a sensible scheme might be to save the initial context
    used by sched (that has physical memory mapped 1:1 at bottom)
    and take the dump while still in mapped mode */
 }
 #endif
 
 /*
  * Convert kernel VA to physical address
  */
 u_long
 kvtop(void *addr)
 {
 	vm_offset_t va;
 
 	va = pmap_kextract((vm_offset_t)addr);
 	if (va == 0)
 		panic("kvtop: zero page frame");
 	return((int)va);
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  */
 void
 vmapbuf(bp)
 	register struct buf *bp;
 {
 	register int npf;
 	register caddr_t addr;
 	int off;
 	vm_offset_t kva;
 	vm_offset_t pa;
 
 	if ((bp->b_flags & B_PHYS) == 0)
 		panic("vmapbuf");
 
 	/*
 	 * this is the kva that is to be used for
 	 * the temporary kernel mapping
 	 */
 	kva = (vm_offset_t) bp->b_saveaddr;
 
 	for (addr = (caddr_t)trunc_page(bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += PAGE_SIZE) {
 
 /*
  * do the vm_fault if needed, do the copy-on-write thing when
  * reading stuff off device into memory.
  */
 		vm_fault_quick(addr,
 			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
 		pa = pmap_kextract((vm_offset_t) addr);
 		if (pa == 0)
 			panic("vmapbuf: page not present");
 /*
  * hold the data page
  */
 #ifdef DIAGNOSTIC
 		if( VM_PAGE_TO_PHYS(PHYS_TO_VM_PAGE(pa)) != pa)
 			panic("vmapbuf: confused PHYS_TO_VM_PAGE mapping");
 #endif
 		vm_page_hold(PHYS_TO_VM_PAGE(pa));
 	}
 
 	addr = bp->b_saveaddr = bp->b_data;
 	off = (int)addr & PGOFSET;
 	npf = btoc(round_page(bp->b_bufsize + off));
 	bp->b_data = (caddr_t) (kva + off);
 	while (npf--) {
 		pa = pmap_kextract((vm_offset_t)addr);
 		if (pa == 0)
 			panic("vmapbuf: null page frame");
 		pmap_kenter(kva, trunc_page(pa));
 		addr += PAGE_SIZE;
 		kva += PAGE_SIZE;
 	}
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  */
 void
 vunmapbuf(bp)
 	register struct buf *bp;
 {
 	register caddr_t addr;
 	vm_offset_t pa;
 
 	if ((bp->b_flags & B_PHYS) == 0)
 		panic("vunmapbuf");
 
 	for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += NBPG)
 		pmap_kremove((vm_offset_t) addr);
 
 	bp->b_data = bp->b_saveaddr;
 	bp->b_saveaddr = NULL;
 
 /*
  * unhold the pde, and data pages
  */
 	for (addr = (caddr_t)trunc_page((vm_offset_t) bp->b_data);
 		addr < bp->b_data + bp->b_bufsize;
 		addr += NBPG) {
 	/*
 	 * release the data page
 	 */
 		pa = pmap_kextract((vm_offset_t) addr);
 		vm_page_unhold(PHYS_TO_VM_PAGE(pa));
 	}
 }
 
 /*
  * Force reset the processor by invalidating the entire address space!
  */
 void
 cpu_reset() {
 
 	/*
 	 * Attempt to do a CPU reset via the keyboard controller,
 	 * do not turn of the GateA20, as any machine that fails
 	 * to do the reset here would then end up in no man's land.
 	 */
 
 #ifndef BROKEN_KEYBOARD_RESET
 	outb(IO_KBD + 4, 0xFE);
 	DELAY(500000);	/* wait 0.5 sec to see if that did it */
 	printf("Keyboard reset did not work, attempting CPU shutdown\n");
 	DELAY(1000000);	/* wait 1 sec for printf to complete */
 #endif
 
 	/* force a shutdown by unmapping entire address space ! */
 	bzero((caddr_t) PTD, NBPG);
 
 	/* "good night, sweet prince .... <THUNK!>" */
 	pmap_update();
 	/* NOTREACHED */
 	while(1);
 }
 
 /*
  * Grow the user stack to allow for 'sp'. This version grows the stack in
  *	chunks of SGROWSIZ.
  */
 int
 grow(p, sp)
 	struct proc *p;
 	u_int sp;
 {
 	unsigned int nss;
 	caddr_t v;
 	struct vmspace *vm = p->p_vmspace;
 
 	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
 	    return (1);
 
 	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
 
 	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
 		return (0);
 
 	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
 	    SGROWSIZ) < nss) {
 		int grow_amount;
 		/*
 		 * If necessary, grow the VM that the stack occupies
 		 * to allow for the rlimit. This allows us to not have
 		 * to allocate all of the VM up-front in execve (which
 		 * is expensive).
 		 * Grow the VM by the amount requested rounded up to
 		 * the nearest SGROWSIZ to provide for some hysteresis.
 		 */
 		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
 		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
 		    SGROWSIZ) - grow_amount;
 		/*
 		 * If there isn't enough room to extend by SGROWSIZ, then
 		 * just extend to the maximum size
 		 */
 		if (v < vm->vm_maxsaddr) {
 			v = vm->vm_maxsaddr;
 			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
 		}
 		if ((grow_amount == 0) || (vm_map_find(&vm->vm_map, NULL, 0, (vm_offset_t *)&v,
-		    grow_amount, FALSE) != KERN_SUCCESS)) {
+		    grow_amount, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != KERN_SUCCESS)) {
 			return (0);
 		}
 		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
 	}
 
 	return (1);
 }
 
 /*
  * prototype routine to implement the pre-zeroed page mechanism
  * this routine is called from the idle loop.
  */
 int
 vm_page_zero_idle() {
 	vm_page_t m;
 	if ((cnt.v_free_count > cnt.v_interrupt_free_min) &&
 		(m = vm_page_queue_free.tqh_first)) {
 		TAILQ_REMOVE(&vm_page_queue_free, m, pageq);
 		enable_intr();
 		pmap_zero_page(VM_PAGE_TO_PHYS(m));
 		disable_intr();
 		TAILQ_INSERT_HEAD(&vm_page_queue_zero, m, pageq);
+		m->queue = PQ_ZERO;
 		++vm_page_zero_count;
 		return 1;
 	}
 	return 0;
 }
Index: head/sys/kern/imgact_aout.c
===================================================================
--- head/sys/kern/imgact_aout.c	(revision 13489)
+++ head/sys/kern/imgact_aout.c	(revision 13490)
@@ -1,218 +1,211 @@
 /*
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by David Greenman
  * 4. The name of the developer may be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: imgact_aout.c,v 1.20 1995/12/11 04:56:00 dyson Exp $
+ *	$Id: imgact_aout.c,v 1.21 1995/12/15 02:57:40 peter Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/resourcevar.h>
 #include <sys/exec.h>
 #include <sys/mman.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/sysent.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 static int	exec_aout_imgact __P((struct image_params *imgp));
 
 static int
 exec_aout_imgact(imgp)
 	struct image_params *imgp;
 {
 	struct exec *a_out = (struct exec *) imgp->image_header;
 	struct vmspace *vmspace = imgp->proc->p_vmspace;
 	unsigned long vmaddr, virtual_offset;
 	unsigned long file_offset;
 	unsigned long bss_size;
 	int error;
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 	/*
 	 * Linux and *BSD binaries look very much alike,
 	 * only the machine id is different:
 	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
 	 */
 	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
 	    ((a_out->a_magic >> 16) & 0xff) != 0)
                 return -1;
 #endif /* COMPAT_LINUX || defined(LINUX) */
 
 	/*
 	 * Set file/virtual offset based on a.out variant.
 	 *	We do two cases: host byte order and network byte order
 	 *	(for NetBSD compatibility)
 	 */
 	switch ((int)(a_out->a_magic & 0xffff)) {
 	case ZMAGIC:
 		virtual_offset = 0;
 		if (a_out->a_text) {
 			file_offset = NBPG;
 		} else {
 			/* Bill's "screwball mode" */
 			file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		virtual_offset = NBPG;
 		file_offset = 0;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			virtual_offset = NBPG;
 			file_offset = 0;
 			break;
 		default:
 			return (-1);
 		}
 	}
 
 	bss_size = roundup(a_out->a_bss, NBPG);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (/* entry point must lay with text region */
 	    a_out->a_entry < virtual_offset ||
 	    a_out->a_entry >= virtual_offset + a_out->a_text ||
 
 	    /* text and data size must each be page rounded */
 	    a_out->a_text % NBPG ||
 	    a_out->a_data % NBPG)
 		return (-1);
 
 	/* text + data can't exceed file size */
 	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
 		return (EFAULT);
 
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	if (/* text can't exceed maximum text size */
 	    a_out->a_text > MAXTSIZ ||
 
 	    /* data + bss can't exceed maximum data size */
 	    a_out->a_data + bss_size > MAXDSIZ ||
 
 	    /* data + bss can't exceed rlimit */
 	    a_out->a_data + bss_size >
 		imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
 			return (ENOMEM);
 
 	/* copy in arguments and/or environment from old process */
 	error = exec_extract_strings(imgp);
 	if (error)
 		return (error);
 
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	exec_new_vmspace(imgp);
 
 	/*
-	 * Map text read/execute
+	 * Map text/data read/execute
 	 */
 	vmaddr = virtual_offset;
 	error =
 	    vm_mmap(&vmspace->vm_map,			/* map */
 		&vmaddr,				/* address */
-		a_out->a_text,				/* size */
+		a_out->a_text + a_out->a_data,		/* size */
 		VM_PROT_READ | VM_PROT_EXECUTE,		/* protection */
-		VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE,	/* max protection */
+		VM_PROT_ALL,				/* max protection */
 		MAP_PRIVATE | MAP_FIXED,		/* flags */
 		(caddr_t)imgp->vp,			/* vnode */
 		file_offset);				/* offset */
 	if (error)
 		return (error);
 
 	/*
-	 * Map data read/write (if text is 0, assume text is in data area
-	 *	[Bill's screwball mode])
+	 * allow writing of data
 	 */
-	vmaddr = virtual_offset + a_out->a_text;
-	error =
-	    vm_mmap(&vmspace->vm_map,
-		&vmaddr,
-		a_out->a_data,
-		VM_PROT_READ | VM_PROT_WRITE | (a_out->a_text ? 0 : VM_PROT_EXECUTE),
-		VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED,
-		(caddr_t) imgp->vp,
-		file_offset + a_out->a_text);
-	if (error)
-		return (error);
+	vm_map_protect(&vmspace->vm_map,
+		vmaddr + a_out->a_text,
+		vmaddr + a_out->a_text + a_out->a_data,
+		VM_PROT_ALL,
+		FALSE);
 
 	if (bss_size != 0) {
 		/*
 		 * Allocate demand-zeroed area for uninitialized data
 		 * "bss" = 'block started by symbol' - named after the IBM 7090
 		 *	instruction of the same name.
 		 */
 		vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
-		error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE);
+		error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error)
 			return (error);
 	}
 
 	/* Fill in process VM information */
 	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) virtual_offset;
 	vmspace->vm_daddr = (caddr_t) virtual_offset + a_out->a_text;
 
 	/* Fill in image_params */
 	imgp->interpreted = 0;
 	imgp->entry_addr = a_out->a_entry;
 
 	imgp->proc->p_sysent = &aout_sysvec;
 
 	/* Indicate that this file should not be modified */
 	imgp->vp->v_flag |= VTEXT;
 
 	return (0);
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  * Since `const' objects end up in the text segment, TEXT_SET is the
  * correct directive to use.
  */
 static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
 TEXT_SET(execsw_set, aout_execsw);
Index: head/sys/kern/imgact_gzip.c
===================================================================
--- head/sys/kern/imgact_gzip.c	(revision 13489)
+++ head/sys/kern/imgact_gzip.c	(revision 13490)
@@ -1,379 +1,379 @@
 /*
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
  * <phk@login.dkuug.dk> wrote this file.  As long as you retain this notice you
  * can do whatever you want with this stuff. If we meet some day, and you think
  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
  * ----------------------------------------------------------------------------
  *
- * $Id: imgact_gzip.c,v 1.16 1995/12/02 16:32:01 bde Exp $
+ * $Id: imgact_gzip.c,v 1.17 1995/12/07 12:46:35 davidg Exp $
  *
  * This module handles execution of a.out files which have been run through
  * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
  *
  * TODO:
  *	text-segments should be made R/O after being filled
  *	is the vm-stuff safe ?
  * 	should handle the entire header of gzip'ed stuff.
  *	inflate isn't quite reentrant yet...
  *	error-handling is a mess...
  *	so is the rest...
  *	tidy up unnecesary includes
  */
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_aout.h>
 #include <sys/kernel.h>
 #include <sys/mman.h>
 #include <sys/resourcevar.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/inflate.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 struct imgact_gzip {
 	struct image_params *ip;
 	struct exec     a_out;
 	int             error;
 	int             where;
 	u_char         *inbuf;
 	u_long          offset;
 	u_long          output;
 	u_long          len;
 	int             idx;
 	u_long          virtual_offset, file_offset, file_end, bss_size;
 };
 
 static int exec_gzip_imgact __P((struct image_params *imgp));
 static int NextByte __P((void *vp));
 static int do_aout_hdr __P((struct imgact_gzip *));
 static int Flush __P((void *vp, u_char *, u_long siz));
 
 static int
 exec_gzip_imgact(imgp)
 	struct image_params *imgp;
 {
 	int             error, error2 = 0;
 	u_char         *p = (u_char *) imgp->image_header;
 	struct imgact_gzip igz;
 	struct inflate  infl;
 
 	/* If these four are not OK, it isn't a gzip file */
 	if (p[0] != 0x1f)
 		return -1;	/* 0    Simply magic	 */
 	if (p[1] != 0x8b)
 		return -1;	/* 1    Simply magic	 */
 	if (p[2] != 0x08)
 		return -1;	/* 2    Compression method	 */
 	if (p[9] != 0x03)
 		return -1;	/* 9    OS compressed on	 */
 
 	/*
 	 * If this one contains anything but a comment or a filename marker,
 	 * we don't want to chew on it
 	 */
 	if (p[3] & ~(0x18))
 		return ENOEXEC;	/* 3    Flags		 */
 
 	/* These are of no use to us */
 	/* 4-7  Timestamp		 */
 	/* 8    Extra flags		 */
 
 	bzero(&igz, sizeof igz);
 	bzero(&infl, sizeof infl);
 	infl.gz_private = (void *) &igz;
 	infl.gz_input = NextByte;
 	infl.gz_output = Flush;
 
 	igz.ip = imgp;
 	igz.idx = 10;
 
 	if (p[3] & 0x08) {	/* skip a filename */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	if (p[3] & 0x10) {	/* skip a comment */
 		while (p[igz.idx++])
 			if (igz.idx >= PAGE_SIZE)
 				return ENOEXEC;
 	}
 	igz.len = igz.ip->attr->va_size;
 
 	error = inflate(&infl);
 
 	if (igz.inbuf) {
 		error2 =
 			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
 			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
 	}
 	if (igz.error || error || error2) {
 		printf("Output=%lu ", igz.output);
 		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
 		       error, igz.error, error2, igz.where);
 	}
 	if (igz.error)
 		return igz.error;
 	if (error)
 		return ENOEXEC;
 	if (error2)
 		return error2;
 	return 0;
 }
 
 static int
 do_aout_hdr(struct imgact_gzip * gz)
 {
 	int             error;
 	struct vmspace *vmspace = gz->ip->proc->p_vmspace;
 	u_long          vmaddr;
 
 	/*
 	 * Set file/virtual offset based on a.out variant. We do two cases:
 	 * host byte order and network byte order (for NetBSD compatibility)
 	 */
 	switch ((int) (gz->a_out.a_magic & 0xffff)) {
 	case ZMAGIC:
 		gz->virtual_offset = 0;
 		if (gz->a_out.a_text) {
 			gz->file_offset = NBPG;
 		} else {
 			/* Bill's "screwball mode" */
 			gz->file_offset = 0;
 		}
 		break;
 	case QMAGIC:
 		gz->virtual_offset = NBPG;
 		gz->file_offset = 0;
 		break;
 	default:
 		/* NetBSD compatibility */
 		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
 		case ZMAGIC:
 		case QMAGIC:
 			gz->virtual_offset = NBPG;
 			gz->file_offset = 0;
 			break;
 		default:
 			gz->where = __LINE__;
 			return (-1);
 		}
 	}
 
 	gz->bss_size = roundup(gz->a_out.a_bss, NBPG);
 
 	/*
 	 * Check various fields in header for validity/bounds.
 	 */
 	if (			/* entry point must lay with text region */
 	    gz->a_out.a_entry < gz->virtual_offset ||
 	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
 
 	/* text and data size must each be page rounded */
 	    gz->a_out.a_text % NBPG ||
 	    gz->a_out.a_data % NBPG) {
 		gz->where = __LINE__;
 		return (-1);
 	}
 	/*
 	 * text/data/bss must not exceed limits
 	 */
 	if (			/* text can't exceed maximum text size */
 	    gz->a_out.a_text > MAXTSIZ ||
 
 	/* data + bss can't exceed maximum data size */
 	    gz->a_out.a_data + gz->bss_size > MAXDSIZ ||
 
 	/* data + bss can't exceed rlimit */
 	    gz->a_out.a_data + gz->bss_size >
 	    gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
 		gz->where = __LINE__;
 		return (ENOMEM);
 	}
 	/* Find out how far we should go */
 	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
 
 	/* copy in arguments and/or environment from old process */
 	error = exec_extract_strings(gz->ip);
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
 	exec_new_vmspace(gz->ip);
 
 	vmaddr = gz->virtual_offset;
 
 	error = vm_mmap(&vmspace->vm_map,	/* map */
 			&vmaddr,/* address */
 			gz->a_out.a_text,	/* size */
 			VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE,	/* protection */
 			VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_WRITE,
 			MAP_ANON | MAP_FIXED,	/* flags */
 			0,	/* vnode */
 			0);	/* offset */
 
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 	vmaddr = gz->virtual_offset + gz->a_out.a_text;
 
 	/*
 	 * Map data read/write (if text is 0, assume text is in data area
 	 * [Bill's screwball mode])
 	 */
 
 	error = vm_mmap(&vmspace->vm_map,
 			&vmaddr,
 			gz->a_out.a_data,
 			VM_PROT_READ | VM_PROT_WRITE | (gz->a_out.a_text ? 0 : VM_PROT_EXECUTE),
 			VM_PROT_ALL, MAP_ANON | MAP_FIXED,
 			0,
 			0);
 
 	if (error) {
 		gz->where = __LINE__;
 		return (error);
 	}
 	if (gz->bss_size != 0) {
 		/*
 		 * Allocate demand-zeroed area for uninitialized data "bss" = 'block
 		 * started by symbol' - named after the IBM 7090 instruction of the
 		 * same name.
 		 */
 		vmaddr = gz->virtual_offset + gz->a_out.a_text + gz->a_out.a_data;
-		error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE);
+		error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, gz->bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (error) {
 			gz->where = __LINE__;
 			return (error);
 		}
 	}
 	/* Fill in process VM information */
 	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
 	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t) gz->virtual_offset;
 	vmspace->vm_daddr = (caddr_t) gz->virtual_offset + gz->a_out.a_text;
 
 	/* Fill in image_params */
 	gz->ip->interpreted = 0;
 	gz->ip->entry_addr = gz->a_out.a_entry;
 
 	gz->ip->proc->p_sysent = &aout_sysvec;
 
 	return 0;
 }
 
 static int
 NextByte(void *vp)
 {
 	int             error;
 	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
 
 	if (igz->idx >= igz->len) {
 		igz->where = __LINE__;
 		return GZ_EOF;
 	}
 	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
 		return igz->inbuf[(igz->idx++) - igz->offset];
 	}
 	if (igz->inbuf) {
 		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
 			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
 		if (error) {
 			igz->where = __LINE__;
 			igz->error = error;
 			return GZ_EOF;
 		}
 	}
 	igz->offset = igz->idx & ~PAGE_MASK;
 
 	error = vm_mmap(kernel_map,	/* map */
 			(vm_offset_t *) & igz->inbuf,	/* address */
 			PAGE_SIZE,	/* size */
 			VM_PROT_READ,	/* protection */
 			VM_PROT_READ,	/* max protection */
 			0,	/* flags */
 			(caddr_t) igz->ip->vp,	/* vnode */
 			igz->offset);	/* offset */
 	if (error) {
 		igz->where = __LINE__;
 		igz->error = error;
 		return GZ_EOF;
 	}
 	return igz->inbuf[(igz->idx++) - igz->offset];
 }
 
 static int
 Flush(void *vp, u_char * ptr, u_long siz)
 {
 	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
 	u_char         *p = ptr, *q;
 	int             i;
 
 	/* First, find a a.out-header */
 	if (gz->output < sizeof gz->a_out) {
 		q = (u_char *) & gz->a_out;
 		i = min(siz, sizeof gz->a_out - gz->output);
 		bcopy(p, q + gz->output, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 		if (gz->output == sizeof gz->a_out) {
 			i = do_aout_hdr(gz);
 			if (i == -1) {
 				if (!gz->where)
 					gz->where = __LINE__;
 				gz->error = ENOEXEC;
 				return ENOEXEC;
 			} else if (i) {
 				gz->where = __LINE__;
 				gz->error = i;
 				return ENOEXEC;
 			}
 			if (gz->file_offset < sizeof gz->a_out) {
 				q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
 				bcopy(&gz->a_out, q, sizeof gz->a_out - gz->file_offset);
 			}
 		}
 	}
 	/* Skip over zero-padded first PAGE if needed */
 	if (gz->output < gz->file_offset && (gz->output + siz) > gz->file_offset) {
 		i = min(siz, gz->file_offset - gz->output);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
 		i = min(siz, gz->file_end - gz->output);
 		q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
 		bcopy(p, q, i);
 		gz->output += i;
 		p += i;
 		siz -= i;
 	}
 	gz->output += siz;
 	return 0;
 }
 
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  * Since `const' objects end up in the text segment, TEXT_SET is the
  * correct directive to use.
  */
 
 static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
 TEXT_SET(execsw_set, gzip_execsw);
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 13489)
+++ head/sys/kern/init_main.c	(revision 13490)
@@ -1,629 +1,629 @@
 /*
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
- * $Id: init_main.c,v 1.35 1995/12/07 12:46:36 davidg Exp $
+ * $Id: init_main.c,v 1.36 1995/12/10 13:45:11 phk Exp $
  */
 
 #include <sys/param.h>
 #include <sys/filedesc.h>
 #include <sys/errno.h>
 #include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/clist.h>
 #include <sys/msg.h>
 #include <sys/protosw.h>
 #include <sys/reboot.h>
 #include <sys/sem.h>
 #include <sys/shm.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 
 #include <ufs/ufs/quota.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <sys/user.h>
 
 extern struct linker_set	sysinit_set;	/* XXX */
 
 extern void __main __P((void));
 extern void main __P((void *framep));
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 static struct pcred cred0;
 static struct filedesc0 filedesc0;
 static struct plimit limit0;
 static struct vmspace vmspace0;
 struct	proc *curproc = &proc0;
 struct	proc *initproc;
 
 static int cmask = CMASK;
 extern	struct user *proc0paddr;
 
 struct	vnode *rootvp;
 int	boothowto;
 
 struct	timeval boottime;
 SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
 	CTLFLAG_RW, &boottime, timeval, "");
 
 struct	timeval runtime;
 
 /*
  * Promiscuous argument pass for start_init()
  *
  * This is a kludge because we use a return from main() rather than a call
  * to a new reoutine in locore.s to kick the kernel alive from locore.s.
  */
 static void	*init_framep;
 
 
 #if __GNUC__ >= 2
 void __main() {}
 #endif
 
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
 
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads", like an LFS
  * cleaner.
  */
 void
 main(framep)
 	void *framep;
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 	int			rval[2];	/* SI_TYPE_KTHREAD support*/
 
 	/*
 	 * Save the locore.s frame pointer for start_init().
 	 */
 	init_framep = framep;
 
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 *
 	 * Since some things care about execution order, this is the
 	 * operation which ensures continued function.
 	 */
 	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
 		for( xipp = sipp + 1; *xipp; xipp++) {
 			if( (*sipp)->subsystem < (*xipp)->subsystem ||
 			    ( (*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order < (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
 		if( (*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		switch( (*sipp)->type) {
 		case SI_TYPE_DEFAULT:
 			/* no special processing*/
 			(*((*sipp)->func))( (*sipp)->udata);
 			break;
 
 		case SI_TYPE_KTHREAD:
 			/* kernel thread*/
 			if (fork(&proc0, NULL, rval))
 				panic("fork kernel process");
 			if (rval[1]) {
 				(*((*sipp)->func))( (*sipp)->udata);
 				/*
 				 * The call to start "init" returns
 				 * here after the scheduler has been
 				 * started, and returns to the caller
 				 * in i386/i386/locore.s.  This is a
 				 * necessary part of initialization
 				 * and is rather non-obvious.
 				 *
 				 * No other "kernel threads" should
 				 * return here.  Call panic() instead.
 				 */
 				return;
 			}
 			break;
 
 		default:
 			panic( "init_main: unrecognized init type");
 		}
 	}
 
 	/* NOTREACHED*/
 }
 
 
 /*
  * Start a kernel process.  This is called after a fork() call in
  * main() in the file kern/init_main.c.
  *
  * This function is used to start "internal" daemons.
  */
 /* ARGSUSED*/
 void
 kproc_start(udata)
 	void *udata;
 {
 	struct kproc_desc	*kp = udata;
 	struct proc		*p = curproc;
 
 	/* save a global descriptor, if desired*/
 	if( kp->global_procpp != NULL)
 		*kp->global_procpp	= p;
 
 	/* this is a non-swapped system process*/
 	p->p_flag |= P_INMEM | P_SYSTEM;
 
 	/* set up arg0 for 'ps', et al*/
 	strcpy( p->p_comm, kp->arg0);
 
 	/* call the processes' main()...*/
 	(*kp->func)();
 
 	/* NOTREACHED */
 	panic("kproc_start: %s", kp->arg0);
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 #ifdef OMIT
 /*
  * Handled by vfs_mountroot (bad idea) at this time... should be
  * done the same as 4.4Lite2.
  */
 SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
 #endif	/* OMIT*/
 
 /*
  * Should get its own file...
  */
 #ifdef HPFPLIB
 char	copyright[] =
 "Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n";
 #else
 char	copyright[] =
 "Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.  All rights reserved.\n\n";
 #endif
 static void print_caddr_t __P((void *data));
 static void
 print_caddr_t(data)
 	void *data;
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void proc0_init __P((void *dummy));
 static void
 proc0_init(dummy)
 	void *dummy;
 {
 	register struct proc		*p;
 	register struct filedesc0	*fdp;
 	register int i;
 
 	/*
 	 * Initialize the current process pointer (curproc) before
 	 * any possible traps/probes to simplify trap processing.
 	 */
 	p = &proc0;
 	curproc = p;			/* XXX redundant*/
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	allproc = (volatile struct proc *)p;
 	p->p_prev = (struct proc **)&allproc;
 	p->p_pgrp = &pgrp0;
 	pgrphash[0] = &pgrp0;
 	pgrp0.pg_mem = p;
 	pgrp0.pg_session = &session0;
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 	p->p_sysent = &aout_sysvec;
 
 	p->p_flag = P_INMEM | P_SYSTEM;
 	p->p_stat = SRUN;
 	p->p_nice = NZERO;
 	p->p_rtprio.type = RTP_PRIO_NORMAL;
 	p->p_rtprio.prio = 0;
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	/* Create credentials. */
 	cred0.p_refcnt = 1;
 	p->p_cred = &cred0;
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 
 	/* Create the file descriptor table. */
 	fdp = &filedesc0;
 	p->p_fd = &fdp->fd_fd;
 	fdp->fd_fd.fd_refcnt = 1;
 	fdp->fd_fd.fd_cmask = cmask;
 	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
 	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
 	fdp->fd_fd.fd_nfiles = NDFILE;
 
 	/* Create the limits structures. */
 	p->p_limit = &limit0;
 	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
 		limit0.pl_rlimit[i].rlim_cur =
 		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE;
 	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC;
 	i = ptoa(cnt.v_free_count);
 	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	limit0.p_refcnt = 1;
 
 	/* Allocate a prototype map so we have something to fork. */
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	pmap_pinit(&vmspace0.vm_pmap);
 	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
 	    trunc_page(VM_MAX_ADDRESS), TRUE);
 	vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
 	p->p_addr = proc0paddr;				/* XXX */
 
 #define INCOMPAT_LITES2
 #ifdef INCOMPAT_LITES2
 	/*
 	 * proc0 needs to have a coherent frame base, too.
 	 * This probably makes the identical call for the init proc
 	 * that happens later unnecessary since it should inherit
 	 * it during the fork.
 	 */
 	cpu_set_init_frame(p, init_framep);			/* XXX! */
 #endif	/* INCOMPAT_LITES2*/
 
 	/*
 	 * We continue to place resource usage info and signal
 	 * actions in the user struct so they're pageable.
 	 */
 	p->p_stats = &p->p_addr->u_stats;
 	p->p_sigacts = &p->p_addr->u_sigacts;
 
 	/*
 	 * Initialize per uid information structure and charge
 	 * root for one process.
 	 */
 	usrinfoinit();
 	(void)chgproccnt(0, 1);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void proc0_post __P((void *dummy));
 static void
 proc0_post(dummy)
 	void *dummy;
 {
 	/*
 	 * Now can look at time, having had a chance to verify the time
 	 * from the file system.  Reset p->p_rtime as it may have been
 	 * munched in mi_switch() after the time got set.
 	 */
 	proc0.p_stats->p_start = runtime = mono_time = boottime = time;
 	proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0;
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void sched_setup __P((void *dummy));
 static void
 sched_setup(dummy)
 	void *dummy;
 {
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
 }
 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
 /* ARGSUSED*/
 static void xxx_vfs_mountroot __P((void *dummy));
 static void
 xxx_vfs_mountroot(dummy)
 	void *dummy;
 {
 	/* Mount the root file system. */
 	if ((*mountroot)(mountrootvfsops))
 		panic("cannot mount root");
 }
 SYSINIT(mountroot, SI_SUB_ROOT, SI_ORDER_FIRST, xxx_vfs_mountroot, NULL)
 
 /* ARGSUSED*/
 static void xxx_vfs_root_fdtab __P((void *dummy));
 static void
 xxx_vfs_root_fdtab(dummy)
 	void *dummy;
 {
 	register struct filedesc0	*fdp = &filedesc0;
 
 	/* Get the vnode for '/'.  Set fdp->fd_fd.fd_cdir to reference it. */
 	if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
 		panic("cannot find root vnode");
 	fdp->fd_fd.fd_cdir = rootvnode;
 	VREF(fdp->fd_fd.fd_cdir);
 	VOP_UNLOCK(rootvnode);
 	fdp->fd_fd.fd_rdir = NULL;
 }
 SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.  It is here for two reasons only:
  ****
  ****	1)	This code returns to startup the system; this is
  ****		abnormal for a kernel thread.
  ****	2)	This code promiscuously uses init_frame
  ****
  ***************************************************************************
  */
 
 static void kthread_init __P((void *dummy));
 SYSINIT_KT(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
 
 
 static void start_init __P((struct proc *p, void *framep));
 
 /* ARGSUSED*/
 static void
 kthread_init(dummy)
 	void *dummy;
 {
 
 	/* Create process 1 (init(8)). */
 	start_init(curproc, init_framep);
 
 	/*
 	 * This is the only kernel thread allowed to return yo the
 	 * caller!!!
 	 */
 	return;	
 }
 
 
 /*
  * List of paths to try when searching for "init".
  */
 static char *initpaths[] = {
 	"/sbin/init",
 	"/sbin/oinit",
 	"/sbin/init.bak",
 	"/stand/sysinstall",
 	NULL,
 };
 
 /*
  * Start the initial user process; try exec'ing each pathname in "initpaths".
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(p, framep)
 	struct proc *p;
 	void *framep;
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, i, retval[2], error;
 	char **pathp, *path, *ucp, **uap, *arg0, *arg1;
 
 	initproc = p;
 
 	/*
 	 * We need to set the system call frame as if we were entered through
 	 * a syscall() so that when we call execve() below, it will be able
 	 * to set the entry point (see setregs) when it tries to exec.  The
 	 * startup code in "locore.s" has allocated space for the frame and
 	 * passed a pointer to that space as main's argument.
 	 */
 	cpu_set_init_frame(p, framep);
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE);
-	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE) != 0)
+	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)USRSTACK;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		for (i = strlen(path) + 1; i >= 0; i--)
 			(void)subyte(--ucp, path[i]);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((int)ucp & ~(NBPW-1));
 		(void)suword((caddr_t)--uap, 0);	/* terminator */
 		(void)suword((caddr_t)--uap, (int)arg1);
 		(void)suword((caddr_t)--uap, (int)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise return to main() which returns to btext
 		 * which completes the system startup.
 		 */
 		if ((error = execve(p, &args, &retval[0])) == 0)
 			return;
 		if (error != ENOENT)
 			printf("exec %s: error %d\n", path, error);
 	}
 	printf("init: not found\n");
 	panic("no init");
 }
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 13489)
+++ head/sys/kern/kern_exec.c	(revision 13490)
@@ -1,584 +1,584 @@
 /*
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by David Greenman
  * 4. The name of the developer may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: kern_exec.c,v 1.31 1996/01/04 20:28:45 wollman Exp $
+ *	$Id: kern_exec.c,v 1.32 1996/01/08 04:30:41 peter Exp $
  */
 
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/shm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <machine/reg.h>
 
 static int *exec_copyout_strings __P((struct image_params *));
 
 static int exec_check_permissions(struct image_params *);
 
 /*
  * execsw_set is constructed for us by the linker.  Each of the items
  * is a pointer to a `const struct execsw', hence the double pointer here.
  */
 static const struct execsw **execsw = 
 	(const struct execsw **)&execsw_set.ls_items[0];
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
         char    *fname; 
         char    **argv;
         char    **envv; 
 };
 #endif
 
 /*
  * execve() system call.
  */
 int
 execve(p, uap, retval)
 	struct proc *p;
 	register struct execve_args *uap;
 	int *retval;
 {
 	struct nameidata nd, *ndp;
 	int *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 
 	imgp = &image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->uap = uap;
 	imgp->attr = &attr;
 	imgp->image_header = NULL;
 	imgp->argc = imgp->envc = 0;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name[0] = '\0';
 
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings
 	 */
 	imgp->stringbase = (char *)kmem_alloc_pageable(exec_map, ARG_MAX);
 	if (imgp->stringbase == NULL) {
 		error = ENOMEM;
 		goto exec_fail;
 	}
 	imgp->stringp = imgp->stringbase;
 	imgp->stringspace = ARG_MAX;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 	    UIO_USERSPACE, uap->fname, p);
 
 interpret:
 
 	error = namei(ndp);
 	if (error) {
 		kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
 		goto exec_fail;
 	}
 
 	imgp->vp = ndp->ni_vp;
 	if (imgp->vp == NULL) {
 		error = ENOEXEC;
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 
 	/*
 	 * Lose the lock on the vnode. It's no longer needed, and must not
 	 * exist for the pagefault paging to work below.
 	 */
 	VOP_UNLOCK(imgp->vp);
 
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 * Map the image header (first page) of the file into
 	 *	kernel address space
 	 */
 	error = vm_mmap(kernel_map,			/* map */
 			(vm_offset_t *)&imgp->image_header, /* address */
 			PAGE_SIZE,			/* size */
 			VM_PROT_READ, 			/* protection */
 			VM_PROT_READ, 			/* max protection */
 			0,	 			/* flags */
 			(caddr_t)imgp->vp,		/* vnode */
 			0);				/* offset */
 	if (error) {
 		uprintf("mmap failed: %d\n",error);
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Loop through list of image activators, calling each one.
 	 *	If there is no match, the activator returns -1. If there
 	 *	is a match, but there was an error during the activation,
 	 *	the error is returned. Otherwise 0 means success. If the
 	 *	image is interpreted, loop back up and try activating
 	 *	the interpreter.
 	 */
 	for (i = 0; execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact)
 			error = (*execsw[i]->ex_imgact)(imgp);
 		else
 			continue;
 
 		if (error == -1)
 			continue;
 		if (error)
 			goto exec_fail_dealloc;
 		if (imgp->interpreted) {
 			/* free old vnode and name buffer */
 			vrele(ndp->ni_vp);
 			FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
 			if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header,
 			    (vm_offset_t)imgp->image_header + PAGE_SIZE))
 				panic("execve: header dealloc failed (1)");
 
 			/* set new name to that of the interpreter */
 			NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 			    UIO_SYSSPACE, imgp->interpreter_name, p);
 			goto interpret;
 		}
 		break;
 	}
 	/* If we made it through all the activators and none matched, exit. */
 	if (error == -1) {
 		error = ENOEXEC;
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	stack_base = exec_copyout_strings(imgp);
 	p->p_vmspace->vm_minsaddr = (char *)stack_base;
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->argc);
 
 	/* close files on exec */
 	fdcloseexec(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as executable, wakeup any process that was vforked and tell
 	 * it that it now has it's own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup((caddr_t)p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid. Disallow if the process is
 	 * being traced.
 	 */
 	if ((attr.va_mode & (VSUID | VSGID)) &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.
 		 */
 		if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
 			p->p_traceflag = 0;
 			vrele(p->p_tracep);
 			p->p_tracep = NULL;
 		}
 		/*
 		 * Set the new credentials.
 		 */
 		p->p_ucred = crcopy(p->p_ucred);
 		if (attr.va_mode & VSUID)
 			p->p_ucred->cr_uid = attr.va_uid;
 		if (attr.va_mode & VSGID)
 			p->p_ucred->cr_groups[0] = attr.va_gid;
 		p->p_flag |= P_SUGID;
 	} else {
 		p->p_flag &= ~P_SUGID;
 	}
 
 	/*
 	 * Implement correct POSIX saved-id behavior.
 	 */
 	p->p_cred->p_svuid = p->p_ucred->cr_uid;
 	p->p_cred->p_svgid = p->p_ucred->cr_gid;
 
 	/*
 	 * Store the vp for use in procfs
 	 */
 	if (p->p_textvp)		/* release old reference */
 		vrele(p->p_textvp);
 	VREF(ndp->ni_vp);
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * 	can be set before the program executes.
 	 */
 	if (p->p_flag & P_TRACED)
 		psignal(p, SIGTRAP);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/* Set entry address */
 	setregs(p, imgp->entry_addr, (u_long)stack_base);
 
 	/*
 	 * free various allocated resources
 	 */
 	kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
 	if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header,
 	    (vm_offset_t)imgp->image_header + PAGE_SIZE))
 		panic("execve: header dealloc failed (2)");
 	vrele(ndp->ni_vp);
 	FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
 
 	return (0);
 
 exec_fail_dealloc:
 	if (imgp->stringbase != NULL)
 		kmem_free(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
 	if (imgp->image_header && imgp->image_header != (char *)-1)
 		if (vm_map_remove(kernel_map, (vm_offset_t)imgp->image_header,
 		    (vm_offset_t)imgp->image_header + PAGE_SIZE))
 			panic("execve: header dealloc failed (3)");
 	if (ndp->ni_vp)
 		vrele(ndp->ni_vp);
 	FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
 
 exec_fail:
 	if (imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(p, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 		return(0);
 	} else {
 		return(error);
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp)
 	struct image_params *imgp;
 {
 	int error;
 	struct vmspace *vmspace = imgp->proc->p_vmspace;
 	caddr_t	stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
 
 	imgp->vmspace_destroyed = 1;
 
 	/* Blow away entire process VM */
 	if (vmspace->vm_shm)
 		shmexit(imgp->proc);
 	vm_map_remove(&vmspace->vm_map, 0, USRSTACK);
 
 	/* Allocate a new stack */
 	error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr,
-	    SGROWSIZ, FALSE);
+	    SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 		return(error);
 
 	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
 
 	/* Initialize maximum stack address */
 	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
 
 	return(0);
 }
 
 /*
  * Copy out argument and environment strings from the old process
  *	address space into the temporary string buffer.
  */
 int
 exec_extract_strings(imgp)
 	struct image_params *imgp;
 {
 	char	**argv, **envv;
 	char	*argp, *envp;
 	int	error, length;
 
 	/*
 	 * extract arguments first
 	 */
 
 	argv = imgp->uap->argv;
 
 	if (argv) {
 		while ((argp = (caddr_t) fuword(argv++))) {
 			if (argp == (caddr_t) -1)
 				return (EFAULT);
 			if ((error = copyinstr(argp, imgp->stringp,
 			    imgp->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					return(E2BIG);
 				return (error);
 			}
 			imgp->stringspace -= length;
 			imgp->stringp += length;
 			imgp->argc++;
 		}
 	}
 
 	/*
 	 * extract environment strings
 	 */
 
 	envv = imgp->uap->envv;
 
 	if (envv) {
 		while ((envp = (caddr_t) fuword(envv++))) {
 			if (envp == (caddr_t) -1)
 				return (EFAULT);
 			if ((error = copyinstr(envp, imgp->stringp,
 			    imgp->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					return(E2BIG);
 				return (error);
 			}
 			imgp->stringspace -= length;
 			imgp->stringp += length;
 			imgp->envc++;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Copy strings out to the new process address space, constructing
  *	new arg and env vector tables. Return a pointer to the base
  *	so that it can be used as the initial stack pointer.
  */
 int *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	int *stack_base;
 	struct ps_strings *arginfo;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 */
 	arginfo = PS_STRINGS;
 	destp =	(caddr_t)arginfo - SPARE_USRSPACE -
 		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
 
 	/*
 	 * The '+ 2' is for the null pointers at the end of each of the
 	 *	arg and	env vector sets
 	 */
 	vectp = (char **) (destp -
 		(imgp->argc + imgp->envc + 2) * sizeof(char *));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (int *)vectp;
 
 	stringp = imgp->stringbase;
 	argc = imgp->argc;
 	envc = imgp->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (int)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (int)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer seperates the argp's from the envp's */
 	suword(vectp++, NULL);
 
 	suword(&arginfo->ps_envstr, (int)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (int)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, NULL);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Return 0 for success or error code on failure.
  */
 static int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct proc *p = imgp->proc;
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	int error;
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 *	if there are any.
 	 */
 	if (vp->v_writecount) {
 		return (ETXTBSY);
 	}
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, p->p_ucred, p);
 	if (error)
 		return (error);
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG)) {
 		return (EACCES);
 	}
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 * Disable setuid/setgid if the filesystem prohibits it or if
 	 *	the process is being traced.
 	 */
         if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED))
 		attr->va_mode &= ~(VSUID | VSGID);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 *	Then call filesystem specific open routine (which does nothing
 	 *	in the general case).
 	 */
 	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
 	if (error)
 		return (error);
 
 	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
 	if (error)
 		return (error);
 
 	return (0);
 }
Index: head/sys/kern/kern_exit.c
===================================================================
--- head/sys/kern/kern_exit.c	(revision 13489)
+++ head/sys/kern/kern_exit.c	(revision 13490)
@@ -1,516 +1,516 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
- * $Id: kern_exit.c,v 1.25 1996/01/04 20:28:46 wollman Exp $
+ * $Id: kern_exit.c,v 1.26 1996/01/08 04:30:44 peter Exp $
  */
 
 #include "opt_ktrace.h"
 #include "opt_sysvipc.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
 #include <sys/ioctl.h>
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <sys/kernel.h>
 #include <sys/buf.h>
 #include <sys/wait.h>
 #include <sys/file.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/ptrace.h>
 #include <sys/filedesc.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 
 #include <machine/cpu.h>
 #ifdef COMPAT_43
 #include <machine/reg.h>
 #include <machine/psl.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
 static int wait1 __P((struct proc *, struct wait_args *, int [], int));
 
 /*
  * exit --
  *	Death of process.
  */
 __dead void
 exit(p, uap, retval)
 	struct proc *p;
 	struct rexit_args /* {
 		int	rval;
 	} */ *uap;
 	int *retval;
 {
 
 	exit1(p, W_EXITCODE(uap->rval, 0));
 	/* NOTREACHED */
 }
 
 /*
  * Exit: deallocate address space and other resources, change proc state
  * to zombie, and unlink proc from allproc and parent's lists.  Save exit
  * status and rusage for wait().  Check for child processes and orphan them.
  */
 __dead void
 exit1(p, rv)
 	register struct proc *p;
 	int rv;
 {
 	register struct proc *q, *nq;
 	register struct proc **pp;
 	register struct vmspace *vm;
 
 	if (p->p_pid == 1) {
 		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
 		panic("Going nowhere without my init!");
 	}
 #ifdef PGINPROF
 	vmsizmon();
 #endif
 	if (p->p_flag & P_PROFIL)
 		stopprofclock(p);
 	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
 		M_ZOMBIE, M_WAITOK);
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
 	 */
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
 	p->p_flag |= P_WEXIT;
 	p->p_sigignore = ~0;
 	p->p_siglist = 0;
 	untimeout(realitexpire, (caddr_t)p);
 
 	/*
 	 * Close open files and release open-file table.
 	 * This may block!
 	 */
 	fdfree(p);
 
 	/*
 	 * XXX Shutdown SYSV semaphores
 	 */
 	semexit(p);
 
 	/* The next two chunks should probably be moved to vmspace_exit. */
 	vm = p->p_vmspace;
 	if (vm->vm_shm)
 		shmexit(p);
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
 	 * which could cause I/O if the file has been unlinked.
 	 * Need to do this early enough that we can still sleep.
 	 * Can't free the entire vmspace as the kernel stack
 	 * may be mapped within that space also.
 	 */
 	if (vm->vm_refcnt == 1)
 		(void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
 		    VM_MAXUSER_ADDRESS);
 
 	if (SESS_LEADER(p)) {
 		register struct session *sp = p->p_session;
 
 		if (sp->s_ttyvp) {
 			/*
 			 * Controlling process.
 			 * Signal foreground pgrp,
 			 * drain controlling terminal
 			 * and revoke access to controlling terminal.
 			 */
 			if (sp->s_ttyp->t_session == sp) {
 				if (sp->s_ttyp->t_pgrp)
 					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
 				(void) ttywait(sp->s_ttyp);
 				/*
 				 * The tty could have been revoked
 				 * if we blocked.
 				 */
 				if (sp->s_ttyvp)
 					vgoneall(sp->s_ttyvp);
 			}
 			if (sp->s_ttyvp)
 				vrele(sp->s_ttyvp);
 			sp->s_ttyvp = NULL;
 			/*
 			 * s_ttyp is not zero'd; we use this to indicate
 			 * that the session once had a controlling terminal.
 			 * (for logging and informational purposes)
 			 */
 		}
 		sp->s_leader = NULL;
 	}
 	fixjobc(p, p->p_pgrp, 0);
 	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	(void)acct_process(p);
 #ifdef KTRACE
 	/*
 	 * release trace file
 	 */
 	p->p_traceflag = 0;	/* don't trace the vrele() */
 	if (p->p_tracep)
 		vrele(p->p_tracep);
 #endif
 	/*
 	 * Remove proc from allproc queue and pidhash chain.
 	 * Place onto zombproc.  Unlink from parent's child list.
 	 */
 	if ((*p->p_prev = p->p_next))
 		p->p_next->p_prev = p->p_prev;
 	if ((p->p_next = zombproc))
 		p->p_next->p_prev = &p->p_next;
 	p->p_prev = &zombproc;
 	zombproc = p;
 	p->p_stat = SZOMB;
 
 	for (pp = &pidhash[PIDHASH(p->p_pid)]; *pp; pp = &(*pp)->p_hash)
 		if (*pp == p) {
 			*pp = p->p_hash;
 			goto done;
 		}
 	panic("exit");
 done:
 
 	if (p->p_cptr)		/* only need this if any child is S_ZOMB */
 		wakeup((caddr_t) initproc);
 	for (q = p->p_cptr; q != NULL; q = nq) {
 		nq = q->p_osptr;
 		if (nq != NULL)
 			nq->p_ysptr = NULL;
 		if (initproc->p_cptr)
 			initproc->p_cptr->p_ysptr = q;
 		q->p_osptr = initproc->p_cptr;
 		q->p_ysptr = NULL;
 		initproc->p_cptr = q;
 
 		q->p_pptr = initproc;
 		/*
 		 * Traced processes are killed
 		 * since their existence means someone is screwing up.
 		 */
 		if (q->p_flag & P_TRACED) {
 			q->p_flag &= ~P_TRACED;
 			psignal(q, SIGKILL);
 		}
 	}
 	p->p_cptr = NULL;
 
 	/*
 	 * Save exit status and final rusage info, adding in child rusage
 	 * info and self times.
 	 */
 	p->p_xstat = rv;
 	*p->p_ru = p->p_stats->p_ru;
 	calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL);
 	ruadd(p->p_ru, &p->p_stats->p_cru);
 
 	/*
 	 * Notify parent that we're gone.
 	 */
 	psignal(p->p_pptr, SIGCHLD);
 	wakeup((caddr_t)p->p_pptr);
 #if defined(tahoe)
 	/* move this to cpu_exit */
 	p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL;
 #endif
 	/*
 	 * Clear curproc after we've done all operations
 	 * that could block, and before tearing down the rest
 	 * of the process state that might be used from clock, etc.
 	 * Also, can't clear curproc while we're still runnable,
 	 * as we're not on a run queue (we are current, just not
 	 * a proper proc any longer!).
 	 *
 	 * Other substructures are freed from wait().
 	 */
 	curproc = NULL;
 	if (--p->p_limit->p_refcnt == 0) {
 		FREE(p->p_limit, M_SUBPROC);
 		p->p_limit = NULL;
 	}
 
 	/*
 	 * Finally, call machine-dependent code to release the remaining
 	 * resources including address space, the kernel stack and pcb.
 	 * The address space is released by "vmspace_free(p->p_vmspace)";
 	 * This is machine-dependent, as we may have to change stacks
 	 * or ensure that the current one isn't reallocated before we
 	 * finish.  cpu_exit will end with a call to cpu_swtch(), finishing
 	 * our execution (pun intended).
 	 */
 	cpu_exit(p);
 }
 
 #ifdef COMPAT_43
 #if defined(hp300) || defined(luna68k)
 #include <machine/frame.h>
 #define GETPS(rp)	((struct frame *)(rp))->f_sr
 #else
 #define GETPS(rp)	(rp)[PS]
 #endif
 
 int
 owait(p, uap, retval)
 	struct proc *p;
 	register struct owait_args /* {
 		int     dummy;
 	} */ *uap;
 	int *retval;
 {
 	struct wait_args w;
 
 #ifdef PSL_ALLCC
 	if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) {
 		w.options = 0;
 		w.rusage = NULL;
 	} else {
 		w.options = p->p_md.md_regs[R0];
 		w.rusage = (struct rusage *)p->p_md.md_regs[R1];
 	}
 #else
 	w.options = 0;
 	w.rusage = NULL;
 #endif
 	w.pid = WAIT_ANY;
 	w.status = NULL;
 	return (wait1(p, &w, retval, 1));
 }
 #endif /* COMPAT_43 */
 
 int
 wait4(p, uap, retval)
 	struct proc *p;
 	struct wait_args *uap;
 	int *retval;
 {
 
 	return (wait1(p, uap, retval, 0));
 }
 
 static int
 wait1(q, uap, retval, compat)
 	register struct proc *q;
 	register struct wait_args /* {
 		int pid;
 		int *status;
 		int options;
 		struct rusage *rusage;
 	} */ *uap;
 	int retval[];
 	int compat;
 {
 	register int nfound;
 	register struct proc *p, *t;
 	int status, error;
 
 	if (uap->pid == 0)
 		uap->pid = -q->p_pgid;
 #ifdef notyet
 	if (uap->options &~ (WUNTRACED|WNOHANG))
 		return (EINVAL);
 #endif
 loop:
 	nfound = 0;
 	for (p = q->p_cptr; p; p = p->p_osptr) {
 		if (uap->pid != WAIT_ANY &&
 		    p->p_pid != uap->pid && p->p_pgid != -uap->pid)
 			continue;
 		nfound++;
 		if (p->p_stat == SZOMB) {
 			/* charge childs scheduling cpu usage to parent */
 			if (curproc->p_pid != 1) {
 				curproc->p_estcpu = min(curproc->p_estcpu +
 				    p->p_estcpu, UCHAR_MAX);
 			}
 
 			retval[0] = p->p_pid;
 #ifdef COMPAT_43
 			if (compat)
 				retval[1] = p->p_xstat;
 			else
 #endif
 			if (uap->status) {
 				status = p->p_xstat;	/* convert to int */
 				if ((error = copyout((caddr_t)&status,
 				    (caddr_t)uap->status, sizeof(status))))
 					return (error);
 			}
 			if (uap->rusage && (error = copyout((caddr_t)p->p_ru,
 			    (caddr_t)uap->rusage, sizeof (struct rusage))))
 				return (error);
 			/*
 			 * If we got the child via a ptrace 'attach',
 			 * we need to give it back to the old parent.
 			 */
 			if (p->p_oppid && (t = pfind(p->p_oppid))) {
 				p->p_oppid = 0;
 				proc_reparent(p, t);
 				psignal(t, SIGCHLD);
 				wakeup((caddr_t)t);
 				return (0);
 			}
 			p->p_xstat = 0;
 			ruadd(&q->p_stats->p_cru, p->p_ru);
 			FREE(p->p_ru, M_ZOMBIE);
 			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
 			 */
 			(void)chgproccnt(p->p_cred->p_ruid, -1);
 
 			/*
+			 * Release reference to text vnode
+			 */
+			if (p->p_textvp)
+				vrele(p->p_textvp);
+
+			/*
 			 * Free up credentials.
 			 */
 			if (--p->p_cred->p_refcnt == 0) {
 				crfree(p->p_cred->pc_ucred);
 				FREE(p->p_cred, M_SUBPROC);
 				p->p_cred = NULL;
 			}
-
-			/*
-			 * Release reference to text vnode
-			 */
-			if (p->p_textvp)
-				vrele(p->p_textvp);
 
 			/*
 			 * Finally finished with old proc entry.
 			 * Unlink it from its process group and free it.
 			 */
 			leavepgrp(p);
 			if ((*p->p_prev = p->p_next))	/* off zombproc */
 				p->p_next->p_prev = p->p_prev;
 			if ((q = p->p_ysptr))
 				q->p_osptr = p->p_osptr;
 			if ((q = p->p_osptr))
 				q->p_ysptr = p->p_ysptr;
 			if ((q = p->p_pptr)->p_cptr == p)
 				q->p_cptr = p->p_osptr;
 
 			/*
 			 * Give machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
 			 */
 			cpu_wait(p);
 			FREE(p, M_PROC);
 			nprocs--;
 			return (0);
 		}
 		if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || uap->options & WUNTRACED)) {
 			p->p_flag |= P_WAITED;
 			retval[0] = p->p_pid;
 #ifdef COMPAT_43
 			if (compat) {
 				retval[1] = W_STOPCODE(p->p_xstat);
 				error = 0;
 			} else
 #endif
 			if (uap->status) {
 				status = W_STOPCODE(p->p_xstat);
 				error = copyout((caddr_t)&status,
 					(caddr_t)uap->status, sizeof(status));
 			} else
 				error = 0;
 			return (error);
 		}
 	}
 	if (nfound == 0)
 		return (ECHILD);
 	if (uap->options & WNOHANG) {
 		retval[0] = 0;
 		return (0);
 	}
 	if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)))
 		return (error);
 	goto loop;
 }
 
 /*
  * make process 'parent' the new parent of process 'child'.
  */
 void
 proc_reparent(child, parent)
 	register struct proc *child;
 	register struct proc *parent;
 {
 	register struct proc *o;
 	register struct proc *y;
 
 	if (child->p_pptr == parent)
 		return;
 
 	/* fix up the child linkage for the old parent */
 	o = child->p_osptr;
 	y = child->p_ysptr;
 	if (y)
 		y->p_osptr = o;
 	if (o)
 		o->p_ysptr = y;
 	if (child->p_pptr->p_cptr == child)
 		child->p_pptr->p_cptr = o;
 
 	/* fix up child linkage for new parent */
 	o = parent->p_cptr;
 	if (o)
 		o->p_ysptr = child;
 	child->p_osptr = o;
 	child->p_ysptr = NULL;
 	parent->p_cptr = child;
 	child->p_pptr = parent;
 }
Index: head/sys/kern/subr_trap.c
===================================================================
--- head/sys/kern/subr_trap.c	(revision 13489)
+++ head/sys/kern/subr_trap.c	(revision 13490)
@@ -1,1061 +1,1062 @@
 /*-
  * Copyright (C) 1994, David Greenman
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the University of Utah, and William Jolitz.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
- *	$Id: trap.c,v 1.69 1996/01/03 21:41:36 wollman Exp $
+ *	$Id: trap.c,v 1.70 1996/01/04 21:11:03 wollman Exp $
  */
 
 /*
  * 386 Trap and System call handling
  */
 
 #include "opt_ktrace.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/acct.h>
 #include <sys/kernel.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/psl.h>
 #include <machine/reg.h>
 #include <machine/trap.h>
 #include <machine/../isa/isa_device.h>
 
 #ifdef POWERFAIL_NMI
 # include <syslog.h>
 # include <machine/clock.h>
 #endif
 
 #include "isa.h"
 #include "npx.h"
 
 int (*pmath_emulate) __P((struct trapframe *));
 
 extern void trap __P((struct trapframe frame));
 extern int trapwrite __P((unsigned addr));
 extern void syscall __P((struct trapframe frame));
 extern void linux_syscall __P((struct trapframe frame));
 
 static int trap_pfault __P((struct trapframe *, int));
 static void trap_fatal __P((struct trapframe *));
 void dblfault_handler __P((void));
 
 extern inthand_t IDTVEC(syscall);
 
 #define MAX_TRAP_MSG		27
 static char *trap_msg[] = {
 	"",					/*  0 unused */
 	"privileged instruction fault",		/*  1 T_PRIVINFLT */
 	"",					/*  2 unused */
 	"breakpoint instruction fault",		/*  3 T_BPTFLT */
 	"",					/*  4 unused */
 	"",					/*  5 unused */
 	"arithmetic trap",			/*  6 T_ARITHTRAP */
 	"system forced exception",		/*  7 T_ASTFLT */
 	"",					/*  8 unused */
 	"general protection fault",		/*  9 T_PROTFLT */
 	"trace trap",				/* 10 T_TRCTRAP */
 	"",					/* 11 unused */
 	"page fault",				/* 12 T_PAGEFLT */
 	"",					/* 13 unused */
 	"alignment fault",			/* 14 T_ALIGNFLT */
 	"",					/* 15 unused */
 	"",					/* 16 unused */
 	"",					/* 17 unused */
 	"integer divide fault",			/* 18 T_DIVIDE */
 	"non-maskable interrupt trap",		/* 19 T_NMI */
 	"overflow trap",			/* 20 T_OFLOW */
 	"FPU bounds check fault",		/* 21 T_BOUND */
 	"FPU device not available",		/* 22 T_DNA */
 	"double fault",				/* 23 T_DOUBLEFLT */
 	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
 	"invalid TSS fault",			/* 25 T_TSSFLT */
 	"segment not present fault",		/* 26 T_SEGNPFLT */
 	"stack fault",				/* 27 T_STKFLT */
 };
 
 static void userret __P((struct proc *p, struct trapframe *frame,
 			 u_quad_t oticks));
 
 static inline void
 userret(p, frame, oticks)
 	struct proc *p;
 	struct trapframe *frame;
 	u_quad_t oticks;
 {
 	int sig, s;
 
 	while ((sig = CURSIG(p)) != 0)
 		postsig(sig);
 	p->p_priority = p->p_usrpri;
 	if (want_resched) {
 		/*
 		 * Since we are curproc, clock will normally just change
 		 * our priority without moving us from one queue to another
 		 * (since the running process is not on a queue.)
 		 * If that happened after we setrunqueue ourselves but before we
 		 * mi_switch()'ed, we might not be on the queue indicated by
 		 * our priority.
 		 */
 		s = splclock();
 		setrunqueue(p);
 		p->p_stats->p_ru.ru_nivcsw++;
 		mi_switch();
 		splx(s);
 		while ((sig = CURSIG(p)) != 0)
 			postsig(sig);
 	}
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
 		u_quad_t ticks = p->p_sticks - oticks;
 
 		if (ticks) {
 #ifdef PROFTIMER
 			extern int profscale;
 			addupc(frame->tf_eip, &p->p_stats->p_prof,
 			    ticks * profscale);
 #else
 			addupc(frame->tf_eip, &p->p_stats->p_prof, ticks);
 #endif
 		}
 	}
 	curpriority = p->p_priority;
 }
 
 /*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this
  * frame after the exception has been processed.
  */
 
 void
 trap(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	u_quad_t sticks = 0;
 	int i = 0, ucode = 0, type, code;
 #ifdef DEBUG
 	u_long eva;
 #endif
 
 	type = frame.tf_trapno;
 	code = frame.tf_err;
 
 	if (ISPL(frame.tf_cs) == SEL_UPL) {
 		/* user trap */
 
 		sticks = p->p_sticks;
 		p->p_md.md_regs = (int *)&frame;
 
 		switch (type) {
 		case T_PRIVINFLT:	/* privileged instruction fault */
 			ucode = type;
 			i = SIGILL;
 			break;
 
 		case T_BPTFLT:		/* bpt instruction fault */
 		case T_TRCTRAP:		/* trace trap */
 			frame.tf_eflags &= ~PSL_T;
 			i = SIGTRAP;
 			break;
 
 		case T_ARITHTRAP:	/* arithmetic trap */
 			ucode = code;
 			i = SIGFPE;
 			break;
 
 		case T_ASTFLT:		/* Allow process switch */
 			astoff();
 			cnt.v_soft++;
 			if (p->p_flag & P_OWEUPC) {
 				addupc(frame.tf_eip, &p->p_stats->p_prof, 1);
 				p->p_flag &= ~P_OWEUPC;
 			}
 			goto out;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 		case T_STKFLT:		/* stack fault */
 		case T_TSSFLT:		/* invalid TSS fault */
 		case T_DOUBLEFLT:	/* double fault */
 		default:
 			ucode = code + BUS_SEGM_FAULT ;
 			i = SIGBUS;
 			break;
 
 		case T_PAGEFLT:		/* page fault */
 			i = trap_pfault(&frame, TRUE);
 			if (i == -1)
 				return;
 			if (i == 0)
 				goto out;
 
 			ucode = T_PAGEFLT;
 			break;
 
 		case T_DIVIDE:		/* integer divide fault */
 			ucode = FPE_INTDIV_TRAP;
 			i = SIGFPE;
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 			goto handle_powerfail;
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			panic("NMI indicates hardware failure");
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 
 		case T_OFLOW:		/* integer overflow fault */
 			ucode = FPE_INTOVF_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_BOUND:		/* bounds check fault */
 			ucode = FPE_SUBRNG_TRAP;
 			i = SIGFPE;
 			break;
 
 		case T_DNA:
 #if NNPX > 0
 			/* if a transparent fault (due to context switch "late") */
 			if (npxdna())
 				return;
 #endif	/* NNPX > 0 */
 
 			if (!pmath_emulate) {
 				i = SIGFPE;
 				ucode = FPE_FPU_NP_TRAP;
 				break;
 			}
 			i = (*pmath_emulate)(&frame);
 			if (i == 0) {
 				if (!(frame.tf_eflags & PSL_T))
 					return;
 				frame.tf_eflags &= ~PSL_T;
 				i = SIGTRAP;
 			}
 			/* else ucode = emulator_only_knows() XXX */
 			break;
 
 		case T_FPOPFLT:		/* FPU operand fetch fault */
 			ucode = T_FPOPFLT;
 			i = SIGILL;
 			break;
 		}
 	} else {
 		/* kernel trap */
 
 		switch (type) {
 		case T_PAGEFLT:			/* page fault */
 			(void) trap_pfault(&frame, FALSE);
 			return;
 
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			/*
 			 * Invalid segment selectors and out of bounds
 			 * %eip's and %esp's can be set up in user mode.
 			 * This causes a fault in kernel mode when the
 			 * kernel tries to return to user mode.  We want
 			 * to get this fault so that we can fix the
 			 * problem here and not have to check all the
 			 * selectors and pointers when the user changes
 			 * them.
 			 */
 #define	MAYBE_DORETI_FAULT(where, whereto)				\
 	do {								\
 		if (frame.tf_eip == (int)where) {			\
 			frame.tf_eip = (int)whereto;			\
 			return;						\
 		}							\
 	} while (0)
 
 			if (intr_nesting_level == 0) {
 				MAYBE_DORETI_FAULT(doreti_iret,
 						   doreti_iret_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_ds,
 						   doreti_popl_ds_fault);
 				MAYBE_DORETI_FAULT(doreti_popl_es,
 						   doreti_popl_es_fault);
 			}
 			if (curpcb && curpcb->pcb_onfault) {
 				frame.tf_eip = (int)curpcb->pcb_onfault;
 				return;
 			}
 			break;
 
 		case T_TSSFLT:
 			/*
 			 * PSL_NT can be set in user mode and isn't cleared
 			 * automatically when the kernel is entered.  This
 			 * causes a TSS fault when the kernel attempts to
 			 * `iret' because the TSS link is uninitialized.  We
 			 * want to get this fault so that we can fix the
 			 * problem here and not every time the kernel is
 			 * entered.
 			 */
 			if (frame.tf_eflags & PSL_NT) {
 				frame.tf_eflags &= ~PSL_NT;
 				return;
 			}
 			break;
 
 		case T_TRCTRAP:	 /* trace trap */
 			if (frame.tf_eip == (int)IDTVEC(syscall)) {
 				/*
 				 * We've just entered system mode via the
 				 * syscall lcall.  Continue single stepping
 				 * silently until the syscall handler has
 				 * saved the flags.
 				 */
 				return;
 			}
 			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
 				/*
 				 * The syscall handler has now saved the
 				 * flags.  Stop single stepping it.
 				 */
 				frame.tf_eflags &= ~PSL_T;
 				return;
 			}
 			/*
 			 * Fall through.
 			 */
 		case T_BPTFLT:
 			/*
 			 * If DDB is enabled, let it handle the debugger trap.
 			 * Otherwise, debugger traps "can't happen".
 			 */
 #ifdef DDB
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif
 			break;
 
 #if NISA > 0
 		case T_NMI:
 #ifdef POWERFAIL_NMI
 #ifndef TIMER_FREQ
 #  define TIMER_FREQ 1193182
 #endif
 	handle_powerfail:
 		{
 		  static unsigned lastalert = 0;
 
 		  if(time.tv_sec - lastalert > 10)
 		    {
 		      log(LOG_WARNING, "NMI: power fail\n");
 		      sysbeep(TIMER_FREQ/880, hz);
 		      lastalert = time.tv_sec;
 		    }
 		  return;
 		}
 #else /* !POWERFAIL_NMI */
 #ifdef DDB
 			/* NMI can be hooked up to a pushbutton for debugging */
 			printf ("NMI ... going to debugger\n");
 			if (kdb_trap (type, 0, &frame))
 				return;
 #endif /* DDB */
 			/* machine/parity/power fail/"kitchen sink" faults */
 			if (isa_nmi(code) == 0) return;
 			/* FALL THROUGH */
 #endif /* POWERFAIL_NMI */
 #endif /* NISA > 0 */
 		}
 
 		trap_fatal(&frame);
 		return;
 	}
 
 	trapsignal(p, i, ucode);
 
 #ifdef DEBUG
 	eva = rcr2();
 	if (type <= MAX_TRAP_MSG) {
 		uprintf("fatal process exception: %s",
 			trap_msg[type]);
 		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
 			uprintf(", fault VA = 0x%x", eva);
 		uprintf("\n");
 	}
 #endif
 
 out:
 	userret(p, &frame, sticks);
 }
 
 #ifdef notyet
 /*
  * This version doesn't allow a page fault to user space while
  * in the kernel. The rest of the kernel needs to be made "safe"
  * before this can be used. I think the only things remaining
  * to be made safe are the iBCS2 code and the process tracing/
  * debugging code.
  */
 static int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va < VM_MIN_KERNEL_ADDRESS) {
 		vm_offset_t v;
 		vm_page_t ptepg;
 
 		if (p == NULL ||
 		    (!usermode && va < VM_MAXUSER_ADDRESS &&
 		    (curpcb == NULL || curpcb->pcb_onfault == NULL))) {
 			trap_fatal(frame);
 			return (-1);
 		}
 
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
 			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(kernel_map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 #endif
 
 int
 trap_pfault(frame, usermode)
 	struct trapframe *frame;
 	int usermode;
 {
 	vm_offset_t va;
 	struct vmspace *vm = NULL;
 	vm_map_t map = 0;
 	int rv = 0;
 	vm_prot_t ftype;
 	int eva;
 	struct proc *p = curproc;
 
 	eva = rcr2();
 	va = trunc_page((vm_offset_t)eva);
 
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
 		 */
 		if (usermode)
 			goto nogo;
 
 		map = kernel_map;
 	} else {
 		/*
 		 * This is a fault on non-kernel virtual memory.
 		 * vm is initialized above to NULL. If curproc is NULL
 		 * or curproc->p_vmspace is NULL the fault is fatal.
 		 */
 		if (p != NULL)
 			vm = p->p_vmspace;
 
 		if (vm == NULL)
 			goto nogo;
 
 		map = &vm->vm_map;
 	}
 
 	if (frame->tf_err & PGEX_W)
 		ftype = VM_PROT_READ | VM_PROT_WRITE;
 	else
 		ftype = VM_PROT_READ;
 
 	if (map != kernel_map) {
 		vm_offset_t v;
 
 		/*
 		 * Keep swapout from messing with us during this
 		 *	critical time.
 		 */
 		++p->p_lock;
 
 		/*
 		 * Grow the stack if necessary
 		 */
 		if ((caddr_t)va > vm->vm_maxsaddr
 		    && (caddr_t)va < (caddr_t)USRSTACK) {
 			if (!grow(p, va)) {
 				rv = KERN_FAILURE;
 				--p->p_lock;
 				goto nogo;
 			}
 		}
 
 		/*
 		 * Check if page table is mapped, if not,
 		 *	fault it first
 		 */
 		v = (vm_offset_t) vtopte(va);
 
 		/* Fault the pte only if needed: */
 		if (*((int *)vtopte(v)) == 0)
-			(void) vm_fault(map, trunc_page(v), VM_PROT_WRITE, FALSE);
+			(void) vm_fault(map,
+				trunc_page(v), VM_PROT_WRITE, FALSE);
 
 		pmap_use_pt( vm_map_pmap(map), va);
 
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype, FALSE);
 
 		pmap_unuse_pt( vm_map_pmap(map), va);
 
 		--p->p_lock;
 	} else {
 		/*
 		 * Since we know that kernel virtual address addresses
 		 * always have pte pages mapped, we just have to fault
 		 * the page.
 		 */
 		rv = vm_fault(map, va, ftype, FALSE);
 	}
 
 	if (rv == KERN_SUCCESS)
 		return (0);
 nogo:
 	if (!usermode) {
 		if (curpcb && curpcb->pcb_onfault) {
 			frame->tf_eip = (int)curpcb->pcb_onfault;
 			return (0);
 		}
 		trap_fatal(frame);
 		return (-1);
 	}
 
 	/* kludge to pass faulting virtual address to sendsig */
 	frame->tf_err = eva;
 
 	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }
 
 static void
 trap_fatal(frame)
 	struct trapframe *frame;
 {
 	int code, type, eva;
 	struct soft_segment_descriptor softseg;
 
 	code = frame->tf_err;
 	type = frame->tf_trapno;
 	eva = rcr2();
 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
 
 	if (type <= MAX_TRAP_MSG)
 		printf("\n\nFatal trap %d: %s while in %s mode\n",
 			type, trap_msg[type],
 			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
 	if (type == T_PAGEFLT) {
 		printf("fault virtual address	= 0x%x\n", eva);
 		printf("fault code		= %s %s, %s\n",
 			code & PGEX_U ? "user" : "supervisor",
 			code & PGEX_W ? "write" : "read",
 			code & PGEX_P ? "protection violation" : "page not present");
 	}
 	printf("instruction pointer	= 0x%x:0x%x\n", frame->tf_cs & 0xffff, frame->tf_eip);
 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
 	    softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
 	    softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, softseg.ssd_gran);
 	printf("processor eflags	= ");
 	if (frame->tf_eflags & PSL_T)
 		printf("trace/trap, ");
 	if (frame->tf_eflags & PSL_I)
 		printf("interrupt enabled, ");
 	if (frame->tf_eflags & PSL_NT)
 		printf("nested task, ");
 	if (frame->tf_eflags & PSL_RF)
 		printf("resume, ");
 	if (frame->tf_eflags & PSL_VM)
 		printf("vm86, ");
 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
 	printf("current process		= ");
 	if (curproc) {
 		printf("%lu (%s)\n",
 		    (u_long)curproc->p_pid, curproc->p_comm ?
 		    curproc->p_comm : "");
 	} else {
 		printf("Idle\n");
 	}
 	printf("interrupt mask		= ");
 	if ((cpl & net_imask) == net_imask)
 		printf("net ");
 	if ((cpl & tty_imask) == tty_imask)
 		printf("tty ");
 	if ((cpl & bio_imask) == bio_imask)
 		printf("bio ");
 	if (cpl == 0)
 		printf("none");
 	printf("\n");
 
 #ifdef KDB
 	if (kdb_trap(&psl))
 		return;
 #endif
 #ifdef DDB
 	if (kdb_trap (type, 0, frame))
 		return;
 #endif
 	if (type <= MAX_TRAP_MSG)
 		panic(trap_msg[type]);
 	else
 		panic("unknown/reserved trap");
 }
 
 /*
  * Double fault handler. Called when a fault occurs while writing
  * a frame for a trap/exception onto the stack. This usually occurs
  * when the stack overflows (such is the case with infinite recursion,
  * for example).
  *
  * XXX Note that the current PTD gets replaced by IdlePTD when the
  * task switch occurs. This means that the stack that was active at
  * the time of the double fault is not available at <kstack> unless
  * the machine was idle when the double fault occurred. The downside
  * of this is that "trace <ebp>" in ddb won't work.
  */
 void
 dblfault_handler()
 {
 	struct pcb *pcb = curpcb;
 
 	if (pcb != NULL) {
 		printf("\nFatal double fault:\n");
 		printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip);
 		printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp);
 		printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp);
 	}
 
 	panic("double fault");
 }
 
 /*
  * Compensate for 386 brain damage (missing URKR).
  * This is a little simpler than the pagefault handler in trap() because
  * it the page tables have already been faulted in and high addresses
  * are thrown out early for other reasons.
  */
 int trapwrite(addr)
 	unsigned addr;
 {
 	struct proc *p;
 	vm_offset_t va, v;
 	struct vmspace *vm;
 	int rv;
 
 	va = trunc_page((vm_offset_t)addr);
 	/*
 	 * XXX - MAX is END.  Changed > to >= for temp. fix.
 	 */
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (1);
 
 	p = curproc;
 	vm = p->p_vmspace;
 
 	++p->p_lock;
 
 	if ((caddr_t)va >= vm->vm_maxsaddr
 	    && (caddr_t)va < (caddr_t)USRSTACK) {
 		if (!grow(p, va)) {
 			--p->p_lock;
 			return (1);
 		}
 	}
 
 	v = trunc_page(vtopte(va));
 
 	/*
 	 * wire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), FALSE);
 	}
 
 	/*
 	 * fault the data page
 	 */
 	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
 
 	/*
 	 * unwire the pte page
 	 */
 	if (va < USRSTACK) {
 		vm_map_pageable(&vm->vm_map, v, round_page(v+1), TRUE);
 	}
 
 	--p->p_lock;
 
 	if (rv != KERN_SUCCESS)
 		return 1;
 
 	return (0);
 }
 
 /*
  * System call request from POSIX system call gate interface to kernel.
  * Like trap(), argument is call by reference.
  */
 void
 syscall(frame)
 	struct trapframe frame;
 {
 	caddr_t params;
 	int i;
 	struct sysent *callp;
 	struct proc *p = curproc;
 	u_quad_t sticks;
 	int error;
 	int args[8], rval[2];
 	u_int code;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	params = (caddr_t)frame.tf_esp + sizeof(int);
 	code = frame.tf_eax;
 	/*
 	 * Need to check if this is a 32 bit or 64 bit syscall.
 	 */
 	if (code == SYS_syscall) {
 		/*
 		 * Code is first argument, followed by actual args.
 		 */
 		code = fuword(params);
 		params += sizeof(int);
 	} else if (code == SYS___syscall) {
 		/*
 		 * Like syscall, but code is a quad, so as to maintain
 		 * quad alignment for the rest of the arguments.
 		 */
 		code = fuword(params);
 		params += sizeof(quad_t);
 	}
 
  	if (p->p_sysent->sv_mask)
  		code &= p->p_sysent->sv_mask;
 
  	if (code >= p->p_sysent->sv_size)
  		callp = &p->p_sysent->sv_table[0];
   	else
  		callp = &p->p_sysent->sv_table[code];
 
 	if ((i = callp->sy_narg * sizeof(int)) &&
 	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_SYSCALL))
 			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 		goto bad;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
 #endif
 	rval[0] = 0;
 	rval[1] = frame.tf_edx;
 
 	error = (*callp->sy_call)(p, args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_edx = rval[1];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/*
 		 * Reconstruct pc, assuming lcall $X,y is 7 bytes.
 		 */
 		frame.tf_eip -= 7;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
 bad:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 
 #if defined(COMPAT_LINUX) || defined(LINUX)
 void
 linux_syscall(frame)
 	struct trapframe frame;
 {
 	struct proc *p = curproc;
 	struct sysent *callp;
 	u_quad_t sticks;
 	int error;
 	int rval[2];
 	u_int code;
 	struct linux_syscall_args {
 		int arg1;
 		int arg2;
 		int arg3;
 		int arg4;
 		int arg5;
 	} args;
 
 	args.arg1 = frame.tf_ebx;
 	args.arg2 = frame.tf_ecx;
 	args.arg3 = frame.tf_edx;
 	args.arg4 = frame.tf_esi;
 	args.arg5 = frame.tf_edi;
 
 	sticks = p->p_sticks;
 	if (ISPL(frame.tf_cs) != SEL_UPL)
 		panic("linux syscall");
 
 	p->p_md.md_regs = (int *)&frame;
 	code = frame.tf_eax;
 
 	if (p->p_sysent->sv_mask)
 		code &= p->p_sysent->sv_mask;
 
 	if (code >= p->p_sysent->sv_size)
 		callp = &p->p_sysent->sv_table[0];
 	else
 		callp = &p->p_sysent->sv_table[code];
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSCALL))
 		ktrsyscall(p->p_tracep, code, callp->sy_narg, (int *)&args);
 #endif
 
 	rval[0] = 0;
 
 	error = (*callp->sy_call)(p, &args, rval);
 
 	switch (error) {
 
 	case 0:
 		/*
 		 * Reinitialize proc pointer `p' as it may be different
 		 * if this is a child returning from fork syscall.
 		 */
 		p = curproc;
 		frame.tf_eax = rval[0];
 		frame.tf_eflags &= ~PSL_C;
 		break;
 
 	case ERESTART:
 		/* Reconstruct pc, subtract size of int 0x80 */
 		frame.tf_eip -= 2;
 		break;
 
 	case EJUSTRETURN:
 		break;
 
 	default:
  		if (p->p_sysent->sv_errsize)
  			if (error >= p->p_sysent->sv_errsize)
   				error = -1;	/* XXX */
    			else
   				error = p->p_sysent->sv_errtbl[error];
 		frame.tf_eax = -error;
 		frame.tf_eflags |= PSL_C;
 		break;
 	}
 
 	if (frame.tf_eflags & PSL_T) {
 		/* Traced syscall. */
 		frame.tf_eflags &= ~PSL_T;
 		trapsignal(p, SIGTRAP, 0);
 	}
 
 	userret(p, &frame, sticks);
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_SYSRET))
 		ktrsysret(p->p_tracep, code, error, rval[0]);
 #endif
 }
 #endif /* COMPAT_LINUX || LINUX */
Index: head/sys/kern/sys_process.c
===================================================================
--- head/sys/kern/sys_process.c	(revision 13489)
+++ head/sys/kern/sys_process.c	(revision 13490)
@@ -1,371 +1,371 @@
 /*
  * Copyright (c) 1994, Sean Eric Fagan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: sys_process.c,v 1.18 1995/12/16 21:43:47 bde Exp $
+ *	$Id: sys_process.c,v 1.19 1995/12/17 06:59:36 bde Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
 #include <sys/errno.h>
 #include <sys/queue.h>
 
 #include <machine/reg.h>
 #include <machine/psl.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #include <sys/user.h>
 
 static int
 pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
 	int		rv;
 	vm_map_t	map, tmap;
 	vm_object_t	object;
 	vm_offset_t	kva = 0;
 	int		page_offset;	/* offset into page */
 	vm_offset_t	pageno;		/* page number */
 	vm_map_entry_t	out_entry;
 	vm_prot_t	out_prot;
 	boolean_t	wired, single_use;
 	vm_pindex_t	pindex;
 
 	/* Map page into kernel space */
 
 	map = &procp->p_vmspace->vm_map;
 
 	page_offset = addr - trunc_page(addr);
 	pageno = trunc_page(addr);
 
 	tmap = map;
 	rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry,
 		&object, &pindex, &out_prot, &wired, &single_use);
 
 	if (rv != KERN_SUCCESS)
 		return EINVAL;
 
 	vm_map_lookup_done (tmap, out_entry);
 
 	/* Find space in kernel_map for the page we're interested in */
-	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), &kva,
-			  PAGE_SIZE, 1);
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	if (!rv) {
 		vm_object_reference (object);
 
 		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
 		if (!rv) {
 			*retval = 0;
 			bcopy ((caddr_t)kva + page_offset,
 			       retval, sizeof *retval);
 		}
 		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
 	}
 
 	return rv;
 }
 
 static int
 pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
 	int		rv;
 	vm_map_t	map, tmap;
 	vm_object_t	object;
 	vm_offset_t	kva = 0;
 	int		page_offset;	/* offset into page */
 	vm_offset_t	pageno;		/* page number */
 	vm_map_entry_t	out_entry;
 	vm_prot_t	out_prot;
 	boolean_t	wired, single_use;
 	vm_pindex_t	pindex;
 	boolean_t	fix_prot = 0;
 
 	/* Map page into kernel space */
 
 	map = &procp->p_vmspace->vm_map;
 
 	page_offset = addr - trunc_page(addr);
 	pageno = trunc_page(addr);
 
 	/*
 	 * Check the permissions for the area we're interested in.
 	 */
 
 	if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE,
 		VM_PROT_WRITE) == FALSE) {
 		/*
 		 * If the page was not writable, we make it so.
 		 * XXX It is possible a page may *not* be read/executable,
 		 * if a process changes that!
 		 */
 		fix_prot = 1;
 		/* The page isn't writable, so let's try making it so... */
 		if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE,
 			VM_PROT_ALL, 0)) != KERN_SUCCESS)
 		  return EFAULT;	/* I guess... */
 	}
 
 	/*
 	 * Now we need to get the page.  out_entry, out_prot, wired, and
 	 * single_use aren't used.  One would think the vm code would be
 	 * a *bit* nicer...  We use tmap because vm_map_lookup() can
 	 * change the map argument.
 	 */
 
 	tmap = map;
 	rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
 		&object, &pindex, &out_prot, &wired, &single_use);
 	if (rv != KERN_SUCCESS) {
 		return EINVAL;
 	}
 
 	/*
 	 * Okay, we've got the page.  Let's release tmap.
 	 */
 
 	vm_map_lookup_done (tmap, out_entry);
 
 	/*
 	 * Fault the page in...
 	 */
 
 	vm_map_pageable(map, trunc_page(vtopte(pageno)),
 		trunc_page(vtopte(pageno)) + PAGE_SIZE, FALSE);
 	rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
 	vm_map_pageable(map, trunc_page(vtopte(pageno)),
 		trunc_page(vtopte(pageno)) + PAGE_SIZE, TRUE);
 	if (rv != KERN_SUCCESS)
 		return EFAULT;
 
 	/* Find space in kernel_map for the page we're interested in */
-	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), &kva,
-			  PAGE_SIZE, 1);
-
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (!rv) {
 		vm_object_reference (object);
 
 		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
 		if (!rv) {
 		  bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum);
 		}
 		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
 	}
 
 	if (fix_prot)
 		vm_map_protect (map, pageno, pageno + PAGE_SIZE,
 			VM_PROT_READ|VM_PROT_EXECUTE, 0);
 	return rv;
 }
 
 /*
  * Process debugging system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ptrace_args {
 	int	req;
 	pid_t	pid;
 	caddr_t	addr;
 	int	data;
 };
 #endif
 
 int
 ptrace(curp, uap, retval)
 	struct proc *curp;
 	struct ptrace_args *uap;
 	int *retval;
 {
 	struct proc *p;
 	int error = 0;
 
 	*retval = 0;
 	if (uap->req == PT_TRACE_ME) {
 		curp->p_flag |= P_TRACED;
 		return 0;
 	}
 	if ((p = pfind(uap->pid)) == NULL) {
 		return ESRCH;
 	}
 
 #ifdef PT_ATTACH
 	if (uap->req != PT_ATTACH && (
 			(p->p_flag & P_TRACED) == 0 ||
 			(p->p_tptr && curp != p->p_tptr) ||
 			(!p->p_tptr && curp != p->p_pptr)))
 
 		return ESRCH;
 #endif
 #ifdef PT_ATTACH
 	if (uap->req != PT_ATTACH) {
 #endif
 		if ((p->p_flag & P_TRACED) == 0)
 			return EPERM;
 		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0)
 			return EBUSY;
 #ifdef PT_ATTACH
 	}
 #endif
 	/*
 	 * XXX The PT_ATTACH code is completely broken.  It will
 	 * be obsoleted by a /proc filesystem, so is it worth it
 	 * to fix it?  (Answer, probably.  So that'll be next,
 	 * I guess.)
 	 */
 
 	switch (uap->req) {
 #ifdef PT_ATTACH
 	case PT_ATTACH:
 		if (curp->p_ucred->cr_uid != 0 && (
 			curp->p_ucred->cr_uid != p->p_ucred->cr_uid ||
 			curp->p_ucred->cr_uid != p->p_cred->p_svuid))
 			return EACCES;
 
 		p->p_tptr = curp;
 		p->p_flag |= P_TRACED;
 		psignal(p, SIGSTOP);
 		return 0;
 
 	case PT_DETACH:
 		if ((unsigned)uap->data >= NSIG)
 			return EINVAL;
 		p->p_flag &= ~P_TRACED;
 		p->p_tptr = NULL;
 		psignal(p->p_pptr, SIGCHLD);
 		wakeup((caddr_t)p->p_pptr);
 		s = splhigh();
 		if (p->p_stat == SSTOP) {
 			p->p_xstat = uap->data;
 			setrunnable(p);
 		} else if (uap->data) {
 			psignal(p, uap->data);
 		}
 		splx(s);
 		return 0;
 
 # ifdef PT_INHERIT
 	case PT_INHERIT:
 		if ((p->p_flag & P_TRACED) == 0)
 			return ESRCH;
 		return 0;
 # endif	/* PT_INHERIT */
 #endif	/* PT_ATTACH */
 
 	case PT_READ_I:
 	case PT_READ_D:
 		if ((error = pread (p, (unsigned int)uap->addr, retval)))
 			return error;
 		return 0;
 	case PT_WRITE_I:
 	case PT_WRITE_D:
 		if ((error = pwrite (p, (unsigned int)uap->addr,
 				    (unsigned int)uap->data)))
 			return error;
 		return 0;
 	case PT_STEP:
 		if ((error = ptrace_single_step (p)))
 			return error;
 		/* fallthrough */
 	case PT_CONTINUE:
 		/*
 		 * Continue at addr uap->addr with signal
 		 * uap->data; if uap->addr is 1, then we just
 		 * let the chips fall where they may.
 		 *
 		 * The only check I'll make right now is for
 		 * uap->data to be larger than NSIG; if so, we return
 		 * EINVAL.
 		 */
 		if (uap->data >= NSIG)
 			return EINVAL;
 
 		if (uap->addr != (caddr_t)1) {
 			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
 			if ((error = ptrace_set_pc (p, (u_int)uap->addr)))
 				return error;
 		}
 
 		p->p_xstat = uap->data;
 
 /*		if (p->p_stat == SSTOP) */
 		setrunnable (p);
 		return 0;
 	case PT_READ_U:
 		if ((u_int)uap->addr > (UPAGES * NBPG - sizeof(int))) {
 			return EFAULT;
 		}
 		p->p_addr->u_kproc.kp_proc = *p;
 		fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
 		*retval = *(int*)((u_int)p->p_addr + (u_int)uap->addr);
 		return 0;
 	case PT_WRITE_U:
 		p->p_addr->u_kproc.kp_proc = *p;
 		fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
 		return ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data);
 	case PT_KILL:
 		p->p_xstat = SIGKILL;
 		setrunnable(p);
 		return 0;
 #ifdef PT_GETREGS
 	case PT_GETREGS:
 		/*
 		 * copyout the registers into addr.  There's no
 		 * size constraint!!! *GRRR*
 		 */
 		return ptrace_getregs(p, uap->addr);
 	case PT_SETREGS:
 		/*
 		 * copyin the registers from addr.  Again, no
 		 * size constraint!!! *GRRRR*
 		 */
 		return ptrace_setregs (p, uap->addr);
 #endif /* PT_GETREGS */
 	default:
 		break;
 	}
 
 	return 0;
 }
 
 int
 trace_req(p)
 	struct proc *p;
 {
 	return 1;
 }
Index: head/sys/kern/vfs_bio.c
===================================================================
--- head/sys/kern/vfs_bio.c	(revision 13489)
+++ head/sys/kern/vfs_bio.c	(revision 13490)
@@ -1,1654 +1,1667 @@
 /*
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. This work was done expressly for inclusion into FreeBSD.  Other use
  *    is allowed if this notation is included.
  * 5. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
- * $Id: vfs_bio.c,v 1.82 1996/01/06 23:23:02 davidg Exp $
+ * $Id: vfs_bio.c,v 1.83 1996/01/06 23:58:03 davidg Exp $
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  */
 
 #include "opt_bounce.h"
 
 #define VMIO
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 
 #include <miscfs/specfs/specdev.h>
 
 static void vfs_update __P((void));
 static struct	proc *updateproc;
 static struct kproc_desc up_kp = {
 	"update",
 	vfs_update,
 	&updateproc
 };
 SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 struct buf *buf;		/* buffer header pool */
 struct swqueue bswlist;
 
 int count_lock_queue __P((void));
 static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_clean_pages(struct buf * bp);
 static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
 
 int needsbuffer;
 
 /*
  * Internal update daemon, process 3
  *	The variable vfs_update_wakeup allows for internal syncs.
  */
 int vfs_update_wakeup;
 
 
 /*
  * buffers base kva
  */
 caddr_t buffers_kva;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 static vm_offset_t bogus_offset;
 
 static int bufspace, maxbufspace;
 
 static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
 static struct bqueues bufqueues[BUFFER_QUEUES];
 
+extern int vm_swap_size;
+
 #define BUF_MAXUSE 8
 
 /*
  * Initialize buffer headers and related structures.
  */
 void
 bufinit()
 {
 	struct buf *bp;
 	int i;
 
 	TAILQ_INIT(&bswlist);
 	LIST_INIT(&invalhash);
 
 	/* first, make a null hash table */
 	for (i = 0; i < BUFHSZ; i++)
 		LIST_INIT(&bufhashtbl[i]);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	buffers_kva = (caddr_t) kmem_alloc_pageable(buffer_map, MAXBSIZE * nbuf);
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_dev = NODEV;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vnbufs.le_next = NOLIST;
 		bp->b_data = buffers_kva + i * MAXBSIZE;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
 /*
  * maxbufspace is currently calculated to support all filesystem blocks
  * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
  * cache is still the same as it would be for 8K filesystems.  This
  * keeps the size of the buffer cache "in check" for big block filesystems.
  */
 	maxbufspace = 2 * (nbuf + 8) * PAGE_SIZE;
 
 	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
 	bogus_page = vm_page_alloc(kernel_object,
 			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 			VM_ALLOC_NORMAL);
 
 }
 
 /*
  * remove the buffer from the appropriate free list
  */
 void
 bremfree(struct buf * bp)
 {
 	int s = splbio();
 
 	if (bp->b_qindex != QUEUE_NONE) {
 		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 		bp->b_qindex = QUEUE_NONE;
 	} else {
 		panic("bremfree: removing a buffer when not on a queue");
 	}
 	splx(s);
 }
 
 /*
  * Get a buffer with the specified data.  Look in the cache first.
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf ** bpp)
 {
 	struct buf *bp;
 
 	bp = getblk(vp, blkno, size, 0, 0);
 	*bpp = bp;
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(bp);
 		return (biowait(bp));
 	}
 	return (0);
 }
 
 /*
  * Operates like bread, but also starts asynchronous I/O on
  * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf ** bpp)
 {
 	struct buf *bp, *rabp;
 	int i;
 	int rv = 0, readwait = 0;
 
 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (curproc != NULL)
 			curproc->p_stats->p_ru.ru_inblock++;
 		bp->b_flags |= B_READ;
 		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 		if (bp->b_rcred == NOCRED) {
 			if (cred != NOCRED)
 				crhold(cred);
 			bp->b_rcred = cred;
 		}
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(bp);
 		++readwait;
 	}
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (curproc != NULL)
 				curproc->p_stats->p_ru.ru_inblock++;
 			rabp->b_flags |= B_READ | B_ASYNC;
 			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
 			if (rabp->b_rcred == NOCRED) {
 				if (cred != NOCRED)
 					crhold(cred);
 				rabp->b_rcred = cred;
 			}
 			vfs_busy_pages(rabp, 0);
 			VOP_STRATEGY(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 
 	if (readwait) {
 		rv = biowait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async.)
  */
 int
 bwrite(struct buf * bp)
 {
 	int oldflags = bp->b_flags;
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 	if (!(bp->b_flags & B_BUSY))
 		panic("bwrite: buffer is not busy???");
 
 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
 	bp->b_flags |= B_WRITEINPROG;
 
 	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
 		reassignbuf(bp, bp->b_vp);
 	}
 
 	bp->b_vp->v_numoutput++;
 	vfs_busy_pages(bp, 1);
 	if (curproc != NULL)
 		curproc->p_stats->p_ru.ru_oublock++;
 	VOP_STRATEGY(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = biowait(bp);
 
 		if (oldflags & B_DELWRI) {
 			reassignbuf(bp, bp->b_vp);
 		}
 		brelse(bp);
 		return (rtval);
 	}
 	return (0);
 }
 
 int
 vn_bwrite(ap)
 	struct vop_bwrite_args *ap;
 {
 	return (bwrite(ap->a_bp));
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).
  */
 void
 bdwrite(struct buf * bp)
 {
 
 	if ((bp->b_flags & B_BUSY) == 0) {
 		panic("bdwrite: buffer is not busy");
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 	if (bp->b_flags & B_TAPE) {
 		bawrite(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_READ|B_RELBUF);
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= B_DONE | B_DELWRI;
 		reassignbuf(bp, bp->b_vp);
 	}
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if( bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty pages.
 	 */
 	vfs_setdirty(bp);
 
 	/*
 	 * We need to do this here to satisfy the vnode_pager and the
 	 * pageout daemon, so that it thinks that the pages have been
 	 * "cleaned".  Note that since the pages are in a delayed write
 	 * buffer -- the VFS layer "will" see that the pages get written
 	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages(bp);
-	brelse(bp);
+	bqrelse(bp);
 	return;
 }
 
 /*
  * Asynchronous write.
  * Start output on a buffer, but do not wait for it to complete.
  * The buffer is released when the output completes.
  */
 void
 bawrite(struct buf * bp)
 {
 	bp->b_flags |= B_ASYNC;
 	(void) VOP_BWRITE(bp);
 }
 
 /*
  * Release a buffer.
  */
 void
 brelse(struct buf * bp)
 {
 	int s;
 
 	if (bp->b_flags & B_CLUSTER) {
 		relpbuf(bp);
 		return;
 	}
 	/* anyone need a "free" block? */
 	s = splbio();
 
 	if (needsbuffer) {
 		needsbuffer = 0;
 		wakeup(&needsbuffer);
 	}
 
 	/* anyone need this block? */
 	if (bp->b_flags & B_WANTED) {
 		bp->b_flags &= ~(B_WANTED | B_AGE);
 		wakeup(bp);
 	} 
 
 	if (bp->b_flags & B_LOCKED)
 		bp->b_flags &= ~B_ERROR;
 
 	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
 	    (bp->b_bufsize <= 0)) {
 		bp->b_flags |= B_INVAL;
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
-		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp)
+		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
+			if (bp->b_bufsize)
+				allocbuf(bp, 0);
 			brelvp(bp);
+		}
 	}
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
 	 * but the VM object is kept around.  The B_NOCACHE flag is used to
 	 * invalidate the pages in the VM object.
 	 */
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 		vm_object_t obj;
 		int i, resid;
 		vm_page_t m;
 		struct vnode *vp;
 		int iototal = bp->b_bufsize;
 
 		vp = bp->b_vp;
 		if (!vp) 
 			panic("brelse: missing vp");
 
 		if (bp->b_npages) {
 			vm_pindex_t poff;
 			obj = (vm_object_t) vp->v_object;
 			if (vp->v_type == VBLK)
 				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
 			else
 				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 			poff = OFF_TO_IDX(foff);
 			for (i = 0; i < bp->b_npages; i++) {
 				m = bp->b_pages[i];
 				if (m == bogus_page) {
 					m = vm_page_lookup(obj, poff + i);
 					if (!m) {
 						panic("brelse: page missing\n");
 					}
 					bp->b_pages[i] = m;
 					pmap_qenter(trunc_page(bp->b_data),
 						bp->b_pages, bp->b_npages);
 				}
 				resid = IDX_TO_OFF(m->pindex+1) - foff;
 				if (resid > iototal)
 					resid = iototal;
 				if (resid > 0) {
 					/*
 					 * Don't invalidate the page if the local machine has already
 					 * modified it.  This is the lesser of two evils, and should
 					 * be fixed.
 					 */
 					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
 						vm_page_test_dirty(m);
 						if (m->dirty == 0) {
 							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
 							if (m->valid == 0)
 								vm_page_protect(m, VM_PROT_NONE);
 						}
 					}
-				}
-				foff += resid;
-				iototal -= resid;
-			}
-		}
-
-		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
-			for(i = 0; i < bp->b_npages; i++) {
-				m = bp->b_pages[i];
-				--m->bmapped;
-				if (m->bmapped == 0) {
-					if (m->flags & PG_WANTED) {
-						m->flags &= ~PG_WANTED;
-						wakeup(m);
-					}
-					if ((m->busy == 0) && ((m->flags & PG_BUSY) == 0)) {
-						if (m->object->flags & OBJ_MIGHTBEDIRTY) {
-							vm_page_test_dirty(m);
+					if (resid >= PAGE_SIZE) {
+						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+							bp->b_flags |= B_INVAL;
 						}
-						/*
-						 * if page isn't valid, no sense in keeping it around
-						 */
-						if (m->valid == 0) {
-							vm_page_protect(m, VM_PROT_NONE);
-							vm_page_free(m);
-						/*
-						 * if page isn't dirty and hasn't been referenced by
-						 * a process, then cache it
-						 */
-						} else if ((m->dirty & m->valid) == 0 &&
-						    (m->flags & PG_REFERENCED) == 0 &&
-						    !pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
-							vm_page_cache(m);
-						/*
-						 * otherwise activate it
-						 */
-						} else if ((m->flags & PG_ACTIVE) == 0) {
-							vm_page_activate(m);
-							m->act_count = 0;
+					} else {
+						if (!vm_page_is_valid(m,
+							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
+							bp->b_flags |= B_INVAL;
 						}
 					}
 				}
+				foff += resid;
+				iototal -= resid;
 			}
-			bufspace -= bp->b_bufsize;
-			pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
-			bp->b_npages = 0;
-			bp->b_bufsize = 0;
-			bp->b_flags &= ~B_VMIO;
-			if (bp->b_vp)
-				brelvp(bp);
 		}
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
 	}
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("brelse: free buffer onto another queue???");
 
 	/* enqueue */
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_qindex = QUEUE_EMPTY;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 		/* buffers with junk contents */
 	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
 		bp->b_qindex = QUEUE_AGE;
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
 		LIST_REMOVE(bp, b_hash);
 		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 		bp->b_dev = NODEV;
 		/* buffers that are locked */
 	} else if (bp->b_flags & B_LOCKED) {
 		bp->b_qindex = QUEUE_LOCKED;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
 		/* buffers with stale but valid contents */
 	} else if (bp->b_flags & B_AGE) {
 		bp->b_qindex = QUEUE_AGE;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
 		/* buffers with valid and quite potentially reuseable contents */
 	} else {
 		bp->b_qindex = QUEUE_LRU;
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
 	}
 
 	/* unlock */
 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 	splx(s);
 }
 
 /*
+ * Release a buffer.
+ */
+void
+bqrelse(struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+
+	if (needsbuffer) {
+		needsbuffer = 0;
+		wakeup(&needsbuffer);
+	}
+
+	/* anyone need this block? */
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~(B_WANTED | B_AGE);
+		wakeup(bp);
+	} 
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("bqrelse: free buffer onto another queue???");
+
+	if (bp->b_flags & B_LOCKED) {
+		bp->b_flags &= ~B_ERROR;
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+		/* buffers with stale but valid contents */
+	} else {
+		bp->b_qindex = QUEUE_LRU;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+	}
+
+	/* unlock */
+	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	splx(s);
+}
+
+static void
+vfs_vmio_release(bp)
+	struct buf *bp;
+{
+	int i;
+	vm_page_t m;
+
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		bp->b_pages[i] = NULL;
+		if (m->flags & PG_WANTED) {
+			m->flags &= ~PG_WANTED;
+			wakeup(m);
+		}
+		vm_page_unwire(m);
+		if (m->wire_count == 0) {
+			if (m->valid) {
+				/*
+				 * this keeps pressure off of the process memory
+				 */
+				if ((vm_swap_size == 0) ||
+					(cnt.v_free_count < cnt.v_free_min))
+					vm_page_cache(m);
+			} else if ((m->hold_count == 0) &&
+				((m->flags & PG_BUSY) == 0) &&
+				(m->busy == 0)) {
+				vm_page_protect(m, VM_PROT_NONE);
+				vm_page_free(m);
+			}
+		}
+	}
+	bufspace -= bp->b_bufsize;
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	bp->b_npages = 0;
+	bp->b_bufsize = 0;
+	bp->b_flags &= ~B_VMIO;
+	if (bp->b_vp)
+		brelvp(bp);
+}
+
+/*
  * Check to see if a block is currently memory resident.
  */
 __inline struct buf *
 gbincore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
 	bh = BUFHASH(vp, blkno);
 	bp = bh->lh_first;
 
 	/* Search hash chain */
 	while (bp != NULL) {
 		/* hit */
 		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
 		    (bp->b_flags & B_INVAL) == 0) {
 			break;
 		}
 		bp = bp->b_hash.le_next;
 	}
 	return (bp);
 }
 
 /*
  * this routine implements clustered async writes for
  * clearing out B_DELWRI buffers...  This is much better
  * than the old way of writing only one buffer at a time.
  */
 int
 vfs_bio_awrite(struct buf * bp)
 {
 	int i;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int s;
 	int ncl;
 	struct buf *bpa;
 	int nwritten;
 
 	s = splbio();
 	/*
 	 * right now we support clustered writing only to regular files
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 		int size;
 		int maxcl;
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		for (i = 1; i < maxcl; i++) {
 			if ((bpa = gbincore(vp, lblkno + i)) &&
 			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
 			    (B_DELWRI | B_CLUSTEROK)) &&
 			    (bpa->b_bufsize == size)) {
 				if ((bpa->b_blkno == bpa->b_lblkno) ||
 				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
 					break;
 			} else {
 				break;
 			}
 		}
 		ncl = i;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
 			splx(s);
 			return nwritten;
 		}
 	}
 	bremfree(bp);
 	splx(s);
 	/*
 	 * default (old) behavior, writing out only one block
 	 */
 	bp->b_flags |= B_BUSY | B_ASYNC;
 	nwritten = bp->b_bufsize;
 	(void) VOP_BWRITE(bp);
 	return nwritten;
 }
 
 
 /*
  * Find a buffer header which is available for use.
  */
 static struct buf *
 getnewbuf(int slpflag, int slptimeo, int doingvmio)
 {
 	struct buf *bp;
 	int s;
 	int nbyteswritten = 0;
 
 	s = splbio();
 start:
 	if (bufspace >= maxbufspace)
 		goto trytofreespace;
 
 	/* can we constitute a new buffer? */
 	if ((bp = bufqueues[QUEUE_EMPTY].tqh_first)) {
 		if (bp->b_qindex != QUEUE_EMPTY)
 			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
 			    bp->b_qindex);
+		bp->b_flags |= B_BUSY;
 		bremfree(bp);
 		goto fillbuf;
 	}
 trytofreespace:
 	/*
 	 * We keep the file I/O from hogging metadata I/O
 	 * This is desirable because file data is cached in the
 	 * VM/Buffer cache even if a buffer is freed.
 	 */
 	if ((bp = bufqueues[QUEUE_AGE].tqh_first)) {
 		if (bp->b_qindex != QUEUE_AGE)
 			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
 			    bp->b_qindex);
 	} else if ((bp = bufqueues[QUEUE_LRU].tqh_first)) {
 		if (bp->b_qindex != QUEUE_LRU)
 			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
 			    bp->b_qindex);
 	}
 	if (!bp) {
 		/* wait for a free buffer of any kind */
 		needsbuffer = 1;
 		tsleep(&needsbuffer,
 			(PRIBIO + 1) | slpflag, "newbuf", slptimeo);
 		splx(s);
 		return (0);
 	}
 
 	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
 		--bp->b_usecount;
 		TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
 		if (bufqueues[QUEUE_LRU].tqh_first != NULL) {
 			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
 			goto start;
 		}
 	}
 
 	/* if we are a delayed write, convert to an async write */
 	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
 		nbyteswritten += vfs_bio_awrite(bp);
 		if (!slpflag && !slptimeo) {
 			splx(s);
 			return (0);
 		}
 		goto start;
 	}
 
 	if (bp->b_flags & B_WANTED) {
 		bp->b_flags &= ~B_WANTED;
 		wakeup(bp);
 	}
 	bremfree(bp);
+	bp->b_flags |= B_BUSY;
 
-	if (bp->b_flags & B_VMIO) {
-		bp->b_flags |= B_RELBUF | B_BUSY | B_DONE;
-		brelse(bp);
-		bremfree(bp);
-	}
+	if (bp->b_flags & B_VMIO)
+		vfs_vmio_release(bp);
 
 	if (bp->b_vp)
 		brelvp(bp);
 
 fillbuf:
 	/* we are not free, nor do we contain interesting data */
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
-	bp->b_flags |= B_BUSY;
+
 	LIST_REMOVE(bp, b_hash);
 	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	splx(s);
 	if (bp->b_bufsize) {
 		allocbuf(bp, 0);
 	}
 	bp->b_flags = B_BUSY;
 	bp->b_dev = NODEV;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_validoff = bp->b_validend = 0;
 	bp->b_usecount = 2;
 	if (bufspace >= maxbufspace + nbyteswritten) {
 		s = splbio();
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		goto trytofreespace;
 	}
 	return (bp);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct vnode * vp, daddr_t blkno)
 {
 	struct buf *bp;
 	struct bufhashhdr *bh;
 
 	int s = splbio();
-
-	bh = BUFHASH(vp, blkno);
-	bp = bh->lh_first;
-
-	/* Search hash chain */
-	while (bp != NULL) {
-		/* hit */
-		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
-		    (bp->b_flags & B_INVAL) == 0) {
-			break;
-		}
-		bp = bp->b_hash.le_next;
-	}
+	bp = gbincore(vp, blkno);
 	splx(s);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	if (incore(vp, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
 		return 0;
 
 	obj = vp->v_object;
 	tinc = PAGE_SIZE;
 	if (tinc > vp->v_mount->mnt_stat.f_iosize)
 		tinc = vp->v_mount->mnt_stat.f_iosize;
 	off = blkno * vp->v_mount->mnt_stat.f_iosize;
 
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			return 0;
 		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
 			return 0;
 	}
 	return 1;
 }
 
 /*
  * now we set the dirty range for the buffer --
  * for NFS -- if the file is mapped and pages have
  * been written to, let it know.  We want the
  * entire range of the buffer to be marked dirty if
  * any of the pages have been written to for consistancy
  * with the b_validoff, b_validend set in the nfs write
  * code, and used by the nfs read code.
  */
 static void
 vfs_setdirty(struct buf *bp) {
 	int i;
 	vm_object_t object;
 	vm_offset_t boffset, offset;
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.  The OBJ_WRITEABLE flag
 	 * is not cleared simply by protecting pages off.
 	 */
 	if ((bp->b_flags & B_VMIO) &&
 		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * scan forwards for the first page modified
 		 */
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		boffset = (i << PAGE_SHIFT);
 		if (boffset < bp->b_dirtyoff) {
 			bp->b_dirtyoff = boffset;
 		}
 
 		/*
 		 * scan backwards for the last page modified
 		 */
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		boffset = (i + 1);
 		offset = boffset + bp->b_pages[0]->pindex;
 		if (offset >= object->size)
 			boffset = object->size - bp->b_pages[0]->pindex;
 		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
 			bp->b_dirtyend = (boffset << PAGE_SHIFT);
 	}
 }
 
 /*
  * Get a block given a specified block and offset into a file/device.
  */
 struct buf *
 getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
 	struct buf *bp;
 	int s;
 	struct bufhashhdr *bh;
 
 	s = splbio();
 loop:
 	if ((bp = gbincore(vp, blkno))) {
 		if (bp->b_flags & B_BUSY) {
 			bp->b_flags |= B_WANTED;
 			if (bp->b_usecount < BUF_MAXUSE)
 				++bp->b_usecount;
 			if (!tsleep(bp,
 				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
 				goto loop;
 
 			splx(s);
 			return (struct buf *) NULL;
 		}
 		bp->b_flags |= B_BUSY | B_CACHE;
 		bremfree(bp);
 				
 		/*
 		 * check for size inconsistancies (note that they shouldn't happen
 		 * but do when filesystems don't handle the size changes correctly.)
 		 * We are conservative on metadata and don't just extend the buffer
 		 * but write and re-constitute it.
 		 */
 
 		if (bp->b_bcount != size) {
 			if (bp->b_flags & B_VMIO) {
 				allocbuf(bp, size);
 			} else {
 				bp->b_flags |= B_NOCACHE;
 				VOP_BWRITE(bp);
 				goto loop;
 			}
 		}
 
-		/*
-		 * make sure that all pages in the buffer are valid, if they
-		 * aren't, clear the cache flag.
-		 * ASSUMPTION:
-		 *  if the buffer is greater than 1 page in size, it is assumed
-		 *  that the buffer address starts on a page boundary...
-		 */
-		if (bp->b_flags & B_VMIO) {
-			int szleft, i;
-			szleft = size;
-			for (i=0;i<bp->b_npages;i++) {
-				if (szleft > PAGE_SIZE) {
-					if ((bp->b_pages[i]->valid & VM_PAGE_BITS_ALL) !=
-						VM_PAGE_BITS_ALL) {
-						bp->b_flags &= ~(B_CACHE|B_DONE);
-						break;
-					}
-					szleft -= PAGE_SIZE;
-				} else {
-					if (!vm_page_is_valid(bp->b_pages[i],
-						(((vm_offset_t) bp->b_data) & PAGE_MASK),
-						szleft)) {
-						bp->b_flags &= ~(B_CACHE|B_DONE);
-						break;
-					}
-					szleft = 0;
-				}
-			}
-		}
 		if (bp->b_usecount < BUF_MAXUSE)
 			++bp->b_usecount;
 		splx(s);
 		return (bp);
 	} else {
 		vm_object_t obj;
 		int doingvmio;
 
 		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
 			doingvmio = 1;
 		} else {
 			doingvmio = 0;
 		}
 		if ((bp = getnewbuf(slpflag, slptimeo, doingvmio)) == 0) {
 			if (slpflag || slptimeo) {
 				splx(s);
 				return NULL;
 			}
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * Normally the vnode is locked so this isn't a problem.
 		 * VBLK type I/O requests, however, don't lock the vnode.
 		 */
 		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bgetvp(vp, bp);
 		LIST_REMOVE(bp, b_hash);
 		bh = BUFHASH(vp, blkno);
 		LIST_INSERT_HEAD(bh, bp, b_hash);
 
 		if (doingvmio) {
 			bp->b_flags |= (B_VMIO | B_CACHE);
 #if defined(VFS_BIO_DEBUG)
 			if (vp->v_type != VREG)
 				printf("getblk: vmioing file type %d???\n", vp->v_type);
 #endif
 		} else {
 			bp->b_flags &= ~B_VMIO;
 		}
 		splx(s);
 
 		allocbuf(bp, size);
 		return (bp);
 	}
 }
 
 /*
  * Get an empty, disassociated buffer of given size.
  */
 struct buf *
 geteblk(int size)
 {
 	struct buf *bp;
 
 	while ((bp = getnewbuf(0, 0, 0)) == 0);
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;
 	return (bp);
 }
 
+
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!!
  *
  * Modify the length of a buffer's underlying buffer storage without
  * destroying information (unless, of course the buffer is shrinking).
  */
 int
 allocbuf(struct buf * bp, int size)
 {
 
 	int s;
 	int newbsize, mbsize;
 	int i;
 
 	if (!(bp->b_flags & B_BUSY))
 		panic("allocbuf: buffer not busy");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		/*
 		 * Just get anonymous memory from the kernel
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 			vm_hold_free_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + newbsize,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize);
 		} else if (newbsize > bp->b_bufsize) {
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 		}
 	} else {
 		vm_page_t m;
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
 
 		if (newbsize < bp->b_bufsize) {
 			if (desiredpages < bp->b_npages) {
-				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
-				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				for (i = desiredpages; i < bp->b_npages; i++) {
+					/*
+					 * the page is not freed here -- it
+					 * is the responsibility of vnode_pager_setsize
+					 */
 					m = bp->b_pages[i];
 					s = splhigh();
 					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
 						m->flags |= PG_WANTED;
 						tsleep(m, PVM, "biodep", 0);
 					}
 					splx(s);
 
-					if (m->bmapped == 0) {
-						printf("allocbuf: bmapped is zero for page %d\n", i);
-						panic("allocbuf: error");
-					}
-					--m->bmapped;
-					if (m->bmapped == 0) {
-						vm_page_protect(m, VM_PROT_NONE);
-						vm_page_free(m);
-					}
 					bp->b_pages[i] = NULL;
+					vm_page_unwire(m);
 				}
+				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
+				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
 				bp->b_npages = desiredpages;
 			}
 		} else if (newbsize > bp->b_bufsize) {
 			vm_object_t obj;
 			vm_offset_t tinc, toff;
 			vm_ooffset_t off;
 			vm_pindex_t objoff;
 			int pageindex, curbpnpages;
 			struct vnode *vp;
 			int bsize;
 
 			vp = bp->b_vp;
 
 			if (vp->v_type == VBLK)
 				bsize = DEV_BSIZE;
 			else
 				bsize = vp->v_mount->mnt_stat.f_iosize;
 
 			if (bp->b_npages < desiredpages) {
 				obj = vp->v_object;
 				tinc = PAGE_SIZE;
 				if (tinc > bsize)
 					tinc = bsize;
 				off = (vm_ooffset_t) bp->b_lblkno * bsize;
 		doretry:
 				curbpnpages = bp->b_npages;
 				bp->b_flags |= B_CACHE;
 				for (toff = 0; toff < newbsize; toff += tinc) {
 					int bytesinpage;
 
 					pageindex = toff >> PAGE_SHIFT;
 					objoff = OFF_TO_IDX(off + toff);
 					if (pageindex < curbpnpages) {
 
 						m = bp->b_pages[pageindex];
+#ifdef VFS_BIO_DIAG
 						if (m->pindex != objoff)
 							panic("allocbuf: page changed offset??!!!?");
+#endif
 						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
 							bytesinpage = newbsize - toff;
-						if (!vm_page_is_valid(m,
+						if ((bp->b_flags & B_CACHE) &&
+							!vm_page_is_valid(m,
 							(vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
 							bytesinpage)) {
 							bp->b_flags &= ~B_CACHE;
 						}
-						if ((m->flags & PG_ACTIVE) == 0) {
-							vm_page_activate(m);
-							m->act_count = 0;
-						}
 						continue;
 					}
 					m = vm_page_lookup(obj, objoff);
 					if (!m) {
 						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
 						if (!m) {
-							int j;
-
-							for (j = bp->b_npages; j < pageindex; j++) {
-								PAGE_WAKEUP(bp->b_pages[j]);
-							}
 							VM_WAIT;
 							goto doretry;
 						}
-						vm_page_activate(m);
-						m->act_count = 0;
-						m->valid = 0;
+						/*
+						 * Normally it is unwise to clear PG_BUSY without
+						 * PAGE_WAKEUP -- but it is okay here, as there is
+						 * no chance for blocking between here and vm_page_alloc
+						 */
+						m->flags &= ~PG_BUSY;
+						vm_page_wire(m);
 						bp->b_flags &= ~B_CACHE;
 					} else if (m->flags & PG_BUSY) {
-						int j;
 
-						for (j = bp->b_npages; j < pageindex; j++) {
-							PAGE_WAKEUP(bp->b_pages[j]);
-						}
-
-						s = splbio();
+						s = splhigh();
 						m->flags |= PG_WANTED;
 						tsleep(m, PVM, "pgtblk", 0);
 						splx(s);
 
 						goto doretry;
 					} else {
 						if ((curproc != pageproc) &&
-							(m->flags & PG_CACHE) &&
-						    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
+							(m->queue == PQ_CACHE) &&
+						    ((cnt.v_free_count + cnt.v_cache_count) <
+								(cnt.v_free_min + cnt.v_cache_min))) {
 							pagedaemon_wakeup();
 						}
 						bytesinpage = tinc;
 						if (tinc > (newbsize - toff))
 							bytesinpage = newbsize - toff;
-						if (!vm_page_is_valid(m,
+						if ((bp->b_flags & B_CACHE) &&
+							!vm_page_is_valid(m,
 							(vm_offset_t) ((toff + off) & (PAGE_SIZE - 1)),
 							bytesinpage)) {
 							bp->b_flags &= ~B_CACHE;
 						}
-						if ((m->flags & PG_ACTIVE) == 0) {
-							vm_page_activate(m);
-							m->act_count = 0;
-						}
-						m->flags |= PG_BUSY;
+						vm_page_wire(m);
 					}
 					bp->b_pages[pageindex] = m;
 					curbpnpages = pageindex + 1;
 				}
-				for (i = bp->b_npages; i < curbpnpages; i++) {
-					m = bp->b_pages[i];
-					m->bmapped++;
-					PAGE_WAKEUP(m);
-				}
-				bp->b_npages = curbpnpages;
+/*
 				bp->b_data = buffers_kva + (bp - buf) * MAXBSIZE;
-				pmap_qenter((vm_offset_t) bp->b_data, bp->b_pages, bp->b_npages);
-				bp->b_data += off & (PAGE_SIZE - 1);
+*/
+				bp->b_data = (caddr_t) trunc_page(bp->b_data);
+				bp->b_npages = curbpnpages;
+				pmap_qenter((vm_offset_t) bp->b_data,
+					bp->b_pages, bp->b_npages);
+				((vm_offset_t) bp->b_data) |= off & (PAGE_SIZE - 1);
 			}
 		}
 	}
 	bufspace += (newbsize - bp->b_bufsize);
 	bp->b_bufsize = newbsize;
 	bp->b_bcount = size;
 	return 1;
 }
 
 /*
  * Wait for buffer I/O completion, returning error status.
  */
 int
 biowait(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 	while ((bp->b_flags & B_DONE) == 0)
 		tsleep(bp, PRIBIO, "biowait", 0);
 	splx(s);
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_flags & B_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  * Finish I/O on a buffer, calling an optional function.
  * This is usually called from interrupt level, so process blocking
  * is not *a good idea*.
  */
 void
 biodone(register struct buf * bp)
 {
 	int s;
 
 	s = splbio();
 	if (!(bp->b_flags & B_BUSY))
 		panic("biodone: buffer not busy");
 
 	if (bp->b_flags & B_DONE) {
 		splx(s);
 		printf("biodone: buffer already done\n");
 		return;
 	}
 	bp->b_flags |= B_DONE;
 
 	if ((bp->b_flags & B_READ) == 0) {
 		vwakeup(bp);
 	}
 #ifdef BOUNCE_BUFFERS
 	if (bp->b_flags & B_BOUNCE)
 		vm_bounce_free(bp);
 #endif
 
 	/* call optional completion function if requested */
 	if (bp->b_flags & B_CALL) {
 		bp->b_flags &= ~B_CALL;
 		(*bp->b_iodone) (bp);
 		splx(s);
 		return;
 	}
 	if (bp->b_flags & B_VMIO) {
 		int i, resid;
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
 
 		if (vp->v_type == VBLK)
 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 		else
 			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 		obj = vp->v_object;
 		if (!obj) {
 			panic("biodone: no object");
 		}
 #if defined(VFS_BIO_DEBUG)
 		if (obj->paging_in_progress < bp->b_npages) {
 			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
 			    obj->paging_in_progress, bp->b_npages);
 		}
 #endif
 		iosize = bp->b_bufsize;
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (!m) {
 #if defined(VFS_BIO_DEBUG)
 					printf("biodone: page disappeared\n");
 #endif
 					--obj->paging_in_progress;
 					continue;
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
 				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
 			}
 #endif
 			resid = IDX_TO_OFF(m->pindex + 1) - foff;
 			if (resid > iosize)
 				resid = iosize;
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly, so we only need to do this
 			 * here in the read case.
 			 */
 			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
 				vm_page_set_validclean(m,
 					(vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
 			}
 
 			/*
 			 * when debugging new filesystems or buffer I/O methods, this
 			 * is the most common error that pops up.  if you see this, you
 			 * have not set the page busy flag correctly!!!
 			 */
 			if (m->busy == 0) {
 				printf("biodone: page busy < 0, "
 				    "pindex: %d, foff: 0x(%x,%x), "
 				    "resid: %d, index: %d\n",
 				    (int) m->pindex, (int)(foff >> 32),
 						(int) foff & 0xffffffff, resid, i);
 				if (vp->v_type != VBLK)
 					printf(" iosize: %d, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    bp->b_vp->v_mount->mnt_stat.f_iosize,
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
 				else
 					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
 					    (int) bp->b_lblkno,
 					    bp->b_flags, bp->b_npages);
-				printf(" valid: 0x%x, dirty: 0x%x, mapped: %d\n",
-				    m->valid, m->dirty, m->bmapped);
+				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+				    m->valid, m->dirty, m->wire_count);
 				panic("biodone: page busy < 0\n");
 			}
 			--m->busy;
 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 			--obj->paging_in_progress;
 			foff += resid;
 			iosize -= resid;
 		}
 		if (obj && obj->paging_in_progress == 0 &&
 		    (obj->flags & OBJ_PIPWNT)) {
 			obj->flags &= ~OBJ_PIPWNT;
 			wakeup(obj);
 		}
 	}
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * checks for B_WANTED and will do the wakeup there if necessary - so
 	 * no need to do a wakeup here in the async case.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
-		brelse(bp);
+		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+			brelse(bp);
+		else
+			bqrelse(bp);
 	} else {
 		wakeup(bp);
 	}
 	splx(s);
 }
 
 int
 count_lock_queue()
 {
 	int count;
 	struct buf *bp;
 
 	count = 0;
 	for (bp = bufqueues[QUEUE_LOCKED].tqh_first;
 	    bp != NULL;
 	    bp = bp->b_freelist.tqe_next)
 		count++;
 	return (count);
 }
 
 int vfs_update_interval = 30;
 
 static void
 vfs_update()
 {
 	(void) spl0();		/* XXX redundant?  wrong place? */
 	while (1) {
 		tsleep(&vfs_update_wakeup, PUSER, "update",
 		    hz * vfs_update_interval);
 		vfs_update_wakeup = 0;
 		sync(curproc, NULL, NULL);
 	}
 }
 
 static int
 sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
 {
 	int error = sysctl_handle_int(oidp,
 		oidp->oid_arg1, oidp->oid_arg2, req);
 	if (!error)
 		wakeup(&vfs_update_wakeup);
 	return error;
 }
 
 SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
 	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
 
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		struct vnode *vp = bp->b_vp;
 		vm_object_t obj = vp->v_object;
 		vm_ooffset_t foff;
 
 		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 
 			if (m == bogus_page) {
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
 				if (!m) {
 					panic("vfs_unbusy_pages: page missing\n");
 				}
 				bp->b_pages[i] = m;
 				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 			}
 			--obj->paging_in_progress;
 			--m->busy;
 			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
 				m->flags &= ~PG_WANTED;
 				wakeup(m);
 			}
 		}
 		if (obj->paging_in_progress == 0 &&
 		    (obj->flags & OBJ_PIPWNT)) {
 			obj->flags &= ~OBJ_PIPWNT;
 			wakeup(obj);
 		}
 	}
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being PG_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  */
 void
 vfs_busy_pages(struct buf * bp, int clear_modify)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_object_t obj = bp->b_vp->v_object;
 		vm_ooffset_t foff;
 		int iocount = bp->b_bufsize;
 
 		if (bp->b_vp->v_type == VBLK)
 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 		else
 			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 		vfs_setdirty(bp);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
 
 			if (resid > iocount)
 				resid = iocount;
 			if ((bp->b_flags & B_CLUSTER) == 0) {
 				obj->paging_in_progress++;
 				m->busy++;
 			}
 			if (clear_modify) {
 				vm_page_protect(m, VM_PROT_READ);
 				vm_page_set_validclean(m,
 					(vm_offset_t) (foff & (PAGE_SIZE-1)), resid);
 			} else if (bp->b_bcount >= PAGE_SIZE) {
 				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
 					bp->b_pages[i] = bogus_page;
 					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 				}
 			}
 			foff += resid;
 			iocount -= resid;
 		}
 	}
 }
 
 /*
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  */
 void
 vfs_clean_pages(struct buf * bp)
 {
 	int i;
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 		int iocount = bp->b_bufsize;
 
 		if (bp->b_vp->v_type == VBLK)
 			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
 		else
 			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
 
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m = bp->b_pages[i];
 			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
 
 			if (resid > iocount)
 				resid = iocount;
 			if (resid > 0) {
 				vm_page_set_validclean(m,
 					((vm_offset_t) foff & (PAGE_SIZE-1)), resid);
 			}
 			foff += resid;
 			iocount -= resid;
 		}
 	}
 }
 
 void
 vfs_bio_clrbuf(struct buf *bp) {
 	int i;
+	int remapbuffer = 0;
 	if( bp->b_flags & B_VMIO) {
 		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
 			int mask;
 			mask = 0;
 			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
 				mask |= (1 << (i/DEV_BSIZE));
 			if( bp->b_pages[0]->valid != mask) {
 				bzero(bp->b_data, bp->b_bufsize);
 			}
 			bp->b_pages[0]->valid = mask;
 			bp->b_resid = 0;
 			return;
 		}
 		for(i=0;i<bp->b_npages;i++) {
 			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
 				continue;
 			if( bp->b_pages[i]->valid == 0) {
-				if ((bp->b_pages[i]->flags & PG_ZERO) == 0)
+				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
 					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
+				}
 			} else {
 				int j;
 				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
 					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
 						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
 				}
 			}
 			bp->b_pages[i]->valid = VM_PAGE_BITS_ALL;
 		}
 		bp->b_resid = 0;
 	} else {
 		clrbuf(bp);
 	}
+	if (remapbuffer)
+			pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
 }
 
 /*
  * vm_hold_load_pages and vm_hold_unload pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 void
 vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
+	int index;
 
 	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
 
-	for (pg = round_page(from); pg < to; pg += PAGE_SIZE) {
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 
 tryagain:
 
 		p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
 		    VM_ALLOC_NORMAL);
 		if (!p) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		vm_page_wire(p);
 		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
-		bp->b_pages[(pg - trunc_page(bp->b_data)) >> PAGE_SHIFT] = p;
+		bp->b_pages[index] = p;
 		PAGE_WAKEUP(p);
-		bp->b_npages++;
 	}
+	bp->b_npages = to >> PAGE_SHIFT;
 }
 
 void
 vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	from = round_page(from);
 	to = round_page(to);
 	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 		p = bp->b_pages[index];
-		bp->b_pages[index] = 0;
-		pmap_kremove(pg);
-		vm_page_free(p);
-		--bp->b_npages;
+		if (p && (index < bp->b_npages)) {
+			if (p->busy) {
+				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
+					bp->b_blkno, bp->b_lblkno);
+			}
+			bp->b_pages[index] = NULL;
+			pmap_kremove(pg);
+			vm_page_unwire(p);
+			vm_page_free(p);
+		}
 	}
+	bp->b_npages = from >> PAGE_SHIFT;
 }
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c	(revision 13489)
+++ head/sys/kern/vfs_cache.c	(revision 13490)
@@ -1,319 +1,325 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1995
  *	Poul-Henning Kamp.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.3 (Berkeley) 8/22/94
- * $Id: vfs_cache.c,v 1.18 1995/12/14 09:52:47 phk Exp $
+ * $Id: vfs_cache.c,v 1.19 1995/12/22 15:56:35 phk Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/errno.h>
 #include <sys/malloc.h>
 
+#define MAXVNODEUSE 32
+
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (that we know a name to >not< exist)
  * we point out entry at our own "nchENOENT", to avoid too much special
  * casing in the inner loops of lookup.
  *
  * For simplicity (and economy of storage), names longer than
  * a maximum length of NCHNAMLEN are not cached; they occur
  * infrequently in any case, and are almost never of interest.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  */
 
 /*
  * Structures associated with name cacheing.
  */
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) nclruhead;	/* LRU chain */
 static u_long	nchash;			/* size of hash table */
 struct nchstats nchstats;		/* cache effectiveness statistics */
 static struct vnode nchENOENT;		/* our own "novnode" */
 static int doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
 static u_long	numcache;
 u_long	numvnodes;
 
 #ifdef NCH_STATISTICS
 u_long	nchnbr;
 #define NCHNBR(ncp) (ncp)->nc_nbr = ++nchnbr;
 #define NCHHIT(ncp) (ncp)->nc_hits++
 #else
 #define NCHNBR(ncp)
 #define NCHHIT(ncp)
 #endif
 
 #define PURGE(ncp)  {						\
 	LIST_REMOVE(ncp, nc_hash);				\
 	ncp->nc_hash.le_prev = 0;				\
 	TAILQ_REMOVE(&nclruhead, ncp, nc_lru);			\
 	TAILQ_INSERT_HEAD(&nclruhead, ncp, nc_lru); }
 
 #define TOUCH(ncp)  {						\
 	if (ncp->nc_lru.tqe_next == 0) { } else {		\
 		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);		\
 		TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);	\
 		NCHNBR(ncp); } }
 
 /*
  * Lookup an entry in the cache
  *
  * We don't do this if the segment name is long, simply so the cache
  * can avoid holding long names (which would either waste space, or
  * add greatly to the complexity).
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought.
  * If the lookup succeeds, the vnode is returned in *vpp, and a status
  * of -1 is returned.
  * If the lookup determines that the name does not exist (negative cacheing),
  * a status of ENOENT is returned.
  * If the lookup fails, a status of zero is returned.
  */
 
 int
 cache_lookup(dvp, vpp, cnp)
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	register struct namecache *ncp,*nnp;
 	register struct nchashhead *ncpp;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 
 	if (cnp->cn_namelen > NCHNAMLEN) {
 		nchstats.ncs_long++;
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 
 	ncpp = &nchashtbl[(dvp->v_id + cnp->cn_hash) % nchash];
 	for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) {
 		nnp = ncp->nc_hash.le_next;
 		/* If one of the vp's went stale, don't bother anymore. */
 		if ((ncp->nc_dvpid != ncp->nc_dvp->v_id) ||
 		    (ncp->nc_vpid  != ncp->nc_vp->v_id)) {
 			nchstats.ncs_falsehits++;
 			PURGE(ncp);
 			continue;
 		}
 		/* Now that we know the vp's to be valid, is it ours ? */
 		if (ncp->nc_dvp == dvp &&
 		    ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, (u_int)ncp->nc_nlen))
 			goto found;	/* Fanatism considered bad. */
 	}
 	nchstats.ncs_miss++;
 	return (0);
 
     found:
 	NCHHIT(ncp);
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		nchstats.ncs_badhits++;
 		PURGE(ncp);
 		return (0);
 	}
 
 	/* We found a "positive" match, return the vnode */
         if (ncp->nc_vp != &nchENOENT) {
 		nchstats.ncs_goodhits++;
 		TOUCH(ncp);
 		*vpp = ncp->nc_vp;
+		if ((*vpp)->v_usage < MAXVNODEUSE)
+			(*vpp)->v_usage++;
 		return (-1);
 	}
 
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		nchstats.ncs_badhits++;
 		PURGE(ncp);
 		return (0);
 	}
 
 	/* The name does not exists */
 	nchstats.ncs_neghits++;
 	TOUCH(ncp);
 	return (ENOENT);
 }
 
 /*
  * Add an entry to the cache.
  */
 
 void
 cache_enter(dvp, vp, cnp)
 	struct vnode *dvp;
 	struct vnode *vp;
 	struct componentname *cnp;
 {
 	register struct namecache *ncp;
 	register struct nchashhead *ncpp;
 
 	if (!doingcache)
 		return;
 
 	if (cnp->cn_namelen > NCHNAMLEN) {
 		printf("cache_enter: name too long");
 		return;
 	}
 
 	if (numcache < numvnodes) {
 		/* Add one more entry */
 		ncp = (struct namecache *)
 			malloc((u_long)sizeof *ncp, M_CACHE, M_WAITOK);
 		bzero((char *)ncp, sizeof *ncp);
 		numcache++;
 	} else if (ncp = nclruhead.tqh_first) {
 		/* reuse an old entry */
 		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);
 		if (ncp->nc_hash.le_prev != 0) {
 			LIST_REMOVE(ncp, nc_hash);
 			ncp->nc_hash.le_prev = 0;
 		}
 	} else {
 		/* give up */
 		return;
 	}
 
 	/* If vp is NULL this is a "negative" cache entry */
 	if (!vp)
 		vp = &nchENOENT;
 
 	/* fill in cache info */
 	ncp->nc_vp = vp;
+	if (vp->v_usage < MAXVNODEUSE)
+		++vp->v_usage;
 	ncp->nc_vpid = vp->v_id;
 	ncp->nc_dvp = dvp;
 	ncp->nc_dvpid = dvp->v_id;
 	ncp->nc_nlen = cnp->cn_namelen;
 	bcopy(cnp->cn_nameptr, ncp->nc_name, (unsigned)ncp->nc_nlen);
 	TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);
 	ncpp = &nchashtbl[(dvp->v_id + cnp->cn_hash) % nchash];
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 
 void
 nchinit()
 {
 
 	TAILQ_INIT(&nclruhead);
 	nchashtbl = phashinit(desiredvnodes, M_CACHE, &nchash);
 	cache_purge(&nchENOENT);	/* Initialize v_id */
 }
 
 /*
  * Invalidate all entries to a particular vnode.
  *
  * We actually just increment the v_id, that will do it.  The stale entries 
  * will be purged by lookup as they get found.
  * If the v_id wraps around, we need to ditch the entire cache, to avoid
  * confusion.
  * No valid vnode will ever have (v_id == 0).
  */
 
 void
 cache_purge(vp)
 	struct vnode *vp;
 {
 	struct nchashhead *ncpp;
 	static u_long nextvnodeid;
 
 	vp->v_id = ++nextvnodeid;
 	if (nextvnodeid != 0)
 		return;
 	for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
 		while(ncpp->lh_first)
 			PURGE(ncpp->lh_first);
 	}
 	nchENOENT.v_id = ++nextvnodeid;
 	vp->v_id = ++nextvnodeid;
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  *
  * Since we need to check it anyway, we will flush all the invalid
  * entries at the same time.
  *
  * If we purge anything, we scan the hash-bucket again.  There is only
  * a handful of entries, so it cheap and simple.
  */
 
 void
 cache_purgevfs(mp)
 	struct mount *mp;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
 		ncp = ncpp->lh_first;
 		while(ncp) {
 			if (ncp->nc_dvpid != ncp->nc_dvp->v_id ||
 			    ncp->nc_vpid != ncp->nc_vp->v_id ||
 			    ncp->nc_dvp->v_mount == mp) {
 				PURGE(ncp);
 				ncp = ncpp->lh_first;
 			} else {
 				ncp = ncp->nc_hash.le_next;
 			}
 		}
 	}
 }
Index: head/sys/kern/vfs_cluster.c
===================================================================
--- head/sys/kern/vfs_cluster.c	(revision 13489)
+++ head/sys/kern/vfs_cluster.c	(revision 13490)
@@ -1,715 +1,710 @@
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  * Modifications/enhancements:
  * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
- * $Id: vfs_cluster.c,v 1.30 1995/12/11 04:56:07 dyson Exp $
+ * $Id: vfs_cluster.c,v 1.31 1995/12/22 16:06:46 bde Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 #include <miscfs/specfs/specdev.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 
 #ifdef notyet_block_reallocation_enabled
 #ifdef DEBUG
 #include <sys/sysctl.h>
 #include <sys/kernel.h>
 
 static int	doreallocblks = 0;
 SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
 #else
 #define	doreallocblks 0
 #endif
 #endif /* notyet_block_reallocation_enabled */
 
 #ifdef notyet_block_reallocation_enabled
 static struct cluster_save *
 	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
 #endif
 static struct buf *
 	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
 			    daddr_t blkno, long size, int run));
 
 static int	totreads;
 static int	totreadblocks;
 extern vm_page_t	bogus_page;
 
 #ifdef DIAGNOSTIC
 /*
  * Set to 1 if reads of block zero should cause readahead to be done.
  * Set to 0 treats a read of block zero as a non-sequential read.
  *
  * Setting to one assumes that most reads of block zero of files are due to
  * sequential passes over the files (e.g. cat, sum) where additional blocks
  * will soon be needed.  Setting to zero assumes that the majority are
  * surgical strikes to get particular info (e.g. size, file) where readahead
  * blocks will not be used and, in fact, push out other potentially useful
  * blocks from the cache.  The former seems intuitive, but some quick tests
  * showed that the latter performed better from a system-wide point of view.
  */
 	int doclusterraz = 0;
 
 #define ISSEQREAD(vp, blk) \
 	(((blk) != 0 || doclusterraz) && \
 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
 #else
 #define ISSEQREAD(vp, blk) \
 	(/* (blk) != 0 && */ ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
 #endif
 
 /*
  * allow for three entire read-aheads...  The system will
  * adjust downwards rapidly if needed...
  */
 #define RA_MULTIPLE_FAST	2
 #define RA_MULTIPLE_SLOW	3
 #define RA_SHIFTDOWN	1	/* approx lg2(RA_MULTIPLE) */
 /*
  * This replaces bread.  If this is a bread at the beginning of a file and
  * lastr is 0, we assume this is the first read and we'll read up to two
  * blocks if they are sequential.  After that, we'll do regular read ahead
  * in clustered chunks.
  * 	bp is the block requested.
  *	rbp is the read-ahead block.
  *	If either is NULL, then you don't have to do the I/O.
  */
 int
 cluster_read(vp, filesize, lblkno, size, cred, bpp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lblkno;
 	long size;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	struct buf *bp, *rbp;
 	daddr_t blkno, rablkno, origlblkno;
 	int error, num_ra, alreadyincore;
 	int i;
 	int seq;
 
 	error = 0;
 	/*
 	 * get the requested block
 	 */
 	origlblkno = lblkno;
 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
 
 	seq = ISSEQREAD(vp, lblkno);
 	/*
 	 * if it is in the cache, then check to see if the reads have been
 	 * sequential.  If they have, then try some read-ahead, otherwise
 	 * back-off on prospective read-aheads.
 	 */
 	if (bp->b_flags & B_CACHE) {
 		if (!seq) {
 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
 			vp->v_ralen >>= RA_SHIFTDOWN;
 			return 0;
 		} else if( vp->v_maxra > lblkno) {
-			if ( (vp->v_maxra + (vp->v_ralen / RA_MULTIPLE_SLOW)) >=
-					(lblkno + vp->v_ralen)) {
+			if ( vp->v_maxra > lblkno + (vp->v_ralen / RA_MULTIPLE_SLOW) ) {
 				if ((vp->v_ralen + 1) < RA_MULTIPLE_FAST*(MAXPHYS / size))
 					++vp->v_ralen;
 				return 0;
 			}
 			lblkno = vp->v_maxra;
 		} else {
 			lblkno += 1;
 		}
 		bp = NULL;
 	} else {
 		/*
 		 * if it isn't in the cache, then get a chunk from disk if
 		 * sequential, otherwise just get the block.
 		 */
 		bp->b_flags |= B_READ;
 		lblkno += 1;
 		curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
 		vp->v_ralen = 0;
 	}
 	/*
 	 * assume no read-ahead
 	 */
 	alreadyincore = 1;
 	rablkno = lblkno;
 
 	/*
 	 * if we have been doing sequential I/O, then do some read-ahead
 	 */
 	if (seq) {
 
 	/*
 	 * bump ralen a bit...
 	 */
 		if ((vp->v_ralen + 1) < RA_MULTIPLE_SLOW*(MAXPHYS / size))
 			++vp->v_ralen;
 		/*
 		 * this code makes sure that the stuff that we have read-ahead
 		 * is still in the cache.  If it isn't, we have been reading
 		 * ahead too much, and we need to back-off, otherwise we might
 		 * try to read more.
 		 */
-		for (i = 0; i < vp->v_ralen; i++) {
+		for (i = 0; i < vp->v_maxra - lblkno; i++) {
 			rablkno = lblkno + i;
-			alreadyincore = (int) gbincore(vp, rablkno);
+			alreadyincore = (int) incore(vp, rablkno);
 			if (!alreadyincore) {
-				if (rablkno < vp->v_maxra) {
-					vp->v_maxra = rablkno;
-					vp->v_ralen >>= RA_SHIFTDOWN;
-					alreadyincore = 1;
-				}
-				break;
-			} else if (vp->v_maxra < rablkno) {
-				vp->v_maxra = rablkno + 1;
+				vp->v_maxra = rablkno;
+				vp->v_ralen >>= RA_SHIFTDOWN;
+				alreadyincore = 1;
 			}
 		}
 	}
 	/*
 	 * we now build the read-ahead buffer if it is desirable.
 	 */
 	rbp = NULL;
 	if (!alreadyincore &&
 	    ((u_quad_t)(rablkno + 1) * size) <= filesize &&
 	    !(error = VOP_BMAP(vp, rablkno, NULL, &blkno, &num_ra, NULL)) &&
 	    blkno != -1) {
 		if (num_ra > vp->v_ralen)
 			num_ra = vp->v_ralen;
 
 		if (num_ra) {
 			rbp = cluster_rbuild(vp, filesize, rablkno, blkno, size,
 				num_ra + 1);
 		} else {
 			rbp = getblk(vp, rablkno, size, 0, 0);
 			rbp->b_flags |= B_READ | B_ASYNC;
 			rbp->b_blkno = blkno;
 		}
 	}
 
 	/*
 	 * handle the synchronous read
 	 */
 	if (bp) {
 		if (bp->b_flags & (B_DONE | B_DELWRI))
 			panic("cluster_read: DONE bp");
 		else {
 			vfs_busy_pages(bp, 0);
 			error = VOP_STRATEGY(bp);
 			vp->v_maxra = bp->b_lblkno + bp->b_bcount / size;
 			totreads++;
 			totreadblocks += bp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
 	/*
 	 * and if we have read-aheads, do them too
 	 */
 	if (rbp) {
 		vp->v_maxra = rbp->b_lblkno + rbp->b_bcount / size;
-		if (error || (rbp->b_flags & B_CACHE)) {
+		if (error) {
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			brelse(rbp);
+		} else if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~(B_ASYNC | B_READ);
+			bqrelse(rbp);
 		} else {
 			if ((rbp->b_flags & B_CLUSTER) == 0)
 				vfs_busy_pages(rbp, 0);
 			(void) VOP_STRATEGY(rbp);
 			totreads++;
 			totreadblocks += rbp->b_bcount / size;
 			curproc->p_stats->p_ru.ru_inblock++;
 		}
 	}
 	if (bp && ((bp->b_flags & B_ASYNC) == 0))
 		return (biowait(bp));
 	return (error);
 }
 
 /*
  * If blocks are contiguous on disk, use this to provide clustered
  * read ahead.  We will read as many blocks as possible sequentially
  * and then parcel them up into logical blocks in the buffer hash table.
  */
 static struct buf *
 cluster_rbuild(vp, filesize, lbn, blkno, size, run)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lbn;
 	daddr_t blkno;
 	long size;
 	int run;
 {
 	struct buf *bp, *tbp;
 	daddr_t bn;
 	int i, inc, j;
 
 #ifdef DIAGNOSTIC
 	if (size != vp->v_mount->mnt_stat.f_iosize)
 		panic("cluster_rbuild: size %d != filesize %d\n",
 		    size, vp->v_mount->mnt_stat.f_iosize);
 #endif
 	/*
 	 * avoid a division
 	 */
 	while ((u_quad_t) size * (lbn + run) > filesize) {
 		--run;
 	}
 
 	tbp = getblk(vp, lbn, size, 0, 0);
 	if (tbp->b_flags & B_CACHE)
 		return tbp;
 
 	tbp->b_blkno = blkno;
 	tbp->b_flags |= B_ASYNC | B_READ; 
 	if( ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
 		return tbp;
 
 	bp = trypbuf();
 	if (bp == 0)
 		return tbp;
 
 	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
 	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
 	bp->b_iodone = cluster_callback;
 	bp->b_blkno = blkno;
 	bp->b_lblkno = lbn;
 	pbgetvp(vp, bp);
 
 	TAILQ_INIT(&bp->b_cluster.cluster_head);
 
 	bp->b_bcount = 0;
 	bp->b_bufsize = 0;
 	bp->b_npages = 0;
 
 	inc = btodb(size);
 	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
 		if (i != 0) {
 			if ((bp->b_npages * PAGE_SIZE) +
 				round_page(size) > MAXPHYS)
 				break;
 
-			if (gbincore(vp, lbn + i))
+			if (incore(vp, lbn + i))
 				break;
 
 			tbp = getblk(vp, lbn + i, size, 0, 0);
 
 			if ((tbp->b_flags & B_CACHE) ||
 				(tbp->b_flags & B_VMIO) == 0) {
-				brelse(tbp);
+				bqrelse(tbp);
 				break;
 			}
 
 			for (j=0;j<tbp->b_npages;j++) {
 				if (tbp->b_pages[j]->valid) {
 					break;
 				}
 			}
 
 			if (j != tbp->b_npages) {
 				/*
 				 * force buffer to be re-constituted later
 				 */
 				tbp->b_flags |= B_RELBUF;
 				brelse(tbp);
 				break;
 			}
 
 			tbp->b_flags |= B_READ | B_ASYNC;
 			if (tbp->b_blkno == tbp->b_lblkno) {
 				tbp->b_blkno = bn;
 			} else if (tbp->b_blkno != bn) {
 				brelse(tbp);
 				break;
 			}
 		}
 		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 			tbp, b_cluster.cluster_entry);
 		for (j = 0; j < tbp->b_npages; j += 1) {
 			vm_page_t m;
 			m = tbp->b_pages[j];
 			++m->busy;
 			++m->object->paging_in_progress;
 			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) {
 				m = bogus_page;
 			}
 			if ((bp->b_npages == 0) ||
 				(bp->b_pages[bp->b_npages-1] != m)) {
 				bp->b_pages[bp->b_npages] = m;
 				bp->b_npages++;
 			}
 		}
 		bp->b_bcount += tbp->b_bcount;
 		bp->b_bufsize += tbp->b_bufsize;
 	}
 	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
 		(vm_page_t *)bp->b_pages, bp->b_npages);
 	return (bp);
 }
 
 /*
  * Cleanup after a clustered read or write.
  * This is complicated by the fact that any of the buffers might have
  * extra memory (if there were no empty buffer headers at allocbuf time)
  * that we will need to shift around.
  */
 void
 cluster_callback(bp)
 	struct buf *bp;
 {
 	struct buf *nbp, *tbp;
 	int error = 0;
 
 	/*
 	 * Must propogate errors to all the components.
 	 */
 	if (bp->b_flags & B_ERROR)
 		error = bp->b_error;
 
 	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
 	for (tbp = bp->b_cluster.cluster_head.tqh_first;
 		tbp; tbp = nbp) {
 		nbp = tbp->b_cluster.cluster_entry.tqe_next;
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
 		}
 		biodone(tbp);
 	}
 	relpbuf(bp);
 }
 
 /*
  * Do clustered write for FFS.
  *
  * Three cases:
  *	1. Write is not sequential (write asynchronously)
  *	Write is sequential:
  *	2.	beginning of cluster - begin cluster
  *	3.	middle of a cluster - add to cluster
  *	4.	end of a cluster - asynchronously write cluster
  */
 void
 cluster_write(bp, filesize)
 	struct buf *bp;
 	u_quad_t filesize;
 {
 	struct vnode *vp;
 	daddr_t lbn;
 	int maxclen, cursize;
 	int lblocksize;
 	int async;
 
 	vp = bp->b_vp;
 	async = (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC));
 	lblocksize = vp->v_mount->mnt_stat.f_iosize;
 	lbn = bp->b_lblkno;
 
 	/* Initialize vnode to beginning of file. */
 	if (lbn == 0)
 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 
 	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
 	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
 		maxclen = MAXPHYS / lblocksize - 1;
 		if (vp->v_clen != 0) {
 			/*
 			 * Next block is not sequential.
 			 *
 			 * If we are not writing at end of file, the process
 			 * seeked to another point in the file since its last
 			 * write, or we have reached our maximum cluster size,
 			 * then push the previous cluster. Otherwise try
 			 * reallocating to make it sequential.
 			 */
 			cursize = vp->v_lastw - vp->v_cstart + 1;
 #ifndef notyet_block_reallocation_enabled
 			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
 				lbn != vp->v_lastw + 1 ||
 				vp->v_clen <= cursize) {
 				if (!async)
 					cluster_wbuild(vp, lblocksize,
 						vp->v_cstart, cursize);
 			}
 #else
 			if (!doreallocblks ||
 			    (lbn + 1) * lblocksize != filesize ||
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
 				if (!async)
 					cluster_wbuild(vp, lblocksize,
 						vp->v_cstart, cursize);
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
 
 				buflist = cluster_collectbufs(vp, bp);
 				endbp = &buflist->bs_children
 				    [buflist->bs_nchildren - 1];
 				if (VOP_REALLOCBLKS(vp, buflist)) {
 					/*
 					 * Failed, push the previous cluster.
 					 */
 					for (bpp = buflist->bs_children;
 					     bpp < endbp; bpp++)
 						brelse(*bpp);
 					free(buflist, M_SEGMENT);
 					cluster_wbuild(vp, lblocksize,
 					    vp->v_cstart, cursize);
 				} else {
 					/*
 					 * Succeeded, keep building cluster.
 					 */
 					for (bpp = buflist->bs_children;
 					     bpp <= endbp; bpp++)
 						bdwrite(*bpp);
 					free(buflist, M_SEGMENT);
 					vp->v_lastw = lbn;
 					vp->v_lasta = bp->b_blkno;
 					return;
 				}
 			}
 #endif /* notyet_block_reallocation_enabled */
 		}
 		/*
 		 * Consider beginning a cluster. If at end of file, make
 		 * cluster as large as possible, otherwise find size of
 		 * existing cluster.
 		 */
 		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
 		    (bp->b_blkno == bp->b_lblkno) &&
 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
 		     bp->b_blkno == -1)) {
 			bawrite(bp);
 			vp->v_clen = 0;
 			vp->v_lasta = bp->b_blkno;
 			vp->v_cstart = lbn + 1;
 			vp->v_lastw = lbn;
 			return;
 		}
 		vp->v_clen = maxclen;
 		if (!async && maxclen == 0) {	/* I/O not contiguous */
 			vp->v_cstart = lbn + 1;
-			if (!async)
-				bawrite(bp);
-			else
-				bdwrite(bp);
+			bawrite(bp);
 		} else {	/* Wait for rest of cluster */
 			vp->v_cstart = lbn;
 			bdwrite(bp);
 		}
 	} else if (lbn == vp->v_cstart + vp->v_clen) {
 		/*
 		 * At end of cluster, write it out.
 		 */
 		bdwrite(bp);
-		cluster_wbuild(vp, lblocksize, vp->v_cstart,
-		    vp->v_clen + 1);
+		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else
 		/*
 		 * In the middle of a cluster, so just delay the I/O for now.
 		 */
 		bdwrite(bp);
 	vp->v_lastw = lbn;
 	vp->v_lasta = bp->b_blkno;
 }
 
 
 /*
  * This is an awful lot like cluster_rbuild...wish they could be combined.
  * The last lbn argument is the current block on which I/O is being
  * performed.  Check to see that it doesn't fall in the middle of
  * the current block (if last_bp == NULL).
  */
 int
 cluster_wbuild(vp, size, start_lbn, len)
 	struct vnode *vp;
 	long size;
 	daddr_t start_lbn;
 	int len;
 {
 	struct buf *bp, *tbp;
 	int i, j, s;
 	int totalwritten = 0;
 	int dbsize = btodb(size);
 	while (len > 0) {
 		s = splbio();
 		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
 			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
 			++start_lbn;
 			--len;
 			splx(s);
 			continue;
 		}
 		bremfree(tbp);
 		tbp->b_flags |= B_BUSY;
 		tbp->b_flags &= ~B_DONE;
 		splx(s);
 
 	/*
 	 * Extra memory in the buffer, punt on this buffer. XXX we could
 	 * handle this in most cases, but we would have to push the extra
 	 * memory down to after our max possible cluster size and then
 	 * potentially pull it back up if the cluster was terminated
 	 * prematurely--too much hassle.
 	 */
 		if (((tbp->b_flags & B_CLUSTEROK) != B_CLUSTEROK) ||
 			(tbp->b_bcount != tbp->b_bufsize) ||
 			(tbp->b_bcount != size) ||
 			len == 1) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
 			--len;
 			continue;
 		}
 
 		bp = trypbuf();
 		if (bp == NULL) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;
 			--len;
 			continue;
 		}
 
 		TAILQ_INIT(&bp->b_cluster.cluster_head);
 		bp->b_bcount = 0;
 		bp->b_bufsize = 0;
 		bp->b_npages = 0;
 
 		bp->b_blkno = tbp->b_blkno;
 		bp->b_lblkno = tbp->b_lblkno;
 		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
 		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & B_VMIO);
 		bp->b_iodone = cluster_callback;
 		pbgetvp(vp, bp);
 
 		for (i = 0; i < len; ++i, ++start_lbn) {
 			if (i != 0) {
 				s = splbio();
 				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
 					splx(s);
 					break;
 				}
 
 				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & B_VMIO))) {
 					splx(s);
 					break;
 				}
 
 				if ((tbp->b_bcount != size) ||
 					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
 					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
 					splx(s);
 					break;
 				}
 				bremfree(tbp);
 				tbp->b_flags |= B_BUSY;
 				tbp->b_flags &= ~B_DONE;
 				splx(s);
 			}
-			for (j = 0; j < tbp->b_npages; j += 1) {
-				vm_page_t m;
-				m = tbp->b_pages[j];
-				++m->busy;
-				++m->object->paging_in_progress;
-				if ((bp->b_npages == 0) ||
-					(bp->b_pages[bp->b_npages - 1] != m)) {
-					bp->b_pages[bp->b_npages] = m;
-					bp->b_npages++;
+			if (tbp->b_flags & B_VMIO) {
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					vm_page_t m;
+					m = tbp->b_pages[j];
+					++m->busy;
+					++m->object->paging_in_progress;
+					if ((bp->b_npages == 0) ||
+						(bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
 				}
 			}
 			bp->b_bcount += size;
 			bp->b_bufsize += size;
 
 			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
 			tbp->b_flags |= B_ASYNC;
 			s = splbio();
 			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
 			++tbp->b_vp->v_numoutput;
 			splx(s);
 			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
 				tbp, b_cluster.cluster_entry);
 		}
 		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
 			(vm_page_t *) bp->b_pages, bp->b_npages);
 		totalwritten += bp->b_bufsize;
 		bawrite(bp);
 
 		len -= i;
 	}
 	return totalwritten;
 }
 
 #ifdef notyet_block_reallocation_enabled
 /*
  * Collect together all the buffers in a cluster.
  * Plus add one additional buffer.
  */
 static struct cluster_save *
 cluster_collectbufs(vp, last_bp)
 	struct vnode *vp;
 	struct buf *last_bp;
 {
 	struct cluster_save *buflist;
 	daddr_t lbn;
 	int i, len;
 
 	len = vp->v_lastw - vp->v_cstart + 1;
 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
 	    M_SEGMENT, M_WAITOK);
 	buflist->bs_nchildren = 0;
 	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
 		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
 		    &buflist->bs_children[i]);
 	buflist->bs_children[i] = last_bp;
 	buflist->bs_nchildren = i + 1;
 	return (buflist);
 }
 #endif /* notyet_block_reallocation_enabled */
Index: head/sys/kern/vfs_export.c
===================================================================
--- head/sys/kern/vfs_export.c	(revision 13489)
+++ head/sys/kern/vfs_export.c	(revision 13490)
@@ -1,1538 +1,1547 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
- * $Id: vfs_subr.c,v 1.50 1996/01/02 18:13:20 davidg Exp $
+ * $Id: vfs_subr.c,v 1.51 1996/01/04 21:12:26 wollman Exp $
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/namei.h>
 #include <sys/ucred.h>
 #include <sys/buf.h>
 #include <sys/errno.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <sys/sysctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #ifdef DDB
 extern void	printlockedvnodes __P((void));
 #endif
 extern void	vclean __P((struct vnode *vp, int flags));
 extern void	vfs_unmountroot __P((struct mount *rootfs));
 
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 /*
  * Insq/Remq for the vnode usage lists.
  */
 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 #define	bufremvn(bp) {  \
 	LIST_REMOVE(bp, b_vnbufs); \
 	(bp)->b_vnbufs.le_next = NOLIST; \
 }
 
 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 u_long freevnodes	= 0;
 
 struct mntlist mountlist;	/* mounted filesystem list */
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RD, &desiredvnodes, 0, "");
 
 static void	vfs_free_addrlist __P((struct netexport *nep));
 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 				       struct export_args *argp));
 
 /*
  * Initialize the vnode management data structures.
  */
 void
 vntblinit()
 {
 	desiredvnodes = maxproc + vm_object_cache_max;
 
 	TAILQ_INIT(&vnode_free_list);
 	CIRCLEQ_INIT(&mountlist);
 }
 
 /*
  * Lock a filesystem.
  * Used to prevent access to it while mounting and unmounting.
  */
 int
 vfs_lock(mp)
 	register struct mount *mp;
 {
 
 	while (mp->mnt_flag & MNT_MLOCK) {
 		mp->mnt_flag |= MNT_MWAIT;
 		(void) tsleep((caddr_t) mp, PVFS, "vfslck", 0);
 	}
 	mp->mnt_flag |= MNT_MLOCK;
 	return (0);
 }
 
 /*
  * Unlock a locked filesystem.
  * Panic if filesystem is not locked.
  */
 void
 vfs_unlock(mp)
 	register struct mount *mp;
 {
 
 	if ((mp->mnt_flag & MNT_MLOCK) == 0)
 		panic("vfs_unlock: not locked");
 	mp->mnt_flag &= ~MNT_MLOCK;
 	if (mp->mnt_flag & MNT_MWAIT) {
 		mp->mnt_flag &= ~MNT_MWAIT;
 		wakeup((caddr_t) mp);
 	}
 }
 
 /*
  * Mark a mount point as busy.
  * Used to synchronize access and to delay unmounting.
  */
 int
 vfs_busy(mp)
 	register struct mount *mp;
 {
 
 	while (mp->mnt_flag & MNT_MPBUSY) {
 		mp->mnt_flag |= MNT_MPWANT;
 		(void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0);
 	}
 	if (mp->mnt_flag & MNT_UNMOUNT)
 		return (1);
 	mp->mnt_flag |= MNT_MPBUSY;
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  * Panic if filesystem is not busy.
  */
 void
 vfs_unbusy(mp)
 	register struct mount *mp;
 {
 
 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
 		panic("vfs_unbusy: not busy");
 	mp->mnt_flag &= ~MNT_MPBUSY;
 	if (mp->mnt_flag & MNT_MPWANT) {
 		mp->mnt_flag &= ~MNT_MPWANT;
 		wakeup((caddr_t) &mp->mnt_flag);
 	}
 }
 
 void
 vfs_unmountroot(struct mount *rootfs)
 {
 	struct mount *mp = rootfs;
 	int error;
 
 	if (vfs_busy(mp)) {
 		printf("failed to unmount root\n");
 		return;
 	}
 	mp->mnt_flag |= MNT_UNMOUNT;
 	if ((error = vfs_lock(mp))) {
 		printf("lock of root filesystem failed (%d)\n", error);
 		return;
 	}
 	vnode_pager_umount(mp);	/* release cached vnodes */
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 
 	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
 		printf("sync of root filesystem failed (%d)\n", error);
 
 	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
 		printf("unmount of root filesystem failed (");
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 	mp->mnt_flag &= ~MNT_UNMOUNT;
 	vfs_unbusy(mp);
 }
 
 /*
  * Unmount all filesystems.  Should only be called by halt().
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp, *nmp, *rootfs = NULL;
 	int error;
 
 	/* unmount all but rootfs */
 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_prev;
 
 		if (mp->mnt_flag & MNT_ROOTFS) {
 			rootfs = mp;
 			continue;
 		}
 		error = dounmount(mp, MNT_FORCE, initproc);
 		if (error) {
 			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		}
 	}
 
 	/* and finally... */
 	if (rootfs) {
 		vfs_unmountroot(rootfs);
 	} else {
 		printf("no root filesystem\n");
 	}
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 			return (mp);
 	}
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid
  */
 void
 getnewfsid(mp, mtype)
 	struct mount *mp;
 	int mtype;
 {
 	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
 
 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
 	if (xxxfs_mntid == 0)
 		++xxxfs_mntid;
 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
 	tfsid.val[1] = mtype;
 	if (mountlist.cqh_first != (void *)&mountlist) {
 		while (getvfs(&tfsid)) {
 			tfsid.val[0]++;
 			xxxfs_mntid++;
 		}
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
 	    vap->va_fsid = vap->va_fileid =
 	    vap->va_blocksize = vap->va_rdev =
 	    vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
 	    vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
 	    vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
 	    vap->va_flags = vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 
+retry:
 	vp = vnode_free_list.tqh_first;
 	/*
 	 * we allocate a new vnode if
 	 * 	1. we don't have any free
 	 *		Pretty obvious, we actually used to panic, but that
 	 *		is a silly thing to do.
 	 *	2. we havn't filled our pool yet
 	 *		We don't want to trash the incore (VM-)vnodecache.
 	 *	3. if less that 1/4th of our vnodes are free.
 	 *		We don't want to trash the namei cache either.
 	 */
 	if (freevnodes < (numvnodes >> 2) ||
 	    numvnodes < desiredvnodes ||
 	    vp == NULL) {
 		vp = (struct vnode *) malloc((u_long) sizeof *vp,
 		    M_VNODE, M_WAITOK);
 		bzero((char *) vp, sizeof *vp);
 		numvnodes++;
 	} else {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		if (vp->v_usage > 0) {
+			--vp->v_usage;
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+			goto retry;
+		}
 		freevnodes--;
 
-		if (vp->v_usecount)
-			panic("free vnode isn't");
-
 		/* see comment on why 0xdeadb is set at end of vgone (below) */
 		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD)
 			vgone(vp);
+		if (vp->v_usecount)
+			panic("free vnode isn't");
+
 #ifdef DIAGNOSTIC
 		{
 			int s;
 
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			s = splbio();
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			splx(s);
 		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
 		vp->v_ralen = 0;
 		vp->v_maxra = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		vp->v_writecount = 0;	/* XXX */
+		vp->v_usage = 0;
 	}
 	vp->v_type = VNON;
 	cache_purge(vp);
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	insmntque(vp, mp);
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		LIST_REMOVE(vp, v_mntvnodes);
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL)
 		return;
 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
 			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 	int slpflag, slptimeo;
 {
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
 	vm_object_t object;
 
 	if (flags & V_SAVE) {
 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
 			return (error);
 		if (vp->v_dirtyblkhd.lh_first != NULL)
 			panic("vinvalbuf: dirty bufs");
 	}
 	for (;;) {
 		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
 		    (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist)
 			break;
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
 				continue;
 			s = splbio();
 			if (bp->b_flags & B_BUSY) {
 				bp->b_flags |= B_WANTED;
 				error = tsleep((caddr_t) bp,
 				    slpflag | (PRIBIO + 1), "vinvalbuf",
 				    slptimeo);
 				splx(s);
 				if (error)
 					return (error);
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= B_BUSY;
 			splx(s);
 			/*
 			 * XXX Since there are no node locks for NFS, I
 			 * believe there is a slight chance that a delayed
 			 * write will occur while sleeping just above, so
 			 * check for it.
 			 */
 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
 				(void) VOP_BWRITE(bp);
 				break;
 			}
 			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
 			brelse(bp);
 		}
 	}
 
 	s = splbio();
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 	}
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	object = vp->v_object;
 	if (object != NULL) {
 		vm_object_page_remove(object, 0, object->size,
 		    (flags & V_SAVE) ? TRUE : FALSE);
 	}
 	if (!(flags & V_SAVEMETA) &&
 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	if (bp->b_vp)
 		panic("bgetvp: not free");
 	VHOLD(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	bufinsvn(bp, &vp->v_cleanblkhd);
 	splx(s);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	int s;
 
 	if (bp->b_vp == (struct vnode *) 0)
 		panic("brelvp: NULL");
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	s = splbio();
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
 	splx(s);
 
 	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	HOLDRELE(vp);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	if (bp->b_vp)
 		panic("pbgetvp: not free");
 	VHOLD(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 
 	if (bp->b_vp == (struct vnode *) 0)
 		panic("brelvp: NULL");
 
 	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	HOLDRELE(vp);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	register struct buflists *listheadp;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
 		tbp = newvp->v_dirtyblkhd.lh_first;
 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
 			bufinsvn(bp, &newvp->v_dirtyblkhd);
 		} else {
-			while (tbp->b_vnbufs.le_next && (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+			while (tbp->b_vnbufs.le_next &&
+				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
 				tbp = tbp->b_vnbufs.le_next;
 			}
 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
 		}
 	} else {
 		listheadp = &newvp->v_cleanblkhd;
 		bufinsvn(bp, listheadp);
 	}
 }
 
 /*
  * Create a vnode for a block device.
  * Used for root filesystem, argdev, and swap areas.
  * Also used for memory file system special devices.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV)
 		return (0);
 	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = 0;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
 	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
 		vput(vp);
 		vp = nvp;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Check to see if the new vnode represents a special device
  * for which we already have a vnode (either because of
  * bdevvp() or because of a different vnode representing
  * the same block device). If such an alias exists, deallocate
  * the existing contents and return the aliased vnode. The
  * caller is responsible for filling it with its new contents.
  */
 struct vnode *
 checkalias(nvp, nvp_rdev, mp)
 	register struct vnode *nvp;
 	dev_t nvp_rdev;
 	struct mount *mp;
 {
 	register struct vnode *vp;
 	struct vnode **vpp;
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		return (NULLVP);
 
 	vpp = &speclisth[SPECHASH(nvp_rdev)];
 loop:
 	for (vp = *vpp; vp; vp = vp->v_specnext) {
 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vp->v_usecount == 0) {
 			vgone(vp);
 			goto loop;
 		}
 		if (vget(vp, 1))
 			goto loop;
 		break;
 	}
 	if (vp == NULL || vp->v_tag != VT_NON) {
 		MALLOC(nvp->v_specinfo, struct specinfo *,
 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
 		nvp->v_rdev = nvp_rdev;
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
 		nvp->v_specflags = 0;
 		*vpp = nvp;
 		if (vp != NULL) {
 			nvp->v_flag |= VALIASED;
 			vp->v_flag |= VALIASED;
 			vput(vp);
 		}
 		return (NULLVP);
 	}
 	VOP_UNLOCK(vp);
 	vclean(vp, 0);
 	vp->v_op = nvp->v_op;
 	vp->v_tag = nvp->v_tag;
 	nvp->v_type = VNON;
 	insmntque(vp, mp);
 	return (vp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
  */
 int
 vget(vp, lockflag)
 	register struct vnode *vp;
 	int lockflag;
 {
 
 	/*
 	 * If the vnode is in the process of being cleaned out for another
 	 * use, we wait for the cleaning to finish and then return failure.
 	 * Cleaning is determined either by checking that the VXLOCK flag is
 	 * set, or that the use count is zero with the back pointer set to
 	 * show that it has been removed from the free list by getnewvnode.
 	 * The VXLOCK flag may not have been set yet because vclean is blocked
 	 * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete.
 	 */
 	if ((vp->v_flag & VXLOCK) ||
 	    (vp->v_usecount == 0 &&
 		vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) {
 		vp->v_flag |= VXWANT;
 		(void) tsleep((caddr_t) vp, PINOD, "vget", 0);
 		return (1);
 	}
 	if (vp->v_usecount == 0) {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 	}
 	vp->v_usecount++;
 	if (lockflag)
 		VOP_LOCK(vp);
 	return (0);
 }
 
 /*
  * Vnode reference, just increment the count
  */
 void
 vref(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_usecount <= 0)
 		panic("vref used where vget required");
 	vp->v_usecount++;
 }
 
 /*
  * vput(), just unlock and vrele()
  */
 void
 vput(vp)
 	register struct vnode *vp;
 {
 
 	VOP_UNLOCK(vp);
 	vrele(vp);
 }
 
 /*
  * Vnode release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	register struct vnode *vp;
 {
 
 #ifdef DIAGNOSTIC
 	if (vp == NULL)
 		panic("vrele: null vp");
 #endif
 	vp->v_usecount--;
 	if (vp->v_usecount > 0)
 		return;
 	if (vp->v_usecount < 0 /* || vp->v_writecount < 0 */ ) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		panic("vrele: negative reference cnt");
 	}
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		vp->v_flag &= ~VAGE;
+		vp->v_usage = 0;
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 
 	VOP_INACTIVE(vp);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Page or buffer structure gets a reference.
  */
 void
 vhold(vp)
 	register struct vnode *vp;
 {
 
 	vp->v_holdcnt++;
 }
 
 /*
  * Page or buffer structure frees a reference.
  */
 void
 holdrele(vp)
 	register struct vnode *vp;
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("holdrele: holdcnt");
 	vp->v_holdcnt--;
 }
 #endif /* DIAGNOSTIC */
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If MNT_NOFORCE is specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If MNT_FORCE is specified, detach any active vnodes
  * that are found.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, skipvp, flags)
 	struct mount *mp;
 	struct vnode *skipvp;
 	int flags;
 {
 	register struct vnode *vp, *nvp;
 	int busy = 0;
 
 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
 		panic("vflush: not busy");
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		/*
 		 * Skip over a selected vnode.
 		 */
 		if (vp == skipvp)
 			continue;
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
 			continue;
 		/*
 		 * If WRITECLOSE is set, only flush out regular file vnodes
 		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG))
 			continue;
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			vgone(vp);
 			continue;
 		}
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
 				vgone(vp);
 			} else {
 				vclean(vp, 0);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		busy++;
 	}
 	if (busy)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 void
 vclean(struct vnode *vp, int flags)
 {
 	int active;
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		VREF(vp);
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still have
 	 * the object locked while it cleans it out. The VOP_LOCK ensures that
 	 * the VOP_INACTIVE routine is done with its work. For active vnodes,
 	 * it ensures that no other activity can occur while the underlying
 	 * object is being cleaned out.
 	 */
 	VOP_LOCK(vp);
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 */
 	if (flags & DOCLOSE)
 		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
 	/*
 	 * Any other processes trying to obtain this lock must first wait for
 	 * VXLOCK to clear, then call the new lock operation.
 	 */
 	VOP_UNLOCK(vp);
 	/*
 	 * If purging an active vnode, it must be closed and deactivated
 	 * before being reclaimed.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
 		VOP_INACTIVE(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp))
 		panic("vclean: cannot reclaim");
 	if (active)
 		vrele(vp);
 
 	/*
 	 * Done with purge, notify sleepers of the grim news.
 	 */
 	vp->v_op = dead_vnodeop_p;
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with  the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 void
 vgoneall(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 
 	if (vp->v_flag & VALIASED) {
 		/*
 		 * If a vgone (or vclean) is already in progress, wait until
 		 * it is done and return.
 		 */
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
 			(void) tsleep((caddr_t) vp, PINOD, "vgall", 0);
 			return;
 		}
 		/*
 		 * Ensure that vp will not be vgone'd while we are eliminating
 		 * its aliases.
 		 */
 		vp->v_flag |= VXLOCK;
 		while (vp->v_flag & VALIASED) {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type || vp == vq)
 					continue;
 				vgone(vq);
 				break;
 			}
 		}
 		/*
 		 * Remove the lock so that vgone below will really eliminate
 		 * the vnode after which time vgone will awaken any sleepers.
 		 */
 		vp->v_flag &= ~VXLOCK;
 	}
 	vgone(vp);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 	struct vnode *vx;
 
 	/*
 	 * If a vgone (or vclean) is already in progress, wait until it is
 	 * done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		(void) tsleep((caddr_t) vp, PINOD, "vgone", 0);
 		return;
 	}
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL) {
 		LIST_REMOVE(vp, v_mntvnodes);
 		vp->v_mount = NULL;
 	}
 	/*
 	 * If special device, remove it from special device alias list.
 	 */
 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
 		if (*vp->v_hashchain == vp) {
 			*vp->v_hashchain = vp->v_specnext;
 		} else {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_specnext != vp)
 					continue;
 				vq->v_specnext = vp->v_specnext;
 				break;
 			}
 			if (vq == NULL)
 				panic("missing bdev");
 		}
 		if (vp->v_flag & VALIASED) {
 			vx = NULL;
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type)
 					continue;
 				if (vx)
 					break;
 				vx = vq;
 			}
 			if (vx == NULL)
 				panic("missing alias");
 			if (vq == NULL)
 				vx->v_flag &= ~VALIASED;
 			vp->v_flag &= ~VALIASED;
 		}
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
 	/*
 	 * If it is on the freelist and not already at the head, move it to
 	 * the head of the list. The test of the back pointer and the
 	 * reference count of zero is because it will be removed from the free
 	 * list by getnewvnode, but will not have its reference count
 	 * incremented until after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to close the
 	 * previous instance of the underlying object. So, the back pointer is
 	 * explicitly set to `0xdeadb' in getnewvnode after removing it from
 	 * the freelist to ensure that we do not try to move it here.
 	 */
 	if (vp->v_usecount == 0 &&
 	    vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb &&
 	    vnode_free_list.tqh_first != vp) {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	}
 	vp->v_type = VBAD;
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 
 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || type != vp->v_type)
 			continue;
 		*vpp = vp;
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq, *vnext;
 	int count;
 
 loop:
 	if ((vp->v_flag & VALIASED) == 0)
 		return (vp->v_usecount);
 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
 		vnext = vq->v_specnext;
 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vq->v_usecount == 0 && vq != vp) {
 			vgone(vq);
 			goto loop;
 		}
 		count += vq->v_usecount;
 	}
 	return (count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	register struct vnode *vp;
 {
 	char buf[64];
 
 	if (label != NULL)
 		printf("%s: ", label);
 	printf("type %s, usecount %d, writecount %d, refcount %ld,",
 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
 	if (vp->v_flag & VTEXT)
 		strcat(buf, "|VTEXT");
 	if (vp->v_flag & VSYSTEM)
 		strcat(buf, "|VSYSTEM");
 	if (vp->v_flag & VXLOCK)
 		strcat(buf, "|VXLOCK");
 	if (vp->v_flag & VXWANT)
 		strcat(buf, "|VXWANT");
 	if (vp->v_flag & VBWAIT)
 		strcat(buf, "|VBWAIT");
 	if (vp->v_flag & VALIASED)
 		strcat(buf, "|VALIASED");
 	if (buf[0] != '\0')
 		printf(" flags (%s)", &buf[1]);
 	if (vp->v_data == NULL) {
 		printf("\n");
 	} else {
 		printf("\n\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 void
 printlockedvnodes(void)
 {
 	register struct mount *mp;
 	register struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		for (vp = mp->mnt_vnodelist.lh_first;
 		    vp != NULL;
 		    vp = vp->v_mntvnodes.le_next)
 			if (VOP_ISLOCKED(vp))
 				vprint((char *) 0, vp);
 	}
 }
 #endif
 
 int kinfo_vdebug = 1;
 int kinfo_vgetfailed;
 
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
 static int
 sysctl_vnode SYSCTL_HANDLER_ARGS
 {
 	register struct mount *mp, *nmp;
 	struct vnode *vp;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
 
 	req->lock = 0;
 	if (!req->oldptr) /* Make an estimate */
 		return (SYSCTL_OUT(req, 0,
 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_next;
 		if (vfs_busy(mp))
 			continue;
 again:
 		for (vp = mp->mnt_vnodelist.lh_first;
 		    vp != NULL;
 		    vp = vp->v_mntvnodes.le_next) {
 			/*
 			 * Check that the vp is still associated with this
 			 * filesystem.  RACE: could have been recycled onto
 			 * the same filesystem.
 			 */
 			if (vp->v_mount != mp) {
 				if (kinfo_vdebug)
 					printf("kinfo: vp changed\n");
 				goto again;
 			}
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
 				vfs_unbusy(mp);
 				return (error);
 			}
 		}
 		vfs_unbusy(mp);
 	}
 
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,vnode", "");
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 
 	if (vp->v_specflags & SI_MOUNTEDON)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
 			if (vq->v_specflags & SI_MOUNTEDON)
 				return (EBUSY);
 		}
 	}
 	return (0);
 }
 
 /*
  * Build hash lists of net addresses and hang them off the mount point.
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 
 	struct export_args *argp)
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	register int i;
 	struct radix_node *rn;
 	struct sockaddr *saddr, *smask = 0;
 	struct domain *dom;
 	int error;
 
 	if (argp->ex_addrlen == 0) {
 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 			return (EPERM);
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		np->netc_anon = argp->ex_anon;
 		np->netc_anon.cr_ref = 1;
 		mp->mnt_flag |= MNT_DEFEXPORTED;
 		return (0);
 	}
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 	bzero((caddr_t) np, i);
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
 			smask->sa_len = argp->ex_masklen;
 	}
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
 		 * Seems silly to initialize every AF when most are not used,
 		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 	}
 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 	    np->netc_rnodes);
 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 		error = EPERM;
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
 	np->netc_anon.cr_ref = 1;
 	return (0);
 out:
 	free(np, M_NETADDR);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 vfs_free_netcred(struct radix_node *rn, void *w)
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
 /*
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
 vfs_free_addrlist(struct netexport *nep)
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
 		if ((rnh = nep->ne_rtable[i])) {
 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 			    (caddr_t) rnh);
 			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
 
 int
 vfs_export(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 
 	if (argp->ex_flags & MNT_DELEXPORT) {
 		vfs_free_addrlist(nep);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
 	return (0);
 }
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
 	register struct mount *mp;
 	struct netexport *nep;
 	struct mbuf *nam;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
 
 	np = NULL;
 	if (mp->mnt_flag & MNT_EXPORTED) {
 		/*
 		 * Lookup in the export list first.
 		 */
 		if (nam != NULL) {
 			saddr = mtod(nam, struct sockaddr *);
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
 				    (*rnh->rnh_matchaddr) ((caddr_t) saddr,
 				    rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
 		}
 		/*
 		 * If no address match, use the default if it exists.
 		 */
 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 			np = &nep->ne_defexported;
 	}
 	return (np);
 }
 
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags) {
 	struct vnode *vp, *nvp;
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
 			continue;
 		if (vp->v_object &&
 		   (((vm_object_t) vp->v_object)->flags & OBJ_MIGHTBEDIRTY)) {
 			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
 		}
 	}
 }
Index: head/sys/kern/vfs_subr.c
===================================================================
--- head/sys/kern/vfs_subr.c	(revision 13489)
+++ head/sys/kern/vfs_subr.c	(revision 13490)
@@ -1,1538 +1,1547 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.13 (Berkeley) 4/18/94
- * $Id: vfs_subr.c,v 1.50 1996/01/02 18:13:20 davidg Exp $
+ * $Id: vfs_subr.c,v 1.51 1996/01/04 21:12:26 wollman Exp $
  */
 
 /*
  * External virtual filesystem routines
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/stat.h>
 #include <sys/namei.h>
 #include <sys/ucred.h>
 #include <sys/buf.h>
 #include <sys/errno.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <sys/sysctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #ifdef DDB
 extern void	printlockedvnodes __P((void));
 #endif
 extern void	vclean __P((struct vnode *vp, int flags));
 extern void	vfs_unmountroot __P((struct mount *rootfs));
 
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
 int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
 
 /*
  * Insq/Remq for the vnode usage lists.
  */
 #define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
 #define	bufremvn(bp) {  \
 	LIST_REMOVE(bp, b_vnbufs); \
 	(bp)->b_vnbufs.le_next = NOLIST; \
 }
 
 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
 u_long freevnodes	= 0;
 
 struct mntlist mountlist;	/* mounted filesystem list */
 
 int desiredvnodes;
 SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RD, &desiredvnodes, 0, "");
 
 static void	vfs_free_addrlist __P((struct netexport *nep));
 static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
 static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
 				       struct export_args *argp));
 
 /*
  * Initialize the vnode management data structures.
  */
 void
 vntblinit()
 {
 	desiredvnodes = maxproc + vm_object_cache_max;
 
 	TAILQ_INIT(&vnode_free_list);
 	CIRCLEQ_INIT(&mountlist);
 }
 
 /*
  * Lock a filesystem.
  * Used to prevent access to it while mounting and unmounting.
  */
 int
 vfs_lock(mp)
 	register struct mount *mp;
 {
 
 	while (mp->mnt_flag & MNT_MLOCK) {
 		mp->mnt_flag |= MNT_MWAIT;
 		(void) tsleep((caddr_t) mp, PVFS, "vfslck", 0);
 	}
 	mp->mnt_flag |= MNT_MLOCK;
 	return (0);
 }
 
 /*
  * Unlock a locked filesystem.
  * Panic if filesystem is not locked.
  */
 void
 vfs_unlock(mp)
 	register struct mount *mp;
 {
 
 	if ((mp->mnt_flag & MNT_MLOCK) == 0)
 		panic("vfs_unlock: not locked");
 	mp->mnt_flag &= ~MNT_MLOCK;
 	if (mp->mnt_flag & MNT_MWAIT) {
 		mp->mnt_flag &= ~MNT_MWAIT;
 		wakeup((caddr_t) mp);
 	}
 }
 
 /*
  * Mark a mount point as busy.
  * Used to synchronize access and to delay unmounting.
  */
 int
 vfs_busy(mp)
 	register struct mount *mp;
 {
 
 	while (mp->mnt_flag & MNT_MPBUSY) {
 		mp->mnt_flag |= MNT_MPWANT;
 		(void) tsleep((caddr_t) &mp->mnt_flag, PVFS, "vfsbsy", 0);
 	}
 	if (mp->mnt_flag & MNT_UNMOUNT)
 		return (1);
 	mp->mnt_flag |= MNT_MPBUSY;
 	return (0);
 }
 
 /*
  * Free a busy filesystem.
  * Panic if filesystem is not busy.
  */
 void
 vfs_unbusy(mp)
 	register struct mount *mp;
 {
 
 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
 		panic("vfs_unbusy: not busy");
 	mp->mnt_flag &= ~MNT_MPBUSY;
 	if (mp->mnt_flag & MNT_MPWANT) {
 		mp->mnt_flag &= ~MNT_MPWANT;
 		wakeup((caddr_t) &mp->mnt_flag);
 	}
 }
 
 void
 vfs_unmountroot(struct mount *rootfs)
 {
 	struct mount *mp = rootfs;
 	int error;
 
 	if (vfs_busy(mp)) {
 		printf("failed to unmount root\n");
 		return;
 	}
 	mp->mnt_flag |= MNT_UNMOUNT;
 	if ((error = vfs_lock(mp))) {
 		printf("lock of root filesystem failed (%d)\n", error);
 		return;
 	}
 	vnode_pager_umount(mp);	/* release cached vnodes */
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 
 	if ((error = VFS_SYNC(mp, MNT_WAIT, initproc->p_ucred, initproc)))
 		printf("sync of root filesystem failed (%d)\n", error);
 
 	if ((error = VFS_UNMOUNT(mp, MNT_FORCE, initproc))) {
 		printf("unmount of root filesystem failed (");
 		if (error == EBUSY)
 			printf("BUSY)\n");
 		else
 			printf("%d)\n", error);
 	}
 	mp->mnt_flag &= ~MNT_UNMOUNT;
 	vfs_unbusy(mp);
 }
 
 /*
  * Unmount all filesystems.  Should only be called by halt().
  */
 void
 vfs_unmountall()
 {
 	struct mount *mp, *nmp, *rootfs = NULL;
 	int error;
 
 	/* unmount all but rootfs */
 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_prev;
 
 		if (mp->mnt_flag & MNT_ROOTFS) {
 			rootfs = mp;
 			continue;
 		}
 		error = dounmount(mp, MNT_FORCE, initproc);
 		if (error) {
 			printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
 			if (error == EBUSY)
 				printf("BUSY)\n");
 			else
 				printf("%d)\n", error);
 		}
 	}
 
 	/* and finally... */
 	if (rootfs) {
 		vfs_unmountroot(rootfs);
 	} else {
 		printf("no root filesystem\n");
 	}
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
 getvfs(fsid)
 	fsid_t *fsid;
 {
 	register struct mount *mp;
 
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
 			return (mp);
 	}
 	return ((struct mount *) 0);
 }
 
 /*
  * Get a new unique fsid
  */
 void
 getnewfsid(mp, mtype)
 	struct mount *mp;
 	int mtype;
 {
 	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
 
 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
 	if (xxxfs_mntid == 0)
 		++xxxfs_mntid;
 	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
 	tfsid.val[1] = mtype;
 	if (mountlist.cqh_first != (void *)&mountlist) {
 		while (getvfs(&tfsid)) {
 			tfsid.val[0]++;
 			xxxfs_mntid++;
 		}
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 }
 
 /*
  * Set vnode attributes to VNOVAL
  */
 void
 vattr_null(vap)
 	register struct vattr *vap;
 {
 
 	vap->va_type = VNON;
 	vap->va_size = VNOVAL;
 	vap->va_bytes = VNOVAL;
 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
 	    vap->va_fsid = vap->va_fileid =
 	    vap->va_blocksize = vap->va_rdev =
 	    vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
 	    vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
 	    vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
 	    vap->va_flags = vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Routines having to do with the management of the vnode table.
  */
 extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
  */
 int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
 	vop_t **vops;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 
+retry:
 	vp = vnode_free_list.tqh_first;
 	/*
 	 * we allocate a new vnode if
 	 * 	1. we don't have any free
 	 *		Pretty obvious, we actually used to panic, but that
 	 *		is a silly thing to do.
 	 *	2. we havn't filled our pool yet
 	 *		We don't want to trash the incore (VM-)vnodecache.
 	 *	3. if less that 1/4th of our vnodes are free.
 	 *		We don't want to trash the namei cache either.
 	 */
 	if (freevnodes < (numvnodes >> 2) ||
 	    numvnodes < desiredvnodes ||
 	    vp == NULL) {
 		vp = (struct vnode *) malloc((u_long) sizeof *vp,
 		    M_VNODE, M_WAITOK);
 		bzero((char *) vp, sizeof *vp);
 		numvnodes++;
 	} else {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		if (vp->v_usage > 0) {
+			--vp->v_usage;
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+			goto retry;
+		}
 		freevnodes--;
 
-		if (vp->v_usecount)
-			panic("free vnode isn't");
-
 		/* see comment on why 0xdeadb is set at end of vgone (below) */
 		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD)
 			vgone(vp);
+		if (vp->v_usecount)
+			panic("free vnode isn't");
+
 #ifdef DIAGNOSTIC
 		{
 			int s;
 
 			if (vp->v_data)
 				panic("cleaned vnode isn't");
 			s = splbio();
 			if (vp->v_numoutput)
 				panic("Clean vnode has pending I/O's");
 			splx(s);
 		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
 		vp->v_ralen = 0;
 		vp->v_maxra = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
 		vp->v_writecount = 0;	/* XXX */
+		vp->v_usage = 0;
 	}
 	vp->v_type = VNON;
 	cache_purge(vp);
 	vp->v_tag = tag;
 	vp->v_op = vops;
 	insmntque(vp, mp);
 	*vpp = vp;
 	vp->v_usecount = 1;
 	vp->v_data = 0;
 	return (0);
 }
 
 /*
  * Move a vnode from one mount queue to another.
  */
 void
 insmntque(vp, mp)
 	register struct vnode *vp;
 	register struct mount *mp;
 {
 
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL)
 		LIST_REMOVE(vp, v_mntvnodes);
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
 	if ((vp->v_mount = mp) == NULL)
 		return;
 	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 }
 
 /*
  * Update outstanding I/O count and do wakeup if requested.
  */
 void
 vwakeup(bp)
 	register struct buf *bp;
 {
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
 	if ((vp = bp->b_vp)) {
 		vp->v_numoutput--;
 		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
 		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
 			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
 
 /*
  * Flush out and invalidate all buffers associated with a vnode.
  * Called with the underlying object locked.
  */
 int
 vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 	int slpflag, slptimeo;
 {
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
 	vm_object_t object;
 
 	if (flags & V_SAVE) {
 		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
 			return (error);
 		if (vp->v_dirtyblkhd.lh_first != NULL)
 			panic("vinvalbuf: dirty bufs");
 	}
 	for (;;) {
 		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
 		    (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist)
 			break;
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
 				continue;
 			s = splbio();
 			if (bp->b_flags & B_BUSY) {
 				bp->b_flags |= B_WANTED;
 				error = tsleep((caddr_t) bp,
 				    slpflag | (PRIBIO + 1), "vinvalbuf",
 				    slptimeo);
 				splx(s);
 				if (error)
 					return (error);
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= B_BUSY;
 			splx(s);
 			/*
 			 * XXX Since there are no node locks for NFS, I
 			 * believe there is a slight chance that a delayed
 			 * write will occur while sleeping just above, so
 			 * check for it.
 			 */
 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
 				(void) VOP_BWRITE(bp);
 				break;
 			}
 			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
 			brelse(bp);
 		}
 	}
 
 	s = splbio();
 	while (vp->v_numoutput > 0) {
 		vp->v_flag |= VBWAIT;
 		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
 	}
 	splx(s);
 
 	/*
 	 * Destroy the copy in the VM cache, too.
 	 */
 	object = vp->v_object;
 	if (object != NULL) {
 		vm_object_page_remove(object, 0, object->size,
 		    (flags & V_SAVE) ? TRUE : FALSE);
 	}
 	if (!(flags & V_SAVEMETA) &&
 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
 		panic("vinvalbuf: flush failed");
 	return (0);
 }
 
 /*
  * Associate a buffer with a vnode.
  */
 void
 bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	int s;
 
 	if (bp->b_vp)
 		panic("bgetvp: not free");
 	VHOLD(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 	/*
 	 * Insert onto list for new vnode.
 	 */
 	s = splbio();
 	bufinsvn(bp, &vp->v_cleanblkhd);
 	splx(s);
 }
 
 /*
  * Disassociate a buffer from a vnode.
  */
 void
 brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 	int s;
 
 	if (bp->b_vp == (struct vnode *) 0)
 		panic("brelvp: NULL");
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	s = splbio();
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
 	splx(s);
 
 	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	HOLDRELE(vp);
 }
 
 /*
  * Associate a p-buffer with a vnode.
  */
 void
 pbgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
 	if (bp->b_vp)
 		panic("pbgetvp: not free");
 	VHOLD(vp);
 	bp->b_vp = vp;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		bp->b_dev = vp->v_rdev;
 	else
 		bp->b_dev = NODEV;
 }
 
 /*
  * Disassociate a p-buffer from a vnode.
  */
 void
 pbrelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
 
 	if (bp->b_vp == (struct vnode *) 0)
 		panic("brelvp: NULL");
 
 	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	HOLDRELE(vp);
 }
 
 /*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
  */
 void
 reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
 	register struct buflists *listheadp;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
 	/*
 	 * If dirty, put on list of dirty buffers; otherwise insert onto list
 	 * of clean buffers.
 	 */
 	if (bp->b_flags & B_DELWRI) {
 		struct buf *tbp;
 
 		tbp = newvp->v_dirtyblkhd.lh_first;
 		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
 			bufinsvn(bp, &newvp->v_dirtyblkhd);
 		} else {
-			while (tbp->b_vnbufs.le_next && (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+			while (tbp->b_vnbufs.le_next &&
+				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
 				tbp = tbp->b_vnbufs.le_next;
 			}
 			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
 		}
 	} else {
 		listheadp = &newvp->v_cleanblkhd;
 		bufinsvn(bp, listheadp);
 	}
 }
 
 /*
  * Create a vnode for a block device.
  * Used for root filesystem, argdev, and swap areas.
  * Also used for memory file system special devices.
  */
 int
 bdevvp(dev, vpp)
 	dev_t dev;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 	struct vnode *nvp;
 	int error;
 
 	if (dev == NODEV)
 		return (0);
 	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
 	if (error) {
 		*vpp = 0;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
 	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
 		vput(vp);
 		vp = nvp;
 	}
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Check to see if the new vnode represents a special device
  * for which we already have a vnode (either because of
  * bdevvp() or because of a different vnode representing
  * the same block device). If such an alias exists, deallocate
  * the existing contents and return the aliased vnode. The
  * caller is responsible for filling it with its new contents.
  */
 struct vnode *
 checkalias(nvp, nvp_rdev, mp)
 	register struct vnode *nvp;
 	dev_t nvp_rdev;
 	struct mount *mp;
 {
 	register struct vnode *vp;
 	struct vnode **vpp;
 
 	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
 		return (NULLVP);
 
 	vpp = &speclisth[SPECHASH(nvp_rdev)];
 loop:
 	for (vp = *vpp; vp; vp = vp->v_specnext) {
 		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vp->v_usecount == 0) {
 			vgone(vp);
 			goto loop;
 		}
 		if (vget(vp, 1))
 			goto loop;
 		break;
 	}
 	if (vp == NULL || vp->v_tag != VT_NON) {
 		MALLOC(nvp->v_specinfo, struct specinfo *,
 		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
 		nvp->v_rdev = nvp_rdev;
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
 		nvp->v_specflags = 0;
 		*vpp = nvp;
 		if (vp != NULL) {
 			nvp->v_flag |= VALIASED;
 			vp->v_flag |= VALIASED;
 			vput(vp);
 		}
 		return (NULLVP);
 	}
 	VOP_UNLOCK(vp);
 	vclean(vp, 0);
 	vp->v_op = nvp->v_op;
 	vp->v_tag = nvp->v_tag;
 	nvp->v_type = VNON;
 	insmntque(vp, mp);
 	return (vp);
 }
 
 /*
  * Grab a particular vnode from the free list, increment its
  * reference count and lock it. The vnode lock bit is set the
  * vnode is being eliminated in vgone. The process is awakened
  * when the transition is completed, and an error returned to
  * indicate that the vnode is no longer usable (possibly having
  * been changed to a new file system type).
  */
 int
 vget(vp, lockflag)
 	register struct vnode *vp;
 	int lockflag;
 {
 
 	/*
 	 * If the vnode is in the process of being cleaned out for another
 	 * use, we wait for the cleaning to finish and then return failure.
 	 * Cleaning is determined either by checking that the VXLOCK flag is
 	 * set, or that the use count is zero with the back pointer set to
 	 * show that it has been removed from the free list by getnewvnode.
 	 * The VXLOCK flag may not have been set yet because vclean is blocked
 	 * in the VOP_LOCK call waiting for the VOP_INACTIVE to complete.
 	 */
 	if ((vp->v_flag & VXLOCK) ||
 	    (vp->v_usecount == 0 &&
 		vp->v_freelist.tqe_prev == (struct vnode **) 0xdeadb)) {
 		vp->v_flag |= VXWANT;
 		(void) tsleep((caddr_t) vp, PINOD, "vget", 0);
 		return (1);
 	}
 	if (vp->v_usecount == 0) {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		freevnodes--;
 	}
 	vp->v_usecount++;
 	if (lockflag)
 		VOP_LOCK(vp);
 	return (0);
 }
 
 /*
  * Vnode reference, just increment the count
  */
 void
 vref(vp)
 	struct vnode *vp;
 {
 
 	if (vp->v_usecount <= 0)
 		panic("vref used where vget required");
 	vp->v_usecount++;
 }
 
 /*
  * vput(), just unlock and vrele()
  */
 void
 vput(vp)
 	register struct vnode *vp;
 {
 
 	VOP_UNLOCK(vp);
 	vrele(vp);
 }
 
 /*
  * Vnode release.
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
 vrele(vp)
 	register struct vnode *vp;
 {
 
 #ifdef DIAGNOSTIC
 	if (vp == NULL)
 		panic("vrele: null vp");
 #endif
 	vp->v_usecount--;
 	if (vp->v_usecount > 0)
 		return;
 	if (vp->v_usecount < 0 /* || vp->v_writecount < 0 */ ) {
 #ifdef DIAGNOSTIC
 		vprint("vrele: negative ref count", vp);
 #endif
 		panic("vrele: negative reference cnt");
 	}
 	if (vp->v_flag & VAGE) {
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		vp->v_flag &= ~VAGE;
+		vp->v_usage = 0;
 	} else {
 		TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
 	}
 	freevnodes++;
 
 	VOP_INACTIVE(vp);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Page or buffer structure gets a reference.
  */
 void
 vhold(vp)
 	register struct vnode *vp;
 {
 
 	vp->v_holdcnt++;
 }
 
 /*
  * Page or buffer structure frees a reference.
  */
 void
 holdrele(vp)
 	register struct vnode *vp;
 {
 
 	if (vp->v_holdcnt <= 0)
 		panic("holdrele: holdcnt");
 	vp->v_holdcnt--;
 }
 #endif /* DIAGNOSTIC */
 
 /*
  * Remove any vnodes in the vnode table belonging to mount point mp.
  *
  * If MNT_NOFORCE is specified, there should not be any active ones,
  * return error if any are found (nb: this is a user error, not a
  * system error). If MNT_FORCE is specified, detach any active vnodes
  * that are found.
  */
 #ifdef DIAGNOSTIC
 static int busyprt = 0;		/* print out busy vnodes */
 SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
 vflush(mp, skipvp, flags)
 	struct mount *mp;
 	struct vnode *skipvp;
 	int flags;
 {
 	register struct vnode *vp, *nvp;
 	int busy = 0;
 
 	if ((mp->mnt_flag & MNT_MPBUSY) == 0)
 		panic("vflush: not busy");
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		/*
 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
 		 * Start over if it has (it won't be on the list anymore).
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		/*
 		 * Skip over a selected vnode.
 		 */
 		if (vp == skipvp)
 			continue;
 		/*
 		 * Skip over a vnodes marked VSYSTEM.
 		 */
 		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM))
 			continue;
 		/*
 		 * If WRITECLOSE is set, only flush out regular file vnodes
 		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG))
 			continue;
 		/*
 		 * With v_usecount == 0, all we need to do is clear out the
 		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			vgone(vp);
 			continue;
 		}
 		/*
 		 * If FORCECLOSE is set, forcibly close the vnode. For block
 		 * or character devices, revert to an anonymous device. For
 		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			if (vp->v_type != VBLK && vp->v_type != VCHR) {
 				vgone(vp);
 			} else {
 				vclean(vp, 0);
 				vp->v_op = spec_vnodeop_p;
 				insmntque(vp, (struct mount *) 0);
 			}
 			continue;
 		}
 #ifdef DIAGNOSTIC
 		if (busyprt)
 			vprint("vflush: busy vnode", vp);
 #endif
 		busy++;
 	}
 	if (busy)
 		return (EBUSY);
 	return (0);
 }
 
 /*
  * Disassociate the underlying file system from a vnode.
  */
 void
 vclean(struct vnode *vp, int flags)
 {
 	int active;
 
 	/*
 	 * Check to see if the vnode is in use. If so we have to reference it
 	 * before we clean it out so that its count cannot fall to zero and
 	 * generate a race against ourselves to recycle it.
 	 */
 	if ((active = vp->v_usecount))
 		VREF(vp);
 	/*
 	 * Even if the count is zero, the VOP_INACTIVE routine may still have
 	 * the object locked while it cleans it out. The VOP_LOCK ensures that
 	 * the VOP_INACTIVE routine is done with its work. For active vnodes,
 	 * it ensures that no other activity can occur while the underlying
 	 * object is being cleaned out.
 	 */
 	VOP_LOCK(vp);
 	/*
 	 * Prevent the vnode from being recycled or brought into use while we
 	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
 	vp->v_flag |= VXLOCK;
 	/*
 	 * Clean out any buffers associated with the vnode.
 	 */
 	if (flags & DOCLOSE)
 		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
 	/*
 	 * Any other processes trying to obtain this lock must first wait for
 	 * VXLOCK to clear, then call the new lock operation.
 	 */
 	VOP_UNLOCK(vp);
 	/*
 	 * If purging an active vnode, it must be closed and deactivated
 	 * before being reclaimed.
 	 */
 	if (active) {
 		if (flags & DOCLOSE)
 			VOP_CLOSE(vp, FNONBLOCK, NOCRED, NULL);
 		VOP_INACTIVE(vp);
 	}
 	/*
 	 * Reclaim the vnode.
 	 */
 	if (VOP_RECLAIM(vp))
 		panic("vclean: cannot reclaim");
 	if (active)
 		vrele(vp);
 
 	/*
 	 * Done with purge, notify sleepers of the grim news.
 	 */
 	vp->v_op = dead_vnodeop_p;
 	vp->v_tag = VT_NON;
 	vp->v_flag &= ~VXLOCK;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
 		wakeup((caddr_t) vp);
 	}
 }
 
 /*
  * Eliminate all activity associated with  the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 void
 vgoneall(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 
 	if (vp->v_flag & VALIASED) {
 		/*
 		 * If a vgone (or vclean) is already in progress, wait until
 		 * it is done and return.
 		 */
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
 			(void) tsleep((caddr_t) vp, PINOD, "vgall", 0);
 			return;
 		}
 		/*
 		 * Ensure that vp will not be vgone'd while we are eliminating
 		 * its aliases.
 		 */
 		vp->v_flag |= VXLOCK;
 		while (vp->v_flag & VALIASED) {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type || vp == vq)
 					continue;
 				vgone(vq);
 				break;
 			}
 		}
 		/*
 		 * Remove the lock so that vgone below will really eliminate
 		 * the vnode after which time vgone will awaken any sleepers.
 		 */
 		vp->v_flag &= ~VXLOCK;
 	}
 	vgone(vp);
 }
 
 /*
  * Eliminate all activity associated with a vnode
  * in preparation for reuse.
  */
 void
 vgone(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 	struct vnode *vx;
 
 	/*
 	 * If a vgone (or vclean) is already in progress, wait until it is
 	 * done and return.
 	 */
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		(void) tsleep((caddr_t) vp, PINOD, "vgone", 0);
 		return;
 	}
 	/*
 	 * Clean out the filesystem specific data.
 	 */
 	vclean(vp, DOCLOSE);
 	/*
 	 * Delete from old mount point vnode list, if on one.
 	 */
 	if (vp->v_mount != NULL) {
 		LIST_REMOVE(vp, v_mntvnodes);
 		vp->v_mount = NULL;
 	}
 	/*
 	 * If special device, remove it from special device alias list.
 	 */
 	if (vp->v_type == VBLK || vp->v_type == VCHR) {
 		if (*vp->v_hashchain == vp) {
 			*vp->v_hashchain = vp->v_specnext;
 		} else {
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_specnext != vp)
 					continue;
 				vq->v_specnext = vp->v_specnext;
 				break;
 			}
 			if (vq == NULL)
 				panic("missing bdev");
 		}
 		if (vp->v_flag & VALIASED) {
 			vx = NULL;
 			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 				if (vq->v_rdev != vp->v_rdev ||
 				    vq->v_type != vp->v_type)
 					continue;
 				if (vx)
 					break;
 				vx = vq;
 			}
 			if (vx == NULL)
 				panic("missing alias");
 			if (vq == NULL)
 				vx->v_flag &= ~VALIASED;
 			vp->v_flag &= ~VALIASED;
 		}
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
 	/*
 	 * If it is on the freelist and not already at the head, move it to
 	 * the head of the list. The test of the back pointer and the
 	 * reference count of zero is because it will be removed from the free
 	 * list by getnewvnode, but will not have its reference count
 	 * incremented until after calling vgone. If the reference count were
 	 * incremented first, vgone would (incorrectly) try to close the
 	 * previous instance of the underlying object. So, the back pointer is
 	 * explicitly set to `0xdeadb' in getnewvnode after removing it from
 	 * the freelist to ensure that we do not try to move it here.
 	 */
 	if (vp->v_usecount == 0 &&
 	    vp->v_freelist.tqe_prev != (struct vnode **) 0xdeadb &&
 	    vnode_free_list.tqh_first != vp) {
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 	}
 	vp->v_type = VBAD;
 }
 
 /*
  * Lookup a vnode by device number.
  */
 int
 vfinddev(dev, type, vpp)
 	dev_t dev;
 	enum vtype type;
 	struct vnode **vpp;
 {
 	register struct vnode *vp;
 
 	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
 		if (dev != vp->v_rdev || type != vp->v_type)
 			continue;
 		*vpp = vp;
 		return (1);
 	}
 	return (0);
 }
 
 /*
  * Calculate the total number of references to a special device.
  */
 int
 vcount(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq, *vnext;
 	int count;
 
 loop:
 	if ((vp->v_flag & VALIASED) == 0)
 		return (vp->v_usecount);
 	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
 		vnext = vq->v_specnext;
 		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
 			continue;
 		/*
 		 * Alias, but not in use, so flush it out.
 		 */
 		if (vq->v_usecount == 0 && vq != vp) {
 			vgone(vq);
 			goto loop;
 		}
 		count += vq->v_usecount;
 	}
 	return (count);
 }
 
 /*
  * Print out a description of a vnode.
  */
 static char *typename[] =
 {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
 	char *label;
 	register struct vnode *vp;
 {
 	char buf[64];
 
 	if (label != NULL)
 		printf("%s: ", label);
 	printf("type %s, usecount %d, writecount %d, refcount %ld,",
 	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
 	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
 	if (vp->v_flag & VTEXT)
 		strcat(buf, "|VTEXT");
 	if (vp->v_flag & VSYSTEM)
 		strcat(buf, "|VSYSTEM");
 	if (vp->v_flag & VXLOCK)
 		strcat(buf, "|VXLOCK");
 	if (vp->v_flag & VXWANT)
 		strcat(buf, "|VXWANT");
 	if (vp->v_flag & VBWAIT)
 		strcat(buf, "|VBWAIT");
 	if (vp->v_flag & VALIASED)
 		strcat(buf, "|VALIASED");
 	if (buf[0] != '\0')
 		printf(" flags (%s)", &buf[1]);
 	if (vp->v_data == NULL) {
 		printf("\n");
 	} else {
 		printf("\n\t");
 		VOP_PRINT(vp);
 	}
 }
 
 #ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
  */
 void
 printlockedvnodes(void)
 {
 	register struct mount *mp;
 	register struct vnode *vp;
 
 	printf("Locked vnodes\n");
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
 	    mp = mp->mnt_list.cqe_next) {
 		for (vp = mp->mnt_vnodelist.lh_first;
 		    vp != NULL;
 		    vp = vp->v_mntvnodes.le_next)
 			if (VOP_ISLOCKED(vp))
 				vprint((char *) 0, vp);
 	}
 }
 #endif
 
 int kinfo_vdebug = 1;
 int kinfo_vgetfailed;
 
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
 static int
 sysctl_vnode SYSCTL_HANDLER_ARGS
 {
 	register struct mount *mp, *nmp;
 	struct vnode *vp;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
 
 	req->lock = 0;
 	if (!req->oldptr) /* Make an estimate */
 		return (SYSCTL_OUT(req, 0,
 			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
 
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_next;
 		if (vfs_busy(mp))
 			continue;
 again:
 		for (vp = mp->mnt_vnodelist.lh_first;
 		    vp != NULL;
 		    vp = vp->v_mntvnodes.le_next) {
 			/*
 			 * Check that the vp is still associated with this
 			 * filesystem.  RACE: could have been recycled onto
 			 * the same filesystem.
 			 */
 			if (vp->v_mount != mp) {
 				if (kinfo_vdebug)
 					printf("kinfo: vp changed\n");
 				goto again;
 			}
 			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
 			    (error = SYSCTL_OUT(req, vp, VNODESZ))) {
 				vfs_unbusy(mp);
 				return (error);
 			}
 		}
 		vfs_unbusy(mp);
 	}
 
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
 	0, 0, sysctl_vnode, "S,vnode", "");
 
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
 int
 vfs_mountedon(vp)
 	register struct vnode *vp;
 {
 	register struct vnode *vq;
 
 	if (vp->v_specflags & SI_MOUNTEDON)
 		return (EBUSY);
 	if (vp->v_flag & VALIASED) {
 		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
 			if (vq->v_rdev != vp->v_rdev ||
 			    vq->v_type != vp->v_type)
 				continue;
 			if (vq->v_specflags & SI_MOUNTEDON)
 				return (EBUSY);
 		}
 	}
 	return (0);
 }
 
 /*
  * Build hash lists of net addresses and hang them off the mount point.
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
 vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 
 	struct export_args *argp)
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	register int i;
 	struct radix_node *rn;
 	struct sockaddr *saddr, *smask = 0;
 	struct domain *dom;
 	int error;
 
 	if (argp->ex_addrlen == 0) {
 		if (mp->mnt_flag & MNT_DEFEXPORTED)
 			return (EPERM);
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		np->netc_anon = argp->ex_anon;
 		np->netc_anon.cr_ref = 1;
 		mp->mnt_flag |= MNT_DEFEXPORTED;
 		return (0);
 	}
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
 	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
 	bzero((caddr_t) np, i);
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
 		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
 		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
 			smask->sa_len = argp->ex_masklen;
 	}
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
 		 * Seems silly to initialize every AF when most are not used,
 		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
 				dom->dom_rtattach((void **) &nep->ne_rtable[i],
 				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
 			error = ENOBUFS;
 			goto out;
 		}
 	}
 	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
 	    np->netc_rnodes);
 	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
 		error = EPERM;
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
 	np->netc_anon.cr_ref = 1;
 	return (0);
 out:
 	free(np, M_NETADDR);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 vfs_free_netcred(struct radix_node *rn, void *w)
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
 	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
 	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
 /*
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
 vfs_free_addrlist(struct netexport *nep)
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
 		if ((rnh = nep->ne_rtable[i])) {
 			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
 			    (caddr_t) rnh);
 			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
 
 int
 vfs_export(mp, nep, argp)
 	struct mount *mp;
 	struct netexport *nep;
 	struct export_args *argp;
 {
 	int error;
 
 	if (argp->ex_flags & MNT_DELEXPORT) {
 		vfs_free_addrlist(nep);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
 	return (0);
 }
 
 struct netcred *
 vfs_export_lookup(mp, nep, nam)
 	register struct mount *mp;
 	struct netexport *nep;
 	struct mbuf *nam;
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
 
 	np = NULL;
 	if (mp->mnt_flag & MNT_EXPORTED) {
 		/*
 		 * Lookup in the export list first.
 		 */
 		if (nam != NULL) {
 			saddr = mtod(nam, struct sockaddr *);
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
 				np = (struct netcred *)
 				    (*rnh->rnh_matchaddr) ((caddr_t) saddr,
 				    rnh);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
 		}
 		/*
 		 * If no address match, use the default if it exists.
 		 */
 		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
 			np = &nep->ne_defexported;
 	}
 	return (np);
 }
 
 
 /*
  * perform msync on all vnodes under a mount point
  * the mount point must be locked.
  */
 void
 vfs_msync(struct mount *mp, int flags) {
 	struct vnode *vp, *nvp;
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
 			continue;
 		if (vp->v_object &&
 		   (((vm_object_t) vp->v_object)->flags & OBJ_MIGHTBEDIRTY)) {
 			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
 		}
 	}
 }
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c	(revision 13489)
+++ head/sys/kern/vfs_vnops.c	(revision 13490)
@@ -1,518 +1,518 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
- * $Id: vfs_vnops.c,v 1.21 1995/12/11 04:56:13 dyson Exp $
+ * $Id: vfs_vnops.c,v 1.22 1995/12/17 21:23:24 phk Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/ioctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vnode_pager.h>
 
 static int vn_closefile __P((struct file *fp, struct proc *p));
 static int vn_ioctl __P((struct file *fp, int com, caddr_t data, 
 		struct proc *p));
 static int vn_read __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred));
 static int vn_select __P((struct file *fp, int which, struct proc *p));
 static int vn_vmio_open __P((struct vnode *vp, struct proc *p, 
 		struct ucred *cred));
 static int vn_write __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred));
 
 struct 	fileops vnops =
 	{ vn_read, vn_write, vn_ioctl, vn_select, vn_closefile };
 
 /*
  * Common code for vnode open operations.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  */
 int
 vn_open(ndp, fmode, cmode)
 	register struct nameidata *ndp;
 	int fmode, cmode;
 {
 	register struct vnode *vp;
 	register struct proc *p = ndp->ni_cnd.cn_proc;
 	register struct ucred *cred = p->p_ucred;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int error;
 
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
 		if ((fmode & O_EXCL) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		error = namei(ndp);
 		if (error)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			LEASE_CHECK(ndp->ni_dvp, p, cred, LEASE_WRITE);
 			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 			    &ndp->ni_cnd, vap);
 			if (error)
 				return (error);
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 		} else {
 			VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd);
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF;
 		error = namei(ndp);
 		if (error)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	if ((fmode & O_CREAT) == 0) {
 		if (fmode & FREAD) {
 			error = VOP_ACCESS(vp, VREAD, cred, p);
 			if (error)
 				goto bad;
 		}
 		if (fmode & (FWRITE | O_TRUNC)) {
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			error = vn_writechk(vp);
 			if (error)
 				goto bad;
 		        error = VOP_ACCESS(vp, VWRITE, cred, p);
 			if (error)
 				goto bad;
 		}
 	}
 	if (fmode & O_TRUNC) {
 		VOP_UNLOCK(vp);				/* XXX */
 		LEASE_CHECK(vp, p, cred, LEASE_WRITE);
 		VOP_LOCK(vp);				/* XXX */
 		VATTR_NULL(vap);
 		vap->va_size = 0;
 		error = VOP_SETATTR(vp, vap, cred, p);
 		if (error)
 			goto bad;
 	}
 	error = VOP_OPEN(vp, fmode, cred, p);
 	if (error)
 		goto bad;
 	/*
 	 * this is here for VMIO support
 	 */
 	if (vp->v_type == VREG) {
 		if ((error = vn_vmio_open(vp, p, cred)) != 0)
 			goto bad;
 	}
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	return (0);
 bad:
 	vput(vp);
 	return (error);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * The read-only status of the file system is checked.
  * Also, prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (vp->v_flag & VTEXT)
 		return (ETXTBSY);
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, cred, p)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 {
 	int error;
 
 	if (flags & FWRITE)
 		vp->v_writecount--;
 	error = VOP_CLOSE(vp, flags, cred, p);
 	vn_vmio_close(vp);
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
 	enum uio_rw rw;
 	struct vnode *vp;
 	caddr_t base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *cred;
 	int *aresid;
 	struct proc *p;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_LOCK(vp);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_procp = p;
 	if (rw == UIO_READ) {
 		error = VOP_READ(vp, &auio, ioflg, cred);
 	} else {
 		error = VOP_WRITE(vp, &auio, ioflg, cred);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp);
 	return (error);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 {
 	register struct vnode *vp = (struct vnode *)fp->f_data;
 	int count, error;
 
 	LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_READ);
 	VOP_LOCK(vp);
 	uio->uio_offset = fp->f_offset;
 	count = uio->uio_resid;
 	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
 		cred);
 	fp->f_offset += count - uio->uio_resid;
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 {
 	register struct vnode *vp = (struct vnode *)fp->f_data;
 	int count, error, ioflag = 0;
 
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	LEASE_CHECK(vp, uio->uio_procp, cred, LEASE_WRITE);
 	VOP_LOCK(vp);
 	uio->uio_offset = fp->f_offset;
 	count = uio->uio_resid;
 	error = VOP_WRITE(vp, uio, ioflag, cred);
 	if (ioflag & IO_APPEND)
 		fp->f_offset = uio->uio_offset;
 	else
 		fp->f_offset += count - uio->uio_resid;
 	VOP_UNLOCK(vp);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 int
 vn_stat(vp, sb, p)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct proc *p;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, p->p_ucred, p);
 	if (error)
 		return (error);
 	/*
 	 * Copy from vattr table
 	 */
 	sb->st_dev = vap->va_fsid;
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vp->v_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	sb->st_size = vap->va_size;
 	sb->st_atimespec = vap->va_atime;
 	sb->st_mtimespec= vap->va_mtime;
 	sb->st_ctimespec = vap->va_ctime;
 	sb->st_blksize = vap->va_blocksize;
 	sb->st_flags = vap->va_flags;
 	sb->st_gen = vap->va_gen;
 #if (S_BLKSIZE == 512)
 	/* Optimize this case */
 	sb->st_blocks = vap->va_bytes >> 9;
 #else
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 #endif
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, p)
 	struct file *fp;
 	int com;
 	caddr_t data;
 	struct proc *p;
 {
 	register struct vnode *vp = ((struct vnode *)fp->f_data);
 	struct vattr vattr;
 	int error;
 
 	switch (vp->v_type) {
 
 	case VREG:
 	case VDIR:
 		if (com == FIONREAD) {
 			error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
 			if (error)
 				return (error);
 			*(int *)data = vattr.va_size - fp->f_offset;
 			return (0);
 		}
 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
 			return (0);			/* XXX */
 		/* fall into ... */
 
 	default:
 		return (ENOTTY);
 
 	case VFIFO:
 	case VCHR:
 	case VBLK:
 		error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
 		if (error == 0 && com == TIOCSCTTY) {
 
 			/* Do nothing if reassigning same control tty */
 			if (p->p_session->s_ttyvp == vp)
 				return (0);
 
 			/* Get rid of reference to old control tty */
 			if (p->p_session->s_ttyvp)
 				vrele(p->p_session->s_ttyvp);
 
 			p->p_session->s_ttyvp = vp;
 			VREF(vp);
 		}
 		return (error);
 	}
 }
 
 /*
  * File table vnode select routine.
  */
 static int
 vn_select(fp, which, p)
 	struct file *fp;
 	int which;
 	struct proc *p;
 {
 
 	return (VOP_SELECT(((struct vnode *)fp->f_data), which, fp->f_flag,
 		fp->f_cred, p));
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, p)
 	struct file *fp;
 	struct proc *p;
 {
 
 	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
 		fp->f_cred, p));
 }
 
 static int
 vn_vmio_open(vp, p, cred)
 	struct vnode *vp;
 	struct proc *p;
 	struct ucred *cred;
 {
 	struct vattr vat;
 	int error;
 	/*
 	 * this is here for VMIO support
 	 */
-	if (vp->v_type == VREG || vp->v_type == VBLK) {
+	if (vp->v_type == VREG /* || vp->v_type == VBLK */) {
 retry:
 		if ((vp->v_flag & VVMIO) == 0) {
 			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
 				return error;
-			(void) vnode_pager_alloc(vp, vat.va_size, 0, 0);
+			(void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
 			vp->v_flag |= VVMIO;
 		} else {
 			vm_object_t object;
 			if ((object = vp->v_object) &&
 				(object->flags & OBJ_DEAD)) {
 				VOP_UNLOCK(vp);
 				tsleep(object, PVM, "vodead", 0);
 				VOP_LOCK(vp);
 				goto retry;
 			}
 			if (!object)
 				panic("vn_open: VMIO object missing");
 			vm_object_reference(object);
 		}
 	}
 	return 0;
 }
 
 void
 vn_vmio_close(vp)
 	struct vnode *vp;
 {
 	/*
 	 * this code is here for VMIO support, will eventually
 	 * be in vfs code.
 	 */
 	if (vp->v_flag & VVMIO) {
 		vrele(vp);
 		if (vp->v_object == NULL)
 			panic("vn_close: VMIO object missing");
 		vm_object_deallocate(vp->v_object);
 	} else
 		vrele(vp);
 }
Index: head/sys/miscfs/procfs/procfs_mem.c
===================================================================
--- head/sys/miscfs/procfs/procfs_mem.c	(revision 13489)
+++ head/sys/miscfs/procfs/procfs_mem.c	(revision 13490)
@@ -1,246 +1,247 @@
 /*
  * Copyright (c) 1993 Jan-Simon Pendry
  * Copyright (c) 1993 Sean Eric Fagan
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry and Sean Eric Fagan.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)procfs_mem.c	8.4 (Berkeley) 1/21/94
  *
- *	$Id: procfs_mem.c,v 1.13 1995/12/11 04:56:31 dyson Exp $
+ *	$Id: procfs_mem.c,v 1.14 1995/12/17 07:19:24 bde Exp $
  */
 
 /*
  * This is a lightly hacked and merged version
  * of sef's pread/pwrite functions
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <miscfs/procfs/procfs.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_extern.h>
 
 static int	procfs_rwmem __P((struct proc *p, struct uio *uio));
 
 static int
 procfs_rwmem(p, uio)
 	struct proc *p;
 	struct uio *uio;
 {
 	int error;
 	int writing;
 
 	writing = uio->uio_rw == UIO_WRITE;
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
 		vm_map_t map, tmap;
 		vm_object_t object;
 		vm_offset_t kva = 0;
 		vm_offset_t uva;
 		int page_offset;		/* offset into page */
 		vm_offset_t pageno;		/* page number */
 		vm_map_entry_t out_entry;
 		vm_prot_t out_prot;
 		vm_page_t m;
 		boolean_t wired, single_use;
 		vm_pindex_t pindex;
 		u_int len;
 		int fix_prot;
 
 		uva = (vm_offset_t) uio->uio_offset;
 		if (uva >= VM_MAXUSER_ADDRESS) {
 			if (writing || (uva >= (VM_MAXUSER_ADDRESS + UPAGES * PAGE_SIZE))) {
 				error = 0;
 				break;
 			}
 		}
 
 		/*
 		 * Get the page number of this segment.
 		 */
 		pageno = trunc_page(uva);
 		page_offset = uva - pageno;
 
 		/*
 		 * How many bytes to copy
 		 */
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
 
 		/*
 		 * The map we want...
 		 */
 		map = &p->p_vmspace->vm_map;
 
 		/*
 		 * Check the permissions for the area we're interested
 		 * in.
 		 */
 		fix_prot = 0;
 		if (writing)
 			fix_prot = !vm_map_check_protection(map, pageno,
 					pageno + PAGE_SIZE, VM_PROT_WRITE);
 
 		if (fix_prot) {
 			/*
 			 * If the page is not writable, we make it so.
 			 * XXX It is possible that a page may *not* be
 			 * read/executable, if a process changes that!
 			 * We will assume, for now, that a page is either
 			 * VM_PROT_ALL, or VM_PROT_READ|VM_PROT_EXECUTE.
 			 */
 			error = vm_map_protect(map, pageno,
 					pageno + PAGE_SIZE, VM_PROT_ALL, 0);
 			if (error)
 				break;
 		}
 
 		/*
 		 * Now we need to get the page.  out_entry, out_prot, wired,
 		 * and single_use aren't used.  One would think the vm code
 		 * would be a *bit* nicer...  We use tmap because
 		 * vm_map_lookup() can change the map argument.
 		 */
 		tmap = map;
 		error = vm_map_lookup(&tmap, pageno,
 				      writing ? VM_PROT_WRITE : VM_PROT_READ,
 				      &out_entry, &object, &pindex, &out_prot,
 				      &wired, &single_use);
 		/*
 		 * We're done with tmap now.
 		 */
 		if (!error)
 			vm_map_lookup_done(tmap, out_entry);
 
 		/*
 		 * Fault the page in...
 		 */
 		if (!error && writing && object->backing_object) {
 			m = vm_page_lookup(object, pindex);
 			if (m == 0)
 				error = vm_fault(map, pageno,
 							VM_PROT_WRITE, FALSE);
 		}
 
 		/* Find space in kernel_map for the page we're interested in */
 		if (!error)
 			error = vm_map_find(kernel_map, object,
-				IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1);
+				IDX_TO_OFF(pindex), &kva, PAGE_SIZE, 1,
+				VM_PROT_ALL, VM_PROT_ALL, 0);
 
 		if (!error) {
 			/*
 			 * Neither vm_map_lookup() nor vm_map_find() appear
 			 * to add a reference count to the object, so we do
 			 * that here and now.
 			 */
 			vm_object_reference(object);
 
 			/*
 			 * Mark the page we just found as pageable.
 			 */
 			error = vm_map_pageable(kernel_map, kva,
 				kva + PAGE_SIZE, 0);
 
 			/*
 			 * Now do the i/o move.
 			 */
 			if (!error)
 				error = uiomove((caddr_t)(kva + page_offset),
 						len, uio);
 
 			vm_map_remove(kernel_map, kva, kva + PAGE_SIZE);
 		}
 		if (fix_prot)
 			vm_map_protect(map, pageno, pageno + PAGE_SIZE,
 					VM_PROT_READ|VM_PROT_EXECUTE, 0);
 	} while (error == 0 && uio->uio_resid > 0);
 
 	return (error);
 }
 
 /*
  * Copy data in and out of the target process.
  * We do this by mapping the process's page into
  * the kernel and then doing a uiomove direct
  * from the kernel address space.
  */
 int
 procfs_domem(curp, p, pfs, uio)
 	struct proc *curp;
 	struct proc *p;
 	struct pfsnode *pfs;
 	struct uio *uio;
 {
 	int error;
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	error = procfs_rwmem(p, uio);
 
 	return (error);
 }
 
 /*
  * Given process (p), find the vnode from which
  * it's text segment is being executed.
  *
  * It would be nice to grab this information from
  * the VM system, however, there is no sure-fire
  * way of doing that.  Instead, fork(), exec() and
  * wait() all maintain the p_textvp field in the
  * process proc structure which contains a held
  * reference to the exec'ed vnode.
  */
 struct vnode *
 procfs_findtextvp(p)
 	struct proc *p;
 {
 	return (p->p_textvp);
 }
Index: head/sys/msdosfs/msdosfs_denode.c
===================================================================
--- head/sys/msdosfs/msdosfs_denode.c	(revision 13489)
+++ head/sys/msdosfs/msdosfs_denode.c	(revision 13490)
@@ -1,728 +1,730 @@
-/*	$Id: msdosfs_denode.c,v 1.14 1995/12/03 16:41:53 bde Exp $ */
+/*	$Id: msdosfs_denode.c,v 1.15 1995/12/07 12:47:19 davidg Exp $ */
 /*	$NetBSD: msdosfs_denode.c,v 1.9 1994/08/21 18:44:00 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994 Wolfgang Solfrank.
  * Copyright (C) 1994 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/types.h>
 #include <sys/kernel.h>		/* defines "time" */
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 
 #include <msdosfs/bpb.h>
 #include <msdosfs/msdosfsmount.h>
 #include <msdosfs/direntry.h>
 #include <msdosfs/denode.h>
 #include <msdosfs/fat.h>
 
 struct denode **dehashtbl;
 u_long dehash;			/* size of hash table - 1 */
 #define	DEHASH(dev, deno)	(((dev) + (deno)) & dehash)
 
 union _qcvt {
 	quad_t qcvt;
 	long val[2];
 };
 #define SETHIGH(q, h) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_HIGHWORD] = (h); \
 	(q) = tmp.qcvt; \
 }
 #define SETLOW(q, l) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_LOWWORD] = (l); \
 	(q) = tmp.qcvt; \
 }
 
 static struct denode *
 		msdosfs_hashget __P((dev_t dev, u_long dirclust,
 				     u_long diroff));
 static void	msdosfs_hashins __P((struct denode *dep));
 static void	msdosfs_hashrem __P((struct denode *dep));
 
 int msdosfs_init()
 {
 	dehashtbl = hashinit(desiredvnodes/2, M_MSDOSFSMNT, &dehash);
 	return 0;
 }
 
 static struct denode *
 msdosfs_hashget(dev, dirclust, diroff)
 	dev_t dev;
 	u_long dirclust;
 	u_long diroff;
 {
 	struct denode *dep;
 
 	for (;;)
 		for (dep = dehashtbl[DEHASH(dev, dirclust + diroff)];;
 		     dep = dep->de_next) {
 			if (dep == NULL)
 				return NULL;
 			if (dirclust != dep->de_dirclust
 			    || diroff != dep->de_diroffset
 			    || dev != dep->de_dev
 			    || dep->de_refcnt == 0)
 				continue;
 			if (dep->de_flag & DE_LOCKED) {
 				dep->de_flag |= DE_WANTED;
 				(void) tsleep((caddr_t)dep, PINOD, "msdhgt", 0);
 				break;
 			}
 			if (!vget(DETOV(dep), 1))
 				return dep;
 			break;
 		}
 	/* NOTREACHED */
 }
 
 static void
 msdosfs_hashins(dep)
 	struct denode *dep;
 {
 	struct denode **depp, *deq;
 
 	depp = &dehashtbl[DEHASH(dep->de_dev, dep->de_dirclust + dep->de_diroffset)];
 	deq = *depp;
 	if (deq)
 		deq->de_prev = &dep->de_next;
 	dep->de_next = deq;
 	dep->de_prev = depp;
 	*depp = dep;
 }
 
 static void
 msdosfs_hashrem(dep)
 	struct denode *dep;
 {
 	struct denode *deq;
 	deq = dep->de_next;
 	if (deq)
 		deq->de_prev = dep->de_prev;
 	*dep->de_prev = deq;
 #ifdef DIAGNOSTIC
 	dep->de_next = NULL;
 	dep->de_prev = NULL;
 #endif
 }
 
 /*
  * If deget() succeeds it returns with the gotten denode locked().
  *
  * pmp	     - address of msdosfsmount structure of the filesystem containing
  *	       the denode of interest.  The pm_dev field and the address of
  *	       the msdosfsmount structure are used.
  * dirclust  - which cluster bp contains, if dirclust is 0 (root directory)
  *	       diroffset is relative to the beginning of the root directory,
  *	       otherwise it is cluster relative.
  * diroffset - offset past begin of cluster of denode we want
  * direntptr - address of the direntry structure of interest. If direntptr is
  *	       NULL, the block is read if necessary.
  * depp	     - returns the address of the gotten denode.
  */
 int
 deget(pmp, dirclust, diroffset, direntptr, depp)
 	struct msdosfsmount *pmp;	/* so we know the maj/min number */
 	u_long dirclust;		/* cluster this dir entry came from */
 	u_long diroffset;		/* index of entry within the cluster */
 	struct direntry *direntptr;
 	struct denode **depp;		/* returns the addr of the gotten denode */
 {
 	int error;
 	dev_t dev = pmp->pm_dev;
 	struct mount *mntp = pmp->pm_mountp;
 	struct denode *ldep;
 	struct vnode *nvp;
 	struct buf *bp;
 
 #ifdef MSDOSFS_DEBUG
 	printf("deget(pmp %p, dirclust %ld, diroffset %x, direntptr %p, depp %p)\n",
 	       pmp, dirclust, diroffset, direntptr, depp);
 #endif
 
 	/*
 	 * If dir entry is given and refers to a directory, convert to
 	 * canonical form
 	 */
 	if (direntptr && (direntptr->deAttributes & ATTR_DIRECTORY)) {
 		dirclust = getushort(direntptr->deStartCluster);
 		if (dirclust == MSDOSFSROOT)
 			diroffset = MSDOSFSROOT_OFS;
 		else
 			diroffset = 0;
 	}
 
 	/*
 	 * See if the denode is in the denode cache. Use the location of
 	 * the directory entry to compute the hash value. For subdir use
 	 * address of "." entry. for root dir use cluster MSDOSFSROOT,
 	 * offset MSDOSFSROOT_OFS
 	 *
 	 * NOTE: The check for de_refcnt > 0 below insures the denode being
 	 * examined does not represent an unlinked but still open file.
 	 * These files are not to be accessible even when the directory
 	 * entry that represented the file happens to be reused while the
 	 * deleted file is still open.
 	 */
 	ldep = msdosfs_hashget(dev, dirclust, diroffset);
 	if (ldep) {
 		*depp = ldep;
 		return 0;
 	}
 
 
 	/*
 	 * Directory entry was not in cache, have to create a vnode and
 	 * copy it from the passed disk buffer.
 	 */
 	/* getnewvnode() does a VREF() on the vnode */
 	error = getnewvnode(VT_MSDOSFS, mntp, msdosfs_vnodeop_p, &nvp);
 	if (error) {
 		*depp = 0;
 		return error;
 	}
 	MALLOC(ldep, struct denode *, sizeof(struct denode), M_MSDOSFSNODE, M_WAITOK);
 	bzero((caddr_t)ldep, sizeof *ldep);
 	nvp->v_data = ldep;
 	ldep->de_vnode = nvp;
 	ldep->de_flag = 0;
 	ldep->de_devvp = 0;
 	ldep->de_lockf = 0;
 	ldep->de_dev = dev;
 	ldep->de_dirclust = dirclust;
 	ldep->de_diroffset = diroffset;
 	fc_purge(ldep, 0);	/* init the fat cache for this denode */
 
 	/*
 	 * Insert the denode into the hash queue and lock the denode so it
 	 * can't be accessed until we've read it in and have done what we
 	 * need to it.
 	 */
 	VOP_LOCK(nvp);
 	msdosfs_hashins(ldep);
 
 	/*
 	 * Copy the directory entry into the denode area of the vnode.
 	 */
 	if (dirclust == MSDOSFSROOT && diroffset == MSDOSFSROOT_OFS) {
 		/*
 		 * Directory entry for the root directory. There isn't one,
 		 * so we manufacture one. We should probably rummage
 		 * through the root directory and find a label entry (if it
 		 * exists), and then use the time and date from that entry
 		 * as the time and date for the root denode.
 		 */
 		ldep->de_Attributes = ATTR_DIRECTORY;
 		ldep->de_StartCluster = MSDOSFSROOT;
 		ldep->de_FileSize = pmp->pm_rootdirsize * pmp->pm_BytesPerSec;
 		/*
 		 * fill in time and date so that dos2unixtime() doesn't
 		 * spit up when called from msdosfs_getattr() with root
 		 * denode
 		 */
 		ldep->de_Time = 0x0000;	/* 00:00:00	 */
 		ldep->de_Date = (0 << DD_YEAR_SHIFT) | (1 << DD_MONTH_SHIFT)
 		    | (1 << DD_DAY_SHIFT);
 		/* Jan 1, 1980	 */
 		/* leave the other fields as garbage */
 	} else {
 		bp = NULL;
 		if (!direntptr) {
 			error = readep(pmp, dirclust, diroffset, &bp,
 				       &direntptr);
 			if (error)
 				return error;
 		}
 		DE_INTERNALIZE(ldep, direntptr);
 		if (bp)
 			brelse(bp);
 	}
 
 	/*
 	 * Fill in a few fields of the vnode and finish filling in the
 	 * denode.  Then return the address of the found denode.
 	 */
 	ldep->de_pmp = pmp;
 	ldep->de_devvp = pmp->pm_devvp;
 	ldep->de_refcnt = 1;
 	if (ldep->de_Attributes & ATTR_DIRECTORY) {
 		/*
 		 * Since DOS directory entries that describe directories
 		 * have 0 in the filesize field, we take this opportunity
 		 * to find out the length of the directory and plug it into
 		 * the denode structure.
 		 */
 		u_long size;
 
 		nvp->v_type = VDIR;
 		if (ldep->de_StartCluster == MSDOSFSROOT)
 			nvp->v_flag |= VROOT;
 		else {
 			error = pcbmap(ldep, 0xffff, 0, &size);
 			if (error == E2BIG) {
 				ldep->de_FileSize = size << pmp->pm_cnshift;
 				error = 0;
 			} else
 				printf("deget(): pcbmap returned %d\n", error);
 		}
 	} else
 		nvp->v_type = VREG;
 	SETHIGH(ldep->de_modrev, mono_time.tv_sec);
 	SETLOW(ldep->de_modrev, mono_time.tv_usec * 4294);
 	VREF(ldep->de_devvp);
 	*depp = ldep;
 	return 0;
 }
 
 int
 deupdat(dep, tp, waitfor)
 	struct denode *dep;
 	struct timespec *tp;
 	int waitfor;
 {
 	int error;
 	struct buf *bp;
 	struct direntry *dirp;
 	struct vnode *vp = DETOV(dep);
 
 #ifdef MSDOSFS_DEBUG
 	printf("deupdat(): dep %p\n", dep);
 #endif
 
 	/*
 	 * If the denode-modified and update-mtime bits are off,
 	 * or this denode is from a readonly filesystem,
 	 * or this denode is for a directory,
 	 * or the denode represents an open but unlinked file,
 	 * then don't do anything.  DOS directory
 	 * entries that describe a directory do not ever get
 	 * updated.  This is the way DOS treats them.
 	 */
 	if ((dep->de_flag & (DE_MODIFIED | DE_UPDATE)) == 0 ||
 	    vp->v_mount->mnt_flag & MNT_RDONLY ||
 	    dep->de_Attributes & ATTR_DIRECTORY ||
 	    dep->de_refcnt <= 0)
 		return 0;
 
 	/*
 	 * Read in the cluster containing the directory entry we want to
 	 * update.
 	 */
 	error = readde(dep, &bp, &dirp);
 	if (error)
 		return error;
 
 	/*
 	 * If the mtime is to be updated, put the passed in time into the
 	 * directory entry.
 	 */
 	if (dep->de_flag & DE_UPDATE) {
 		dep->de_Attributes |= ATTR_ARCHIVE;
 		unix2dostime(tp, &dep->de_Date, &dep->de_Time);
 	}
 
 	/*
 	 * The mtime is now up to date.  The denode will be unmodifed soon.
 	 */
 	dep->de_flag &= ~(DE_MODIFIED | DE_UPDATE);
 
 	/*
 	 * Copy the directory entry out of the denode into the cluster it
 	 * came from.
 	 */
 	DE_EXTERNALIZE(dirp, dep);
 
 	/*
 	 * Write the cluster back to disk.  If they asked for us to wait
 	 * for the write to complete, then use bwrite() otherwise use
 	 * bdwrite().
 	 */
 	error = 0;		/* note that error is 0 from above, but ... */
 	if (waitfor)
 		error = bwrite(bp);
 	else
 		bdwrite(bp);
 	return error;
 }
 
 /*
  * Truncate the file described by dep to the length specified by length.
  */
 int
 detrunc(dep, length, flags, cred, p)
 	struct denode *dep;
 	u_long length;
 	int flags;
 	struct ucred *cred;
 	struct proc *p;
 {
 	int error;
 	int allerror;
 	int vflags;
 	u_long eofentry;
 	u_long chaintofree;
 	daddr_t bn;
 	int boff;
 	int isadir = dep->de_Attributes & ATTR_DIRECTORY;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct timespec ts;
 
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): file %s, length %d, flags %d\n", dep->de_Name, length, flags);
 #endif
 
 	/*
 	 * Disallow attempts to truncate the root directory since it is of
 	 * fixed size.  That's just the way dos filesystems are.  We use
 	 * the VROOT bit in the vnode because checking for the directory
 	 * bit and a startcluster of 0 in the denode is not adequate to
 	 * recognize the root directory at this point in a file or
 	 * directory's life.
 	 */
 	if (DETOV(dep)->v_flag & VROOT) {
 		printf(
     "detrunc(): can't truncate root directory, clust %ld, offset %ld\n",
 		    dep->de_dirclust, dep->de_diroffset);
 		return EINVAL;
 	}
 
-	vnode_pager_setsize(DETOV(dep), length);
 
-	if (dep->de_FileSize < length)
+	if (dep->de_FileSize < length) {
+		vnode_pager_setsize(DETOV(dep), length);
 		return deextend(dep, length, cred);
+	}
 
 	/*
 	 * If the desired length is 0 then remember the starting cluster of
 	 * the file and set the StartCluster field in the directory entry
 	 * to 0.  If the desired length is not zero, then get the number of
 	 * the last cluster in the shortened file.  Then get the number of
 	 * the first cluster in the part of the file that is to be freed.
 	 * Then set the next cluster pointer in the last cluster of the
 	 * file to CLUST_EOFE.
 	 */
 	if (length == 0) {
 		chaintofree = dep->de_StartCluster;
 		dep->de_StartCluster = 0;
 		eofentry = ~0;
 	} else {
 		error = pcbmap(dep, de_clcount(pmp, length) - 1, 0, &eofentry);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): pcbmap fails %d\n", error);
 #endif
 			return error;
 		}
 	}
 
 	fc_purge(dep, (length + pmp->pm_crbomask) >> pmp->pm_cnshift);
 
 	/*
 	 * If the new length is not a multiple of the cluster size then we
 	 * must zero the tail end of the new last cluster in case it
 	 * becomes part of the file again because of a seek.
 	 */
 	if ((boff = length & pmp->pm_crbomask) != 0) {
 		/*
 		 * should read from file vnode or filesystem vnode
 		 * depending on if file or dir
 		 */
 		if (isadir) {
 			bn = cntobn(pmp, eofentry);
 			error = bread(pmp->pm_devvp, bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		} else {
 			bn = de_blk(pmp, length);
 			error = bread(DETOV(dep), bn, pmp->pm_bpcluster,
 			    NOCRED, &bp);
 		}
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): bread fails %d\n", error);
 #endif
 			return error;
 		}
 		/*
 		 * is this the right place for it?
 		 */
 		bzero(bp->b_data + boff, pmp->pm_bpcluster - boff);
 		if (flags & IO_SYNC)
 			bwrite(bp);
 		else
 			bdwrite(bp);
 	}
 
 	/*
 	 * Write out the updated directory entry.  Even if the update fails
 	 * we free the trailing clusters.
 	 */
 	dep->de_FileSize = length;
 	dep->de_flag |= DE_UPDATE;
 	vflags = (length > 0 ? V_SAVE : 0) | V_SAVEMETA;
 	vinvalbuf(DETOV(dep), vflags, cred, p, 0, 0);
+	vnode_pager_setsize(DETOV(dep), length);
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
 	allerror = deupdat(dep, &ts, 1);
 #ifdef MSDOSFS_DEBUG
 	printf("detrunc(): allerror %d, eofentry %d\n",
 	       allerror, eofentry);
 #endif
 
 	/*
 	 * If we need to break the cluster chain for the file then do it
 	 * now.
 	 */
 	if (eofentry != ~0) {
 		error = fatentry(FAT_GET_AND_SET, pmp, eofentry,
 				 &chaintofree, CLUST_EOFE);
 		if (error) {
 #ifdef MSDOSFS_DEBUG
 			printf("detrunc(): fatentry errors %d\n", error);
 #endif
 			return error;
 		}
 		fc_setcache(dep, FC_LASTFC, (length - 1) >> pmp->pm_cnshift,
 			    eofentry);
 	}
 
 	/*
 	 * Now free the clusters removed from the file because of the
 	 * truncation.
 	 */
 	if (chaintofree != 0 && !MSDOSFSEOF(chaintofree))
 		freeclusterchain(pmp, chaintofree);
 
 	return allerror;
 }
 
 /*
  * Extend the file described by dep to length specified by length.
  */
 int
 deextend(dep, length, cred)
 	struct denode *dep;
 	off_t length;
 	struct ucred *cred;
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	u_long count;
 	int error;
 	struct timespec ts;
 
 	/*
 	 * The root of a DOS filesystem cannot be extended.
 	 */
 	if (DETOV(dep)->v_flag & VROOT)
 		return EINVAL;
 
 	/*
 	 * Directories can only be extended by the superuser.
 	 * Is this really important?
 	 */
 	if (dep->de_Attributes & ATTR_DIRECTORY) {
 		error = suser(cred, NULL);
 		if (error)
 			return error;
 	}
 
 	if (length <= dep->de_FileSize)
 		panic("deextend: file too large");
 
 	/*
 	 * Compute the number of clusters to allocate.
 	 */
 	count = de_clcount(pmp, length) - de_clcount(pmp, dep->de_FileSize);
 	if (count > 0) {
 		if (count > pmp->pm_freeclustercount)
 			return ENOSPC;
 		error = extendfile(dep, count, NULL, NULL, DE_CLEAR);
 		if (error) {
 			/* truncate the added clusters away again */
 			(void) detrunc(dep, dep->de_FileSize, 0, cred, NULL);
 			return error;
 		}
 	}
 
 	dep->de_flag |= DE_UPDATE;
 	dep->de_FileSize = length;
 	TIMEVAL_TO_TIMESPEC(&time, &ts);
 	return deupdat(dep, &ts, 1);
 }
 
 /*
  * Move a denode to its correct hash queue after the file it represents has
  * been moved to a new directory.
  */
 int reinsert(dep)
 	struct denode *dep;
 {
 	/*
 	 * Fix up the denode cache.  If the denode is for a directory,
 	 * there is nothing to do since the hash is based on the starting
 	 * cluster of the directory file and that hasn't changed.  If for a
 	 * file the hash is based on the location of the directory entry,
 	 * so we must remove it from the cache and re-enter it with the
 	 * hash based on the new location of the directory entry.
 	 */
 	if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) {
 		msdosfs_hashrem(dep);
 		msdosfs_hashins(dep);
 	}
 	return 0;
 }
 
 int
 msdosfs_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_reclaim(): dep %p, file %s, refcnt %ld\n",
 	    dep, dep->de_Name, dep->de_refcnt);
 #endif
 
 	if (prtactive && vp->v_usecount != 0)
 		vprint("msdosfs_reclaim(): pushing active", vp);
 
 	/*
 	 * Remove the denode from the denode hash chain we are in.
 	 */
 	msdosfs_hashrem(dep);
 
 	cache_purge(vp);
 	/*
 	 * Indicate that one less file on the filesystem is open.
 	 */
 	if (dep->de_devvp) {
 		vrele(dep->de_devvp);
 		dep->de_devvp = 0;
 	}
 
 	dep->de_flag = 0;
 
 	FREE(dep, M_MSDOSFSNODE);
 	vp->v_data = NULL;
 
 	return 0;
 }
 
 int
 msdosfs_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct denode *dep = VTODE(vp);
 	int error = 0;
 	struct timespec ts;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, de_Name[0] %x\n", dep, dep->de_Name[0]);
 #endif
 
 	if (prtactive && vp->v_usecount != 0)
 		vprint("msdosfs_inactive(): pushing active", vp);
 
 	/*
 	 * Get rid of denodes related to stale file handles. Hmmm, what
 	 * does this really do?
 	 */
 	if (dep->de_Name[0] == SLOT_DELETED) {
 		if ((vp->v_flag & VXLOCK) == 0)
 			vgone(vp);
 		return 0;
 	}
 
 	/*
 	 * If the file has been deleted and it is on a read/write
 	 * filesystem, then truncate the file, and mark the directory slot
 	 * as empty.  (This may not be necessary for the dos filesystem.)
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): dep %p, refcnt %ld, mntflag %x, MNT_RDONLY %x\n",
 	       dep, dep->de_refcnt, vp->v_mount->mnt_flag, MNT_RDONLY);
 #endif
 	VOP_LOCK(vp);
 	if (dep->de_refcnt <= 0 && (vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		error = detrunc(dep, (u_long) 0, 0, NOCRED, NULL);
 		dep->de_flag |= DE_UPDATE;
 		dep->de_Name[0] = SLOT_DELETED;
 	}
 	if (dep->de_flag & (DE_MODIFIED | DE_UPDATE)) {
 		TIMEVAL_TO_TIMESPEC(&time, &ts);
 		deupdat(dep, &ts, 0);
 	}
 	VOP_UNLOCK(vp);
 	dep->de_flag = 0;
 
 	/*
 	 * If we are done with the denode, then reclaim it so that it can
 	 * be reused now.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_inactive(): v_usecount %d, de_Name[0] %x\n", vp->v_usecount,
 	       dep->de_Name[0]);
 #endif
 	if (vp->v_usecount == 0 && dep->de_Name[0] == SLOT_DELETED)
 		vgone(vp);
 	return error;
 }
Index: head/sys/nfs/nfs_common.c
===================================================================
--- head/sys/nfs/nfs_common.c	(revision 13489)
+++ head/sys/nfs/nfs_common.c	(revision 13490)
@@ -1,1979 +1,1979 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c	8.3 (Berkeley) 1/4/94
- * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $
+ * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $
  */
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #ifdef VFS_LKM
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfs/nfsm_subs.h>
 #include <nfs/nfsmount.h>
 #include <nfs/nqnfs.h>
 #include <nfs/nfsrtt.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <netinet/in.h>
 #ifdef ISO
 #include <netiso/iso.h>
 #endif
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_long nfs_xdrneg1;
 u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
 	rpc_mismatch, rpc_auth_unix, rpc_msgaccepted,
 	rpc_auth_kerb;
 u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false;
 
 /* And other global data */
 static u_long nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON 
 };
 enum vtype nv3tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
 };
 
 int nfs_ticks;
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
 int nfssvc_sockhead_flag;
 struct nfsd_head nfsd_head;
 int nfsd_head_flag;
 struct nfs_bufq nfs_bufq;
 struct nqtimerhead nqtimerhead;
 struct nqfhhashhead *nqfhhashtbl;
 u_long nqfhhash;
 
 #ifndef NFS_NOSERVER
 /*
  * Mapping of old NFS Version 2 RPC numbers to generic numbers.
  */
 int nfsv3_procid[NFS_NPROCS] = {
 	NFSPROC_NULL,
 	NFSPROC_GETATTR,
 	NFSPROC_SETATTR,
 	NFSPROC_NOOP,
 	NFSPROC_LOOKUP,
 	NFSPROC_READLINK,
 	NFSPROC_READ,
 	NFSPROC_NOOP,
 	NFSPROC_WRITE,
 	NFSPROC_CREATE,
 	NFSPROC_REMOVE,
 	NFSPROC_RENAME,
 	NFSPROC_LINK,
 	NFSPROC_SYMLINK,
 	NFSPROC_MKDIR,
 	NFSPROC_RMDIR,
 	NFSPROC_READDIR,
 	NFSPROC_FSSTAT,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP
 };
 
 #endif /* NFS_NOSERVER */
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 #ifndef NFS_NOSERVER
 /*
  * Maps errno values to nfs error numbers.
  * Use NFSERR_IO as the catch all for ones not specifically defined in
  * RFC 1094.
  */
 static u_char nfsrv_v2errmap[ELAST] = {
   NFSERR_PERM,	NFSERR_NOENT,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NXIO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_ACCES,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_EXIST,	NFSERR_IO,	NFSERR_NODEV,	NFSERR_NOTDIR,
   NFSERR_ISDIR,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_FBIG,	NFSERR_NOSPC,	NFSERR_IO,	NFSERR_ROFS,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_NAMETOL,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NOTEMPTY, NFSERR_IO,	NFSERR_IO,	NFSERR_DQUOT,	NFSERR_STALE,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Although it is not obvious whether or not NFS clients really care if
  * a returned error value is in the specified list for the procedure, the
  * safest thing to do is filter them appropriately. For Version 2, the
  * X/Open XNFS document is the only specification that defines error values
  * for each RPC (The RFC simply lists all possible error values for all RPCs),
  * so I have decided to not do this for Version 2.
  * The first entry is the default error return and the rest are the valid
  * errors for that RPC in increasing numeric order.
  */
 static short nfsv3err_null[] = {
 	0,
 	0,
 };
 
 static short nfsv3err_getattr[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_setattr[] = {
 	NFSERR_IO,
 	NFSERR_PERM,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOT_SYNC,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_lookup[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_access[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_read[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_NXIO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_write[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_FBIG,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_create[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mkdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_symlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mknod[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	NFSERR_BADTYPE,
 	0,
 };
 
 static short nfsv3err_remove[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rmdir[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rename[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_ISDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_link[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdirplus[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_NOTSUPP,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsstat[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsinfo[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_pathconf[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_commit[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short *nfsrv_v3errmap[] = {
 	nfsv3err_null,
 	nfsv3err_getattr,
 	nfsv3err_setattr,
 	nfsv3err_lookup,
 	nfsv3err_access,
 	nfsv3err_readlink,
 	nfsv3err_read,
 	nfsv3err_write,
 	nfsv3err_create,
 	nfsv3err_mkdir,
 	nfsv3err_symlink,
 	nfsv3err_mknod,
 	nfsv3err_remove,
 	nfsv3err_rmdir,
 	nfsv3err_rename,
 	nfsv3err_link,
 	nfsv3err_readdir,
 	nfsv3err_readdirplus,
 	nfsv3err_fsstat,
 	nfsv3err_fsinfo,
 	nfsv3err_pathconf,
 	nfsv3err_commit,
 };
 
 #endif /* NFS_NOSERVER */
 
 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsrtt nfsrtt;
 extern time_t nqnfsstarttime;
 extern int nqsrv_clockskew;
 extern int nqsrv_writeslack;
 extern int nqsrv_maxlease;
 extern struct nfsstats nfsstats;
 extern int nqnfs_piggy[NFS_NPROCS];
 extern nfstype nfsv2_type[9];
 extern nfstype nfsv3_type[9];
 extern struct nfsnodehashhead *nfsnodehashtbl;
 extern u_long nfsnodehash;
 
 #ifdef VFS_LKM
 struct getfh_args;
 extern int getfh(struct proc *, struct getfh_args *, int *);
 struct nfssvc_args;
 extern int nfssvc(struct proc *, struct nfssvc_args *, int *);
 #endif
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 /*
  * Create the header for an rpc request packet
  * The hsiz is the size of the rest of the nfs request header.
  * (just used to decide if a cluster is a good idea)
  */
 struct mbuf *
 nfsm_reqh(vp, procid, hsiz, bposp)
 	struct vnode *vp;
 	u_long procid;
 	int hsiz;
 	caddr_t *bposp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	struct mbuf *mb2;
 	struct nfsmount *nmp;
 	int nqflag;
 
 	MGET(mb, M_WAIT, MT_DATA);
 	if (hsiz >= MINCLSIZE)
 		MCLGET(mb, M_WAIT);
 	mb->m_len = 0;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * For NQNFS, add lease request.
 	 */
 	if (vp) {
 		nmp = VFSTONFS(vp->v_mount);
 		if (nmp->nm_flag & NFSMNT_NQNFS) {
 			nqflag = NQNFS_NEEDLEASE(vp, procid);
 			if (nqflag) {
 				nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(nqflag);
 				*tl = txdr_unsigned(nmp->nm_leaseterm);
 			} else {
 				nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 				*tl = 0;
 			}
 		}
 	}
 	/* Finally, return values */
 	*bposp = bpos;
 	return (mb);
 }
 
 /*
  * Build the RPC header and fill in the authorization info.
  * The authorization string argument is only used when the credentials
  * come from outside of the kernel.
  * Returns the head of the mbuf list.
  */
 struct mbuf *
 nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len,
 	verf_str, mrest, mrest_len, mbp, xidp)
 	register struct ucred *cr;
 	int nmflag;
 	int procid;
 	int auth_type;
 	int auth_len;
 	char *auth_str;
 	int verf_len;
 	char *verf_str;
 	struct mbuf *mrest;
 	int mrest_len;
 	struct mbuf **mbp;
 	u_long *xidp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	register int i;
 	struct mbuf *mreq, *mb2;
 	int siz, grpsiz, authsiz;
 
 	authsiz = nfsm_rndup(auth_len);
 	MGETHDR(mb, M_WAIT, MT_DATA);
 	if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
 		MCLGET(mb, M_WAIT);
 	} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
 		MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
 	} else {
 		MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
 	}
 	mb->m_len = 0;
 	mreq = mb;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * First the RPC header.
 	 */
 	nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED);
 	if (++nfs_xid == 0)
 		nfs_xid++;
 	*tl++ = *xidp = txdr_unsigned(nfs_xid);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	if (nmflag & NFSMNT_NQNFS) {
 		*tl++ = txdr_unsigned(NQNFS_PROG);
 		*tl++ = txdr_unsigned(NQNFS_VER3);
 	} else {
 		*tl++ = txdr_unsigned(NFS_PROG);
 		if (nmflag & NFSMNT_NFSV3)
 			*tl++ = txdr_unsigned(NFS_VER3);
 		else
 			*tl++ = txdr_unsigned(NFS_VER2);
 	}
 	if (nmflag & NFSMNT_NFSV3)
 		*tl++ = txdr_unsigned(procid);
 	else
 		*tl++ = txdr_unsigned(nfsv2_procid[procid]);
 
 	/*
 	 * And then the authorization cred.
 	 */
 	*tl++ = txdr_unsigned(auth_type);
 	*tl = txdr_unsigned(authsiz);
 	switch (auth_type) {
 	case RPCAUTH_UNIX:
 		nfsm_build(tl, u_long *, auth_len);
 		*tl++ = 0;		/* stamp ?? */
 		*tl++ = 0;		/* NULL hostname */
 		*tl++ = txdr_unsigned(cr->cr_uid);
 		*tl++ = txdr_unsigned(cr->cr_groups[0]);
 		grpsiz = (auth_len >> 2) - 5;
 		*tl++ = txdr_unsigned(grpsiz);
 		for (i = 1; i <= grpsiz; i++)
 			*tl++ = txdr_unsigned(cr->cr_groups[i]);
 		break;
 	case RPCAUTH_KERB4:
 		siz = auth_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(auth_str, bpos, i);
 			mb->m_len += i;
 			auth_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 		break;
 	};
 
 	/*
 	 * And the verifier...
 	 */
 	nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
 	if (verf_str) {
 		*tl++ = txdr_unsigned(RPCAUTH_KERB4);
 		*tl = txdr_unsigned(verf_len);
 		siz = verf_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(verf_str, bpos, i);
 			mb->m_len += i;
 			verf_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 	} else {
 		*tl++ = txdr_unsigned(RPCAUTH_NULL);
 		*tl = 0;
 	}
 	mb->m_next = mrest;
 	mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
 	mreq->m_pkthdr.rcvif = (struct ifnet *)0;
 	*mbp = mb;
 	return (mreq);
 }
 
 /*
  * copies mbuf chain to the uio scatter/gather list
  */
 int
 nfsm_mbuftouio(mrep, uiop, siz, dpos)
 	struct mbuf **mrep;
 	register struct uio *uiop;
 	int siz;
 	caddr_t *dpos;
 {
 	register char *mbufcp, *uiocp;
 	register int xfer, left, len;
 	register struct mbuf *mp;
 	long uiosiz, rem;
 	int error = 0;
 
 	mp = *mrep;
 	mbufcp = *dpos;
 	len = mtod(mp, caddr_t)+mp->m_len-mbufcp;
 	rem = nfsm_rndup(siz)-siz;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			while (len == 0) {
 				mp = mp->m_next;
 				if (mp == NULL)
 					return (EBADRPC);
 				mbufcp = mtod(mp, caddr_t);
 				len = mp->m_len;
 			}
 			xfer = (left > len) ? len : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(mbufcp, uiocp, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(mbufcp, uiocp, xfer);
 			else
 				copyout(mbufcp, uiocp, xfer);
 			left -= xfer;
 			len -= xfer;
 			mbufcp += xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	*dpos = mbufcp;
 	*mrep = mp;
 	if (rem > 0) {
 		if (len < rem)
 			error = nfs_adv(mrep, dpos, rem, len);
 		else
 			*dpos += rem;
 	}
 	return (error);
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain...
  */
 int
 nfsm_uiotombuf(uiop, mq, siz, bpos)
 	register struct uio *uiop;
 	struct mbuf **mq;
 	int siz;
 	caddr_t *bpos;
 {
 	register char *uiocp;
 	register struct mbuf *mp, *mp2;
 	register int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EINVAL);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				MGET(mp, M_WAIT, MT_DATA);
 				if (clflg)
 					MCLGET(mp, M_WAIT);
 				mp->m_len = 0;
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			MGET(mp, M_WAIT, MT_DATA);
 			mp->m_len = 0;
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Help break down an mbuf chain by setting the first siz bytes contiguous
  * pointed to by returned val.
  * This is used by the macros nfsm_dissect and nfsm_dissecton for tough
  * cases. (The macros use the vars. dpos and dpos2)
  */
 int
 nfsm_disct(mdp, dposp, siz, left, cp2)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int siz;
 	int left;
 	caddr_t *cp2;
 {
 	register struct mbuf *mp, *mp2;
 	register int siz2, xfer;
 	register caddr_t p;
 
 	mp = *mdp;
 	while (left == 0) {
 		*mdp = mp = mp->m_next;
 		if (mp == NULL)
 			return (EBADRPC);
 		left = mp->m_len;
 		*dposp = mtod(mp, caddr_t);
 	}
 	if (left >= siz) {
 		*cp2 = *dposp;
 		*dposp += siz;
 	} else if (mp->m_next == NULL) {
 		return (EBADRPC);
 	} else if (siz > MHLEN) {
 		panic("nfs S too big");
 	} else {
 		MGET(mp2, M_WAIT, MT_DATA);
 		mp2->m_next = mp->m_next;
 		mp->m_next = mp2;
 		mp->m_len -= left;
 		mp = mp2;
 		*cp2 = p = mtod(mp, caddr_t);
 		bcopy(*dposp, p, left);		/* Copy what was left */
 		siz2 = siz-left;
 		p += left;
 		mp2 = mp->m_next;
 		/* Loop around copying up the siz2 bytes */
 		while (siz2 > 0) {
 			if (mp2 == NULL)
 				return (EBADRPC);
 			xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2;
 			if (xfer > 0) {
 				bcopy(mtod(mp2, caddr_t), p, xfer);
 				NFSMADV(mp2, xfer);
 				mp2->m_len -= xfer;
 				p += xfer;
 				siz2 -= xfer;
 			}
 			if (siz2 > 0)
 				mp2 = mp2->m_next;
 		}
 		mp->m_len = siz;
 		*mdp = mp2;
 		*dposp = mtod(mp2, caddr_t);
 	}
 	return (0);
 }
 
 /*
  * Advance the position in the mbuf chain.
  */
 int
 nfs_adv(mdp, dposp, offs, left)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int offs;
 	int left;
 {
 	register struct mbuf *m;
 	register int s;
 
 	m = *mdp;
 	s = left;
 	while (s < offs) {
 		offs -= s;
 		m = m->m_next;
 		if (m == NULL)
 			return (EBADRPC);
 		s = m->m_len;
 	}
 	*mdp = m;
 	*dposp = mtod(m, caddr_t)+offs;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(mb, bpos, cp, siz)
 	struct mbuf **mb;
 	char **bpos;
 	char *cp;
 	long siz;
 {
 	register struct mbuf *m1 = 0, *m2;
 	long left, xfer, len, tlen;
 	u_long *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_long *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		MGET(m1, M_WAIT, MT_DATA);
 		if (siz > MLEN)
 			MCLGET(m1, M_WAIT);
 		m1->m_len = NFSMSIZ(m1);
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_long *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init()
 {
 	register int i;
 
 	/*
 	 * Check to see if major data structures haven't bloated.
 	 */
 	if (sizeof (struct nfsnode) > NFS_NODEALLOC) {
 		printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC);
 		printf("Try reducing NFS_SMALLFH\n");
 	}
 	if (sizeof (struct nfsmount) > NFS_MNTALLOC) {
 		printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC);
 		printf("Try reducing NFS_MUIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) {
 		printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC);
 		printf("Try reducing NFS_UIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfsuid) > NFS_UIDALLOC) {
 		printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC);
 		printf("Try unionizing the nu_nickname and nu_flag fields\n");
 	}
 	nfsrtt.pos = 0;
 	rpc_vers = txdr_unsigned(RPC_VER2);
 	rpc_call = txdr_unsigned(RPC_CALL);
 	rpc_reply = txdr_unsigned(RPC_REPLY);
 	rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 	rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 	rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 	rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 	rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 	rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4);
 	nfs_prog = txdr_unsigned(NFS_PROG);
 	nqnfs_prog = txdr_unsigned(NQNFS_PROG);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 		nfs_iodwant[i] = (struct proc *)0;
 	TAILQ_INIT(&nfs_bufq);
 	nfs_nhinit();			/* Init the nfsnode table */
 #ifndef NFS_NOSERVER
 	nfsrv_init(0);			/* Init server data structures */
 	nfsrv_initcache();		/* Init the server request cache */
 #endif
 
 	/*
 	 * Initialize the nqnfs server stuff.
 	 */
 	if (nqnfsstarttime == 0) {
 		nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease
 			+ nqsrv_clockskew + nqsrv_writeslack;
 		NQLOADNOVRAM(nqnfsstarttime);
 		CIRCLEQ_INIT(&nqtimerhead);
 		nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash);
 	}
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
 #ifndef NFS_NOSERVER
 	nfs_timer(0);
 #endif
 
 #ifdef __FreeBSD__
 	/*
 	 * Set up lease_check and lease_updatetime so that other parts
 	 * of the system can call us, if we are loadable.
 	 */
 #ifndef NFS_NOSERVER
 	lease_check = nfs_lease_check;
 #endif
 	lease_updatetime = nfs_lease_updatetime;
 	vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */
 #ifdef VFS_LKM
 	sysent[SYS_nfssvc].sy_narg = 2;
 	sysent[SYS_nfssvc].sy_call = nfssvc;
 #ifndef NFS_NOSERVER
 	sysent[SYS_getfh].sy_narg = 2;
 	sysent[SYS_getfh].sy_call = getfh;
 #endif
 #endif
 #endif
 
 	return (0);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(vpp, mdp, dposp, vaper)
 	struct vnode **vpp;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vattr *vaper;
 {
 	register struct vnode *vp = *vpp;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 	register struct nfsnode *np;
 	register struct nfsnodehashhead *nhpp;
 	register long t1;
 	caddr_t cp2;
 	int error = 0, rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime;
 	struct vnode *nvp;
 	int v3 = NFS_ISV3(vp);
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2))
 		return (error);
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(u_char, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(long, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	if (vp->v_type != vtyp) {
 		/*
 		 * If we had a lock and it turns out that the vnode
 		 * is an object which we don't want to lock (e.g. VDIR)
 		 * to avoid nasty hanging problems on a server crash,
 		 * then release it here.
 		 */
 		if (vtyp != VREG && VOP_ISLOCKED(vp))
 			VOP_UNLOCK(vp);
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO) {
 			vp->v_op = fifo_nfsv2nodeop_p;
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
 			nvp = checkalias(vp, (dev_t)rdev, vp->v_mount);
 			if (nvp) {
 				/*
 				 * Discard unneeded vnode, but save its nfsnode.
 				 */
 				LIST_REMOVE(np, n_hash);
 				nvp->v_data = vp->v_data;
 				vp->v_data = NULL;
 				vp->v_op = spec_vnodeop_p;
 				vrele(vp);
 				vgone(vp);
 				/*
 				 * Reinitialize aliased node.
 				 */
 				np->n_vnode = nvp;
 				nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize));
 				LIST_INSERT_HEAD(nhpp, np, n_hash);
 				*vpp = vp = nvp;
 			}
 		}
 		np->n_mtime = mtime.ts_sec;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = (dev_t)rdev;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		fxdr_hyper(&fp->fa3_size, &vap->va_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		fxdr_hyper(&fp->fa3_used, &vap->va_bytes);
 		vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_long, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize);
 		vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.ts_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	np->n_attrstamp = time.tv_sec;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(vp, vaper)
 	register struct vnode *vp;
 	struct vattr *vaper;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register struct vattr *vap;
 
 	if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) {
 		nfsstats.attrcache_misses++;
 		return (ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	vap = &np->n_vattr;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	return (0);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Set up nameidata for a lookup() call and do it
  */
 int
 nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag)
 	register struct nameidata *ndp;
 	fhandle_t *fhp;
 	int len;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vnode **retdirp;
 	struct proc *p;
 	int kerbflag;
 {
 	register int i, rem;
 	register struct mbuf *md;
 	register char *fromcp, *tocp;
 	struct vnode *dp;
 	int error, rdonly;
 	struct componentname *cnp = &ndp->ni_cnd;
 
 	*retdirp = (struct vnode *)0;
 	MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK);
 	/*
 	 * Copy the name from the mbuf list to ndp->ni_pnbuf
 	 * and set the various ndp fields appropriately.
 	 */
 	fromcp = *dposp;
 	tocp = cnp->cn_pnbuf;
 	md = *mdp;
 	rem = mtod(md, caddr_t) + md->m_len - fromcp;
 	cnp->cn_hash = 0;
 	for (i = 0; i < len; i++) {
 		while (rem == 0) {
 			md = md->m_next;
 			if (md == NULL) {
 				error = EBADRPC;
 				goto out;
 			}
 			fromcp = mtod(md, caddr_t);
 			rem = md->m_len;
 		}
 		if (*fromcp == '\0' || *fromcp == '/') {
 			error = EACCES;
 			goto out;
 		}
 		cnp->cn_hash += (unsigned char)*fromcp;
 		*tocp++ = *fromcp++;
 		rem--;
 	}
 	*tocp = '\0';
 	*mdp = md;
 	*dposp = fromcp;
 	len = nfsm_rndup(len)-len;
 	if (len > 0) {
 		if (rem >= len)
 			*dposp += len;
 		else if (error = nfs_adv(mdp, dposp, len, rem))
 			goto out;
 	}
 	ndp->ni_pathlen = tocp - cnp->cn_pnbuf;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	/*
 	 * Extract and set starting directory.
 	 */
 	if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp,
 	    nam, &rdonly, kerbflag))
 		goto out;
 	if (dp->v_type != VDIR) {
 		nfsrv_vrele(dp);
 		error = ENOTDIR;
 		goto out;
 	}
 	VREF(dp);
 	*retdirp = dp;
 	ndp->ni_startdir = dp;
 	if (rdonly)
 		cnp->cn_flags |= (NOCROSSMOUNT | RDONLY);
 	else
 		cnp->cn_flags |= NOCROSSMOUNT;
 	/*
 	 * And call lookup() to do the real work
 	 */
 	cnp->cn_proc = p;
 	if (error = lookup(ndp))
 		goto out;
 	/*
 	 * Check for encountering a symbolic link
 	 */
 	if (cnp->cn_flags & ISSYMLINK) {
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 		error = EINVAL;
 		goto out;
 	}
 
 	nfsrv_vmio(ndp->ni_vp);
 
 	/*
 	 * Check for saved name request
 	 */
 	if (cnp->cn_flags & (SAVENAME | SAVESTART)) {
 		cnp->cn_flags |= HASBUF;
 		return (0);
 	}
 out:
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 	return (error);
 }
 
 /*
  * A fiddled version of m_adj() that ensures null fill to a long
  * boundary and only trims off the back end
  */
 void
 nfsm_adj(mp, len, nul)
 	struct mbuf *mp;
 	register int len;
 	int nul;
 {
 	register struct mbuf *m;
 	register int count, i;
 	register char *cp;
 
 	/*
 	 * Trim from tail.  Scan the mbuf chain,
 	 * calculating its length and finding the last mbuf.
 	 * If the adjustment only affects this mbuf, then just
 	 * adjust and return.  Otherwise, rescan and truncate
 	 * after the remaining size.
 	 */
 	count = 0;
 	m = mp;
 	for (;;) {
 		count += m->m_len;
 		if (m->m_next == (struct mbuf *)0)
 			break;
 		m = m->m_next;
 	}
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
 		}
 		return;
 	}
 	count -= len;
 	if (count < 0)
 		count = 0;
 	/*
 	 * Correct length for chain is "count".
 	 * Find the mbuf with last data, adjust its length,
 	 * and toss data from remaining mbufs on chain.
 	 */
 	for (m = mp; m; m = m->m_next) {
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
 			}
 			break;
 		}
 		count -= m->m_len;
 	}
 	for (m = m->m_next;m;m = m->m_next)
 		m->m_len = 0;
 }
 
 /*
  * Make these functions instead of macros, so that the kernel text size
  * doesn't get too big...
  */
 void
 nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int before_ret;
 	register struct vattr *before_vap;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 
 	if (before_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED);
 		*tl++ = nfs_true;
 		txdr_hyper(&(before_vap->va_size), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_mtime), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_ctime), tl);
 	}
 	*bposp = bpos;
 	*mbp = mb;
 	nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp);
 }
 
 void
 nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 	register struct nfs_fattr *fp;
 
 	if (after_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR);
 		*tl++ = nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		nfsm_srvfattr(nfsd, after_vap, fp);
 	}
 	*mbp = mb;
 	*bposp = bpos;
 }
 
 void
 nfsm_srvfattr(nfsd, vap, fp)
 	register struct nfsrv_descript *nfsd;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 {
 
 	fp->fa_nlink = txdr_unsigned(vap->va_nlink);
 	fp->fa_uid = txdr_unsigned(vap->va_uid);
 	fp->fa_gid = txdr_unsigned(vap->va_gid);
 	if (nfsd->nd_flag & ND_NFSV3) {
 		fp->fa_type = vtonfsv3_type(vap->va_type);
 		fp->fa_mode = vtonfsv3_mode(vap->va_mode);
 		txdr_hyper(&vap->va_size, &fp->fa3_size);
 		txdr_hyper(&vap->va_bytes, &fp->fa3_used);
 		fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev));
 		fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev));
 		fp->fa3_fsid.nfsuquad[0] = 0;
 		fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid);
 		fp->fa3_fileid.nfsuquad[0] = 0;
 		fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime);
 		txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime);
 		txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime);
 	} else {
 		fp->fa_type = vtonfsv2_type(vap->va_type);
 		fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		fp->fa2_size = txdr_unsigned(vap->va_size);
 		fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize);
 		if (vap->va_type == VFIFO)
 			fp->fa2_rdev = 0xffffffff;
 		else
 			fp->fa2_rdev = txdr_unsigned(vap->va_rdev);
 		fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE);
 		fp->fa2_fsid = txdr_unsigned(vap->va_fsid);
 		fp->fa2_fileid = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime);
 		txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime);
 		txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime);
 	}
 }
 
 /*
  * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked)
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling VFS_FHTOVP()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	- if not lockflag unlock it with VOP_UNLOCK()
  */
 int
 nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag)
 	fhandle_t *fhp;
 	int lockflag;
 	struct vnode **vpp;
 	struct ucred *cred;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	int *rdonlyp;
 	int kerbflag;
 {
 	register struct mount *mp;
 	register int i;
 	struct ucred *credanon;
 	int error, exflags;
 
 	*vpp = (struct vnode *)0;
 	mp = getvfs(&fhp->fh_fsid);
 	if (!mp)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon);
 	if (error)
 		return (error);
 	/*
 	 * Check/setup credentials.
 	 */
 	if (exflags & MNT_EXKERB) {
 		if (!kerbflag) {
 			vput(*vpp);
 			return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		}
 	} else if (kerbflag) {
 		vput(*vpp);
 		return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 	} else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 		cred->cr_uid = credanon->cr_uid;
 		for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++)
 			cred->cr_groups[i] = credanon->cr_groups[i];
 		cred->cr_ngroups = i;
 	}
 	if (exflags & MNT_EXRDONLY)
 		*rdonlyp = 1;
 	else
 		*rdonlyp = 0;
 
 	nfsrv_vmio(*vpp);
 
 	if (!lockflag)
 		VOP_UNLOCK(*vpp);
 	return (0);
 }
 
 #endif /* NFS_NOSERVER */
 /*
  * This function compares two net addresses by family and returns TRUE
  * if they are the same host.
  * If there is any doubt, return FALSE.
  * The AF_INET family is handled as a special case so that address mbufs
  * don't need to be saved to store "struct in_addr", which is only 4 bytes.
  */
 int
 netaddr_match(family, haddr, nam)
 	int family;
 	union nethostaddr *haddr;
 	struct mbuf *nam;
 {
 	register struct sockaddr_in *inetaddr;
 
 	switch (family) {
 	case AF_INET:
 		inetaddr = mtod(nam, struct sockaddr_in *);
 		if (inetaddr->sin_family == AF_INET &&
 		    inetaddr->sin_addr.s_addr == haddr->had_inetaddr)
 			return (1);
 		break;
 #ifdef ISO
 	case AF_ISO:
 	    {
 		register struct sockaddr_iso *isoaddr1, *isoaddr2;
 
 		isoaddr1 = mtod(nam, struct sockaddr_iso *);
 		isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *);
 		if (isoaddr1->siso_family == AF_ISO &&
 		    isoaddr1->siso_nlen > 0 &&
 		    isoaddr1->siso_nlen == isoaddr2->siso_nlen &&
 		    SAME_ISOADDR(isoaddr1, isoaddr2))
 			return (1);
 		break;
 	    }
 #endif	/* ISO */
 	default:
 		break;
 	};
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { 0, 0 };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(np, off, add)
 	register struct nfsnode *np;
 	off_t off;
 	int add;
 {
 	register struct nfsdmap *dp, *dp2;
 	register int pos;
 
 	pos = off / NFS_DIRBLKSIZ;
 	if (pos == 0) {
 #ifdef DIAGNOSTIC
 		if (add)
 			panic("nfs getcookie add at 0");
 #endif
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = np->n_cookies.lh_first;
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			return ((nfsuint64 *)0);
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (dp->ndm_list.le_next) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 				pos >= dp->ndm_eocookie)
 				return ((nfsuint64 *)0);
 			dp = dp->ndm_list.le_next;
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			return ((nfsuint64 *)0);
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			return ((nfsuint64 *)0);
 	}
 	return (&dp->ndm_cookies[pos]);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(vp)
 	register struct vnode *vp;
 {
 	register struct nfsnode *np = VTONFS(vp);
 
 #ifdef DIAGNOSTIC
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (np->n_cookies.lh_first)
 		np->n_cookies.lh_first->ndm_eocookie = 0;
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * flag. Once done the new write verifier can be set for the mount point.
  */
 void
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
 	register struct vnode *vp, *nvp;
 	register struct buf *bp, *nbp;
 	int s;
 
 	s = splbio();
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		if (vp->v_mount != mp)	/* Paranoia */
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~B_NEEDCOMMIT;
 		}
 	}
 	splx(s);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
  * numbers not specified for the associated procedure.
  */
 int
 nfsrv_errmap(nd, err)
 	struct nfsrv_descript *nd;
 	register int err;
 {
 	register short *defaulterrp, *errp;
 
 	if (nd->nd_flag & ND_NFSV3) {
 	    if (nd->nd_procnum <= NFSPROC_COMMIT) {
 		errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum];
 		while (*++errp) {
 			if (*errp == err)
 				return (err);
 			else if (*errp > err)
 				break;
 		}
 		return ((int)*defaulterrp);
 	    } else
 		return (err & 0xffff);
 	}
 	if (err <= ELAST)
 		return ((int)nfsrv_v2errmap[err - 1]);
 	return (NFSERR_IO);
 }
 
 int
 nfsrv_vmio(struct vnode *vp) {
 	vm_object_t object;
 
 	if ((vp == NULL) || (vp->v_type != VREG))
 		return 1;
 
 retry:
 	if ((vp->v_flag & VVMIO) == 0) {
 		struct vattr vat;
 		struct proc *p = curproc;
 
 		if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0)
 			panic("nfsrv_vmio: VOP_GETATTR failed");
 
-		(void) vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		(void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
 
 		vp->v_flag |= VVMIO;
 	} else {
 		if ((object = vp->v_object) &&
 			(object->flags & OBJ_DEAD)) {
 			tsleep(object, PVM, "nfdead", 0);
 			goto retry;
 		}
 		if (!object)
 			panic("nfsrv_vmio: VMIO object missing");
 		vm_object_reference(object);
 	}
 	return 0;
 }
 int
 nfsrv_vput(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vput(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vput(vp);
 	}
 	return 0;
 }
 int
 nfsrv_vrele(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vrele(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vrele(vp);
 	}
 	return 0;
 }
 #endif /* NFS_NOSERVER */
Index: head/sys/nfs/nfs_subs.c
===================================================================
--- head/sys/nfs/nfs_subs.c	(revision 13489)
+++ head/sys/nfs/nfs_subs.c	(revision 13490)
@@ -1,1979 +1,1979 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c	8.3 (Berkeley) 1/4/94
- * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $
+ * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $
  */
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #ifdef VFS_LKM
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfs/nfsm_subs.h>
 #include <nfs/nfsmount.h>
 #include <nfs/nqnfs.h>
 #include <nfs/nfsrtt.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <netinet/in.h>
 #ifdef ISO
 #include <netiso/iso.h>
 #endif
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_long nfs_xdrneg1;
 u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
 	rpc_mismatch, rpc_auth_unix, rpc_msgaccepted,
 	rpc_auth_kerb;
 u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false;
 
 /* And other global data */
 static u_long nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON 
 };
 enum vtype nv3tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
 };
 
 int nfs_ticks;
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
 int nfssvc_sockhead_flag;
 struct nfsd_head nfsd_head;
 int nfsd_head_flag;
 struct nfs_bufq nfs_bufq;
 struct nqtimerhead nqtimerhead;
 struct nqfhhashhead *nqfhhashtbl;
 u_long nqfhhash;
 
 #ifndef NFS_NOSERVER
 /*
  * Mapping of old NFS Version 2 RPC numbers to generic numbers.
  */
 int nfsv3_procid[NFS_NPROCS] = {
 	NFSPROC_NULL,
 	NFSPROC_GETATTR,
 	NFSPROC_SETATTR,
 	NFSPROC_NOOP,
 	NFSPROC_LOOKUP,
 	NFSPROC_READLINK,
 	NFSPROC_READ,
 	NFSPROC_NOOP,
 	NFSPROC_WRITE,
 	NFSPROC_CREATE,
 	NFSPROC_REMOVE,
 	NFSPROC_RENAME,
 	NFSPROC_LINK,
 	NFSPROC_SYMLINK,
 	NFSPROC_MKDIR,
 	NFSPROC_RMDIR,
 	NFSPROC_READDIR,
 	NFSPROC_FSSTAT,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP
 };
 
 #endif /* NFS_NOSERVER */
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 #ifndef NFS_NOSERVER
 /*
  * Maps errno values to nfs error numbers.
  * Use NFSERR_IO as the catch all for ones not specifically defined in
  * RFC 1094.
  */
 static u_char nfsrv_v2errmap[ELAST] = {
   NFSERR_PERM,	NFSERR_NOENT,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NXIO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_ACCES,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_EXIST,	NFSERR_IO,	NFSERR_NODEV,	NFSERR_NOTDIR,
   NFSERR_ISDIR,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_FBIG,	NFSERR_NOSPC,	NFSERR_IO,	NFSERR_ROFS,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_NAMETOL,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NOTEMPTY, NFSERR_IO,	NFSERR_IO,	NFSERR_DQUOT,	NFSERR_STALE,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Although it is not obvious whether or not NFS clients really care if
  * a returned error value is in the specified list for the procedure, the
  * safest thing to do is filter them appropriately. For Version 2, the
  * X/Open XNFS document is the only specification that defines error values
  * for each RPC (The RFC simply lists all possible error values for all RPCs),
  * so I have decided to not do this for Version 2.
  * The first entry is the default error return and the rest are the valid
  * errors for that RPC in increasing numeric order.
  */
 static short nfsv3err_null[] = {
 	0,
 	0,
 };
 
 static short nfsv3err_getattr[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_setattr[] = {
 	NFSERR_IO,
 	NFSERR_PERM,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOT_SYNC,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_lookup[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_access[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_read[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_NXIO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_write[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_FBIG,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_create[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mkdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_symlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mknod[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	NFSERR_BADTYPE,
 	0,
 };
 
 static short nfsv3err_remove[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rmdir[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rename[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_ISDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_link[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdirplus[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_NOTSUPP,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsstat[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsinfo[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_pathconf[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_commit[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short *nfsrv_v3errmap[] = {
 	nfsv3err_null,
 	nfsv3err_getattr,
 	nfsv3err_setattr,
 	nfsv3err_lookup,
 	nfsv3err_access,
 	nfsv3err_readlink,
 	nfsv3err_read,
 	nfsv3err_write,
 	nfsv3err_create,
 	nfsv3err_mkdir,
 	nfsv3err_symlink,
 	nfsv3err_mknod,
 	nfsv3err_remove,
 	nfsv3err_rmdir,
 	nfsv3err_rename,
 	nfsv3err_link,
 	nfsv3err_readdir,
 	nfsv3err_readdirplus,
 	nfsv3err_fsstat,
 	nfsv3err_fsinfo,
 	nfsv3err_pathconf,
 	nfsv3err_commit,
 };
 
 #endif /* NFS_NOSERVER */
 
 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsrtt nfsrtt;
 extern time_t nqnfsstarttime;
 extern int nqsrv_clockskew;
 extern int nqsrv_writeslack;
 extern int nqsrv_maxlease;
 extern struct nfsstats nfsstats;
 extern int nqnfs_piggy[NFS_NPROCS];
 extern nfstype nfsv2_type[9];
 extern nfstype nfsv3_type[9];
 extern struct nfsnodehashhead *nfsnodehashtbl;
 extern u_long nfsnodehash;
 
 #ifdef VFS_LKM
 struct getfh_args;
 extern int getfh(struct proc *, struct getfh_args *, int *);
 struct nfssvc_args;
 extern int nfssvc(struct proc *, struct nfssvc_args *, int *);
 #endif
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 /*
  * Create the header for an rpc request packet
  * The hsiz is the size of the rest of the nfs request header.
  * (just used to decide if a cluster is a good idea)
  */
 struct mbuf *
 nfsm_reqh(vp, procid, hsiz, bposp)
 	struct vnode *vp;
 	u_long procid;
 	int hsiz;
 	caddr_t *bposp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	struct mbuf *mb2;
 	struct nfsmount *nmp;
 	int nqflag;
 
 	MGET(mb, M_WAIT, MT_DATA);
 	if (hsiz >= MINCLSIZE)
 		MCLGET(mb, M_WAIT);
 	mb->m_len = 0;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * For NQNFS, add lease request.
 	 */
 	if (vp) {
 		nmp = VFSTONFS(vp->v_mount);
 		if (nmp->nm_flag & NFSMNT_NQNFS) {
 			nqflag = NQNFS_NEEDLEASE(vp, procid);
 			if (nqflag) {
 				nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(nqflag);
 				*tl = txdr_unsigned(nmp->nm_leaseterm);
 			} else {
 				nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 				*tl = 0;
 			}
 		}
 	}
 	/* Finally, return values */
 	*bposp = bpos;
 	return (mb);
 }
 
 /*
  * Build the RPC header and fill in the authorization info.
  * The authorization string argument is only used when the credentials
  * come from outside of the kernel.
  * Returns the head of the mbuf list.
  */
 struct mbuf *
 nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len,
 	verf_str, mrest, mrest_len, mbp, xidp)
 	register struct ucred *cr;
 	int nmflag;
 	int procid;
 	int auth_type;
 	int auth_len;
 	char *auth_str;
 	int verf_len;
 	char *verf_str;
 	struct mbuf *mrest;
 	int mrest_len;
 	struct mbuf **mbp;
 	u_long *xidp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	register int i;
 	struct mbuf *mreq, *mb2;
 	int siz, grpsiz, authsiz;
 
 	authsiz = nfsm_rndup(auth_len);
 	MGETHDR(mb, M_WAIT, MT_DATA);
 	if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
 		MCLGET(mb, M_WAIT);
 	} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
 		MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
 	} else {
 		MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
 	}
 	mb->m_len = 0;
 	mreq = mb;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * First the RPC header.
 	 */
 	nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED);
 	if (++nfs_xid == 0)
 		nfs_xid++;
 	*tl++ = *xidp = txdr_unsigned(nfs_xid);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	if (nmflag & NFSMNT_NQNFS) {
 		*tl++ = txdr_unsigned(NQNFS_PROG);
 		*tl++ = txdr_unsigned(NQNFS_VER3);
 	} else {
 		*tl++ = txdr_unsigned(NFS_PROG);
 		if (nmflag & NFSMNT_NFSV3)
 			*tl++ = txdr_unsigned(NFS_VER3);
 		else
 			*tl++ = txdr_unsigned(NFS_VER2);
 	}
 	if (nmflag & NFSMNT_NFSV3)
 		*tl++ = txdr_unsigned(procid);
 	else
 		*tl++ = txdr_unsigned(nfsv2_procid[procid]);
 
 	/*
 	 * And then the authorization cred.
 	 */
 	*tl++ = txdr_unsigned(auth_type);
 	*tl = txdr_unsigned(authsiz);
 	switch (auth_type) {
 	case RPCAUTH_UNIX:
 		nfsm_build(tl, u_long *, auth_len);
 		*tl++ = 0;		/* stamp ?? */
 		*tl++ = 0;		/* NULL hostname */
 		*tl++ = txdr_unsigned(cr->cr_uid);
 		*tl++ = txdr_unsigned(cr->cr_groups[0]);
 		grpsiz = (auth_len >> 2) - 5;
 		*tl++ = txdr_unsigned(grpsiz);
 		for (i = 1; i <= grpsiz; i++)
 			*tl++ = txdr_unsigned(cr->cr_groups[i]);
 		break;
 	case RPCAUTH_KERB4:
 		siz = auth_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(auth_str, bpos, i);
 			mb->m_len += i;
 			auth_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 		break;
 	};
 
 	/*
 	 * And the verifier...
 	 */
 	nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
 	if (verf_str) {
 		*tl++ = txdr_unsigned(RPCAUTH_KERB4);
 		*tl = txdr_unsigned(verf_len);
 		siz = verf_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(verf_str, bpos, i);
 			mb->m_len += i;
 			verf_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 	} else {
 		*tl++ = txdr_unsigned(RPCAUTH_NULL);
 		*tl = 0;
 	}
 	mb->m_next = mrest;
 	mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
 	mreq->m_pkthdr.rcvif = (struct ifnet *)0;
 	*mbp = mb;
 	return (mreq);
 }
 
 /*
  * copies mbuf chain to the uio scatter/gather list
  */
 int
 nfsm_mbuftouio(mrep, uiop, siz, dpos)
 	struct mbuf **mrep;
 	register struct uio *uiop;
 	int siz;
 	caddr_t *dpos;
 {
 	register char *mbufcp, *uiocp;
 	register int xfer, left, len;
 	register struct mbuf *mp;
 	long uiosiz, rem;
 	int error = 0;
 
 	mp = *mrep;
 	mbufcp = *dpos;
 	len = mtod(mp, caddr_t)+mp->m_len-mbufcp;
 	rem = nfsm_rndup(siz)-siz;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			while (len == 0) {
 				mp = mp->m_next;
 				if (mp == NULL)
 					return (EBADRPC);
 				mbufcp = mtod(mp, caddr_t);
 				len = mp->m_len;
 			}
 			xfer = (left > len) ? len : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(mbufcp, uiocp, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(mbufcp, uiocp, xfer);
 			else
 				copyout(mbufcp, uiocp, xfer);
 			left -= xfer;
 			len -= xfer;
 			mbufcp += xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	*dpos = mbufcp;
 	*mrep = mp;
 	if (rem > 0) {
 		if (len < rem)
 			error = nfs_adv(mrep, dpos, rem, len);
 		else
 			*dpos += rem;
 	}
 	return (error);
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain...
  */
 int
 nfsm_uiotombuf(uiop, mq, siz, bpos)
 	register struct uio *uiop;
 	struct mbuf **mq;
 	int siz;
 	caddr_t *bpos;
 {
 	register char *uiocp;
 	register struct mbuf *mp, *mp2;
 	register int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EINVAL);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				MGET(mp, M_WAIT, MT_DATA);
 				if (clflg)
 					MCLGET(mp, M_WAIT);
 				mp->m_len = 0;
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			MGET(mp, M_WAIT, MT_DATA);
 			mp->m_len = 0;
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Help break down an mbuf chain by setting the first siz bytes contiguous
  * pointed to by returned val.
  * This is used by the macros nfsm_dissect and nfsm_dissecton for tough
  * cases. (The macros use the vars. dpos and dpos2)
  */
 int
 nfsm_disct(mdp, dposp, siz, left, cp2)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int siz;
 	int left;
 	caddr_t *cp2;
 {
 	register struct mbuf *mp, *mp2;
 	register int siz2, xfer;
 	register caddr_t p;
 
 	mp = *mdp;
 	while (left == 0) {
 		*mdp = mp = mp->m_next;
 		if (mp == NULL)
 			return (EBADRPC);
 		left = mp->m_len;
 		*dposp = mtod(mp, caddr_t);
 	}
 	if (left >= siz) {
 		*cp2 = *dposp;
 		*dposp += siz;
 	} else if (mp->m_next == NULL) {
 		return (EBADRPC);
 	} else if (siz > MHLEN) {
 		panic("nfs S too big");
 	} else {
 		MGET(mp2, M_WAIT, MT_DATA);
 		mp2->m_next = mp->m_next;
 		mp->m_next = mp2;
 		mp->m_len -= left;
 		mp = mp2;
 		*cp2 = p = mtod(mp, caddr_t);
 		bcopy(*dposp, p, left);		/* Copy what was left */
 		siz2 = siz-left;
 		p += left;
 		mp2 = mp->m_next;
 		/* Loop around copying up the siz2 bytes */
 		while (siz2 > 0) {
 			if (mp2 == NULL)
 				return (EBADRPC);
 			xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2;
 			if (xfer > 0) {
 				bcopy(mtod(mp2, caddr_t), p, xfer);
 				NFSMADV(mp2, xfer);
 				mp2->m_len -= xfer;
 				p += xfer;
 				siz2 -= xfer;
 			}
 			if (siz2 > 0)
 				mp2 = mp2->m_next;
 		}
 		mp->m_len = siz;
 		*mdp = mp2;
 		*dposp = mtod(mp2, caddr_t);
 	}
 	return (0);
 }
 
 /*
  * Advance the position in the mbuf chain.
  */
 int
 nfs_adv(mdp, dposp, offs, left)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int offs;
 	int left;
 {
 	register struct mbuf *m;
 	register int s;
 
 	m = *mdp;
 	s = left;
 	while (s < offs) {
 		offs -= s;
 		m = m->m_next;
 		if (m == NULL)
 			return (EBADRPC);
 		s = m->m_len;
 	}
 	*mdp = m;
 	*dposp = mtod(m, caddr_t)+offs;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(mb, bpos, cp, siz)
 	struct mbuf **mb;
 	char **bpos;
 	char *cp;
 	long siz;
 {
 	register struct mbuf *m1 = 0, *m2;
 	long left, xfer, len, tlen;
 	u_long *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_long *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		MGET(m1, M_WAIT, MT_DATA);
 		if (siz > MLEN)
 			MCLGET(m1, M_WAIT);
 		m1->m_len = NFSMSIZ(m1);
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_long *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init()
 {
 	register int i;
 
 	/*
 	 * Check to see if major data structures haven't bloated.
 	 */
 	if (sizeof (struct nfsnode) > NFS_NODEALLOC) {
 		printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC);
 		printf("Try reducing NFS_SMALLFH\n");
 	}
 	if (sizeof (struct nfsmount) > NFS_MNTALLOC) {
 		printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC);
 		printf("Try reducing NFS_MUIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) {
 		printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC);
 		printf("Try reducing NFS_UIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfsuid) > NFS_UIDALLOC) {
 		printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC);
 		printf("Try unionizing the nu_nickname and nu_flag fields\n");
 	}
 	nfsrtt.pos = 0;
 	rpc_vers = txdr_unsigned(RPC_VER2);
 	rpc_call = txdr_unsigned(RPC_CALL);
 	rpc_reply = txdr_unsigned(RPC_REPLY);
 	rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 	rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 	rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 	rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 	rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 	rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4);
 	nfs_prog = txdr_unsigned(NFS_PROG);
 	nqnfs_prog = txdr_unsigned(NQNFS_PROG);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 		nfs_iodwant[i] = (struct proc *)0;
 	TAILQ_INIT(&nfs_bufq);
 	nfs_nhinit();			/* Init the nfsnode table */
 #ifndef NFS_NOSERVER
 	nfsrv_init(0);			/* Init server data structures */
 	nfsrv_initcache();		/* Init the server request cache */
 #endif
 
 	/*
 	 * Initialize the nqnfs server stuff.
 	 */
 	if (nqnfsstarttime == 0) {
 		nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease
 			+ nqsrv_clockskew + nqsrv_writeslack;
 		NQLOADNOVRAM(nqnfsstarttime);
 		CIRCLEQ_INIT(&nqtimerhead);
 		nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash);
 	}
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
 #ifndef NFS_NOSERVER
 	nfs_timer(0);
 #endif
 
 #ifdef __FreeBSD__
 	/*
 	 * Set up lease_check and lease_updatetime so that other parts
 	 * of the system can call us, if we are loadable.
 	 */
 #ifndef NFS_NOSERVER
 	lease_check = nfs_lease_check;
 #endif
 	lease_updatetime = nfs_lease_updatetime;
 	vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */
 #ifdef VFS_LKM
 	sysent[SYS_nfssvc].sy_narg = 2;
 	sysent[SYS_nfssvc].sy_call = nfssvc;
 #ifndef NFS_NOSERVER
 	sysent[SYS_getfh].sy_narg = 2;
 	sysent[SYS_getfh].sy_call = getfh;
 #endif
 #endif
 #endif
 
 	return (0);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(vpp, mdp, dposp, vaper)
 	struct vnode **vpp;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vattr *vaper;
 {
 	register struct vnode *vp = *vpp;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 	register struct nfsnode *np;
 	register struct nfsnodehashhead *nhpp;
 	register long t1;
 	caddr_t cp2;
 	int error = 0, rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime;
 	struct vnode *nvp;
 	int v3 = NFS_ISV3(vp);
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2))
 		return (error);
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(u_char, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(long, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	if (vp->v_type != vtyp) {
 		/*
 		 * If we had a lock and it turns out that the vnode
 		 * is an object which we don't want to lock (e.g. VDIR)
 		 * to avoid nasty hanging problems on a server crash,
 		 * then release it here.
 		 */
 		if (vtyp != VREG && VOP_ISLOCKED(vp))
 			VOP_UNLOCK(vp);
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO) {
 			vp->v_op = fifo_nfsv2nodeop_p;
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
 			nvp = checkalias(vp, (dev_t)rdev, vp->v_mount);
 			if (nvp) {
 				/*
 				 * Discard unneeded vnode, but save its nfsnode.
 				 */
 				LIST_REMOVE(np, n_hash);
 				nvp->v_data = vp->v_data;
 				vp->v_data = NULL;
 				vp->v_op = spec_vnodeop_p;
 				vrele(vp);
 				vgone(vp);
 				/*
 				 * Reinitialize aliased node.
 				 */
 				np->n_vnode = nvp;
 				nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize));
 				LIST_INSERT_HEAD(nhpp, np, n_hash);
 				*vpp = vp = nvp;
 			}
 		}
 		np->n_mtime = mtime.ts_sec;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = (dev_t)rdev;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		fxdr_hyper(&fp->fa3_size, &vap->va_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		fxdr_hyper(&fp->fa3_used, &vap->va_bytes);
 		vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_long, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize);
 		vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.ts_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	np->n_attrstamp = time.tv_sec;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(vp, vaper)
 	register struct vnode *vp;
 	struct vattr *vaper;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register struct vattr *vap;
 
 	if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) {
 		nfsstats.attrcache_misses++;
 		return (ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	vap = &np->n_vattr;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	return (0);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Set up nameidata for a lookup() call and do it
  */
 int
 nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag)
 	register struct nameidata *ndp;
 	fhandle_t *fhp;
 	int len;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vnode **retdirp;
 	struct proc *p;
 	int kerbflag;
 {
 	register int i, rem;
 	register struct mbuf *md;
 	register char *fromcp, *tocp;
 	struct vnode *dp;
 	int error, rdonly;
 	struct componentname *cnp = &ndp->ni_cnd;
 
 	*retdirp = (struct vnode *)0;
 	MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK);
 	/*
 	 * Copy the name from the mbuf list to ndp->ni_pnbuf
 	 * and set the various ndp fields appropriately.
 	 */
 	fromcp = *dposp;
 	tocp = cnp->cn_pnbuf;
 	md = *mdp;
 	rem = mtod(md, caddr_t) + md->m_len - fromcp;
 	cnp->cn_hash = 0;
 	for (i = 0; i < len; i++) {
 		while (rem == 0) {
 			md = md->m_next;
 			if (md == NULL) {
 				error = EBADRPC;
 				goto out;
 			}
 			fromcp = mtod(md, caddr_t);
 			rem = md->m_len;
 		}
 		if (*fromcp == '\0' || *fromcp == '/') {
 			error = EACCES;
 			goto out;
 		}
 		cnp->cn_hash += (unsigned char)*fromcp;
 		*tocp++ = *fromcp++;
 		rem--;
 	}
 	*tocp = '\0';
 	*mdp = md;
 	*dposp = fromcp;
 	len = nfsm_rndup(len)-len;
 	if (len > 0) {
 		if (rem >= len)
 			*dposp += len;
 		else if (error = nfs_adv(mdp, dposp, len, rem))
 			goto out;
 	}
 	ndp->ni_pathlen = tocp - cnp->cn_pnbuf;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	/*
 	 * Extract and set starting directory.
 	 */
 	if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp,
 	    nam, &rdonly, kerbflag))
 		goto out;
 	if (dp->v_type != VDIR) {
 		nfsrv_vrele(dp);
 		error = ENOTDIR;
 		goto out;
 	}
 	VREF(dp);
 	*retdirp = dp;
 	ndp->ni_startdir = dp;
 	if (rdonly)
 		cnp->cn_flags |= (NOCROSSMOUNT | RDONLY);
 	else
 		cnp->cn_flags |= NOCROSSMOUNT;
 	/*
 	 * And call lookup() to do the real work
 	 */
 	cnp->cn_proc = p;
 	if (error = lookup(ndp))
 		goto out;
 	/*
 	 * Check for encountering a symbolic link
 	 */
 	if (cnp->cn_flags & ISSYMLINK) {
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 		error = EINVAL;
 		goto out;
 	}
 
 	nfsrv_vmio(ndp->ni_vp);
 
 	/*
 	 * Check for saved name request
 	 */
 	if (cnp->cn_flags & (SAVENAME | SAVESTART)) {
 		cnp->cn_flags |= HASBUF;
 		return (0);
 	}
 out:
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 	return (error);
 }
 
 /*
  * A fiddled version of m_adj() that ensures null fill to a long
  * boundary and only trims off the back end
  */
 void
 nfsm_adj(mp, len, nul)
 	struct mbuf *mp;
 	register int len;
 	int nul;
 {
 	register struct mbuf *m;
 	register int count, i;
 	register char *cp;
 
 	/*
 	 * Trim from tail.  Scan the mbuf chain,
 	 * calculating its length and finding the last mbuf.
 	 * If the adjustment only affects this mbuf, then just
 	 * adjust and return.  Otherwise, rescan and truncate
 	 * after the remaining size.
 	 */
 	count = 0;
 	m = mp;
 	for (;;) {
 		count += m->m_len;
 		if (m->m_next == (struct mbuf *)0)
 			break;
 		m = m->m_next;
 	}
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
 		}
 		return;
 	}
 	count -= len;
 	if (count < 0)
 		count = 0;
 	/*
 	 * Correct length for chain is "count".
 	 * Find the mbuf with last data, adjust its length,
 	 * and toss data from remaining mbufs on chain.
 	 */
 	for (m = mp; m; m = m->m_next) {
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
 			}
 			break;
 		}
 		count -= m->m_len;
 	}
 	for (m = m->m_next;m;m = m->m_next)
 		m->m_len = 0;
 }
 
 /*
  * Make these functions instead of macros, so that the kernel text size
  * doesn't get too big...
  */
 void
 nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int before_ret;
 	register struct vattr *before_vap;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 
 	if (before_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED);
 		*tl++ = nfs_true;
 		txdr_hyper(&(before_vap->va_size), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_mtime), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_ctime), tl);
 	}
 	*bposp = bpos;
 	*mbp = mb;
 	nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp);
 }
 
 void
 nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 	register struct nfs_fattr *fp;
 
 	if (after_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR);
 		*tl++ = nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		nfsm_srvfattr(nfsd, after_vap, fp);
 	}
 	*mbp = mb;
 	*bposp = bpos;
 }
 
 void
 nfsm_srvfattr(nfsd, vap, fp)
 	register struct nfsrv_descript *nfsd;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 {
 
 	fp->fa_nlink = txdr_unsigned(vap->va_nlink);
 	fp->fa_uid = txdr_unsigned(vap->va_uid);
 	fp->fa_gid = txdr_unsigned(vap->va_gid);
 	if (nfsd->nd_flag & ND_NFSV3) {
 		fp->fa_type = vtonfsv3_type(vap->va_type);
 		fp->fa_mode = vtonfsv3_mode(vap->va_mode);
 		txdr_hyper(&vap->va_size, &fp->fa3_size);
 		txdr_hyper(&vap->va_bytes, &fp->fa3_used);
 		fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev));
 		fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev));
 		fp->fa3_fsid.nfsuquad[0] = 0;
 		fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid);
 		fp->fa3_fileid.nfsuquad[0] = 0;
 		fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime);
 		txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime);
 		txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime);
 	} else {
 		fp->fa_type = vtonfsv2_type(vap->va_type);
 		fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		fp->fa2_size = txdr_unsigned(vap->va_size);
 		fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize);
 		if (vap->va_type == VFIFO)
 			fp->fa2_rdev = 0xffffffff;
 		else
 			fp->fa2_rdev = txdr_unsigned(vap->va_rdev);
 		fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE);
 		fp->fa2_fsid = txdr_unsigned(vap->va_fsid);
 		fp->fa2_fileid = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime);
 		txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime);
 		txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime);
 	}
 }
 
 /*
  * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked)
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling VFS_FHTOVP()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	- if not lockflag unlock it with VOP_UNLOCK()
  */
 int
 nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag)
 	fhandle_t *fhp;
 	int lockflag;
 	struct vnode **vpp;
 	struct ucred *cred;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	int *rdonlyp;
 	int kerbflag;
 {
 	register struct mount *mp;
 	register int i;
 	struct ucred *credanon;
 	int error, exflags;
 
 	*vpp = (struct vnode *)0;
 	mp = getvfs(&fhp->fh_fsid);
 	if (!mp)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon);
 	if (error)
 		return (error);
 	/*
 	 * Check/setup credentials.
 	 */
 	if (exflags & MNT_EXKERB) {
 		if (!kerbflag) {
 			vput(*vpp);
 			return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		}
 	} else if (kerbflag) {
 		vput(*vpp);
 		return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 	} else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 		cred->cr_uid = credanon->cr_uid;
 		for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++)
 			cred->cr_groups[i] = credanon->cr_groups[i];
 		cred->cr_ngroups = i;
 	}
 	if (exflags & MNT_EXRDONLY)
 		*rdonlyp = 1;
 	else
 		*rdonlyp = 0;
 
 	nfsrv_vmio(*vpp);
 
 	if (!lockflag)
 		VOP_UNLOCK(*vpp);
 	return (0);
 }
 
 #endif /* NFS_NOSERVER */
 /*
  * This function compares two net addresses by family and returns TRUE
  * if they are the same host.
  * If there is any doubt, return FALSE.
  * The AF_INET family is handled as a special case so that address mbufs
  * don't need to be saved to store "struct in_addr", which is only 4 bytes.
  */
 int
 netaddr_match(family, haddr, nam)
 	int family;
 	union nethostaddr *haddr;
 	struct mbuf *nam;
 {
 	register struct sockaddr_in *inetaddr;
 
 	switch (family) {
 	case AF_INET:
 		inetaddr = mtod(nam, struct sockaddr_in *);
 		if (inetaddr->sin_family == AF_INET &&
 		    inetaddr->sin_addr.s_addr == haddr->had_inetaddr)
 			return (1);
 		break;
 #ifdef ISO
 	case AF_ISO:
 	    {
 		register struct sockaddr_iso *isoaddr1, *isoaddr2;
 
 		isoaddr1 = mtod(nam, struct sockaddr_iso *);
 		isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *);
 		if (isoaddr1->siso_family == AF_ISO &&
 		    isoaddr1->siso_nlen > 0 &&
 		    isoaddr1->siso_nlen == isoaddr2->siso_nlen &&
 		    SAME_ISOADDR(isoaddr1, isoaddr2))
 			return (1);
 		break;
 	    }
 #endif	/* ISO */
 	default:
 		break;
 	};
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { 0, 0 };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(np, off, add)
 	register struct nfsnode *np;
 	off_t off;
 	int add;
 {
 	register struct nfsdmap *dp, *dp2;
 	register int pos;
 
 	pos = off / NFS_DIRBLKSIZ;
 	if (pos == 0) {
 #ifdef DIAGNOSTIC
 		if (add)
 			panic("nfs getcookie add at 0");
 #endif
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = np->n_cookies.lh_first;
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			return ((nfsuint64 *)0);
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (dp->ndm_list.le_next) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 				pos >= dp->ndm_eocookie)
 				return ((nfsuint64 *)0);
 			dp = dp->ndm_list.le_next;
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			return ((nfsuint64 *)0);
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			return ((nfsuint64 *)0);
 	}
 	return (&dp->ndm_cookies[pos]);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(vp)
 	register struct vnode *vp;
 {
 	register struct nfsnode *np = VTONFS(vp);
 
 #ifdef DIAGNOSTIC
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (np->n_cookies.lh_first)
 		np->n_cookies.lh_first->ndm_eocookie = 0;
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * flag. Once done the new write verifier can be set for the mount point.
  */
 void
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
 	register struct vnode *vp, *nvp;
 	register struct buf *bp, *nbp;
 	int s;
 
 	s = splbio();
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		if (vp->v_mount != mp)	/* Paranoia */
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~B_NEEDCOMMIT;
 		}
 	}
 	splx(s);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
  * numbers not specified for the associated procedure.
  */
 int
 nfsrv_errmap(nd, err)
 	struct nfsrv_descript *nd;
 	register int err;
 {
 	register short *defaulterrp, *errp;
 
 	if (nd->nd_flag & ND_NFSV3) {
 	    if (nd->nd_procnum <= NFSPROC_COMMIT) {
 		errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum];
 		while (*++errp) {
 			if (*errp == err)
 				return (err);
 			else if (*errp > err)
 				break;
 		}
 		return ((int)*defaulterrp);
 	    } else
 		return (err & 0xffff);
 	}
 	if (err <= ELAST)
 		return ((int)nfsrv_v2errmap[err - 1]);
 	return (NFSERR_IO);
 }
 
 int
 nfsrv_vmio(struct vnode *vp) {
 	vm_object_t object;
 
 	if ((vp == NULL) || (vp->v_type != VREG))
 		return 1;
 
 retry:
 	if ((vp->v_flag & VVMIO) == 0) {
 		struct vattr vat;
 		struct proc *p = curproc;
 
 		if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0)
 			panic("nfsrv_vmio: VOP_GETATTR failed");
 
-		(void) vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		(void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
 
 		vp->v_flag |= VVMIO;
 	} else {
 		if ((object = vp->v_object) &&
 			(object->flags & OBJ_DEAD)) {
 			tsleep(object, PVM, "nfdead", 0);
 			goto retry;
 		}
 		if (!object)
 			panic("nfsrv_vmio: VMIO object missing");
 		vm_object_reference(object);
 	}
 	return 0;
 }
 int
 nfsrv_vput(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vput(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vput(vp);
 	}
 	return 0;
 }
 int
 nfsrv_vrele(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vrele(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vrele(vp);
 	}
 	return 0;
 }
 #endif /* NFS_NOSERVER */
Index: head/sys/nfsclient/nfs_subs.c
===================================================================
--- head/sys/nfsclient/nfs_subs.c	(revision 13489)
+++ head/sys/nfsclient/nfs_subs.c	(revision 13490)
@@ -1,1979 +1,1979 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c	8.3 (Berkeley) 1/4/94
- * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $
+ * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $
  */
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #ifdef VFS_LKM
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfs/nfsm_subs.h>
 #include <nfs/nfsmount.h>
 #include <nfs/nqnfs.h>
 #include <nfs/nfsrtt.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <netinet/in.h>
 #ifdef ISO
 #include <netiso/iso.h>
 #endif
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_long nfs_xdrneg1;
 u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
 	rpc_mismatch, rpc_auth_unix, rpc_msgaccepted,
 	rpc_auth_kerb;
 u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false;
 
 /* And other global data */
 static u_long nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON 
 };
 enum vtype nv3tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
 };
 
 int nfs_ticks;
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
 int nfssvc_sockhead_flag;
 struct nfsd_head nfsd_head;
 int nfsd_head_flag;
 struct nfs_bufq nfs_bufq;
 struct nqtimerhead nqtimerhead;
 struct nqfhhashhead *nqfhhashtbl;
 u_long nqfhhash;
 
 #ifndef NFS_NOSERVER
 /*
  * Mapping of old NFS Version 2 RPC numbers to generic numbers.
  */
 int nfsv3_procid[NFS_NPROCS] = {
 	NFSPROC_NULL,
 	NFSPROC_GETATTR,
 	NFSPROC_SETATTR,
 	NFSPROC_NOOP,
 	NFSPROC_LOOKUP,
 	NFSPROC_READLINK,
 	NFSPROC_READ,
 	NFSPROC_NOOP,
 	NFSPROC_WRITE,
 	NFSPROC_CREATE,
 	NFSPROC_REMOVE,
 	NFSPROC_RENAME,
 	NFSPROC_LINK,
 	NFSPROC_SYMLINK,
 	NFSPROC_MKDIR,
 	NFSPROC_RMDIR,
 	NFSPROC_READDIR,
 	NFSPROC_FSSTAT,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP
 };
 
 #endif /* NFS_NOSERVER */
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 #ifndef NFS_NOSERVER
 /*
  * Maps errno values to nfs error numbers.
  * Use NFSERR_IO as the catch all for ones not specifically defined in
  * RFC 1094.
  */
 static u_char nfsrv_v2errmap[ELAST] = {
   NFSERR_PERM,	NFSERR_NOENT,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NXIO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_ACCES,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_EXIST,	NFSERR_IO,	NFSERR_NODEV,	NFSERR_NOTDIR,
   NFSERR_ISDIR,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_FBIG,	NFSERR_NOSPC,	NFSERR_IO,	NFSERR_ROFS,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_NAMETOL,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NOTEMPTY, NFSERR_IO,	NFSERR_IO,	NFSERR_DQUOT,	NFSERR_STALE,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Although it is not obvious whether or not NFS clients really care if
  * a returned error value is in the specified list for the procedure, the
  * safest thing to do is filter them appropriately. For Version 2, the
  * X/Open XNFS document is the only specification that defines error values
  * for each RPC (The RFC simply lists all possible error values for all RPCs),
  * so I have decided to not do this for Version 2.
  * The first entry is the default error return and the rest are the valid
  * errors for that RPC in increasing numeric order.
  */
 static short nfsv3err_null[] = {
 	0,
 	0,
 };
 
 static short nfsv3err_getattr[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_setattr[] = {
 	NFSERR_IO,
 	NFSERR_PERM,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOT_SYNC,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_lookup[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_access[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_read[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_NXIO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_write[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_FBIG,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_create[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mkdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_symlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mknod[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	NFSERR_BADTYPE,
 	0,
 };
 
 static short nfsv3err_remove[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rmdir[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rename[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_ISDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_link[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdirplus[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_NOTSUPP,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsstat[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsinfo[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_pathconf[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_commit[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short *nfsrv_v3errmap[] = {
 	nfsv3err_null,
 	nfsv3err_getattr,
 	nfsv3err_setattr,
 	nfsv3err_lookup,
 	nfsv3err_access,
 	nfsv3err_readlink,
 	nfsv3err_read,
 	nfsv3err_write,
 	nfsv3err_create,
 	nfsv3err_mkdir,
 	nfsv3err_symlink,
 	nfsv3err_mknod,
 	nfsv3err_remove,
 	nfsv3err_rmdir,
 	nfsv3err_rename,
 	nfsv3err_link,
 	nfsv3err_readdir,
 	nfsv3err_readdirplus,
 	nfsv3err_fsstat,
 	nfsv3err_fsinfo,
 	nfsv3err_pathconf,
 	nfsv3err_commit,
 };
 
 #endif /* NFS_NOSERVER */
 
 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsrtt nfsrtt;
 extern time_t nqnfsstarttime;
 extern int nqsrv_clockskew;
 extern int nqsrv_writeslack;
 extern int nqsrv_maxlease;
 extern struct nfsstats nfsstats;
 extern int nqnfs_piggy[NFS_NPROCS];
 extern nfstype nfsv2_type[9];
 extern nfstype nfsv3_type[9];
 extern struct nfsnodehashhead *nfsnodehashtbl;
 extern u_long nfsnodehash;
 
 #ifdef VFS_LKM
 struct getfh_args;
 extern int getfh(struct proc *, struct getfh_args *, int *);
 struct nfssvc_args;
 extern int nfssvc(struct proc *, struct nfssvc_args *, int *);
 #endif
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 /*
  * Create the header for an rpc request packet
  * The hsiz is the size of the rest of the nfs request header.
  * (just used to decide if a cluster is a good idea)
  */
 struct mbuf *
 nfsm_reqh(vp, procid, hsiz, bposp)
 	struct vnode *vp;
 	u_long procid;
 	int hsiz;
 	caddr_t *bposp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	struct mbuf *mb2;
 	struct nfsmount *nmp;
 	int nqflag;
 
 	MGET(mb, M_WAIT, MT_DATA);
 	if (hsiz >= MINCLSIZE)
 		MCLGET(mb, M_WAIT);
 	mb->m_len = 0;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * For NQNFS, add lease request.
 	 */
 	if (vp) {
 		nmp = VFSTONFS(vp->v_mount);
 		if (nmp->nm_flag & NFSMNT_NQNFS) {
 			nqflag = NQNFS_NEEDLEASE(vp, procid);
 			if (nqflag) {
 				nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(nqflag);
 				*tl = txdr_unsigned(nmp->nm_leaseterm);
 			} else {
 				nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 				*tl = 0;
 			}
 		}
 	}
 	/* Finally, return values */
 	*bposp = bpos;
 	return (mb);
 }
 
 /*
  * Build the RPC header and fill in the authorization info.
  * The authorization string argument is only used when the credentials
  * come from outside of the kernel.
  * Returns the head of the mbuf list.
  */
 struct mbuf *
 nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len,
 	verf_str, mrest, mrest_len, mbp, xidp)
 	register struct ucred *cr;
 	int nmflag;
 	int procid;
 	int auth_type;
 	int auth_len;
 	char *auth_str;
 	int verf_len;
 	char *verf_str;
 	struct mbuf *mrest;
 	int mrest_len;
 	struct mbuf **mbp;
 	u_long *xidp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	register int i;
 	struct mbuf *mreq, *mb2;
 	int siz, grpsiz, authsiz;
 
 	authsiz = nfsm_rndup(auth_len);
 	MGETHDR(mb, M_WAIT, MT_DATA);
 	if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
 		MCLGET(mb, M_WAIT);
 	} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
 		MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
 	} else {
 		MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
 	}
 	mb->m_len = 0;
 	mreq = mb;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * First the RPC header.
 	 */
 	nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED);
 	if (++nfs_xid == 0)
 		nfs_xid++;
 	*tl++ = *xidp = txdr_unsigned(nfs_xid);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	if (nmflag & NFSMNT_NQNFS) {
 		*tl++ = txdr_unsigned(NQNFS_PROG);
 		*tl++ = txdr_unsigned(NQNFS_VER3);
 	} else {
 		*tl++ = txdr_unsigned(NFS_PROG);
 		if (nmflag & NFSMNT_NFSV3)
 			*tl++ = txdr_unsigned(NFS_VER3);
 		else
 			*tl++ = txdr_unsigned(NFS_VER2);
 	}
 	if (nmflag & NFSMNT_NFSV3)
 		*tl++ = txdr_unsigned(procid);
 	else
 		*tl++ = txdr_unsigned(nfsv2_procid[procid]);
 
 	/*
 	 * And then the authorization cred.
 	 */
 	*tl++ = txdr_unsigned(auth_type);
 	*tl = txdr_unsigned(authsiz);
 	switch (auth_type) {
 	case RPCAUTH_UNIX:
 		nfsm_build(tl, u_long *, auth_len);
 		*tl++ = 0;		/* stamp ?? */
 		*tl++ = 0;		/* NULL hostname */
 		*tl++ = txdr_unsigned(cr->cr_uid);
 		*tl++ = txdr_unsigned(cr->cr_groups[0]);
 		grpsiz = (auth_len >> 2) - 5;
 		*tl++ = txdr_unsigned(grpsiz);
 		for (i = 1; i <= grpsiz; i++)
 			*tl++ = txdr_unsigned(cr->cr_groups[i]);
 		break;
 	case RPCAUTH_KERB4:
 		siz = auth_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(auth_str, bpos, i);
 			mb->m_len += i;
 			auth_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 		break;
 	};
 
 	/*
 	 * And the verifier...
 	 */
 	nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
 	if (verf_str) {
 		*tl++ = txdr_unsigned(RPCAUTH_KERB4);
 		*tl = txdr_unsigned(verf_len);
 		siz = verf_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(verf_str, bpos, i);
 			mb->m_len += i;
 			verf_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 	} else {
 		*tl++ = txdr_unsigned(RPCAUTH_NULL);
 		*tl = 0;
 	}
 	mb->m_next = mrest;
 	mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
 	mreq->m_pkthdr.rcvif = (struct ifnet *)0;
 	*mbp = mb;
 	return (mreq);
 }
 
 /*
  * copies mbuf chain to the uio scatter/gather list
  */
 int
 nfsm_mbuftouio(mrep, uiop, siz, dpos)
 	struct mbuf **mrep;
 	register struct uio *uiop;
 	int siz;
 	caddr_t *dpos;
 {
 	register char *mbufcp, *uiocp;
 	register int xfer, left, len;
 	register struct mbuf *mp;
 	long uiosiz, rem;
 	int error = 0;
 
 	mp = *mrep;
 	mbufcp = *dpos;
 	len = mtod(mp, caddr_t)+mp->m_len-mbufcp;
 	rem = nfsm_rndup(siz)-siz;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			while (len == 0) {
 				mp = mp->m_next;
 				if (mp == NULL)
 					return (EBADRPC);
 				mbufcp = mtod(mp, caddr_t);
 				len = mp->m_len;
 			}
 			xfer = (left > len) ? len : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(mbufcp, uiocp, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(mbufcp, uiocp, xfer);
 			else
 				copyout(mbufcp, uiocp, xfer);
 			left -= xfer;
 			len -= xfer;
 			mbufcp += xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	*dpos = mbufcp;
 	*mrep = mp;
 	if (rem > 0) {
 		if (len < rem)
 			error = nfs_adv(mrep, dpos, rem, len);
 		else
 			*dpos += rem;
 	}
 	return (error);
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain...
  */
 int
 nfsm_uiotombuf(uiop, mq, siz, bpos)
 	register struct uio *uiop;
 	struct mbuf **mq;
 	int siz;
 	caddr_t *bpos;
 {
 	register char *uiocp;
 	register struct mbuf *mp, *mp2;
 	register int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EINVAL);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				MGET(mp, M_WAIT, MT_DATA);
 				if (clflg)
 					MCLGET(mp, M_WAIT);
 				mp->m_len = 0;
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			MGET(mp, M_WAIT, MT_DATA);
 			mp->m_len = 0;
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Help break down an mbuf chain by setting the first siz bytes contiguous
  * pointed to by returned val.
  * This is used by the macros nfsm_dissect and nfsm_dissecton for tough
  * cases. (The macros use the vars. dpos and dpos2)
  */
 int
 nfsm_disct(mdp, dposp, siz, left, cp2)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int siz;
 	int left;
 	caddr_t *cp2;
 {
 	register struct mbuf *mp, *mp2;
 	register int siz2, xfer;
 	register caddr_t p;
 
 	mp = *mdp;
 	while (left == 0) {
 		*mdp = mp = mp->m_next;
 		if (mp == NULL)
 			return (EBADRPC);
 		left = mp->m_len;
 		*dposp = mtod(mp, caddr_t);
 	}
 	if (left >= siz) {
 		*cp2 = *dposp;
 		*dposp += siz;
 	} else if (mp->m_next == NULL) {
 		return (EBADRPC);
 	} else if (siz > MHLEN) {
 		panic("nfs S too big");
 	} else {
 		MGET(mp2, M_WAIT, MT_DATA);
 		mp2->m_next = mp->m_next;
 		mp->m_next = mp2;
 		mp->m_len -= left;
 		mp = mp2;
 		*cp2 = p = mtod(mp, caddr_t);
 		bcopy(*dposp, p, left);		/* Copy what was left */
 		siz2 = siz-left;
 		p += left;
 		mp2 = mp->m_next;
 		/* Loop around copying up the siz2 bytes */
 		while (siz2 > 0) {
 			if (mp2 == NULL)
 				return (EBADRPC);
 			xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2;
 			if (xfer > 0) {
 				bcopy(mtod(mp2, caddr_t), p, xfer);
 				NFSMADV(mp2, xfer);
 				mp2->m_len -= xfer;
 				p += xfer;
 				siz2 -= xfer;
 			}
 			if (siz2 > 0)
 				mp2 = mp2->m_next;
 		}
 		mp->m_len = siz;
 		*mdp = mp2;
 		*dposp = mtod(mp2, caddr_t);
 	}
 	return (0);
 }
 
 /*
  * Advance the position in the mbuf chain.
  */
 int
 nfs_adv(mdp, dposp, offs, left)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int offs;
 	int left;
 {
 	register struct mbuf *m;
 	register int s;
 
 	m = *mdp;
 	s = left;
 	while (s < offs) {
 		offs -= s;
 		m = m->m_next;
 		if (m == NULL)
 			return (EBADRPC);
 		s = m->m_len;
 	}
 	*mdp = m;
 	*dposp = mtod(m, caddr_t)+offs;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(mb, bpos, cp, siz)
 	struct mbuf **mb;
 	char **bpos;
 	char *cp;
 	long siz;
 {
 	register struct mbuf *m1 = 0, *m2;
 	long left, xfer, len, tlen;
 	u_long *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_long *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		MGET(m1, M_WAIT, MT_DATA);
 		if (siz > MLEN)
 			MCLGET(m1, M_WAIT);
 		m1->m_len = NFSMSIZ(m1);
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_long *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init()
 {
 	register int i;
 
 	/*
 	 * Check to see if major data structures haven't bloated.
 	 */
 	if (sizeof (struct nfsnode) > NFS_NODEALLOC) {
 		printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC);
 		printf("Try reducing NFS_SMALLFH\n");
 	}
 	if (sizeof (struct nfsmount) > NFS_MNTALLOC) {
 		printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC);
 		printf("Try reducing NFS_MUIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) {
 		printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC);
 		printf("Try reducing NFS_UIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfsuid) > NFS_UIDALLOC) {
 		printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC);
 		printf("Try unionizing the nu_nickname and nu_flag fields\n");
 	}
 	nfsrtt.pos = 0;
 	rpc_vers = txdr_unsigned(RPC_VER2);
 	rpc_call = txdr_unsigned(RPC_CALL);
 	rpc_reply = txdr_unsigned(RPC_REPLY);
 	rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 	rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 	rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 	rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 	rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 	rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4);
 	nfs_prog = txdr_unsigned(NFS_PROG);
 	nqnfs_prog = txdr_unsigned(NQNFS_PROG);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 		nfs_iodwant[i] = (struct proc *)0;
 	TAILQ_INIT(&nfs_bufq);
 	nfs_nhinit();			/* Init the nfsnode table */
 #ifndef NFS_NOSERVER
 	nfsrv_init(0);			/* Init server data structures */
 	nfsrv_initcache();		/* Init the server request cache */
 #endif
 
 	/*
 	 * Initialize the nqnfs server stuff.
 	 */
 	if (nqnfsstarttime == 0) {
 		nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease
 			+ nqsrv_clockskew + nqsrv_writeslack;
 		NQLOADNOVRAM(nqnfsstarttime);
 		CIRCLEQ_INIT(&nqtimerhead);
 		nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash);
 	}
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
 #ifndef NFS_NOSERVER
 	nfs_timer(0);
 #endif
 
 #ifdef __FreeBSD__
 	/*
 	 * Set up lease_check and lease_updatetime so that other parts
 	 * of the system can call us, if we are loadable.
 	 */
 #ifndef NFS_NOSERVER
 	lease_check = nfs_lease_check;
 #endif
 	lease_updatetime = nfs_lease_updatetime;
 	vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */
 #ifdef VFS_LKM
 	sysent[SYS_nfssvc].sy_narg = 2;
 	sysent[SYS_nfssvc].sy_call = nfssvc;
 #ifndef NFS_NOSERVER
 	sysent[SYS_getfh].sy_narg = 2;
 	sysent[SYS_getfh].sy_call = getfh;
 #endif
 #endif
 #endif
 
 	return (0);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(vpp, mdp, dposp, vaper)
 	struct vnode **vpp;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vattr *vaper;
 {
 	register struct vnode *vp = *vpp;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 	register struct nfsnode *np;
 	register struct nfsnodehashhead *nhpp;
 	register long t1;
 	caddr_t cp2;
 	int error = 0, rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime;
 	struct vnode *nvp;
 	int v3 = NFS_ISV3(vp);
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2))
 		return (error);
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(u_char, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(long, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	if (vp->v_type != vtyp) {
 		/*
 		 * If we had a lock and it turns out that the vnode
 		 * is an object which we don't want to lock (e.g. VDIR)
 		 * to avoid nasty hanging problems on a server crash,
 		 * then release it here.
 		 */
 		if (vtyp != VREG && VOP_ISLOCKED(vp))
 			VOP_UNLOCK(vp);
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO) {
 			vp->v_op = fifo_nfsv2nodeop_p;
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
 			nvp = checkalias(vp, (dev_t)rdev, vp->v_mount);
 			if (nvp) {
 				/*
 				 * Discard unneeded vnode, but save its nfsnode.
 				 */
 				LIST_REMOVE(np, n_hash);
 				nvp->v_data = vp->v_data;
 				vp->v_data = NULL;
 				vp->v_op = spec_vnodeop_p;
 				vrele(vp);
 				vgone(vp);
 				/*
 				 * Reinitialize aliased node.
 				 */
 				np->n_vnode = nvp;
 				nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize));
 				LIST_INSERT_HEAD(nhpp, np, n_hash);
 				*vpp = vp = nvp;
 			}
 		}
 		np->n_mtime = mtime.ts_sec;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = (dev_t)rdev;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		fxdr_hyper(&fp->fa3_size, &vap->va_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		fxdr_hyper(&fp->fa3_used, &vap->va_bytes);
 		vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_long, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize);
 		vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.ts_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	np->n_attrstamp = time.tv_sec;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(vp, vaper)
 	register struct vnode *vp;
 	struct vattr *vaper;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register struct vattr *vap;
 
 	if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) {
 		nfsstats.attrcache_misses++;
 		return (ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	vap = &np->n_vattr;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	return (0);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Set up nameidata for a lookup() call and do it
  */
 int
 nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag)
 	register struct nameidata *ndp;
 	fhandle_t *fhp;
 	int len;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vnode **retdirp;
 	struct proc *p;
 	int kerbflag;
 {
 	register int i, rem;
 	register struct mbuf *md;
 	register char *fromcp, *tocp;
 	struct vnode *dp;
 	int error, rdonly;
 	struct componentname *cnp = &ndp->ni_cnd;
 
 	*retdirp = (struct vnode *)0;
 	MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK);
 	/*
 	 * Copy the name from the mbuf list to ndp->ni_pnbuf
 	 * and set the various ndp fields appropriately.
 	 */
 	fromcp = *dposp;
 	tocp = cnp->cn_pnbuf;
 	md = *mdp;
 	rem = mtod(md, caddr_t) + md->m_len - fromcp;
 	cnp->cn_hash = 0;
 	for (i = 0; i < len; i++) {
 		while (rem == 0) {
 			md = md->m_next;
 			if (md == NULL) {
 				error = EBADRPC;
 				goto out;
 			}
 			fromcp = mtod(md, caddr_t);
 			rem = md->m_len;
 		}
 		if (*fromcp == '\0' || *fromcp == '/') {
 			error = EACCES;
 			goto out;
 		}
 		cnp->cn_hash += (unsigned char)*fromcp;
 		*tocp++ = *fromcp++;
 		rem--;
 	}
 	*tocp = '\0';
 	*mdp = md;
 	*dposp = fromcp;
 	len = nfsm_rndup(len)-len;
 	if (len > 0) {
 		if (rem >= len)
 			*dposp += len;
 		else if (error = nfs_adv(mdp, dposp, len, rem))
 			goto out;
 	}
 	ndp->ni_pathlen = tocp - cnp->cn_pnbuf;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	/*
 	 * Extract and set starting directory.
 	 */
 	if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp,
 	    nam, &rdonly, kerbflag))
 		goto out;
 	if (dp->v_type != VDIR) {
 		nfsrv_vrele(dp);
 		error = ENOTDIR;
 		goto out;
 	}
 	VREF(dp);
 	*retdirp = dp;
 	ndp->ni_startdir = dp;
 	if (rdonly)
 		cnp->cn_flags |= (NOCROSSMOUNT | RDONLY);
 	else
 		cnp->cn_flags |= NOCROSSMOUNT;
 	/*
 	 * And call lookup() to do the real work
 	 */
 	cnp->cn_proc = p;
 	if (error = lookup(ndp))
 		goto out;
 	/*
 	 * Check for encountering a symbolic link
 	 */
 	if (cnp->cn_flags & ISSYMLINK) {
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 		error = EINVAL;
 		goto out;
 	}
 
 	nfsrv_vmio(ndp->ni_vp);
 
 	/*
 	 * Check for saved name request
 	 */
 	if (cnp->cn_flags & (SAVENAME | SAVESTART)) {
 		cnp->cn_flags |= HASBUF;
 		return (0);
 	}
 out:
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 	return (error);
 }
 
 /*
  * A fiddled version of m_adj() that ensures null fill to a long
  * boundary and only trims off the back end
  */
 void
 nfsm_adj(mp, len, nul)
 	struct mbuf *mp;
 	register int len;
 	int nul;
 {
 	register struct mbuf *m;
 	register int count, i;
 	register char *cp;
 
 	/*
 	 * Trim from tail.  Scan the mbuf chain,
 	 * calculating its length and finding the last mbuf.
 	 * If the adjustment only affects this mbuf, then just
 	 * adjust and return.  Otherwise, rescan and truncate
 	 * after the remaining size.
 	 */
 	count = 0;
 	m = mp;
 	for (;;) {
 		count += m->m_len;
 		if (m->m_next == (struct mbuf *)0)
 			break;
 		m = m->m_next;
 	}
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
 		}
 		return;
 	}
 	count -= len;
 	if (count < 0)
 		count = 0;
 	/*
 	 * Correct length for chain is "count".
 	 * Find the mbuf with last data, adjust its length,
 	 * and toss data from remaining mbufs on chain.
 	 */
 	for (m = mp; m; m = m->m_next) {
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
 			}
 			break;
 		}
 		count -= m->m_len;
 	}
 	for (m = m->m_next;m;m = m->m_next)
 		m->m_len = 0;
 }
 
 /*
  * Make these functions instead of macros, so that the kernel text size
  * doesn't get too big...
  */
 void
 nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int before_ret;
 	register struct vattr *before_vap;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 
 	if (before_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED);
 		*tl++ = nfs_true;
 		txdr_hyper(&(before_vap->va_size), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_mtime), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_ctime), tl);
 	}
 	*bposp = bpos;
 	*mbp = mb;
 	nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp);
 }
 
 void
 nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 	register struct nfs_fattr *fp;
 
 	if (after_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR);
 		*tl++ = nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		nfsm_srvfattr(nfsd, after_vap, fp);
 	}
 	*mbp = mb;
 	*bposp = bpos;
 }
 
 void
 nfsm_srvfattr(nfsd, vap, fp)
 	register struct nfsrv_descript *nfsd;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 {
 
 	fp->fa_nlink = txdr_unsigned(vap->va_nlink);
 	fp->fa_uid = txdr_unsigned(vap->va_uid);
 	fp->fa_gid = txdr_unsigned(vap->va_gid);
 	if (nfsd->nd_flag & ND_NFSV3) {
 		fp->fa_type = vtonfsv3_type(vap->va_type);
 		fp->fa_mode = vtonfsv3_mode(vap->va_mode);
 		txdr_hyper(&vap->va_size, &fp->fa3_size);
 		txdr_hyper(&vap->va_bytes, &fp->fa3_used);
 		fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev));
 		fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev));
 		fp->fa3_fsid.nfsuquad[0] = 0;
 		fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid);
 		fp->fa3_fileid.nfsuquad[0] = 0;
 		fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime);
 		txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime);
 		txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime);
 	} else {
 		fp->fa_type = vtonfsv2_type(vap->va_type);
 		fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		fp->fa2_size = txdr_unsigned(vap->va_size);
 		fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize);
 		if (vap->va_type == VFIFO)
 			fp->fa2_rdev = 0xffffffff;
 		else
 			fp->fa2_rdev = txdr_unsigned(vap->va_rdev);
 		fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE);
 		fp->fa2_fsid = txdr_unsigned(vap->va_fsid);
 		fp->fa2_fileid = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime);
 		txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime);
 		txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime);
 	}
 }
 
 /*
  * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked)
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling VFS_FHTOVP()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	- if not lockflag unlock it with VOP_UNLOCK()
  */
 int
 nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag)
 	fhandle_t *fhp;
 	int lockflag;
 	struct vnode **vpp;
 	struct ucred *cred;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	int *rdonlyp;
 	int kerbflag;
 {
 	register struct mount *mp;
 	register int i;
 	struct ucred *credanon;
 	int error, exflags;
 
 	*vpp = (struct vnode *)0;
 	mp = getvfs(&fhp->fh_fsid);
 	if (!mp)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon);
 	if (error)
 		return (error);
 	/*
 	 * Check/setup credentials.
 	 */
 	if (exflags & MNT_EXKERB) {
 		if (!kerbflag) {
 			vput(*vpp);
 			return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		}
 	} else if (kerbflag) {
 		vput(*vpp);
 		return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 	} else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 		cred->cr_uid = credanon->cr_uid;
 		for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++)
 			cred->cr_groups[i] = credanon->cr_groups[i];
 		cred->cr_ngroups = i;
 	}
 	if (exflags & MNT_EXRDONLY)
 		*rdonlyp = 1;
 	else
 		*rdonlyp = 0;
 
 	nfsrv_vmio(*vpp);
 
 	if (!lockflag)
 		VOP_UNLOCK(*vpp);
 	return (0);
 }
 
 #endif /* NFS_NOSERVER */
 /*
  * This function compares two net addresses by family and returns TRUE
  * if they are the same host.
  * If there is any doubt, return FALSE.
  * The AF_INET family is handled as a special case so that address mbufs
  * don't need to be saved to store "struct in_addr", which is only 4 bytes.
  */
 int
 netaddr_match(family, haddr, nam)
 	int family;
 	union nethostaddr *haddr;
 	struct mbuf *nam;
 {
 	register struct sockaddr_in *inetaddr;
 
 	switch (family) {
 	case AF_INET:
 		inetaddr = mtod(nam, struct sockaddr_in *);
 		if (inetaddr->sin_family == AF_INET &&
 		    inetaddr->sin_addr.s_addr == haddr->had_inetaddr)
 			return (1);
 		break;
 #ifdef ISO
 	case AF_ISO:
 	    {
 		register struct sockaddr_iso *isoaddr1, *isoaddr2;
 
 		isoaddr1 = mtod(nam, struct sockaddr_iso *);
 		isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *);
 		if (isoaddr1->siso_family == AF_ISO &&
 		    isoaddr1->siso_nlen > 0 &&
 		    isoaddr1->siso_nlen == isoaddr2->siso_nlen &&
 		    SAME_ISOADDR(isoaddr1, isoaddr2))
 			return (1);
 		break;
 	    }
 #endif	/* ISO */
 	default:
 		break;
 	};
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { 0, 0 };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(np, off, add)
 	register struct nfsnode *np;
 	off_t off;
 	int add;
 {
 	register struct nfsdmap *dp, *dp2;
 	register int pos;
 
 	pos = off / NFS_DIRBLKSIZ;
 	if (pos == 0) {
 #ifdef DIAGNOSTIC
 		if (add)
 			panic("nfs getcookie add at 0");
 #endif
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = np->n_cookies.lh_first;
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			return ((nfsuint64 *)0);
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (dp->ndm_list.le_next) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 				pos >= dp->ndm_eocookie)
 				return ((nfsuint64 *)0);
 			dp = dp->ndm_list.le_next;
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			return ((nfsuint64 *)0);
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			return ((nfsuint64 *)0);
 	}
 	return (&dp->ndm_cookies[pos]);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(vp)
 	register struct vnode *vp;
 {
 	register struct nfsnode *np = VTONFS(vp);
 
 #ifdef DIAGNOSTIC
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (np->n_cookies.lh_first)
 		np->n_cookies.lh_first->ndm_eocookie = 0;
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * flag. Once done the new write verifier can be set for the mount point.
  */
 void
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
 	register struct vnode *vp, *nvp;
 	register struct buf *bp, *nbp;
 	int s;
 
 	s = splbio();
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		if (vp->v_mount != mp)	/* Paranoia */
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~B_NEEDCOMMIT;
 		}
 	}
 	splx(s);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
  * numbers not specified for the associated procedure.
  */
 int
 nfsrv_errmap(nd, err)
 	struct nfsrv_descript *nd;
 	register int err;
 {
 	register short *defaulterrp, *errp;
 
 	if (nd->nd_flag & ND_NFSV3) {
 	    if (nd->nd_procnum <= NFSPROC_COMMIT) {
 		errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum];
 		while (*++errp) {
 			if (*errp == err)
 				return (err);
 			else if (*errp > err)
 				break;
 		}
 		return ((int)*defaulterrp);
 	    } else
 		return (err & 0xffff);
 	}
 	if (err <= ELAST)
 		return ((int)nfsrv_v2errmap[err - 1]);
 	return (NFSERR_IO);
 }
 
 int
 nfsrv_vmio(struct vnode *vp) {
 	vm_object_t object;
 
 	if ((vp == NULL) || (vp->v_type != VREG))
 		return 1;
 
 retry:
 	if ((vp->v_flag & VVMIO) == 0) {
 		struct vattr vat;
 		struct proc *p = curproc;
 
 		if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0)
 			panic("nfsrv_vmio: VOP_GETATTR failed");
 
-		(void) vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		(void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
 
 		vp->v_flag |= VVMIO;
 	} else {
 		if ((object = vp->v_object) &&
 			(object->flags & OBJ_DEAD)) {
 			tsleep(object, PVM, "nfdead", 0);
 			goto retry;
 		}
 		if (!object)
 			panic("nfsrv_vmio: VMIO object missing");
 		vm_object_reference(object);
 	}
 	return 0;
 }
 int
 nfsrv_vput(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vput(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vput(vp);
 	}
 	return 0;
 }
 int
 nfsrv_vrele(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vrele(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vrele(vp);
 	}
 	return 0;
 }
 #endif /* NFS_NOSERVER */
Index: head/sys/nfsserver/nfs_srvsubs.c
===================================================================
--- head/sys/nfsserver/nfs_srvsubs.c	(revision 13489)
+++ head/sys/nfsserver/nfs_srvsubs.c	(revision 13490)
@@ -1,1979 +1,1979 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)nfs_subs.c	8.3 (Berkeley) 1/4/94
- * $Id: nfs_subs.c,v 1.26 1995/12/17 21:12:30 phk Exp $
+ * $Id: nfs_subs.c,v 1.27 1996/01/13 23:27:56 phk Exp $
  */
 
 /*
  * These functions support the macros and help fiddle mbuf chains for
  * the nfs op functions. They do things like create the rpc header and
  * copy data between mbuf chains and uio lists.
  */
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/malloc.h>
 #ifdef VFS_LKM
 #include <sys/sysent.h>
 #include <sys/syscall.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vnode_pager.h>
 
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfs/nfsnode.h>
 #include <nfs/nfs.h>
 #include <nfs/xdr_subs.h>
 #include <nfs/nfsm_subs.h>
 #include <nfs/nfsmount.h>
 #include <nfs/nqnfs.h>
 #include <nfs/nfsrtt.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <netinet/in.h>
 #ifdef ISO
 #include <netiso/iso.h>
 #endif
 
 /*
  * Data items converted to xdr at startup, since they are constant
  * This is kinda hokey, but may save a little time doing byte swaps
  */
 u_long nfs_xdrneg1;
 u_long rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr,
 	rpc_mismatch, rpc_auth_unix, rpc_msgaccepted,
 	rpc_auth_kerb;
 u_long nfs_prog, nqnfs_prog, nfs_true, nfs_false;
 
 /* And other global data */
 static u_long nfs_xid = 0;
 static enum vtype nv2tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON,  VNON 
 };
 enum vtype nv3tov_type[8]= {
 	VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO
 };
 
 int nfs_ticks;
 
 struct nfs_reqq nfs_reqq;
 struct nfssvc_sockhead nfssvc_sockhead;
 int nfssvc_sockhead_flag;
 struct nfsd_head nfsd_head;
 int nfsd_head_flag;
 struct nfs_bufq nfs_bufq;
 struct nqtimerhead nqtimerhead;
 struct nqfhhashhead *nqfhhashtbl;
 u_long nqfhhash;
 
 #ifndef NFS_NOSERVER
 /*
  * Mapping of old NFS Version 2 RPC numbers to generic numbers.
  */
 int nfsv3_procid[NFS_NPROCS] = {
 	NFSPROC_NULL,
 	NFSPROC_GETATTR,
 	NFSPROC_SETATTR,
 	NFSPROC_NOOP,
 	NFSPROC_LOOKUP,
 	NFSPROC_READLINK,
 	NFSPROC_READ,
 	NFSPROC_NOOP,
 	NFSPROC_WRITE,
 	NFSPROC_CREATE,
 	NFSPROC_REMOVE,
 	NFSPROC_RENAME,
 	NFSPROC_LINK,
 	NFSPROC_SYMLINK,
 	NFSPROC_MKDIR,
 	NFSPROC_RMDIR,
 	NFSPROC_READDIR,
 	NFSPROC_FSSTAT,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP,
 	NFSPROC_NOOP
 };
 
 #endif /* NFS_NOSERVER */
 /*
  * and the reverse mapping from generic to Version 2 procedure numbers
  */
 int nfsv2_procid[NFS_NPROCS] = {
 	NFSV2PROC_NULL,
 	NFSV2PROC_GETATTR,
 	NFSV2PROC_SETATTR,
 	NFSV2PROC_LOOKUP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_READLINK,
 	NFSV2PROC_READ,
 	NFSV2PROC_WRITE,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_MKDIR,
 	NFSV2PROC_SYMLINK,
 	NFSV2PROC_CREATE,
 	NFSV2PROC_REMOVE,
 	NFSV2PROC_RMDIR,
 	NFSV2PROC_RENAME,
 	NFSV2PROC_LINK,
 	NFSV2PROC_READDIR,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_STATFS,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 	NFSV2PROC_NOOP,
 };
 
 #ifndef NFS_NOSERVER
 /*
  * Maps errno values to nfs error numbers.
  * Use NFSERR_IO as the catch all for ones not specifically defined in
  * RFC 1094.
  */
 static u_char nfsrv_v2errmap[ELAST] = {
   NFSERR_PERM,	NFSERR_NOENT,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NXIO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_ACCES,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_EXIST,	NFSERR_IO,	NFSERR_NODEV,	NFSERR_NOTDIR,
   NFSERR_ISDIR,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_FBIG,	NFSERR_NOSPC,	NFSERR_IO,	NFSERR_ROFS,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_NAMETOL,	NFSERR_IO,	NFSERR_IO,
   NFSERR_NOTEMPTY, NFSERR_IO,	NFSERR_IO,	NFSERR_DQUOT,	NFSERR_STALE,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,	NFSERR_IO,
   NFSERR_IO,
 };
 
 /*
  * Maps errno values to nfs error numbers.
  * Although it is not obvious whether or not NFS clients really care if
  * a returned error value is in the specified list for the procedure, the
  * safest thing to do is filter them appropriately. For Version 2, the
  * X/Open XNFS document is the only specification that defines error values
  * for each RPC (The RFC simply lists all possible error values for all RPCs),
  * so I have decided to not do this for Version 2.
  * The first entry is the default error return and the rest are the valid
  * errors for that RPC in increasing numeric order.
  */
 static short nfsv3err_null[] = {
 	0,
 	0,
 };
 
 static short nfsv3err_getattr[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_setattr[] = {
 	NFSERR_IO,
 	NFSERR_PERM,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOT_SYNC,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_lookup[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_access[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_read[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_NXIO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_write[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_INVAL,
 	NFSERR_FBIG,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_create[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mkdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_symlink[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_mknod[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	NFSERR_BADTYPE,
 	0,
 };
 
 static short nfsv3err_remove[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rmdir[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_ROFS,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_rename[] = {
 	NFSERR_IO,
 	NFSERR_NOENT,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_ISDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_NOTEMPTY,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_link[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_EXIST,
 	NFSERR_XDEV,
 	NFSERR_NOTDIR,
 	NFSERR_INVAL,
 	NFSERR_NOSPC,
 	NFSERR_ROFS,
 	NFSERR_MLINK,
 	NFSERR_NAMETOL,
 	NFSERR_DQUOT,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_NOTSUPP,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdir[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_readdirplus[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_ACCES,
 	NFSERR_NOTDIR,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_BAD_COOKIE,
 	NFSERR_NOTSUPP,
 	NFSERR_TOOSMALL,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsstat[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_fsinfo[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_pathconf[] = {
 	NFSERR_STALE,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short nfsv3err_commit[] = {
 	NFSERR_IO,
 	NFSERR_IO,
 	NFSERR_STALE,
 	NFSERR_BADHANDLE,
 	NFSERR_SERVERFAULT,
 	0,
 };
 
 static short *nfsrv_v3errmap[] = {
 	nfsv3err_null,
 	nfsv3err_getattr,
 	nfsv3err_setattr,
 	nfsv3err_lookup,
 	nfsv3err_access,
 	nfsv3err_readlink,
 	nfsv3err_read,
 	nfsv3err_write,
 	nfsv3err_create,
 	nfsv3err_mkdir,
 	nfsv3err_symlink,
 	nfsv3err_mknod,
 	nfsv3err_remove,
 	nfsv3err_rmdir,
 	nfsv3err_rename,
 	nfsv3err_link,
 	nfsv3err_readdir,
 	nfsv3err_readdirplus,
 	nfsv3err_fsstat,
 	nfsv3err_fsinfo,
 	nfsv3err_pathconf,
 	nfsv3err_commit,
 };
 
 #endif /* NFS_NOSERVER */
 
 extern struct proc *nfs_iodwant[NFS_MAXASYNCDAEMON];
 extern struct nfsrtt nfsrtt;
 extern time_t nqnfsstarttime;
 extern int nqsrv_clockskew;
 extern int nqsrv_writeslack;
 extern int nqsrv_maxlease;
 extern struct nfsstats nfsstats;
 extern int nqnfs_piggy[NFS_NPROCS];
 extern nfstype nfsv2_type[9];
 extern nfstype nfsv3_type[9];
 extern struct nfsnodehashhead *nfsnodehashtbl;
 extern u_long nfsnodehash;
 
 #ifdef VFS_LKM
 struct getfh_args;
 extern int getfh(struct proc *, struct getfh_args *, int *);
 struct nfssvc_args;
 extern int nfssvc(struct proc *, struct nfssvc_args *, int *);
 #endif
 
 LIST_HEAD(nfsnodehashhead, nfsnode);
 
 /*
  * Create the header for an rpc request packet
  * The hsiz is the size of the rest of the nfs request header.
  * (just used to decide if a cluster is a good idea)
  */
 struct mbuf *
 nfsm_reqh(vp, procid, hsiz, bposp)
 	struct vnode *vp;
 	u_long procid;
 	int hsiz;
 	caddr_t *bposp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	struct mbuf *mb2;
 	struct nfsmount *nmp;
 	int nqflag;
 
 	MGET(mb, M_WAIT, MT_DATA);
 	if (hsiz >= MINCLSIZE)
 		MCLGET(mb, M_WAIT);
 	mb->m_len = 0;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * For NQNFS, add lease request.
 	 */
 	if (vp) {
 		nmp = VFSTONFS(vp->v_mount);
 		if (nmp->nm_flag & NFSMNT_NQNFS) {
 			nqflag = NQNFS_NEEDLEASE(vp, procid);
 			if (nqflag) {
 				nfsm_build(tl, u_long *, 2*NFSX_UNSIGNED);
 				*tl++ = txdr_unsigned(nqflag);
 				*tl = txdr_unsigned(nmp->nm_leaseterm);
 			} else {
 				nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 				*tl = 0;
 			}
 		}
 	}
 	/* Finally, return values */
 	*bposp = bpos;
 	return (mb);
 }
 
 /*
  * Build the RPC header and fill in the authorization info.
  * The authorization string argument is only used when the credentials
  * come from outside of the kernel.
  * Returns the head of the mbuf list.
  */
 struct mbuf *
 nfsm_rpchead(cr, nmflag, procid, auth_type, auth_len, auth_str, verf_len,
 	verf_str, mrest, mrest_len, mbp, xidp)
 	register struct ucred *cr;
 	int nmflag;
 	int procid;
 	int auth_type;
 	int auth_len;
 	char *auth_str;
 	int verf_len;
 	char *verf_str;
 	struct mbuf *mrest;
 	int mrest_len;
 	struct mbuf **mbp;
 	u_long *xidp;
 {
 	register struct mbuf *mb;
 	register u_long *tl;
 	register caddr_t bpos;
 	register int i;
 	struct mbuf *mreq, *mb2;
 	int siz, grpsiz, authsiz;
 
 	authsiz = nfsm_rndup(auth_len);
 	MGETHDR(mb, M_WAIT, MT_DATA);
 	if ((authsiz + 10 * NFSX_UNSIGNED) >= MINCLSIZE) {
 		MCLGET(mb, M_WAIT);
 	} else if ((authsiz + 10 * NFSX_UNSIGNED) < MHLEN) {
 		MH_ALIGN(mb, authsiz + 10 * NFSX_UNSIGNED);
 	} else {
 		MH_ALIGN(mb, 8 * NFSX_UNSIGNED);
 	}
 	mb->m_len = 0;
 	mreq = mb;
 	bpos = mtod(mb, caddr_t);
 
 	/*
 	 * First the RPC header.
 	 */
 	nfsm_build(tl, u_long *, 8 * NFSX_UNSIGNED);
 	if (++nfs_xid == 0)
 		nfs_xid++;
 	*tl++ = *xidp = txdr_unsigned(nfs_xid);
 	*tl++ = rpc_call;
 	*tl++ = rpc_vers;
 	if (nmflag & NFSMNT_NQNFS) {
 		*tl++ = txdr_unsigned(NQNFS_PROG);
 		*tl++ = txdr_unsigned(NQNFS_VER3);
 	} else {
 		*tl++ = txdr_unsigned(NFS_PROG);
 		if (nmflag & NFSMNT_NFSV3)
 			*tl++ = txdr_unsigned(NFS_VER3);
 		else
 			*tl++ = txdr_unsigned(NFS_VER2);
 	}
 	if (nmflag & NFSMNT_NFSV3)
 		*tl++ = txdr_unsigned(procid);
 	else
 		*tl++ = txdr_unsigned(nfsv2_procid[procid]);
 
 	/*
 	 * And then the authorization cred.
 	 */
 	*tl++ = txdr_unsigned(auth_type);
 	*tl = txdr_unsigned(authsiz);
 	switch (auth_type) {
 	case RPCAUTH_UNIX:
 		nfsm_build(tl, u_long *, auth_len);
 		*tl++ = 0;		/* stamp ?? */
 		*tl++ = 0;		/* NULL hostname */
 		*tl++ = txdr_unsigned(cr->cr_uid);
 		*tl++ = txdr_unsigned(cr->cr_groups[0]);
 		grpsiz = (auth_len >> 2) - 5;
 		*tl++ = txdr_unsigned(grpsiz);
 		for (i = 1; i <= grpsiz; i++)
 			*tl++ = txdr_unsigned(cr->cr_groups[i]);
 		break;
 	case RPCAUTH_KERB4:
 		siz = auth_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(auth_str, bpos, i);
 			mb->m_len += i;
 			auth_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 		break;
 	};
 
 	/*
 	 * And the verifier...
 	 */
 	nfsm_build(tl, u_long *, 2 * NFSX_UNSIGNED);
 	if (verf_str) {
 		*tl++ = txdr_unsigned(RPCAUTH_KERB4);
 		*tl = txdr_unsigned(verf_len);
 		siz = verf_len;
 		while (siz > 0) {
 			if (M_TRAILINGSPACE(mb) == 0) {
 				MGET(mb2, M_WAIT, MT_DATA);
 				if (siz >= MINCLSIZE)
 					MCLGET(mb2, M_WAIT);
 				mb->m_next = mb2;
 				mb = mb2;
 				mb->m_len = 0;
 				bpos = mtod(mb, caddr_t);
 			}
 			i = min(siz, M_TRAILINGSPACE(mb));
 			bcopy(verf_str, bpos, i);
 			mb->m_len += i;
 			verf_str += i;
 			bpos += i;
 			siz -= i;
 		}
 		if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) {
 			for (i = 0; i < siz; i++)
 				*bpos++ = '\0';
 			mb->m_len += siz;
 		}
 	} else {
 		*tl++ = txdr_unsigned(RPCAUTH_NULL);
 		*tl = 0;
 	}
 	mb->m_next = mrest;
 	mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len;
 	mreq->m_pkthdr.rcvif = (struct ifnet *)0;
 	*mbp = mb;
 	return (mreq);
 }
 
 /*
  * copies mbuf chain to the uio scatter/gather list
  */
 int
 nfsm_mbuftouio(mrep, uiop, siz, dpos)
 	struct mbuf **mrep;
 	register struct uio *uiop;
 	int siz;
 	caddr_t *dpos;
 {
 	register char *mbufcp, *uiocp;
 	register int xfer, left, len;
 	register struct mbuf *mp;
 	long uiosiz, rem;
 	int error = 0;
 
 	mp = *mrep;
 	mbufcp = *dpos;
 	len = mtod(mp, caddr_t)+mp->m_len-mbufcp;
 	rem = nfsm_rndup(siz)-siz;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			while (len == 0) {
 				mp = mp->m_next;
 				if (mp == NULL)
 					return (EBADRPC);
 				mbufcp = mtod(mp, caddr_t);
 				len = mp->m_len;
 			}
 			xfer = (left > len) ? len : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(mbufcp, uiocp, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(mbufcp, uiocp, xfer);
 			else
 				copyout(mbufcp, uiocp, xfer);
 			left -= xfer;
 			len -= xfer;
 			mbufcp += xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	*dpos = mbufcp;
 	*mrep = mp;
 	if (rem > 0) {
 		if (len < rem)
 			error = nfs_adv(mrep, dpos, rem, len);
 		else
 			*dpos += rem;
 	}
 	return (error);
 }
 
 /*
  * copies a uio scatter/gather list to an mbuf chain...
  */
 int
 nfsm_uiotombuf(uiop, mq, siz, bpos)
 	register struct uio *uiop;
 	struct mbuf **mq;
 	int siz;
 	caddr_t *bpos;
 {
 	register char *uiocp;
 	register struct mbuf *mp, *mp2;
 	register int xfer, left, mlen;
 	int uiosiz, clflg, rem;
 	char *cp;
 
 	if (siz > MLEN)		/* or should it >= MCLBYTES ?? */
 		clflg = 1;
 	else
 		clflg = 0;
 	rem = nfsm_rndup(siz)-siz;
 	mp = mp2 = *mq;
 	while (siz > 0) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
 			return (EINVAL);
 		left = uiop->uio_iov->iov_len;
 		uiocp = uiop->uio_iov->iov_base;
 		if (left > siz)
 			left = siz;
 		uiosiz = left;
 		while (left > 0) {
 			mlen = M_TRAILINGSPACE(mp);
 			if (mlen == 0) {
 				MGET(mp, M_WAIT, MT_DATA);
 				if (clflg)
 					MCLGET(mp, M_WAIT);
 				mp->m_len = 0;
 				mp2->m_next = mp;
 				mp2 = mp;
 				mlen = M_TRAILINGSPACE(mp);
 			}
 			xfer = (left > mlen) ? mlen : left;
 #ifdef notdef
 			/* Not Yet.. */
 			if (uiop->uio_iov->iov_op != NULL)
 				(*(uiop->uio_iov->iov_op))
 				(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 #endif
 			if (uiop->uio_segflg == UIO_SYSSPACE)
 				bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			else
 				copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer);
 			mp->m_len += xfer;
 			left -= xfer;
 			uiocp += xfer;
 			uiop->uio_offset += xfer;
 			uiop->uio_resid -= xfer;
 		}
 		if (uiop->uio_iov->iov_len <= siz) {
 			uiop->uio_iovcnt--;
 			uiop->uio_iov++;
 		} else {
 			uiop->uio_iov->iov_base += uiosiz;
 			uiop->uio_iov->iov_len -= uiosiz;
 		}
 		siz -= uiosiz;
 	}
 	if (rem > 0) {
 		if (rem > M_TRAILINGSPACE(mp)) {
 			MGET(mp, M_WAIT, MT_DATA);
 			mp->m_len = 0;
 			mp2->m_next = mp;
 		}
 		cp = mtod(mp, caddr_t)+mp->m_len;
 		for (left = 0; left < rem; left++)
 			*cp++ = '\0';
 		mp->m_len += rem;
 		*bpos = cp;
 	} else
 		*bpos = mtod(mp, caddr_t)+mp->m_len;
 	*mq = mp;
 	return (0);
 }
 
 /*
  * Help break down an mbuf chain by setting the first siz bytes contiguous
  * pointed to by returned val.
  * This is used by the macros nfsm_dissect and nfsm_dissecton for tough
  * cases. (The macros use the vars. dpos and dpos2)
  */
 int
 nfsm_disct(mdp, dposp, siz, left, cp2)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int siz;
 	int left;
 	caddr_t *cp2;
 {
 	register struct mbuf *mp, *mp2;
 	register int siz2, xfer;
 	register caddr_t p;
 
 	mp = *mdp;
 	while (left == 0) {
 		*mdp = mp = mp->m_next;
 		if (mp == NULL)
 			return (EBADRPC);
 		left = mp->m_len;
 		*dposp = mtod(mp, caddr_t);
 	}
 	if (left >= siz) {
 		*cp2 = *dposp;
 		*dposp += siz;
 	} else if (mp->m_next == NULL) {
 		return (EBADRPC);
 	} else if (siz > MHLEN) {
 		panic("nfs S too big");
 	} else {
 		MGET(mp2, M_WAIT, MT_DATA);
 		mp2->m_next = mp->m_next;
 		mp->m_next = mp2;
 		mp->m_len -= left;
 		mp = mp2;
 		*cp2 = p = mtod(mp, caddr_t);
 		bcopy(*dposp, p, left);		/* Copy what was left */
 		siz2 = siz-left;
 		p += left;
 		mp2 = mp->m_next;
 		/* Loop around copying up the siz2 bytes */
 		while (siz2 > 0) {
 			if (mp2 == NULL)
 				return (EBADRPC);
 			xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2;
 			if (xfer > 0) {
 				bcopy(mtod(mp2, caddr_t), p, xfer);
 				NFSMADV(mp2, xfer);
 				mp2->m_len -= xfer;
 				p += xfer;
 				siz2 -= xfer;
 			}
 			if (siz2 > 0)
 				mp2 = mp2->m_next;
 		}
 		mp->m_len = siz;
 		*mdp = mp2;
 		*dposp = mtod(mp2, caddr_t);
 	}
 	return (0);
 }
 
 /*
  * Advance the position in the mbuf chain.
  */
 int
 nfs_adv(mdp, dposp, offs, left)
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	int offs;
 	int left;
 {
 	register struct mbuf *m;
 	register int s;
 
 	m = *mdp;
 	s = left;
 	while (s < offs) {
 		offs -= s;
 		m = m->m_next;
 		if (m == NULL)
 			return (EBADRPC);
 		s = m->m_len;
 	}
 	*mdp = m;
 	*dposp = mtod(m, caddr_t)+offs;
 	return (0);
 }
 
 /*
  * Copy a string into mbufs for the hard cases...
  */
 int
 nfsm_strtmbuf(mb, bpos, cp, siz)
 	struct mbuf **mb;
 	char **bpos;
 	char *cp;
 	long siz;
 {
 	register struct mbuf *m1 = 0, *m2;
 	long left, xfer, len, tlen;
 	u_long *tl;
 	int putsize;
 
 	putsize = 1;
 	m2 = *mb;
 	left = M_TRAILINGSPACE(m2);
 	if (left > 0) {
 		tl = ((u_long *)(*bpos));
 		*tl++ = txdr_unsigned(siz);
 		putsize = 0;
 		left -= NFSX_UNSIGNED;
 		m2->m_len += NFSX_UNSIGNED;
 		if (left > 0) {
 			bcopy(cp, (caddr_t) tl, left);
 			siz -= left;
 			cp += left;
 			m2->m_len += left;
 			left = 0;
 		}
 	}
 	/* Loop around adding mbufs */
 	while (siz > 0) {
 		MGET(m1, M_WAIT, MT_DATA);
 		if (siz > MLEN)
 			MCLGET(m1, M_WAIT);
 		m1->m_len = NFSMSIZ(m1);
 		m2->m_next = m1;
 		m2 = m1;
 		tl = mtod(m1, u_long *);
 		tlen = 0;
 		if (putsize) {
 			*tl++ = txdr_unsigned(siz);
 			m1->m_len -= NFSX_UNSIGNED;
 			tlen = NFSX_UNSIGNED;
 			putsize = 0;
 		}
 		if (siz < m1->m_len) {
 			len = nfsm_rndup(siz);
 			xfer = siz;
 			if (xfer < len)
 				*(tl+(xfer>>2)) = 0;
 		} else {
 			xfer = len = m1->m_len;
 		}
 		bcopy(cp, (caddr_t) tl, xfer);
 		m1->m_len = len+tlen;
 		siz -= xfer;
 		cp += xfer;
 	}
 	*mb = m1;
 	*bpos = mtod(m1, caddr_t)+m1->m_len;
 	return (0);
 }
 
 /*
  * Called once to initialize data structures...
  */
 int
 nfs_init()
 {
 	register int i;
 
 	/*
 	 * Check to see if major data structures haven't bloated.
 	 */
 	if (sizeof (struct nfsnode) > NFS_NODEALLOC) {
 		printf("struct nfsnode bloated (> %dbytes)\n", NFS_NODEALLOC);
 		printf("Try reducing NFS_SMALLFH\n");
 	}
 	if (sizeof (struct nfsmount) > NFS_MNTALLOC) {
 		printf("struct nfsmount bloated (> %dbytes)\n", NFS_MNTALLOC);
 		printf("Try reducing NFS_MUIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfssvc_sock) > NFS_SVCALLOC) {
 		printf("struct nfssvc_sock bloated (> %dbytes)\n",NFS_SVCALLOC);
 		printf("Try reducing NFS_UIDHASHSIZ\n");
 	}
 	if (sizeof (struct nfsuid) > NFS_UIDALLOC) {
 		printf("struct nfsuid bloated (> %dbytes)\n",NFS_UIDALLOC);
 		printf("Try unionizing the nu_nickname and nu_flag fields\n");
 	}
 	nfsrtt.pos = 0;
 	rpc_vers = txdr_unsigned(RPC_VER2);
 	rpc_call = txdr_unsigned(RPC_CALL);
 	rpc_reply = txdr_unsigned(RPC_REPLY);
 	rpc_msgdenied = txdr_unsigned(RPC_MSGDENIED);
 	rpc_msgaccepted = txdr_unsigned(RPC_MSGACCEPTED);
 	rpc_mismatch = txdr_unsigned(RPC_MISMATCH);
 	rpc_autherr = txdr_unsigned(RPC_AUTHERR);
 	rpc_auth_unix = txdr_unsigned(RPCAUTH_UNIX);
 	rpc_auth_kerb = txdr_unsigned(RPCAUTH_KERB4);
 	nfs_prog = txdr_unsigned(NFS_PROG);
 	nqnfs_prog = txdr_unsigned(NQNFS_PROG);
 	nfs_true = txdr_unsigned(TRUE);
 	nfs_false = txdr_unsigned(FALSE);
 	nfs_xdrneg1 = txdr_unsigned(-1);
 	nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000;
 	if (nfs_ticks < 1)
 		nfs_ticks = 1;
 	/* Ensure async daemons disabled */
 	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
 		nfs_iodwant[i] = (struct proc *)0;
 	TAILQ_INIT(&nfs_bufq);
 	nfs_nhinit();			/* Init the nfsnode table */
 #ifndef NFS_NOSERVER
 	nfsrv_init(0);			/* Init server data structures */
 	nfsrv_initcache();		/* Init the server request cache */
 #endif
 
 	/*
 	 * Initialize the nqnfs server stuff.
 	 */
 	if (nqnfsstarttime == 0) {
 		nqnfsstarttime = boottime.tv_sec + nqsrv_maxlease
 			+ nqsrv_clockskew + nqsrv_writeslack;
 		NQLOADNOVRAM(nqnfsstarttime);
 		CIRCLEQ_INIT(&nqtimerhead);
 		nqfhhashtbl = hashinit(NQLCHSZ, M_NQLEASE, &nqfhhash);
 	}
 
 	/*
 	 * Initialize reply list and start timer
 	 */
 	TAILQ_INIT(&nfs_reqq);
 #ifndef NFS_NOSERVER
 	nfs_timer(0);
 #endif
 
 #ifdef __FreeBSD__
 	/*
 	 * Set up lease_check and lease_updatetime so that other parts
 	 * of the system can call us, if we are loadable.
 	 */
 #ifndef NFS_NOSERVER
 	lease_check = nfs_lease_check;
 #endif
 	lease_updatetime = nfs_lease_updatetime;
 	vfsconf[MOUNT_NFS]->vfc_refcount++; /* make us non-unloadable */
 #ifdef VFS_LKM
 	sysent[SYS_nfssvc].sy_narg = 2;
 	sysent[SYS_nfssvc].sy_call = nfssvc;
 #ifndef NFS_NOSERVER
 	sysent[SYS_getfh].sy_narg = 2;
 	sysent[SYS_getfh].sy_call = getfh;
 #endif
 #endif
 #endif
 
 	return (0);
 }
 
 /*
  * Attribute cache routines.
  * nfs_loadattrcache() - loads or updates the cache contents from attributes
  *	that are on the mbuf list
  * nfs_getattrcache() - returns valid attributes if found in cache, returns
  *	error otherwise
  */
 
 /*
  * Load the attribute cache (that lives in the nfsnode entry) with
  * the values on the mbuf list and
  * Iff vap not NULL
  *    copy the attributes to *vaper
  */
 int
 nfs_loadattrcache(vpp, mdp, dposp, vaper)
 	struct vnode **vpp;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vattr *vaper;
 {
 	register struct vnode *vp = *vpp;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 	register struct nfsnode *np;
 	register struct nfsnodehashhead *nhpp;
 	register long t1;
 	caddr_t cp2;
 	int error = 0, rdev;
 	struct mbuf *md;
 	enum vtype vtyp;
 	u_short vmode;
 	struct timespec mtime;
 	struct vnode *nvp;
 	int v3 = NFS_ISV3(vp);
 
 	md = *mdp;
 	t1 = (mtod(md, caddr_t) + md->m_len) - *dposp;
 	if (error = nfsm_disct(mdp, dposp, NFSX_FATTR(v3), t1, &cp2))
 		return (error);
 	fp = (struct nfs_fattr *)cp2;
 	if (v3) {
 		vtyp = nfsv3tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		rdev = makedev(fxdr_unsigned(u_char, fp->fa3_rdev.specdata1),
 			fxdr_unsigned(u_char, fp->fa3_rdev.specdata2));
 		fxdr_nfsv3time(&fp->fa3_mtime, &mtime);
 	} else {
 		vtyp = nfsv2tov_type(fp->fa_type);
 		vmode = fxdr_unsigned(u_short, fp->fa_mode);
 		/*
 		 * XXX
 		 *
 		 * The duplicate information returned in fa_type and fa_mode
 		 * is an ambiguity in the NFS version 2 protocol.
 		 *
 		 * VREG should be taken literally as a regular file.  If a
 		 * server intents to return some type information differently
 		 * in the upper bits of the mode field (e.g. for sockets, or
 		 * FIFOs), NFSv2 mandates fa_type to be VNON.  Anyway, we
 		 * leave the examination of the mode bits even in the VREG
 		 * case to avoid breakage for bogus servers, but we make sure
 		 * that there are actually type bits set in the upper part of
 		 * fa_mode (and failing that, trust the va_type field).
 		 *
 		 * NFSv3 cleared the issue, and requires fa_mode to not
 		 * contain any type information (while also introduing sockets
 		 * and FIFOs for fa_type).
 		 */
 		if (vtyp == VNON || (vtyp == VREG && (vmode & S_IFMT) != 0))
 			vtyp = IFTOVT(vmode);
 		rdev = fxdr_unsigned(long, fp->fa2_rdev);
 		fxdr_nfsv2time(&fp->fa2_mtime, &mtime);
 
 		/*
 		 * Really ugly NFSv2 kludge.
 		 */
 		if (vtyp == VCHR && rdev == 0xffffffff)
 			vtyp = VFIFO;
 	}
 
 	/*
 	 * If v_type == VNON it is a new node, so fill in the v_type,
 	 * n_mtime fields. Check to see if it represents a special
 	 * device, and if so, check for a possible alias. Once the
 	 * correct vnode has been obtained, fill in the rest of the
 	 * information.
 	 */
 	np = VTONFS(vp);
 	if (vp->v_type != vtyp) {
 		/*
 		 * If we had a lock and it turns out that the vnode
 		 * is an object which we don't want to lock (e.g. VDIR)
 		 * to avoid nasty hanging problems on a server crash,
 		 * then release it here.
 		 */
 		if (vtyp != VREG && VOP_ISLOCKED(vp))
 			VOP_UNLOCK(vp);
 		vp->v_type = vtyp;
 		if (vp->v_type == VFIFO) {
 			vp->v_op = fifo_nfsv2nodeop_p;
 		}
 		if (vp->v_type == VCHR || vp->v_type == VBLK) {
 			vp->v_op = spec_nfsv2nodeop_p;
 			nvp = checkalias(vp, (dev_t)rdev, vp->v_mount);
 			if (nvp) {
 				/*
 				 * Discard unneeded vnode, but save its nfsnode.
 				 */
 				LIST_REMOVE(np, n_hash);
 				nvp->v_data = vp->v_data;
 				vp->v_data = NULL;
 				vp->v_op = spec_vnodeop_p;
 				vrele(vp);
 				vgone(vp);
 				/*
 				 * Reinitialize aliased node.
 				 */
 				np->n_vnode = nvp;
 				nhpp = NFSNOHASH(nfs_hash(np->n_fhp, np->n_fhsize));
 				LIST_INSERT_HEAD(nhpp, np, n_hash);
 				*vpp = vp = nvp;
 			}
 		}
 		np->n_mtime = mtime.ts_sec;
 	}
 	vap = &np->n_vattr;
 	vap->va_type = vtyp;
 	vap->va_mode = (vmode & 07777);
 	vap->va_rdev = (dev_t)rdev;
 	vap->va_mtime = mtime;
 	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
 	if (v3) {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		fxdr_hyper(&fp->fa3_size, &vap->va_size);
 		vap->va_blocksize = NFS_FABLKSIZE;
 		fxdr_hyper(&fp->fa3_used, &vap->va_bytes);
 		vap->va_fileid = fxdr_unsigned(int, fp->fa3_fileid.nfsuquad[1]);
 		fxdr_nfsv3time(&fp->fa3_atime, &vap->va_atime);
 		fxdr_nfsv3time(&fp->fa3_ctime, &vap->va_ctime);
 		vap->va_flags = 0;
 		vap->va_filerev = 0;
 	} else {
 		vap->va_nlink = fxdr_unsigned(u_short, fp->fa_nlink);
 		vap->va_uid = fxdr_unsigned(uid_t, fp->fa_uid);
 		vap->va_gid = fxdr_unsigned(gid_t, fp->fa_gid);
 		vap->va_size = fxdr_unsigned(u_long, fp->fa2_size);
 		vap->va_blocksize = fxdr_unsigned(long, fp->fa2_blocksize);
 		vap->va_bytes = fxdr_unsigned(long, fp->fa2_blocks) * NFS_FABLKSIZE;
 		vap->va_fileid = fxdr_unsigned(long, fp->fa2_fileid);
 		fxdr_nfsv2time(&fp->fa2_atime, &vap->va_atime);
 		vap->va_flags = 0;
 		vap->va_ctime.ts_sec = fxdr_unsigned(long, fp->fa2_ctime.nfsv2_sec);
 		vap->va_ctime.ts_nsec = 0;
 		vap->va_gen = fxdr_unsigned(u_long, fp->fa2_ctime.nfsv2_usec);
 		vap->va_filerev = 0;
 	}
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	np->n_attrstamp = time.tv_sec;
 	if (vaper != NULL) {
 		bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(*vap));
 		if (np->n_flag & NCHG) {
 			if (np->n_flag & NACC)
 				vaper->va_atime = np->n_atim;
 			if (np->n_flag & NUPD)
 				vaper->va_mtime = np->n_mtim;
 		}
 	}
 	return (0);
 }
 
 /*
  * Check the time stamp
  * If the cache is valid, copy contents to *vap and return 0
  * otherwise return an error
  */
 int
 nfs_getattrcache(vp, vaper)
 	register struct vnode *vp;
 	struct vattr *vaper;
 {
 	register struct nfsnode *np = VTONFS(vp);
 	register struct vattr *vap;
 
 	if ((time.tv_sec - np->n_attrstamp) >= NFS_ATTRTIMEO(np)) {
 		nfsstats.attrcache_misses++;
 		return (ENOENT);
 	}
 	nfsstats.attrcache_hits++;
 	vap = &np->n_vattr;
 	if (vap->va_size != np->n_size) {
 		if (vap->va_type == VREG) {
 			if (np->n_flag & NMODIFIED) {
 				if (vap->va_size < np->n_size)
 					vap->va_size = np->n_size;
 				else
 					np->n_size = vap->va_size;
 			} else
 				np->n_size = vap->va_size;
 			vnode_pager_setsize(vp, (u_long)np->n_size);
 		} else
 			np->n_size = vap->va_size;
 	}
 	bcopy((caddr_t)vap, (caddr_t)vaper, sizeof(struct vattr));
 	if (np->n_flag & NCHG) {
 		if (np->n_flag & NACC)
 			vaper->va_atime = np->n_atim;
 		if (np->n_flag & NUPD)
 			vaper->va_mtime = np->n_mtim;
 	}
 	return (0);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Set up nameidata for a lookup() call and do it
  */
 int
 nfs_namei(ndp, fhp, len, slp, nam, mdp, dposp, retdirp, p, kerbflag)
 	register struct nameidata *ndp;
 	fhandle_t *fhp;
 	int len;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	struct mbuf **mdp;
 	caddr_t *dposp;
 	struct vnode **retdirp;
 	struct proc *p;
 	int kerbflag;
 {
 	register int i, rem;
 	register struct mbuf *md;
 	register char *fromcp, *tocp;
 	struct vnode *dp;
 	int error, rdonly;
 	struct componentname *cnp = &ndp->ni_cnd;
 
 	*retdirp = (struct vnode *)0;
 	MALLOC(cnp->cn_pnbuf, char *, len + 1, M_NAMEI, M_WAITOK);
 	/*
 	 * Copy the name from the mbuf list to ndp->ni_pnbuf
 	 * and set the various ndp fields appropriately.
 	 */
 	fromcp = *dposp;
 	tocp = cnp->cn_pnbuf;
 	md = *mdp;
 	rem = mtod(md, caddr_t) + md->m_len - fromcp;
 	cnp->cn_hash = 0;
 	for (i = 0; i < len; i++) {
 		while (rem == 0) {
 			md = md->m_next;
 			if (md == NULL) {
 				error = EBADRPC;
 				goto out;
 			}
 			fromcp = mtod(md, caddr_t);
 			rem = md->m_len;
 		}
 		if (*fromcp == '\0' || *fromcp == '/') {
 			error = EACCES;
 			goto out;
 		}
 		cnp->cn_hash += (unsigned char)*fromcp;
 		*tocp++ = *fromcp++;
 		rem--;
 	}
 	*tocp = '\0';
 	*mdp = md;
 	*dposp = fromcp;
 	len = nfsm_rndup(len)-len;
 	if (len > 0) {
 		if (rem >= len)
 			*dposp += len;
 		else if (error = nfs_adv(mdp, dposp, len, rem))
 			goto out;
 	}
 	ndp->ni_pathlen = tocp - cnp->cn_pnbuf;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
 	/*
 	 * Extract and set starting directory.
 	 */
 	if (error = nfsrv_fhtovp(fhp, FALSE, &dp, ndp->ni_cnd.cn_cred, slp,
 	    nam, &rdonly, kerbflag))
 		goto out;
 	if (dp->v_type != VDIR) {
 		nfsrv_vrele(dp);
 		error = ENOTDIR;
 		goto out;
 	}
 	VREF(dp);
 	*retdirp = dp;
 	ndp->ni_startdir = dp;
 	if (rdonly)
 		cnp->cn_flags |= (NOCROSSMOUNT | RDONLY);
 	else
 		cnp->cn_flags |= NOCROSSMOUNT;
 	/*
 	 * And call lookup() to do the real work
 	 */
 	cnp->cn_proc = p;
 	if (error = lookup(ndp))
 		goto out;
 	/*
 	 * Check for encountering a symbolic link
 	 */
 	if (cnp->cn_flags & ISSYMLINK) {
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			vput(ndp->ni_dvp);
 		else
 			vrele(ndp->ni_dvp);
 		vput(ndp->ni_vp);
 		ndp->ni_vp = NULL;
 		error = EINVAL;
 		goto out;
 	}
 
 	nfsrv_vmio(ndp->ni_vp);
 
 	/*
 	 * Check for saved name request
 	 */
 	if (cnp->cn_flags & (SAVENAME | SAVESTART)) {
 		cnp->cn_flags |= HASBUF;
 		return (0);
 	}
 out:
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 	return (error);
 }
 
 /*
  * A fiddled version of m_adj() that ensures null fill to a long
  * boundary and only trims off the back end
  */
 void
 nfsm_adj(mp, len, nul)
 	struct mbuf *mp;
 	register int len;
 	int nul;
 {
 	register struct mbuf *m;
 	register int count, i;
 	register char *cp;
 
 	/*
 	 * Trim from tail.  Scan the mbuf chain,
 	 * calculating its length and finding the last mbuf.
 	 * If the adjustment only affects this mbuf, then just
 	 * adjust and return.  Otherwise, rescan and truncate
 	 * after the remaining size.
 	 */
 	count = 0;
 	m = mp;
 	for (;;) {
 		count += m->m_len;
 		if (m->m_next == (struct mbuf *)0)
 			break;
 		m = m->m_next;
 	}
 	if (m->m_len > len) {
 		m->m_len -= len;
 		if (nul > 0) {
 			cp = mtod(m, caddr_t)+m->m_len-nul;
 			for (i = 0; i < nul; i++)
 				*cp++ = '\0';
 		}
 		return;
 	}
 	count -= len;
 	if (count < 0)
 		count = 0;
 	/*
 	 * Correct length for chain is "count".
 	 * Find the mbuf with last data, adjust its length,
 	 * and toss data from remaining mbufs on chain.
 	 */
 	for (m = mp; m; m = m->m_next) {
 		if (m->m_len >= count) {
 			m->m_len = count;
 			if (nul > 0) {
 				cp = mtod(m, caddr_t)+m->m_len-nul;
 				for (i = 0; i < nul; i++)
 					*cp++ = '\0';
 			}
 			break;
 		}
 		count -= m->m_len;
 	}
 	for (m = m->m_next;m;m = m->m_next)
 		m->m_len = 0;
 }
 
 /*
  * Make these functions instead of macros, so that the kernel text size
  * doesn't get too big...
  */
 void
 nfsm_srvwcc(nfsd, before_ret, before_vap, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int before_ret;
 	register struct vattr *before_vap;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 
 	if (before_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, 7 * NFSX_UNSIGNED);
 		*tl++ = nfs_true;
 		txdr_hyper(&(before_vap->va_size), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_mtime), tl);
 		tl += 2;
 		txdr_nfsv3time(&(before_vap->va_ctime), tl);
 	}
 	*bposp = bpos;
 	*mbp = mb;
 	nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp);
 }
 
 void
 nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp)
 	struct nfsrv_descript *nfsd;
 	int after_ret;
 	struct vattr *after_vap;
 	struct mbuf **mbp;
 	char **bposp;
 {
 	register struct mbuf *mb = *mbp, *mb2;
 	register char *bpos = *bposp;
 	register u_long *tl;
 	register struct nfs_fattr *fp;
 
 	if (after_ret) {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED);
 		*tl = nfs_false;
 	} else {
 		nfsm_build(tl, u_long *, NFSX_UNSIGNED + NFSX_V3FATTR);
 		*tl++ = nfs_true;
 		fp = (struct nfs_fattr *)tl;
 		nfsm_srvfattr(nfsd, after_vap, fp);
 	}
 	*mbp = mb;
 	*bposp = bpos;
 }
 
 void
 nfsm_srvfattr(nfsd, vap, fp)
 	register struct nfsrv_descript *nfsd;
 	register struct vattr *vap;
 	register struct nfs_fattr *fp;
 {
 
 	fp->fa_nlink = txdr_unsigned(vap->va_nlink);
 	fp->fa_uid = txdr_unsigned(vap->va_uid);
 	fp->fa_gid = txdr_unsigned(vap->va_gid);
 	if (nfsd->nd_flag & ND_NFSV3) {
 		fp->fa_type = vtonfsv3_type(vap->va_type);
 		fp->fa_mode = vtonfsv3_mode(vap->va_mode);
 		txdr_hyper(&vap->va_size, &fp->fa3_size);
 		txdr_hyper(&vap->va_bytes, &fp->fa3_used);
 		fp->fa3_rdev.specdata1 = txdr_unsigned(major(vap->va_rdev));
 		fp->fa3_rdev.specdata2 = txdr_unsigned(minor(vap->va_rdev));
 		fp->fa3_fsid.nfsuquad[0] = 0;
 		fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid);
 		fp->fa3_fileid.nfsuquad[0] = 0;
 		fp->fa3_fileid.nfsuquad[1] = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime);
 		txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime);
 		txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime);
 	} else {
 		fp->fa_type = vtonfsv2_type(vap->va_type);
 		fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode);
 		fp->fa2_size = txdr_unsigned(vap->va_size);
 		fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize);
 		if (vap->va_type == VFIFO)
 			fp->fa2_rdev = 0xffffffff;
 		else
 			fp->fa2_rdev = txdr_unsigned(vap->va_rdev);
 		fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE);
 		fp->fa2_fsid = txdr_unsigned(vap->va_fsid);
 		fp->fa2_fileid = txdr_unsigned(vap->va_fileid);
 		txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime);
 		txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime);
 		txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime);
 	}
 }
 
 /*
  * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked)
  * 	- look up fsid in mount list (if not found ret error)
  *	- get vp and export rights by calling VFS_FHTOVP()
  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
  *	- if not lockflag unlock it with VOP_UNLOCK()
  */
 int
 nfsrv_fhtovp(fhp, lockflag, vpp, cred, slp, nam, rdonlyp, kerbflag)
 	fhandle_t *fhp;
 	int lockflag;
 	struct vnode **vpp;
 	struct ucred *cred;
 	struct nfssvc_sock *slp;
 	struct mbuf *nam;
 	int *rdonlyp;
 	int kerbflag;
 {
 	register struct mount *mp;
 	register int i;
 	struct ucred *credanon;
 	int error, exflags;
 
 	*vpp = (struct vnode *)0;
 	mp = getvfs(&fhp->fh_fsid);
 	if (!mp)
 		return (ESTALE);
 	error = VFS_FHTOVP(mp, &fhp->fh_fid, nam, vpp, &exflags, &credanon);
 	if (error)
 		return (error);
 	/*
 	 * Check/setup credentials.
 	 */
 	if (exflags & MNT_EXKERB) {
 		if (!kerbflag) {
 			vput(*vpp);
 			return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 		}
 	} else if (kerbflag) {
 		vput(*vpp);
 		return (NFSERR_AUTHERR | AUTH_TOOWEAK);
 	} else if (cred->cr_uid == 0 || (exflags & MNT_EXPORTANON)) {
 		cred->cr_uid = credanon->cr_uid;
 		for (i = 0; i < credanon->cr_ngroups && i < NGROUPS; i++)
 			cred->cr_groups[i] = credanon->cr_groups[i];
 		cred->cr_ngroups = i;
 	}
 	if (exflags & MNT_EXRDONLY)
 		*rdonlyp = 1;
 	else
 		*rdonlyp = 0;
 
 	nfsrv_vmio(*vpp);
 
 	if (!lockflag)
 		VOP_UNLOCK(*vpp);
 	return (0);
 }
 
 #endif /* NFS_NOSERVER */
 /*
  * This function compares two net addresses by family and returns TRUE
  * if they are the same host.
  * If there is any doubt, return FALSE.
  * The AF_INET family is handled as a special case so that address mbufs
  * don't need to be saved to store "struct in_addr", which is only 4 bytes.
  */
 int
 netaddr_match(family, haddr, nam)
 	int family;
 	union nethostaddr *haddr;
 	struct mbuf *nam;
 {
 	register struct sockaddr_in *inetaddr;
 
 	switch (family) {
 	case AF_INET:
 		inetaddr = mtod(nam, struct sockaddr_in *);
 		if (inetaddr->sin_family == AF_INET &&
 		    inetaddr->sin_addr.s_addr == haddr->had_inetaddr)
 			return (1);
 		break;
 #ifdef ISO
 	case AF_ISO:
 	    {
 		register struct sockaddr_iso *isoaddr1, *isoaddr2;
 
 		isoaddr1 = mtod(nam, struct sockaddr_iso *);
 		isoaddr2 = mtod(haddr->had_nam, struct sockaddr_iso *);
 		if (isoaddr1->siso_family == AF_ISO &&
 		    isoaddr1->siso_nlen > 0 &&
 		    isoaddr1->siso_nlen == isoaddr2->siso_nlen &&
 		    SAME_ISOADDR(isoaddr1, isoaddr2))
 			return (1);
 		break;
 	    }
 #endif	/* ISO */
 	default:
 		break;
 	};
 	return (0);
 }
 
 static nfsuint64 nfs_nullcookie = { 0, 0 };
 /*
  * This function finds the directory cookie that corresponds to the
  * logical byte offset given.
  */
 nfsuint64 *
 nfs_getcookie(np, off, add)
 	register struct nfsnode *np;
 	off_t off;
 	int add;
 {
 	register struct nfsdmap *dp, *dp2;
 	register int pos;
 
 	pos = off / NFS_DIRBLKSIZ;
 	if (pos == 0) {
 #ifdef DIAGNOSTIC
 		if (add)
 			panic("nfs getcookie add at 0");
 #endif
 		return (&nfs_nullcookie);
 	}
 	pos--;
 	dp = np->n_cookies.lh_first;
 	if (!dp) {
 		if (add) {
 			MALLOC(dp, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp->ndm_eocookie = 0;
 			LIST_INSERT_HEAD(&np->n_cookies, dp, ndm_list);
 		} else
 			return ((nfsuint64 *)0);
 	}
 	while (pos >= NFSNUMCOOKIES) {
 		pos -= NFSNUMCOOKIES;
 		if (dp->ndm_list.le_next) {
 			if (!add && dp->ndm_eocookie < NFSNUMCOOKIES &&
 				pos >= dp->ndm_eocookie)
 				return ((nfsuint64 *)0);
 			dp = dp->ndm_list.le_next;
 		} else if (add) {
 			MALLOC(dp2, struct nfsdmap *, sizeof (struct nfsdmap),
 				M_NFSDIROFF, M_WAITOK);
 			dp2->ndm_eocookie = 0;
 			LIST_INSERT_AFTER(dp, dp2, ndm_list);
 			dp = dp2;
 		} else
 			return ((nfsuint64 *)0);
 	}
 	if (pos >= dp->ndm_eocookie) {
 		if (add)
 			dp->ndm_eocookie = pos + 1;
 		else
 			return ((nfsuint64 *)0);
 	}
 	return (&dp->ndm_cookies[pos]);
 }
 
 /*
  * Invalidate cached directory information, except for the actual directory
  * blocks (which are invalidated separately).
  * Done mainly to avoid the use of stale offset cookies.
  */
 void
 nfs_invaldir(vp)
 	register struct vnode *vp;
 {
 	register struct nfsnode *np = VTONFS(vp);
 
 #ifdef DIAGNOSTIC
 	if (vp->v_type != VDIR)
 		panic("nfs: invaldir not dir");
 #endif
 	np->n_direofoffset = 0;
 	np->n_cookieverf.nfsuquad[0] = 0;
 	np->n_cookieverf.nfsuquad[1] = 0;
 	if (np->n_cookies.lh_first)
 		np->n_cookies.lh_first->ndm_eocookie = 0;
 }
 
 /*
  * The write verifier has changed (probably due to a server reboot), so all
  * B_NEEDCOMMIT blocks will have to be written again. Since they are on the
  * dirty block list as B_DELWRI, all this takes is clearing the B_NEEDCOMMIT
  * flag. Once done the new write verifier can be set for the mount point.
  */
 void
 nfs_clearcommit(mp)
 	struct mount *mp;
 {
 	register struct vnode *vp, *nvp;
 	register struct buf *bp, *nbp;
 	int s;
 
 	s = splbio();
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
 		if (vp->v_mount != mp)	/* Paranoia */
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		for (bp = vp->v_dirtyblkhd.lh_first; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
 			if ((bp->b_flags & (B_BUSY | B_DELWRI | B_NEEDCOMMIT))
 				== (B_DELWRI | B_NEEDCOMMIT))
 				bp->b_flags &= ~B_NEEDCOMMIT;
 		}
 	}
 	splx(s);
 }
 
 #ifndef NFS_NOSERVER
 /*
  * Map errnos to NFS error numbers. For Version 3 also filter out error
  * numbers not specified for the associated procedure.
  */
 int
 nfsrv_errmap(nd, err)
 	struct nfsrv_descript *nd;
 	register int err;
 {
 	register short *defaulterrp, *errp;
 
 	if (nd->nd_flag & ND_NFSV3) {
 	    if (nd->nd_procnum <= NFSPROC_COMMIT) {
 		errp = defaulterrp = nfsrv_v3errmap[nd->nd_procnum];
 		while (*++errp) {
 			if (*errp == err)
 				return (err);
 			else if (*errp > err)
 				break;
 		}
 		return ((int)*defaulterrp);
 	    } else
 		return (err & 0xffff);
 	}
 	if (err <= ELAST)
 		return ((int)nfsrv_v2errmap[err - 1]);
 	return (NFSERR_IO);
 }
 
 int
 nfsrv_vmio(struct vnode *vp) {
 	vm_object_t object;
 
 	if ((vp == NULL) || (vp->v_type != VREG))
 		return 1;
 
 retry:
 	if ((vp->v_flag & VVMIO) == 0) {
 		struct vattr vat;
 		struct proc *p = curproc;
 
 		if (VOP_GETATTR(vp, &vat, p->p_ucred, p) != 0)
 			panic("nfsrv_vmio: VOP_GETATTR failed");
 
-		(void) vnode_pager_alloc(vp, vat.va_size, 0, 0);
+		(void) vnode_pager_alloc(vp, OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
 
 		vp->v_flag |= VVMIO;
 	} else {
 		if ((object = vp->v_object) &&
 			(object->flags & OBJ_DEAD)) {
 			tsleep(object, PVM, "nfdead", 0);
 			goto retry;
 		}
 		if (!object)
 			panic("nfsrv_vmio: VMIO object missing");
 		vm_object_reference(object);
 	}
 	return 0;
 }
 int
 nfsrv_vput(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vput(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vput(vp);
 	}
 	return 0;
 }
 int
 nfsrv_vrele(struct vnode *vp) {
 	if ((vp->v_flag & VVMIO) && vp->v_object) {
 		vrele(vp);
 		vm_object_deallocate(vp->v_object);
 	} else {
 		vrele(vp);
 	}
 	return 0;
 }
 #endif /* NFS_NOSERVER */
Index: head/sys/sys/bio.h
===================================================================
--- head/sys/sys/bio.h	(revision 13489)
+++ head/sys/sys/bio.h	(revision 13490)
@@ -1,248 +1,249 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.7 (Berkeley) 1/21/94
- * $Id: buf.h,v 1.25 1995/12/11 04:57:20 dyson Exp $
+ * $Id: buf.h,v 1.26 1995/12/28 23:34:28 davidg Exp $
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 #include <sys/queue.h>
 
 #define NOLIST ((struct buf *)0x87654321)
 
 struct buf;
 
 struct iodone_chain {
 	long	ic_prev_flags;
 	void	(*ic_prev_iodone) __P((struct buf *));
 	void	*ic_prev_iodone_chain;
 	struct {
 		long	ia_long;
 		void	*ia_ptr;
 	}	ic_args[5];
 };
 
 typedef TAILQ_HEAD(buf_queue_head, buf) buf_queue_head, *buf_queue_head_t;
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
 	LIST_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	struct	buf *b_actf, **b_actb;	/* Device driver queue when active. *depricated* XXX */
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	struct  proc *b_proc;		/* Associated proc; NULL if kernel. */
 	long	b_flags;	/* B_* flags. */
 	unsigned short b_qindex;		/* buffer queue index */
 	unsigned char b_usecount;	/* buffer use count */
 	int	b_error;		/* Errno value. */
 	long	b_bufsize;		/* Allocated buffer size. */
 	long	b_bcount;		/* Valid bytes in buffer. */
 	long	b_resid;		/* Remaining I/O. */
 	dev_t	b_dev;			/* Device associated with buffer. */
 	struct {
 		caddr_t	b_addr;		/* Memory, superblocks, indirect etc. */
 	} b_un;
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
 					/* For nested b_iodone's. */
 	struct	iodone_chain *b_iodone_chain;
 	struct	vnode *b_vp;		/* Device vnode. */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
 	daddr_t	b_pblkno;               /* physical block number */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
 	void	*b_spc;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[(MAXPHYS + PAGE_SIZE - 1)/PAGE_SIZE];
 	int		b_npages;
 };
 
 /* Device driver compatibility definitions. */
 #define	b_active b_bcount		/* Driver queue head: drive active. */
 #define	b_data	 b_un.b_addr		/* b_un.b_addr is not changeable. */
 #define	b_errcnt b_resid		/* Retry count while I/O in progress. */
 #define	iodone	 biodone		/* Old name for biodone. */
 #define	iowait	 biowait		/* Old name for biowait. */
 
 /*
  * These flags are kept in b_flags.
  */
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_APPENDWRITE	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_BAD		0x00000008	/* Bad block revectoring in progress. */
 #define	B_BUSY		0x00000010	/* I/O in progress. */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_DIRTY		0x00000100	/* Dirty page to be pushed out async. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_ERROR		0x00000800	/* I/O error occurred. */
 #define	B_GATHERED	0x00001000	/* LFS: already in a segment. */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK		0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_PHYS		0x00040000	/* I/O to user memory. */
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_TAPE		0x00200000	/* Magnetic tape I/O. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_WANTED	0x00800000	/* Process wants this buffer. */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_BOUNCE	0x80000000	/* bounce buffer flag */
 
 /*
  * number of buffer hash entries
  */
 #define BUFHSZ 512
 
 /*
  * buffer hash table calculation, originally by David Greenman
  */
 #define BUFHASH(vnp, bn)        \
 	(&bufhashtbl[(((unsigned long)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 extern LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_LOCKED	1	/* locked buffers */
 #define QUEUE_LRU	2	/* useful buffers */
 #define QUEUE_VMIO	3	/* VMIO buffers */
 #define QUEUE_AGE	4	/* not-useful buffers */
 #define QUEUE_EMPTY	5	/* empty buffer headers*/
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /* Flags to low-level allocation routines. */
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 
 #ifdef KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern TAILQ_HEAD(swqueue, buf) bswlist;
 
 __BEGIN_DECLS
 void	bufinit __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
 int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **));
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
 void	brelse __P((struct buf *));
+void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
 struct buf *     getpbuf __P((void));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
 int allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages(struct buf *);
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
 void	relpbuf __P((struct buf *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
 void	reassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((void));
 void	vm_bounce_alloc __P((struct buf *));
 void	vm_bounce_free __P((struct buf *));
 vm_offset_t	vm_bounce_kva_alloc __P((int));
 void	vm_bounce_kva_alloc_free __P((vm_offset_t, int));
 __END_DECLS
 #endif
 #endif /* !_SYS_BUF_H_ */
Index: head/sys/sys/buf.h
===================================================================
--- head/sys/sys/buf.h	(revision 13489)
+++ head/sys/sys/buf.h	(revision 13490)
@@ -1,248 +1,249 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)buf.h	8.7 (Berkeley) 1/21/94
- * $Id: buf.h,v 1.25 1995/12/11 04:57:20 dyson Exp $
+ * $Id: buf.h,v 1.26 1995/12/28 23:34:28 davidg Exp $
  */
 
 #ifndef _SYS_BUF_H_
 #define	_SYS_BUF_H_
 #include <sys/queue.h>
 
 #define NOLIST ((struct buf *)0x87654321)
 
 struct buf;
 
 struct iodone_chain {
 	long	ic_prev_flags;
 	void	(*ic_prev_iodone) __P((struct buf *));
 	void	*ic_prev_iodone_chain;
 	struct {
 		long	ia_long;
 		void	*ia_ptr;
 	}	ic_args[5];
 };
 
 typedef TAILQ_HEAD(buf_queue_head, buf) buf_queue_head, *buf_queue_head_t;
 
 /*
  * The buffer header describes an I/O operation in the kernel.
  */
 struct buf {
 	LIST_ENTRY(buf) b_hash;		/* Hash chain. */
 	LIST_ENTRY(buf) b_vnbufs;	/* Buffer's associated vnode. */
 	TAILQ_ENTRY(buf) b_freelist;	/* Free list position if not active. */
 	struct	buf *b_actf, **b_actb;	/* Device driver queue when active. *depricated* XXX */
 	TAILQ_ENTRY(buf) b_act;		/* Device driver queue when active. *new* */
 	struct  proc *b_proc;		/* Associated proc; NULL if kernel. */
 	long	b_flags;	/* B_* flags. */
 	unsigned short b_qindex;		/* buffer queue index */
 	unsigned char b_usecount;	/* buffer use count */
 	int	b_error;		/* Errno value. */
 	long	b_bufsize;		/* Allocated buffer size. */
 	long	b_bcount;		/* Valid bytes in buffer. */
 	long	b_resid;		/* Remaining I/O. */
 	dev_t	b_dev;			/* Device associated with buffer. */
 	struct {
 		caddr_t	b_addr;		/* Memory, superblocks, indirect etc. */
 	} b_un;
 	void	*b_saveaddr;		/* Original b_addr for physio. */
 	daddr_t	b_lblkno;		/* Logical block number. */
 	daddr_t	b_blkno;		/* Underlying physical block number. */
 					/* Function to call upon completion. */
 	void	(*b_iodone) __P((struct buf *));
 					/* For nested b_iodone's. */
 	struct	iodone_chain *b_iodone_chain;
 	struct	vnode *b_vp;		/* Device vnode. */
 	int	b_dirtyoff;		/* Offset in buffer of dirty region. */
 	int	b_dirtyend;		/* Offset of end of dirty region. */
 	struct	ucred *b_rcred;		/* Read credentials reference. */
 	struct	ucred *b_wcred;		/* Write credentials reference. */
 	int	b_validoff;		/* Offset in buffer of valid region. */
 	int	b_validend;		/* Offset of end of valid region. */
 	daddr_t	b_pblkno;               /* physical block number */
 	caddr_t	b_savekva;              /* saved kva for transfer while bouncing */
 	void	*b_driver1;		/* for private use by the driver */
 	void	*b_driver2;		/* for private use by the driver */
 	void	*b_spc;
 	union	cluster_info {
 		TAILQ_HEAD(cluster_list_head, buf) cluster_head;
 		TAILQ_ENTRY(buf) cluster_entry;
 	} b_cluster;
 	struct	vm_page *b_pages[(MAXPHYS + PAGE_SIZE - 1)/PAGE_SIZE];
 	int		b_npages;
 };
 
 /* Device driver compatibility definitions. */
 #define	b_active b_bcount		/* Driver queue head: drive active. */
 #define	b_data	 b_un.b_addr		/* b_un.b_addr is not changeable. */
 #define	b_errcnt b_resid		/* Retry count while I/O in progress. */
 #define	iodone	 biodone		/* Old name for biodone. */
 #define	iowait	 biowait		/* Old name for biowait. */
 
 /*
  * These flags are kept in b_flags.
  */
 #define	B_AGE		0x00000001	/* Move to age queue when I/O done. */
 #define	B_APPENDWRITE	0x00000002	/* Append-write in progress. */
 #define	B_ASYNC		0x00000004	/* Start I/O, do not wait. */
 #define	B_BAD		0x00000008	/* Bad block revectoring in progress. */
 #define	B_BUSY		0x00000010	/* I/O in progress. */
 #define	B_CACHE		0x00000020	/* Bread found us in the cache. */
 #define	B_CALL		0x00000040	/* Call b_iodone from biodone. */
 #define	B_DELWRI	0x00000080	/* Delay I/O until buffer reused. */
 #define	B_DIRTY		0x00000100	/* Dirty page to be pushed out async. */
 #define	B_DONE		0x00000200	/* I/O completed. */
 #define	B_EINTR		0x00000400	/* I/O was interrupted */
 #define	B_ERROR		0x00000800	/* I/O error occurred. */
 #define	B_GATHERED	0x00001000	/* LFS: already in a segment. */
 #define	B_INVAL		0x00002000	/* Does not contain valid info. */
 #define	B_LOCKED	0x00004000	/* Locked in core (not reusable). */
 #define	B_NOCACHE	0x00008000	/* Do not cache block after use. */
 #define	B_MALLOC	0x00010000	/* malloced b_data */
 #define	B_CLUSTEROK		0x00020000	/* Pagein op, so swap() can count it. */
 #define	B_PHYS		0x00040000	/* I/O to user memory. */
 #define	B_RAW		0x00080000	/* Set by physio for raw transfers. */
 #define	B_READ		0x00100000	/* Read buffer. */
 #define	B_TAPE		0x00200000	/* Magnetic tape I/O. */
 #define	B_RELBUF	0x00400000	/* Release VMIO buffer. */
 #define	B_WANTED	0x00800000	/* Process wants this buffer. */
 #define	B_WRITE		0x00000000	/* Write buffer (pseudo flag). */
 #define	B_WRITEINPROG	0x01000000	/* Write in progress. */
 #define	B_XXX		0x02000000	/* Debugging flag. */
 #define	B_PAGING	0x04000000	/* volatile paging I/O -- bypass VMIO */
 #define B_VMIO		0x20000000	/* VMIO flag */
 #define B_CLUSTER	0x40000000	/* pagein op, so swap() can count it */
 #define B_BOUNCE	0x80000000	/* bounce buffer flag */
 
 /*
  * number of buffer hash entries
  */
 #define BUFHSZ 512
 
 /*
  * buffer hash table calculation, originally by David Greenman
  */
 #define BUFHASH(vnp, bn)        \
 	(&bufhashtbl[(((unsigned long)(vnp) >> 7)+(int)(bn)) % BUFHSZ])
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 extern LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash;
 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_LOCKED	1	/* locked buffers */
 #define QUEUE_LRU	2	/* useful buffers */
 #define QUEUE_VMIO	3	/* VMIO buffers */
 #define QUEUE_AGE	4	/* not-useful buffers */
 #define QUEUE_EMPTY	5	/* empty buffer headers*/
 
 /*
  * Zero out the buffer's data area.
  */
 #define	clrbuf(bp) {							\
 	bzero((bp)->b_data, (u_int)(bp)->b_bcount);			\
 	(bp)->b_resid = 0;						\
 }
 
 /* Flags to low-level allocation routines. */
 #define B_CLRBUF	0x01	/* Request allocated buffer be cleared. */
 #define B_SYNC		0x02	/* Do all allocations synchronously. */
 
 #ifdef KERNEL
 extern int	nbuf;			/* The number of buffer headers */
 extern struct	buf *buf;		/* The buffer headers. */
 extern char	*buffers;		/* The buffer contents. */
 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
 extern TAILQ_HEAD(swqueue, buf) bswlist;
 
 __BEGIN_DECLS
 void	bufinit __P((void));
 void	bremfree __P((struct buf *));
 int	bread __P((struct vnode *, daddr_t, int,
 	    struct ucred *, struct buf **));
 int	breadn __P((struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **));
 int	bwrite __P((struct buf *));
 void	bdwrite __P((struct buf *));
 void	bawrite __P((struct buf *));
 void	brelse __P((struct buf *));
+void	bqrelse __P((struct buf *));
 int	vfs_bio_awrite __P((struct buf *));
 struct buf *     getpbuf __P((void));
 struct buf *incore __P((struct vnode *, daddr_t));
 struct buf *gbincore __P((struct vnode *, daddr_t));
 int	inmem __P((struct vnode *, daddr_t));
 struct buf *getblk __P((struct vnode *, daddr_t, int, int, int));
 struct buf *geteblk __P((int));
 int allocbuf __P((struct buf *, int));
 int	biowait __P((struct buf *));
 void	biodone __P((struct buf *));
 
 void	cluster_callback __P((struct buf *));
 int	cluster_read __P((struct vnode *, u_quad_t, daddr_t, long,
 	    struct ucred *, struct buf **));
 int	cluster_wbuild __P((struct vnode *, long, daddr_t, int));
 void	cluster_write __P((struct buf *, u_quad_t));
 int	physio __P((void (*)(struct buf *), struct buf *, dev_t, 
 	    int, u_int (*)(struct buf *), struct uio *));
 u_int	minphys __P((struct buf *));
 void	vfs_bio_clrbuf __P((struct buf *));
 void	vfs_busy_pages __P((struct buf *, int clear_modify));
 void	vfs_unbusy_pages(struct buf *);
 void	vwakeup __P((struct buf *));
 void	vmapbuf __P((struct buf *));
 void	vunmapbuf __P((struct buf *));
 void	relpbuf __P((struct buf *));
 void	brelvp __P((struct buf *));
 void	bgetvp __P((struct vnode *, struct buf *));
 void	pbgetvp __P((struct vnode *, struct buf *));
 void	pbrelvp __P((struct buf *));
 void	reassignbuf __P((struct buf *, struct vnode *));
 struct	buf *trypbuf __P((void));
 void	vm_bounce_alloc __P((struct buf *));
 void	vm_bounce_free __P((struct buf *));
 vm_offset_t	vm_bounce_kva_alloc __P((int));
 void	vm_bounce_kva_alloc_free __P((vm_offset_t, int));
 __END_DECLS
 #endif
 #endif /* !_SYS_BUF_H_ */
Index: head/sys/sys/vnode.h
===================================================================
--- head/sys/sys/vnode.h	(revision 13489)
+++ head/sys/sys/vnode.h	(revision 13490)
@@ -1,431 +1,432 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vnode.h	8.7 (Berkeley) 2/4/94
- * $Id: vnode.h,v 1.27 1995/12/17 21:23:44 phk Exp $
+ * $Id: vnode.h,v 1.28 1995/12/25 07:24:13 bde Exp $
  */
 
 #ifndef _SYS_VNODE_H_
 #define	_SYS_VNODE_H_
 
 #include <sys/queue.h>
 
 /*
  * The vnode is the focus of all file activity in UNIX.  There is a
  * unique vnode allocated for each active file, each current directory,
  * each mounted-on file, text file, and the root.
  */
 
 /*
  * Vnode types.  VNON means no type.
  */
 enum vtype	{ VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
 
 /*
  * Vnode tag types.
  * These are for the benefit of external programs only (e.g., pstat)
  * and should NEVER be inspected by the kernel.
  */
 enum vtagtype	{
 	VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_PC, VT_LFS, VT_LOFS, VT_FDESC,
 	VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS, VT_AFS, VT_ISOFS,
 	VT_UNION, VT_MSDOSFS, VT_DEVFS
 };
 
 /*
  * Each underlying filesystem allocates its own private area and hangs
  * it from v_data.  If non-null, this area is freed in getnewvnode().
  */
 LIST_HEAD(buflists, buf);
 
 typedef	int 	vop_t __P((void *));
 
 struct vnode {
 	u_long	v_flag;				/* vnode flags (see below) */
 	short	v_usecount;			/* reference count of users */
 	short	v_writecount;			/* reference count of writers */
 	long	v_holdcnt;			/* page & buffer references */
 	daddr_t	v_lastr;			/* last read (read-ahead) */
 	u_long	v_id;				/* capability identifier */
 	struct	mount *v_mount;			/* ptr to vfs we are in */
 	vop_t	**v_op;				/* vnode operations vector */
 	TAILQ_ENTRY(vnode) v_freelist;		/* vnode freelist */
 	LIST_ENTRY(vnode) v_mntvnodes;		/* vnodes for mount point */
 	struct	buflists v_cleanblkhd;		/* clean blocklist head */
 	struct	buflists v_dirtyblkhd;		/* dirty blocklist head */
 	long	v_numoutput;			/* num of writes in progress */
 	enum	vtype v_type;			/* vnode type */
 	union {
 		struct mount	*vu_mountedhere;/* ptr to mounted vfs (VDIR) */
 		struct socket	*vu_socket;	/* unix ipc (VSOCK) */
 		struct specinfo	*vu_specinfo;	/* device (VCHR, VBLK) */
 		struct fifoinfo	*vu_fifoinfo;	/* fifo (VFIFO) */
 	} v_un;
 	struct	nqlease *v_lease;		/* Soft reference to lease */
 	daddr_t	v_lastw;			/* last write (write cluster) */
 	daddr_t	v_cstart;			/* start block of cluster */
 	daddr_t	v_lasta;			/* last allocation */
 	int	v_clen;				/* length of current cluster */
 	int	v_ralen;			/* Read-ahead length */
+	int	v_usage;			/* Vnode usage counter */
 	daddr_t	v_maxra;			/* last readahead block */
 	void	*v_object;			/* Place to store VM object */
 	enum	vtagtype v_tag;			/* type of underlying data */
 	void 	*v_data;			/* private data for fs */
 };
 #define	v_mountedhere	v_un.vu_mountedhere
 #define	v_socket	v_un.vu_socket
 #define	v_specinfo	v_un.vu_specinfo
 #define	v_fifoinfo	v_un.vu_fifoinfo
 
 /*
  * Vnode flags.
  */
 #define	VROOT		0x0001	/* root of its file system */
 #define	VTEXT		0x0002	/* vnode is a pure text prototype */
 #define	VSYSTEM		0x0004	/* vnode being used by kernel */
 #define	VOLOCK		0x0008	/* vnode is locked waiting for an object */
 #define	VOWANT		0x0010	/* a process is waiting for VOLOCK */
 #define	VXLOCK		0x0100	/* vnode is locked to change underlying type */
 #define	VXWANT		0x0200	/* process is waiting for vnode */
 #define	VBWAIT		0x0400	/* waiting for output to complete */
 #define	VALIASED	0x0800	/* vnode has an alias */
 #define	VDIROP		0x1000	/* LFS: vnode is involved in a directory op */
 #define	VVMIO		0x2000	/* VMIO flag */
 #define	VNINACT		0x4000	/* LFS: skip ufs_inactive() in lfs_vunref */
 #define	VAGE		0x8000	/* Insert vnode at head of free list */
 
 /*
  * Vnode attributes.  A field value of VNOVAL represents a field whose value
  * is unavailable (getattr) or which is not to be changed (setattr).
  */
 struct vattr {
 	enum vtype	va_type;	/* vnode type (for create) */
 	u_short		va_mode;	/* files access mode and type */
 	short		va_nlink;	/* number of references to file */
 	uid_t		va_uid;		/* owner user id */
 	gid_t		va_gid;		/* owner group id */
 	long		va_fsid;	/* file system id (dev for now) */
 	long		va_fileid;	/* file id */
 	u_quad_t	va_size;	/* file size in bytes */
 	long		va_blocksize;	/* blocksize preferred for i/o */
 	struct timespec	va_atime;	/* time of last access */
 	struct timespec	va_mtime;	/* time of last modification */
 	struct timespec	va_ctime;	/* time file changed */
 	u_long		va_gen;		/* generation number of file */
 	u_long		va_flags;	/* flags defined for file */
 	dev_t		va_rdev;	/* device the special file represents */
 	u_quad_t	va_bytes;	/* bytes of disk space held by file */
 	u_quad_t	va_filerev;	/* file modification number */
 	u_int		va_vaflags;	/* operations flags, see below */
 	long		va_spare;	/* remain quad aligned */
 };
 
 /*
  * Flags for va_cflags.
  */
 #define	VA_UTIMES_NULL	0x01		/* utimes argument was NULL */
 
 /*
  * Flags for ioflag.
  */
 #define	IO_UNIT		0x01		/* do I/O as atomic unit */
 #define	IO_APPEND	0x02		/* append write to end */
 #define	IO_SYNC		0x04		/* do I/O synchronously */
 #define	IO_NODELOCKED	0x08		/* underlying node already locked */
 #define	IO_NDELAY	0x10		/* FNDELAY flag set in file table */
 #define	IO_VMIO		0x20		/* data already in VMIO space */
 
 /*
  *  Modes.  Some values same as Ixxx entries from inode.h for now.
  */
 #define	VSUID	04000		/* set user id on execution */
 #define	VSGID	02000		/* set group id on execution */
 #define	VSVTX	01000		/* save swapped text even after use */
 #define	VREAD	00400		/* read, write, execute permissions */
 #define	VWRITE	00200
 #define	VEXEC	00100
 
 /*
  * Token indicating no attribute value yet assigned.
  */
 #define	VNOVAL	(-1)
 
 #ifdef KERNEL
 /*
  * Convert between vnode types and inode formats (since POSIX.1
  * defines mode word of stat structure in terms of inode formats).
  */
 extern enum vtype	iftovt_tab[];
 extern int		vttoif_tab[];
 #define IFTOVT(mode)	(iftovt_tab[((mode) & S_IFMT) >> 12])
 #define VTTOIF(indx)	(vttoif_tab[(int)(indx)])
 #define MAKEIMODE(indx, mode)	(int)(VTTOIF(indx) | (mode))
 
 /*
  * Flags to various vnode functions.
  */
 #define	SKIPSYSTEM	0x0001		/* vflush: skip vnodes marked VSYSTEM */
 #define	FORCECLOSE	0x0002		/* vflush: force file closeure */
 #define	WRITECLOSE	0x0004		/* vflush: only close writeable files */
 #define	DOCLOSE		0x0008		/* vclean: close active files */
 #define	V_SAVE		0x0001		/* vinvalbuf: sync file first */
 #define	V_SAVEMETA	0x0002		/* vinvalbuf: leave indirect blocks */
 
 #ifdef DIAGNOSTIC
 #define	HOLDRELE(vp)	holdrele(vp)
 #define	VATTR_NULL(vap)	vattr_null(vap)
 #define	VHOLD(vp)	vhold(vp)
 #define	VREF(vp)	vref(vp)
 
 void	holdrele __P((struct vnode *));
 void	vhold __P((struct vnode *));
 #else
 #define	HOLDRELE(vp)	(vp)->v_holdcnt--	/* decrease buf or page ref */
 #define	VATTR_NULL(vap)	(*(vap) = va_null)	/* initialize a vattr */
 #define	VHOLD(vp)	(vp)->v_holdcnt++	/* increase buf or page ref */
 #define	VREF(vp)	(vp)->v_usecount++	/* increase reference */
 #endif
 
 #define	NULLVP	((struct vnode *)NULL)
 
 #ifdef VFS_LKM
 #define	VNODEOP_SET(f) DATA_SET(MODVNOPS,f)
 #else
 #define	VNODEOP_SET(f) DATA_SET(vfs_opv_descs_,f)
 #endif
 
 /*
  * Global vnode data.
  */
 extern	struct vnode *rootvnode;	/* root (i.e. "/") vnode */
 extern	int desiredvnodes;		/* number of vnodes desired */
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.
  */
 #define	LEASE_READ	0x1		/* Check lease for readers */
 #define	LEASE_WRITE	0x2		/* Check lease for modifiers */
 
 extern void	(*lease_check) __P((struct vnode *vp, struct proc *p,
 				    struct ucred *ucred, int flag));
 extern void	(*lease_updatetime) __P((int deltat));
 
 #ifdef NFS
 #ifdef NQNFS
 #define	LEASE_CHECK(vp, p, cred, flag)	lease_check((vp), (p), (cred), (flag))
 #define	LEASE_UPDATETIME(dt)		lease_updatetime(dt)
 #else
 #define	LEASE_CHECK(vp, p, cred, flag)
 #define	LEASE_UPDATETIME(dt)
 #endif /* NQNFS */
 #else
 #define	LEASE_CHECK(vp, p, cred, flag) \
 	do { if(lease_check) lease_check((vp), (p), (cred), (flag)); } while(0)
 #define	LEASE_UPDATETIME(dt) \
 	do { if(lease_updatetime) lease_updatetime(dt); } while(0)
 #endif /* NFS */
 #endif /* KERNEL */
 
 
 /*
  * Mods for exensibility.
  */
 
 /*
  * Flags for vdesc_flags:
  */
 #define VDESC_MAX_VPS		16
 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
 #define VDESC_VP0_WILLRELE	0x0001
 #define VDESC_VP1_WILLRELE	0x0002
 #define VDESC_VP2_WILLRELE	0x0004
 #define VDESC_VP3_WILLRELE	0x0008
 #define VDESC_NOMAP_VPP		0x0100
 #define VDESC_VPP_WILLRELE	0x0200
 
 /*
  * VDESC_NO_OFFSET is used to identify the end of the offset list
  * and in places where no such field exists.
  */
 #define VDESC_NO_OFFSET -1
 
 /*
  * This structure describes the vnode operation taking place.
  */
 struct vnodeop_desc {
 	int	vdesc_offset;		/* offset in vector--first for speed */
 	char    *vdesc_name;		/* a readable name for debugging */
 	int	vdesc_flags;		/* VDESC_* flags */
 
 	/*
 	 * These ops are used by bypass routines to map and locate arguments.
 	 * Creds and procs are not needed in bypass routines, but sometimes
 	 * they are useful to (for example) transport layers.
 	 * Nameidata is useful because it has a cred in it.
 	 */
 	int	*vdesc_vp_offsets;	/* list ended by VDESC_NO_OFFSET */
 	int	vdesc_vpp_offset;	/* return vpp location */
 	int	vdesc_cred_offset;	/* cred location, if any */
 	int	vdesc_proc_offset;	/* proc location, if any */
 	int	vdesc_componentname_offset; /* if any */
 	/*
 	 * Finally, we've got a list of private data (about each operation)
 	 * for each transport layer.  (Support to manage this list is not
 	 * yet part of BSD.)
 	 */
 	caddr_t	*vdesc_transports;
 };
 
 #ifdef KERNEL
 /*
  * A list of all the operation descs.
  */
 extern struct vnodeop_desc *vnodeop_descs[];
 
 
 /*
  * This macro is very helpful in defining those offsets in the vdesc struct.
  *
  * This is stolen from X11R4.  I ingored all the fancy stuff for
  * Crays, so if you decide to port this to such a serious machine,
  * you might want to consult Intrisics.h's XtOffset{,Of,To}.
  */
 #define VOPARG_OFFSET(p_type,field) \
         ((int) (((char *) (&(((p_type)NULL)->field))) - ((char *) NULL)))
 #define VOPARG_OFFSETOF(s_type,field) \
 	VOPARG_OFFSET(s_type*,field)
 #define VOPARG_OFFSETTO(S_TYPE,S_OFFSET,STRUCT_P) \
 	((S_TYPE)(((char*)(STRUCT_P))+(S_OFFSET)))
 
 
 /*
  * This structure is used to configure the new vnodeops vector.
  */
 struct vnodeopv_entry_desc {
 	struct vnodeop_desc *opve_op;   /* which operation this is */
 	vop_t *opve_impl;		/* code implementing this operation */
 };
 struct vnodeopv_desc {
 			/* ptr to the ptr to the vector where op should go */
 	vop_t ***opv_desc_vector_p;
 	struct vnodeopv_entry_desc *opv_desc_ops;   /* null terminated list */
 };
 
 /*
  * A default routine which just returns an error.
  */
 int vn_default_error __P((void));
 
 /*
  * A generic structure.
  * This can be used by bypass routines to identify generic arguments.
  */
 struct vop_generic_args {
 	struct vnodeop_desc *a_desc;
 	/* other random data follows, presumably */
 };
 
 /*
  * VOCALL calls an op given an ops vector.  We break it out because BSD's
  * vclean changes the ops vector and then wants to call ops with the old
  * vector.
  */
 #define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))
 
 /*
  * This call works for vnodes in the kernel.
  */
 #define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP))
 #define VDESC(OP) (& __CONCAT(OP,_desc))
 #define VOFFSET(OP) (VDESC(OP)->vdesc_offset)
 
 /*
  * Finally, include the default set of vnode operations.
  */
 #include <vnode_if.h>
 
 /*
  * Public vnode manipulation functions.
  */
 struct componentname;
 struct file;
 struct mount;
 struct nameidata;
 struct proc;
 struct stat;
 struct ucred;
 struct uio;
 struct vattr;
 struct vnode;
 struct vop_bwrite_args;
 
 int 	bdevvp __P((dev_t dev, struct vnode **vpp));
 /* cache_* may belong in namei.h. */
 void	cache_enter __P((struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp));
 int	cache_lookup __P((struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp));
 void	cache_purge __P((struct vnode *vp));
 void	cache_purgevfs __P((struct mount *mp));
 struct vnode *
 	checkalias __P((struct vnode *vp, dev_t nvp_rdev, struct mount *mp));
 int 	getnewvnode __P((enum vtagtype tag,
 	    struct mount *mp, vop_t **vops, struct vnode **vpp));
 void	insmntque __P((struct vnode *vp, struct mount *mp));
 void 	vattr_null __P((struct vattr *vap));
 int 	vcount __P((struct vnode *vp));
 int	vfinddev __P((dev_t dev, enum vtype type, struct vnode **vpp));
 void	vfs_opv_init __P((struct vnodeopv_desc **them));
 int 	vget __P((struct vnode *vp, int lockflag));
 void 	vgone __P((struct vnode *vp));
 void 	vgoneall __P((struct vnode *vp));
 int	vinvalbuf __P((struct vnode *vp, int save, struct ucred *cred,
 	    struct proc *p, int slpflag, int slptimeo));
 int	vn_bwrite __P((struct vop_bwrite_args *ap));
 int 	vn_close __P((struct vnode *vp,
 	    int flags, struct ucred *cred, struct proc *p));
 int 	vn_open __P((struct nameidata *ndp, int fmode, int cmode));
 int 	vn_rdwr __P((enum uio_rw rw, struct vnode *vp, caddr_t base,
 	    int len, off_t offset, enum uio_seg segflg, int ioflg,
 	    struct ucred *cred, int *aresid, struct proc *p));
 int	vn_stat __P((struct vnode *vp, struct stat *sb, struct proc *p));
 void	vn_vmio_close __P((struct vnode *vp));
 int 	vn_writechk __P((struct vnode *vp));
 void	vprint __P((char *label, struct vnode *vp));
 void 	vput __P((struct vnode *vp));
 void 	vref __P((struct vnode *vp));
 void 	vrele __P((struct vnode *vp));
 #endif /* KERNEL */
 
 #endif /* !_SYS_VNODE_H_ */
Index: head/sys/ufs/ffs/ffs_balloc.c
===================================================================
--- head/sys/ufs/ffs/ffs_balloc.c	(revision 13489)
+++ head/sys/ufs/ffs/ffs_balloc.c	(revision 13490)
@@ -1,291 +1,291 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_balloc.c	8.4 (Berkeley) 9/23/93
- * $Id: ffs_balloc.c,v 1.8 1995/05/28 04:32:23 davidg Exp $
+ * $Id: ffs_balloc.c,v 1.9 1995/05/30 08:14:59 rgrimes Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 /*
  * Balloc defines the structure of file system storage
  * by allocating the physical blocks on a device given
  * the inode and the logical block number in a file.
  */
 int
 ffs_balloc(ip, bn, size, cred, bpp, flags)
 	register struct inode *ip;
 	register daddr_t bn;
 	int size;
 	struct ucred *cred;
 	struct buf **bpp;
 	int flags;
 {
 	register struct fs *fs;
 	register daddr_t nb;
 	struct buf *bp, *nbp;
 	struct vnode *vp = ITOV(ip);
 	struct indir indirs[NIADDR + 2];
 	daddr_t newb, lbn, *bap, pref;
 	int osize, nsize, num, i, error;
 
 	*bpp = NULL;
 	if (bn < 0)
 		return (EFBIG);
 	fs = ip->i_fs;
 	lbn = bn;
 
 	/*
 	 * If the next write will extend the file into a new block,
 	 * and the file is currently composed of a fragment
 	 * this fragment has to be extended to be a full block.
 	 */
 	nb = lblkno(fs, ip->i_size);
 	if (nb < NDADDR && nb < bn) {
 		osize = blksize(fs, ip, nb);
 		if (osize < fs->fs_bsize && osize > 0) {
 			error = ffs_realloccg(ip, nb,
 				ffs_blkpref(ip, nb, (int)nb, &ip->i_db[0]),
 				osize, (int)fs->fs_bsize, cred, &bp);
 			if (error)
 				return (error);
 			ip->i_size = (nb + 1) * fs->fs_bsize;
 			ip->i_db[nb] = dbtofsb(fs, bp->b_blkno);
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 			if (flags & B_SYNC)
 				bwrite(bp);
 			else
 				bawrite(bp);
 		}
 	}
 	/*
 	 * The first NDADDR blocks are direct blocks
 	 */
 	if (bn < NDADDR) {
 		nb = ip->i_db[bn];
 		if (nb != 0 && ip->i_size >= (bn + 1) * fs->fs_bsize) {
 			error = bread(vp, bn, fs->fs_bsize, NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 			bp->b_blkno = fsbtodb(fs, nb);
 			*bpp = bp;
 			return (0);
 		}
 		if (nb != 0) {
 			/*
 			 * Consider need to reallocate a fragment.
 			 */
 			osize = fragroundup(fs, blkoff(fs, ip->i_size));
 			nsize = fragroundup(fs, size);
 			if (nsize <= osize) {
 				error = bread(vp, bn, osize, NOCRED, &bp);
 				if (error) {
 					brelse(bp);
 					return (error);
 				}
 				bp->b_blkno = fsbtodb(fs, nb);
 			} else {
 				error = ffs_realloccg(ip, bn,
 				    ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]),
 				    osize, nsize, cred, &bp);
 				if (error)
 					return (error);
 			}
 		} else {
 			if (ip->i_size < (bn + 1) * fs->fs_bsize)
 				nsize = fragroundup(fs, size);
 			else
 				nsize = fs->fs_bsize;
 			error = ffs_alloc(ip, bn,
 			    ffs_blkpref(ip, bn, (int)bn, &ip->i_db[0]),
 			    nsize, cred, &newb);
 			if (error)
 				return (error);
 			bp = getblk(vp, bn, nsize, 0, 0);
 			bp->b_blkno = fsbtodb(fs, newb);
 			if (flags & B_CLRBUF)
 				vfs_bio_clrbuf(bp);
 		}
 		ip->i_db[bn] = dbtofsb(fs, bp->b_blkno);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Determine the number of levels of indirection.
 	 */
 	pref = 0;
 	error = ufs_getlbns(vp, bn, indirs, &num);
 	if (error)
 		return(error);
 #ifdef DIAGNOSTIC
 	if (num < 1)
 		panic ("ffs_balloc: ufs_bmaparray returned indirect block");
 #endif
 	/*
 	 * Fetch the first indirect block allocating if necessary.
 	 */
 	--num;
 	nb = ip->i_ib[indirs[0].in_off];
 	if (nb == 0) {
 		pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0);
 	        error = ffs_alloc(ip, lbn, pref,
 			(int)fs->fs_bsize, cred, &newb);
 	        if (error)
 			return (error);
 		nb = newb;
 		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0);
 		bp->b_blkno = fsbtodb(fs, newb);
 		vfs_bio_clrbuf(bp);
 		/*
 		 * Write synchronously so that indirect blocks
 		 * never point at garbage.
 		 */
 		error = bwrite(bp);
 		if (error) {
 			ffs_blkfree(ip, nb, fs->fs_bsize);
 			return (error);
 		}
 		ip->i_ib[indirs[0].in_off] = newb;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * Fetch through the indirect blocks, allocating as necessary.
 	 */
 	for (i = 1;;) {
 		error = bread(vp,
 		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		bap = (daddr_t *)bp->b_data;
 		nb = bap[indirs[i].in_off];
 		if (i == num)
 			break;
 		i += 1;
 		if (nb != 0) {
-			brelse(bp);
+			bqrelse(bp);
 			continue;
 		}
 		if (pref == 0)
 			pref = ffs_blkpref(ip, lbn, 0, (daddr_t *)0);
 		error =
 		    ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize, cred, &newb);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		nb = newb;
 		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		vfs_bio_clrbuf(nbp);
 		/*
 		 * Write synchronously so that indirect blocks
 		 * never point at garbage.
 		 */
 		error = bwrite(nbp);
 		if (error) {
 			ffs_blkfree(ip, nb, fs->fs_bsize);
 			brelse(bp);
 			return (error);
 		}
 		bap[indirs[i - 1].in_off] = nb;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & B_SYNC) {
 			bwrite(bp);
 		} else {
 			bdwrite(bp);
 		}
 	}
 	/*
 	 * Get the data block, allocating if necessary.
 	 */
 	if (nb == 0) {
 		pref = ffs_blkpref(ip, lbn, indirs[i].in_off, &bap[0]);
 		error = ffs_alloc(ip,
 		    lbn, pref, (int)fs->fs_bsize, cred, &newb);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		nb = newb;
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 		if (flags & B_CLRBUF)
 			vfs_bio_clrbuf(nbp);
 		bap[indirs[i].in_off] = nb;
 		/*
 		 * If required, write synchronously, otherwise use
 		 * delayed write.
 		 */
 		if (flags & B_SYNC) {
 			bwrite(bp);
 		} else {
 			bdwrite(bp);
 		}
 		*bpp = nbp;
 		return (0);
 	}
 	brelse(bp);
 	if (flags & B_CLRBUF) {
 		error = bread(vp, lbn, (int)fs->fs_bsize, NOCRED, &nbp);
 		if (error) {
 			brelse(nbp);
 			return (error);
 		}
 	} else {
 		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0);
 		nbp->b_blkno = fsbtodb(fs, nb);
 	}
 	*bpp = nbp;
 	return (0);
 }
Index: head/sys/ufs/ffs/ffs_inode.c
===================================================================
--- head/sys/ufs/ffs/ffs_inode.c	(revision 13489)
+++ head/sys/ufs/ffs/ffs_inode.c	(revision 13490)
@@ -1,522 +1,522 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_inode.c	8.5 (Berkeley) 12/30/93
- * $Id: ffs_inode.c,v 1.18 1995/12/11 04:57:37 dyson Exp $
+ * $Id: ffs_inode.c,v 1.19 1996/01/05 18:31:48 wollman Exp $
  */
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 static int ffs_indirtrunc __P((struct inode *, daddr_t, daddr_t, daddr_t, int,
 	    long *));
 
 int
 ffs_init()
 {
 	return (ufs_init());
 }
 
 /*
  * Update the access, modified, and inode change times as specified by the
  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively. The IN_MODIFIED
  * flag is used to specify that the inode needs to be updated even if none
  * of the times needs to be updated. The access and modified times are taken
  * from the second and third parameters; the inode change time is always
  * taken from the current time. If waitfor is set, then wait for the disk
  * write of the inode to complete.
  */
 int
 ffs_update(ap)
 	struct vop_update_args /* {
 		struct vnode *a_vp;
 		struct timeval *a_access;
 		struct timeval *a_modify;
 		int a_waitfor;
 	} */ *ap;
 {
 	register struct fs *fs;
 	struct buf *bp;
 	struct inode *ip;
 	int error;
 	time_t tv_sec;
 
 	ip = VTOI(ap->a_vp);
 	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY) {
 		ip->i_flag &=
 		    ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 		return (0);
 	}
 	if ((ip->i_flag &
 	    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)
 		return (0);
 	/*
 	 * Use a copy of the current time to get consistent timestamps
 	 * (a_access and a_modify are sometimes aliases for &time).
 	 *
 	 * XXX in 2.0, a_access and a_modify are often pointers to the
 	 * same copy of `time'.  This is not as good.  Some callers forget
 	 * to make a copy; others make a copy too early (before the i/o
 	 * has completed)...
 	 *
 	 * XXX there should be a function or macro for reading the time
 	 * (e.g., some machines may require splclock()).
 	 */
 	tv_sec = time.tv_sec;
 	if (ip->i_flag & IN_ACCESS)
 		ip->i_atime.ts_sec =
 		    (ap->a_access == &time ? tv_sec : ap->a_access->tv_sec);
 	if (ip->i_flag & IN_UPDATE) {
 		ip->i_mtime.ts_sec =
 		    (ap->a_modify == &time ? tv_sec : ap->a_modify->tv_sec);
 		ip->i_modrev++;
 	}
 	if (ip->i_flag & IN_CHANGE)
 		ip->i_ctime.ts_sec = tv_sec;
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE);
 	fs = ip->i_fs;
 	/*
 	 * Ensure that uid and gid are correct. This is a temporary
 	 * fix until fsck has been changed to do the update.
 	 */
 	if (fs->fs_inodefmt < FS_44INODEFMT) {		/* XXX */
 		ip->i_din.di_ouid = ip->i_uid;		/* XXX */
 		ip->i_din.di_ogid = ip->i_gid;		/* XXX */
 	}						/* XXX */
 	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		(int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	*((struct dinode *)bp->b_data +
 	    ino_to_fsbo(fs, ip->i_number)) = ip->i_din;
 	if (ap->a_waitfor && (ap->a_vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
 		return (bwrite(bp));
 	else {
 		bp->b_flags |= B_CLUSTEROK;
 		bdwrite(bp);
 		return (0);
 	}
 }
 
 #define	SINGLE	0	/* index of single indirect block */
 #define	DOUBLE	1	/* index of double indirect block */
 #define	TRIPLE	2	/* index of triple indirect block */
 /*
  * Truncate the inode oip to at most length size, freeing the
  * disk blocks.
  */
 int
 ffs_truncate(ap)
 	struct vop_truncate_args /* {
 		struct vnode *a_vp;
 		off_t a_length;
 		int a_flags;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct vnode *ovp = ap->a_vp;
 	register daddr_t lastblock;
 	register struct inode *oip;
 	daddr_t bn, lbn, lastiblock[NIADDR], indir_lbn[NIADDR];
 	daddr_t oldblks[NDADDR + NIADDR], newblks[NDADDR + NIADDR];
 	off_t length = ap->a_length;
 	register struct fs *fs;
 	struct buf *bp;
 	int offset, size, level;
 	long count, nblocks, vflags, blocksreleased = 0;
 	struct timeval tv;
 	register int i;
 	int aflags, error, allerror;
 	off_t osize;
 
 	oip = VTOI(ovp);
 	fs = oip->i_fs;
 	if (length < 0 || length > fs->fs_maxfilesize)
 		return (EINVAL);
 	tv = time;
 	if (ovp->v_type == VLNK &&
 	    (oip->i_size < ovp->v_mount->mnt_maxsymlinklen || oip->i_din.di_blocks == 0)) {
 #ifdef DIAGNOSTIC
 		if (length != 0)
 			panic("ffs_truncate: partial truncate of symlink");
 #endif
 		bzero((char *)&oip->i_shortlink, (u_int)oip->i_size);
 		oip->i_size = 0;
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	if (oip->i_size == length) {
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 0));
 	}
 #ifdef QUOTA
 	error = getinoquota(oip);
 	if (error)
 		return (error);
 #endif
 	osize = oip->i_size;
 	/*
 	 * Lengthen the size of the file. We must ensure that the
 	 * last byte of the file is allocated. Since the smallest
 	 * value of osize is 0, length will be at least 1.
 	 */
 	if (osize < length) {
 		offset = blkoff(fs, length - 1);
 		lbn = lblkno(fs, length - 1);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
+		vnode_pager_setsize(ovp, length);
 		error = ffs_balloc(oip, lbn, offset + 1, ap->a_cred,
 		    &bp, aflags);
 		if (error)
 			return (error);
 		oip->i_size = length;
 		if (aflags & B_SYNC)
 			bwrite(bp);
 		else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
 			bdwrite(bp);
 		else
 			bawrite(bp);
-		vnode_pager_setsize(ovp, length);
 		oip->i_flag |= IN_CHANGE | IN_UPDATE;
 		return (VOP_UPDATE(ovp, &tv, &tv, 1));
 	}
 	/*
 	 * Shorten the size of the file. If the file is not being
 	 * truncated to a block boundry, the contents of the
 	 * partial block following the end of the file must be
 	 * zero'ed in case it ever become accessable again because
 	 * of subsequent file growth.
 	 */
 	offset = blkoff(fs, length);
 	if (offset == 0) {
 		oip->i_size = length;
 	} else {
 		lbn = lblkno(fs, length);
 		aflags = B_CLRBUF;
 		if (ap->a_flags & IO_SYNC)
 			aflags |= B_SYNC;
 		error = ffs_balloc(oip, lbn, offset, ap->a_cred, &bp, aflags);
 		if (error)
 			return (error);
 		oip->i_size = length;
 		size = blksize(fs, oip, lbn);
 		bzero((char *)bp->b_data + offset, (u_int)(size - offset));
 		allocbuf(bp, size);
 		if (aflags & B_SYNC)
 			bwrite(bp);
 		else if (ovp->v_mount->mnt_flag & MNT_ASYNC)
 			bdwrite(bp);
 		else
 			bawrite(bp);
 	}
 	/*
 	 * Calculate index into inode's block list of
 	 * last direct and indirect blocks (if any)
 	 * which we want to keep.  Lastblock is -1 when
 	 * the file is truncated to 0.
 	 */
 	lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1;
 	lastiblock[SINGLE] = lastblock - NDADDR;
 	lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs);
 	lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs);
 	nblocks = btodb(fs->fs_bsize);
 	/*
 	 * Update file and block pointers on disk before we start freeing
 	 * blocks.  If we crash before free'ing blocks below, the blocks
 	 * will be returned to the free list.  lastiblock values are also
 	 * normalized to -1 for calls to ffs_indirtrunc below.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)oldblks, sizeof oldblks);
 	for (level = TRIPLE; level >= SINGLE; level--)
 		if (lastiblock[level] < 0) {
 			oip->i_ib[level] = 0;
 			lastiblock[level] = -1;
 		}
 	for (i = NDADDR - 1; i > lastblock; i--)
 		oip->i_db[i] = 0;
 	oip->i_flag |= IN_CHANGE | IN_UPDATE;
-	error = VOP_UPDATE(ovp, &tv, &tv, 0);
+	error = VOP_UPDATE(ovp, &tv, &tv, ((length > 0) ? 0 : 1));
 	if (error)
 		allerror = error;
 	/*
 	 * Having written the new inode to disk, save its new configuration
 	 * and put back the old block pointers long enough to process them.
 	 * Note that we save the new block configuration so we can check it
 	 * when we are done.
 	 */
 	bcopy((caddr_t)&oip->i_db[0], (caddr_t)newblks, sizeof newblks);
 	bcopy((caddr_t)oldblks, (caddr_t)&oip->i_db[0], sizeof oldblks);
 	oip->i_size = osize;
 	vflags = ((length > 0) ? V_SAVE : 0) | V_SAVEMETA;
 	allerror = vinvalbuf(ovp, vflags, ap->a_cred, ap->a_p, 0, 0);
 
 	/*
 	 * Indirect blocks first.
 	 */
 	indir_lbn[SINGLE] = -NDADDR;
 	indir_lbn[DOUBLE] = indir_lbn[SINGLE] - NINDIR(fs) - 1;
 	indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - NINDIR(fs) * NINDIR(fs) - 1;
 	for (level = TRIPLE; level >= SINGLE; level--) {
 		bn = oip->i_ib[level];
 		if (bn != 0) {
 			error = ffs_indirtrunc(oip, indir_lbn[level],
 			    fsbtodb(fs, bn), lastiblock[level], level, &count);
 			if (error)
 				allerror = error;
 			blocksreleased += count;
 			if (lastiblock[level] < 0) {
 				oip->i_ib[level] = 0;
 				ffs_blkfree(oip, bn, fs->fs_bsize);
 				blocksreleased += nblocks;
 			}
 		}
 		if (lastiblock[level] >= 0)
 			goto done;
 	}
 
 	/*
 	 * All whole direct blocks or frags.
 	 */
 	for (i = NDADDR - 1; i > lastblock; i--) {
 		register long bsize;
 
 		bn = oip->i_db[i];
 		if (bn == 0)
 			continue;
 		oip->i_db[i] = 0;
 		bsize = blksize(fs, oip, i);
 		ffs_blkfree(oip, bn, bsize);
 		blocksreleased += btodb(bsize);
 	}
 	if (lastblock < 0)
 		goto done;
 
 	/*
 	 * Finally, look for a change in size of the
 	 * last direct block; release any frags.
 	 */
 	bn = oip->i_db[lastblock];
 	if (bn != 0) {
 		long oldspace, newspace;
 
 		/*
 		 * Calculate amount of space we're giving
 		 * back as old block size minus new block size.
 		 */
 		oldspace = blksize(fs, oip, lastblock);
 		oip->i_size = length;
 		newspace = blksize(fs, oip, lastblock);
 		if (newspace == 0)
 			panic("ffs_truncate: newspace");
 		if (oldspace - newspace > 0) {
 			/*
 			 * Block number of space to be free'd is
 			 * the old block # plus the number of frags
 			 * required for the storage we're keeping.
 			 */
 			bn += numfrags(fs, newspace);
 			ffs_blkfree(oip, bn, oldspace - newspace);
 			blocksreleased += btodb(oldspace - newspace);
 		}
 	}
 done:
 #ifdef DIAGNOSTIC
 	for (level = SINGLE; level <= TRIPLE; level++)
 		if (newblks[NDADDR + level] != oip->i_ib[level])
 			panic("ffs_truncate1");
 	for (i = 0; i < NDADDR; i++)
 		if (newblks[i] != oip->i_db[i])
 			panic("ffs_truncate2");
 	if (length == 0 &&
 	    (ovp->v_dirtyblkhd.lh_first || ovp->v_cleanblkhd.lh_first))
 		panic("ffs_truncate3");
 #endif /* DIAGNOSTIC */
 	/*
 	 * Put back the real size.
 	 */
 	oip->i_size = length;
 	oip->i_blocks -= blocksreleased;
 	if (oip->i_blocks < 0)			/* sanity */
 		oip->i_blocks = 0;
 	oip->i_flag |= IN_CHANGE;
 	vnode_pager_setsize(ovp, length);
 #ifdef QUOTA
 	(void) chkdq(oip, -blocksreleased, NOCRED, 0);
 #endif
 	return (allerror);
 }
 
 /*
  * Release blocks associated with the inode ip and stored in the indirect
  * block bn.  Blocks are free'd in LIFO order up to (but not including)
  * lastbn.  If level is greater than SINGLE, the block is an indirect block
  * and recursive calls to indirtrunc must be used to cleanse other indirect
  * blocks.
  *
  * NB: triple indirect blocks are untested.
  */
 static int
 ffs_indirtrunc(ip, lbn, dbn, lastbn, level, countp)
 	register struct inode *ip;
 	daddr_t lbn, lastbn;
 	daddr_t dbn;
 	int level;
 	long *countp;
 {
 	register int i;
 	struct buf *bp;
 	register struct fs *fs = ip->i_fs;
 	register daddr_t *bap;
 	struct vnode *vp;
 	daddr_t *copy, nb, nlbn, last;
 	long blkcount, factor;
 	int nblocks, blocksreleased = 0;
 	int error = 0, allerror = 0;
 
 	/*
 	 * Calculate index in current block of last
 	 * block to be kept.  -1 indicates the entire
 	 * block so we need not calculate the index.
 	 */
 	factor = 1;
 	for (i = SINGLE; i < level; i++)
 		factor *= NINDIR(fs);
 	last = lastbn;
 	if (lastbn > 0)
 		last /= factor;
 	nblocks = btodb(fs->fs_bsize);
 	/*
 	 * Get buffer of block pointers, zero those entries corresponding
 	 * to blocks to be free'd, and update on disk copy first.  Since
 	 * double(triple) indirect before single(double) indirect, calls
 	 * to bmap on these blocks will fail.  However, we already have
 	 * the on disk address, so we have to set the b_blkno field
 	 * explicitly instead of letting bread do everything for us.
 	 */
 	vp = ITOV(ip);
 	bp = getblk(vp, lbn, (int)fs->fs_bsize, 0, 0);
 	if ((bp->b_flags & B_CACHE) == 0) {
 		curproc->p_stats->p_ru.ru_inblock++;	/* pay for read */
 		bp->b_flags |= B_READ;
 		if (bp->b_bcount > bp->b_bufsize)
 			panic("ffs_indirtrunc: bad buffer size");
 		bp->b_blkno = dbn;
 		vfs_busy_pages(bp, 0);
 		VOP_STRATEGY(bp);
 		error = biowait(bp);
 	}
 	if (error) {
 		brelse(bp);
 		*countp = 0;
 		return (error);
 	}
 
 	bap = (daddr_t *)bp->b_data;
 	MALLOC(copy, daddr_t *, fs->fs_bsize, M_TEMP, M_WAITOK);
 	bcopy((caddr_t)bap, (caddr_t)copy, (u_int)fs->fs_bsize);
 	bzero((caddr_t)&bap[last + 1],
 	  (u_int)(NINDIR(fs) - (last + 1)) * sizeof (daddr_t));
 	if (last == -1)
 		bp->b_flags |= B_INVAL;
 	if ((vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
 		error = bwrite(bp);
 	} else {
 		bawrite(bp);
 		error = 0;
 	}
 	if (error)
 		allerror = error;
 	bap = copy;
 
 	/*
 	 * Recursively free totally unused blocks.
 	 */
 	for (i = NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
 	    i--, nlbn += factor) {
 		nb = bap[i];
 		if (nb == 0)
 			continue;
 		if (level > SINGLE) {
 			error = ffs_indirtrunc(ip, nlbn,
 			    fsbtodb(fs, nb), (daddr_t)-1, level - 1, &blkcount);
 			if (error)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 		ffs_blkfree(ip, nb, fs->fs_bsize);
 		blocksreleased += nblocks;
 	}
 
 	/*
 	 * Recursively free last partial block.
 	 */
 	if (level > SINGLE && lastbn >= 0) {
 		last = lastbn % factor;
 		nb = bap[i];
 		if (nb != 0) {
 			error = ffs_indirtrunc(ip, nlbn, fsbtodb(fs, nb),
 			    last, level - 1, &blkcount);
 			if (error)
 				allerror = error;
 			blocksreleased += blkcount;
 		}
 	}
 	FREE(copy, M_TEMP);
 	*countp = blocksreleased;
 	return (allerror);
 }
Index: head/sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- head/sys/ufs/ffs/ffs_vfsops.c	(revision 13489)
+++ head/sys/ufs/ffs/ffs_vfsops.c	(revision 13490)
@@ -1,1047 +1,1047 @@
 /*
  * Copyright (c) 1989, 1991, 1993, 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_vfsops.c	8.8 (Berkeley) 4/18/94
- * $Id: ffs_vfsops.c,v 1.33 1996/01/05 18:31:49 wollman Exp $
+ * $Id: ffs_vfsops.c,v 1.34 1996/01/14 18:54:59 bde Exp $
  */
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/socket.h>
 #include <sys/mount.h>
 #include <sys/buf.h>
 #include <sys/mbuf.h>
 #include <sys/file.h>
 #include <sys/disklabel.h>
 #include <sys/ioctl.h>
 #include <sys/errno.h>
 #include <sys/malloc.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 static int	ffs_sbupdate __P((struct ufsmount *, int));
 static int	ffs_reload __P((struct mount *,struct ucred *,struct proc *));
 static int	ffs_oldfscompat __P((struct fs *));
 static int	ffs_mount __P((struct mount *,
 	    char *, caddr_t, struct nameidata *, struct proc *));
 
 struct vfsops ufs_vfsops = {
 	ffs_mount,
 	ufs_start,
 	ffs_unmount,
 	ufs_root,
 	ufs_quotactl,
 	ffs_statfs,
 	ffs_sync,
 	ffs_vget,
 	ffs_fhtovp,
 	ffs_vptofh,
 	ffs_init,
 };
 
 VFS_SET(ufs_vfsops, ufs, MOUNT_UFS, 0);
 
 extern u_long nextgennumber;
 
 
 /*
  * ffs_mount
  *
  * Called when mounting local physical media
  *
  * PARAMETERS:
  *		mountroot
  *			mp	mount point structure
  *			path	NULL (flag for root mount!!!)
  *			data	<unused>
  *			ndp	<unused>
  *			p	process (user credentials check [statfs])
  *
  *		mount
  *			mp	mount point structure
  *			path	path to mount point
  *			data	pointer to argument struct in user space
  *			ndp	mount point namei() return (used for
  *				credentials on reload), reused to look
  *				up block device.
  *			p	process (user credentials check)
  *
  * RETURNS:	0	Success
  *		!0	error number (errno.h)
  *
  * LOCK STATE:
  *
  *		ENTRY
  *			mount point is locked
  *		EXIT
  *			mount point is locked
  *
  * NOTES:
  *		A NULL path can be used for a flag since the mount
  *		system call will fail with EFAULT in copyinstr in
  *		namei() if it is a genuine NULL from the user.
  */
 static int
 ffs_mount( mp, path, data, ndp, p)
         register struct mount	*mp;	/* mount struct pointer*/
         char			*path;	/* path to mount point*/
         caddr_t			data;	/* arguments to FS specific mount*/
         struct nameidata	*ndp;	/* mount point credentials*/
         struct proc		*p;	/* process requesting mount*/
 {
 	u_int		size;
 	int		err = 0;
 	struct vnode	*devvp;
 
 	struct ufs_args args;
 	struct ufsmount *ump = 0;
 	register struct fs *fs;
 	int flags;
 
 	/*
 	 * Use NULL path to flag a root mount
 	 */
 	if( path == NULL) {
 		/*
 		 ***
 		 * Mounting root file system
 		 ***
 		 */
 	
 		/* Get vnode for root device*/
 		if( bdevvp( rootdev, &rootvp))
 			panic("ffs_mountroot: can't setup bdevvp for root");
 
 		/*
 		 * FS specific handling
 		 */
 		mp->mnt_flag |= MNT_RDONLY;	/* XXX globally applicable?*/
 
 		/*
 		 * Attempt mount
 		 */
 		if( ( err = ffs_mountfs(rootvp, mp, p)) != 0) {
 			/* fs specific cleanup (if any)*/
 			goto error_1;
 		}
 
 		goto dostatfs;		/* success*/
 
 	}
 
 	/*
 	 ***
 	 * Mounting non-root file system or updating a file system
 	 ***
 	 */
 
 	/* copy in user arguments*/
 	err = copyin(data, (caddr_t)&args, sizeof (struct ufs_args));
 	if (err)
 		goto error_1;		/* can't get arguments*/
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		ump = VFSTOUFS(mp);
 		fs = ump->um_fs;
 		err = 0;
 		if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			if (vfs_busy(mp)) {
 				err = EBUSY;
 				goto error_1;
 			}
 			err = ffs_flushfiles(mp, flags, p);
 			vfs_unbusy(mp);
 		}
 		if (!err && (mp->mnt_flag & MNT_RELOAD))
 			err = ffs_reload(mp, ndp->ni_cnd.cn_cred, p);
 		if (err) {
 			goto error_1;
 		}
 		if (fs->fs_ronly && (mp->mnt_flag & MNT_WANTRDWR)) {
 			if (!fs->fs_clean) {
 				if (mp->mnt_flag & MNT_FORCE) {
 					printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt);
 				} else {
 					printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n",
 					    fs->fs_fsmnt);
 					err = EPERM;
 					goto error_1;
 				}
 			}
 			fs->fs_ronly = 0;
 		}
 		if (fs->fs_ronly == 0) {
 			fs->fs_clean = 0;
 			ffs_sbupdate(ump, MNT_WAIT);
 		}
 		/* if not updating name...*/
 		if (args.fspec == 0) {
 			/*
 			 * Process export requests.  Jumping to "success"
 			 * will return the vfs_export() error code.
 			 */
 			err = vfs_export(mp, &ump->um_export, &args.export);
 			goto success;
 		}
 	}
 
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible block device.
 	 */
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args.fspec, p);
 	err = namei(ndp);
 	if (err) {
 		/* can't get devvp!*/
 		goto error_1;
 	}
 
 	devvp = ndp->ni_vp;
 
 	if (devvp->v_type != VBLK) {
 		err = ENOTBLK;
 		goto error_2;
 	}
 	if (major(devvp->v_rdev) >= nblkdev) {
 		err = ENXIO;
 		goto error_2;
 	}
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 ********************
 		 * UPDATE
 		 ********************
 		 */
 
 		if (devvp != ump->um_devvp)
 			err = EINVAL;	/* needs translation */
 		else
 			vrele(devvp);
 		/*
 		 * Update device name only on success
 		 */
 		if( !err) {
 			/* Save "mounted from" info for mount point (NULL pad)*/
 			copyinstr(	args.fspec,
 					mp->mnt_stat.f_mntfromname,
 					MNAMELEN - 1,
 					&size);
 			bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 		}
 	} else {
 		/*
 		 ********************
 		 * NEW MOUNT
 		 ********************
 		 */
 
 		/*
 		 * Since this is a new mount, we want the names for
 		 * the device and the mount point copied in.  If an
 		 * error occurs,  the mountpoint is discarded by the
 		 * upper level code.
 		 */
 		/* Save "last mounted on" info for mount point (NULL pad)*/
 		copyinstr(	path,				/* mount point*/
 				mp->mnt_stat.f_mntonname,	/* save area*/
 				MNAMELEN - 1,			/* max size*/
 				&size);				/* real size*/
 		bzero( mp->mnt_stat.f_mntonname + size, MNAMELEN - size);
 
 		/* Save "mounted from" info for mount point (NULL pad)*/
 		copyinstr(	args.fspec,			/* device name*/
 				mp->mnt_stat.f_mntfromname,	/* save area*/
 				MNAMELEN - 1,			/* max size*/
 				&size);				/* real size*/
 		bzero( mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 
 		err = ffs_mountfs(devvp, mp, p);
 	}
 	if (err) {
 		goto error_2;
 	}
 
 dostatfs:
 	/*
 	 * Initialize FS stat information in mount struct; uses both
 	 * mp->mnt_stat.f_mntonname and mp->mnt_stat.f_mntfromname
 	 *
 	 * This code is common to root and non-root mounts
 	 */
 	(void)VFS_STATFS(mp, &mp->mnt_stat, p);
 
 	goto success;
 
 
 error_2:	/* error with devvp held*/
 
 	/* release devvp before failing*/
 	vrele(devvp);
 
 error_1:	/* no state to back out*/
 
 success:
 	return( err);
 }
 
 
 /*
  * Reload all incore data for a filesystem (used after running fsck on
  * the root filesystem and finding things to fix). The filesystem must
  * be mounted read-only.
  *
  * Things to do to update the mount:
  *	1) invalidate all cached meta-data.
  *	2) re-read superblock from disk.
  *	3) re-read summary information from disk.
  *	4) invalidate all inactive vnodes.
  *	5) invalidate all cached file data.
  *	6) re-read inode data for all active vnodes.
  */
 static int
 ffs_reload(mp, cred, p)
 	register struct mount *mp;
 	struct ucred *cred;
 	struct proc *p;
 {
 	register struct vnode *vp, *nvp, *devvp;
 	struct inode *ip;
 	struct csum *space;
 	struct buf *bp;
 	struct fs *fs;
 	int i, blks, size, error;
 
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		return (EINVAL);
 	/*
 	 * Step 1: invalidate all cached meta-data.
 	 */
 	devvp = VFSTOUFS(mp)->um_devvp;
 	if (vinvalbuf(devvp, 0, cred, p, 0, 0))
 		panic("ffs_reload: dirty1");
 	/*
 	 * Step 2: re-read superblock from disk.
 	 */
 	error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp);
 	if (error)
 		return (error);
 	fs = (struct fs *)bp->b_data;
 	if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE ||
 	    fs->fs_bsize < sizeof(struct fs)) {
 		brelse(bp);
 		return (EIO);		/* XXX needs translation */
 	}
 	fs = VFSTOUFS(mp)->um_fs;
 	bcopy(&fs->fs_csp[0], &((struct fs *)bp->b_data)->fs_csp[0],
 	    sizeof(fs->fs_csp));
 	bcopy(bp->b_data, fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBSIZE)
 		bp->b_flags |= B_INVAL;
 	brelse(bp);
 	ffs_oldfscompat(fs);
 	/*
 	 * Step 3: re-read summary information from disk.
 	 */
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = fs->fs_csp[0];
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 		    NOCRED, &bp);
 		if (error)
 			return (error);
 		bcopy(bp->b_data, fs->fs_csp[fragstoblks(fs, i)], (u_int)size);
 		brelse(bp);
 	}
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 		nvp = vp->v_mntvnodes.le_next;
 		/*
 		 * Step 4: invalidate all inactive vnodes.
 		 */
 		if (vp->v_usecount == 0) {
 			vgone(vp);
 			continue;
 		}
 		/*
 		 * Step 5: invalidate all cached file data.
 		 */
 		if (vget(vp, 1))
 			goto loop;
 		if (vinvalbuf(vp, 0, cred, p, 0, 0))
 			panic("ffs_reload: dirty2");
 		/*
 		 * Step 6: re-read inode data for all active vnodes.
 		 */
 		ip = VTOI(vp);
 		error =
 		    bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		    (int)fs->fs_bsize, NOCRED, &bp);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 		ip->i_din = *((struct dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number));
 		brelse(bp);
 		vput(vp);
 		if (vp->v_mount != mp)
 			goto loop;
 	}
 	return (0);
 }
 
 /*
  * Common code for mount and mountroot
  */
 int
 ffs_mountfs(devvp, mp, p)
 	register struct vnode *devvp;
 	struct mount *mp;
 	struct proc *p;
 {
 	register struct ufsmount *ump;
 	struct buf *bp;
 	register struct fs *fs;
 	dev_t dev = devvp->v_rdev;
 	struct partinfo dpart;
 	caddr_t base, space;
 	int havepart = 0, blks;
 	int error, i, size;
 	int ronly;
 	u_int strsize;
 
 	/*
 	 * Disallow multiple mounts of the same device.
 	 * Disallow mounting of a device that is currently in use
 	 * (except for root, which might share swap device for miniroot).
 	 * Flush out any old buffers remaining from a previous use.
 	 */
 	error = vfs_mountedon(devvp);
 	if (error)
 		return (error);
 	if (vcount(devvp) > 1 && devvp != rootvp)
 		return (EBUSY);
 	error = vinvalbuf(devvp, V_SAVE, p->p_ucred, p, 0, 0);
 	if (error)
 		return (error);
 
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 	error = VOP_OPEN(devvp, ronly ? FREAD : FREAD|FWRITE, FSCRED, p);
 	if (error)
 		return (error);
 	if (VOP_IOCTL(devvp, DIOCGPART, (caddr_t)&dpart, FREAD, NOCRED, p) != 0)
 		size = DEV_BSIZE;
 	else {
 		havepart = 1;
 		size = dpart.disklab->d_secsize;
 	}
 
 	bp = NULL;
 	ump = NULL;
 	error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp);
 	if (error)
 		goto out;
 	fs = (struct fs *)bp->b_data;
 	if (fs->fs_magic != FS_MAGIC || fs->fs_bsize > MAXBSIZE ||
 	    fs->fs_bsize < sizeof(struct fs)) {
 		error = EINVAL;		/* XXX needs translation */
 		goto out;
 	}
 	if (!fs->fs_clean) {
 		if (ronly || (mp->mnt_flag & MNT_FORCE)) {
 			printf("WARNING: %s was not properly dismounted.\n",fs->fs_fsmnt);
 		} else {
 			printf("WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck.\n",fs->fs_fsmnt);
 			error = EPERM;
 			goto out;
 		}
 	}
 	ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK);
 	bzero((caddr_t)ump, sizeof *ump);
 	ump->um_fs = malloc((u_long)fs->fs_sbsize, M_UFSMNT,
 	    M_WAITOK);
 	bcopy(bp->b_data, ump->um_fs, (u_int)fs->fs_sbsize);
 	if (fs->fs_sbsize < SBSIZE)
 		bp->b_flags |= B_INVAL;
 	brelse(bp);
 	bp = NULL;
 	fs = ump->um_fs;
 	fs->fs_ronly = ronly;
 	if (ronly == 0) {
 		fs->fs_fmod = 1;
 		fs->fs_clean = 0;
 	}
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	base = space = malloc((u_long)fs->fs_cssize, M_UFSMNT,
 	    M_WAITOK);
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size,
 			NOCRED, &bp);
 		if (error) {
 			free(base, M_UFSMNT);
 			goto out;
 		}
 		bcopy(bp->b_data, space, (u_int)size);
 		fs->fs_csp[fragstoblks(fs, i)] = (struct csum *)space;
 		space += size;
 		brelse(bp);
 		bp = NULL;
 	}
 	mp->mnt_data = (qaddr_t)ump;
 	mp->mnt_stat.f_fsid.val[0] = (long)dev;
 	mp->mnt_stat.f_fsid.val[1] = MOUNT_UFS;
 	mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen;
 	mp->mnt_flag |= MNT_LOCAL;
 	ump->um_mountp = mp;
 	ump->um_dev = dev;
 	ump->um_devvp = devvp;
 	ump->um_nindir = fs->fs_nindir;
 	ump->um_bptrtodb = fs->fs_fsbtodb;
 	ump->um_seqinc = fs->fs_frag;
 	for (i = 0; i < MAXQUOTAS; i++)
 		ump->um_quotas[i] = NULLVP;
 	devvp->v_specflags |= SI_MOUNTEDON;
 	ffs_oldfscompat(fs);
 
 	/*
 	 * Set FS local "last mounted on" information (NULL pad)
 	 */
 	copystr(	mp->mnt_stat.f_mntonname,	/* mount point*/
 			fs->fs_fsmnt,			/* copy area*/
 			sizeof(fs->fs_fsmnt) - 1,	/* max size*/
 			&strsize);			/* real size*/
 	bzero( fs->fs_fsmnt + strsize, sizeof(fs->fs_fsmnt) - strsize);
 
 	if( mp->mnt_flag & MNT_ROOTFS) {
 		/*
 		 * Root mount; update timestamp in mount structure.
 		 * this will be used by the common root mount code
 		 * to update the system clock.
 		 */
 		mp->mnt_time = fs->fs_time;
 	}
 	if (ronly == 0)
 		ffs_sbupdate(ump, MNT_WAIT);
 	return (0);
 out:
 	if (bp)
 		brelse(bp);
 	(void)VOP_CLOSE(devvp, ronly ? FREAD : FREAD|FWRITE, NOCRED, p);
 	if (ump) {
 		free(ump->um_fs, M_UFSMNT);
 		free(ump, M_UFSMNT);
 		mp->mnt_data = (qaddr_t)0;
 	}
 	return (error);
 }
 
 /*
  * Sanity checks for old file systems.
  *
  * XXX - goes away some day.
  */
 static int
 ffs_oldfscompat(fs)
 	struct fs *fs;
 {
 
 	fs->fs_npsect = max(fs->fs_npsect, fs->fs_nsect);	/* XXX */
 	fs->fs_interleave = max(fs->fs_interleave, 1);		/* XXX */
 	if (fs->fs_postblformat == FS_42POSTBLFMT)		/* XXX */
 		fs->fs_nrpos = 8;				/* XXX */
 	if (fs->fs_inodefmt < FS_44INODEFMT) {			/* XXX */
 #if 0
 		int i;						/* XXX */
 		quad_t sizepb = fs->fs_bsize;			/* XXX */
 		fs->fs_maxfilesize = fs->fs_bsize * NDADDR - 1;	/* XXX */
 		for (i = 0; i < NIADDR; i++) {			/* XXX */
 			sizepb *= NINDIR(fs);			/* XXX */
 			fs->fs_maxfilesize += sizepb;		/* XXX */
 		}						/* XXX */
 #endif
 		fs->fs_maxfilesize = (u_quad_t) 1LL << 39;
 		fs->fs_qbmask = ~fs->fs_bmask;			/* XXX */
 		fs->fs_qfmask = ~fs->fs_fmask;			/* XXX */
 	}							/* XXX */
 	return (0);
 }
 
 /*
  * unmount system call
  */
 int
 ffs_unmount(mp, mntflags, p)
 	struct mount *mp;
 	int mntflags;
 	struct proc *p;
 {
 	register struct ufsmount *ump;
 	register struct fs *fs;
 	int error, flags, ronly;
 
 	flags = 0;
 	if (mntflags & MNT_FORCE) {
 		flags |= FORCECLOSE;
 	}
 	error = ffs_flushfiles(mp, flags, p);
 	if (error)
 		return (error);
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	ronly = fs->fs_ronly;
 	if (!ronly) {
 		fs->fs_clean = 1;
 		ffs_sbupdate(ump, MNT_WAIT);
 	}
 	ump->um_devvp->v_specflags &= ~SI_MOUNTEDON;
 	error = VOP_CLOSE(ump->um_devvp, ronly ? FREAD : FREAD|FWRITE,
 		NOCRED, p);
 /*
 	vrele(ump->um_devvp);
 */
 	vn_vmio_close(ump->um_devvp);
 	free(fs->fs_csp[0], M_UFSMNT);
 	free(fs, M_UFSMNT);
 	free(ump, M_UFSMNT);
 	mp->mnt_data = (qaddr_t)0;
 	mp->mnt_flag &= ~MNT_LOCAL;
 	return (error);
 }
 
 /*
  * Flush out all the files in a filesystem.
  */
 int
 ffs_flushfiles(mp, flags, p)
 	register struct mount *mp;
 	int flags;
 	struct proc *p;
 {
 	register struct ufsmount *ump;
 	int error;
 
 	if (!doforce)
 		flags &= ~FORCECLOSE;
 	ump = VFSTOUFS(mp);
 #ifdef QUOTA
 	if (mp->mnt_flag & MNT_QUOTA) {
 		int i;
 		error = vflush(mp, NULLVP, SKIPSYSTEM|flags);
 		if (error)
 			return (error);
 		for (i = 0; i < MAXQUOTAS; i++) {
 			if (ump->um_quotas[i] == NULLVP)
 				continue;
 			quotaoff(p, mp, i);
 		}
 		/*
 		 * Here we fall through to vflush again to ensure
 		 * that we have gotten rid of all the system vnodes.
 		 */
 	}
 #endif
 	error = vflush(mp, NULLVP, flags);
 	return (error);
 }
 
 /*
  * Get file system statistics.
  */
 int
 ffs_statfs(mp, sbp, p)
 	struct mount *mp;
 	register struct statfs *sbp;
 	struct proc *p;
 {
 	register struct ufsmount *ump;
 	register struct fs *fs;
 
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	if (fs->fs_magic != FS_MAGIC)
 		panic("ffs_statfs");
 	sbp->f_type = MOUNT_UFS;
 	sbp->f_bsize = fs->fs_fsize;
 	sbp->f_iosize = fs->fs_bsize;
 	sbp->f_blocks = fs->fs_dsize;
 	sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag +
 		fs->fs_cstotal.cs_nffree;
 	sbp->f_bavail = freespace(fs, fs->fs_minfree);
 	sbp->f_files =  fs->fs_ncg * fs->fs_ipg - ROOTINO;
 	sbp->f_ffree = fs->fs_cstotal.cs_nifree;
 	if (sbp != &mp->mnt_stat) {
 		bcopy((caddr_t)mp->mnt_stat.f_mntonname,
 			(caddr_t)&sbp->f_mntonname[0], MNAMELEN);
 		bcopy((caddr_t)mp->mnt_stat.f_mntfromname,
 			(caddr_t)&sbp->f_mntfromname[0], MNAMELEN);
 	}
 	return (0);
 }
 
 /*
  * Go through the disk queues to initiate sandbagged IO;
  * go through the inodes to write those that have been modified;
  * initiate the writing of the super block if it has been modified.
  *
  * Note: we are always called with the filesystem marked `MPBUSY'.
  */
 int
 ffs_sync(mp, waitfor, cred, p)
 	struct mount *mp;
 	int waitfor;
 	struct ucred *cred;
 	struct proc *p;
 {
 	register struct vnode *vp, *nvp;
 	register struct inode *ip;
 	register struct ufsmount *ump = VFSTOUFS(mp);
 	register struct fs *fs;
 	struct timeval tv;
 	int error, allerror = 0;
 
 	fs = ump->um_fs;
 	/*
 	 * Write back modified superblock.
 	 * Consistency check that the superblock
 	 * is still in the buffer cache.
 	 */
 	if (fs->fs_fmod != 0) {
 		if (fs->fs_ronly != 0) {		/* XXX */
 			printf("fs = %s\n", fs->fs_fsmnt);
 			panic("update: rofs mod");
 		}
 		fs->fs_fmod = 0;
 		fs->fs_time = time.tv_sec;
 		allerror = ffs_sbupdate(ump, waitfor);
 	}
 	/*
 	 * Write back each (modified) inode.
 	 */
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 		/*
 		 * If the vnode that we are about to sync is no longer
 		 * associated with this mount point, start over.
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
 		if (VOP_ISLOCKED(vp))
 			continue;
 		ip = VTOI(vp);
 		if ((((ip->i_flag &
 		    (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0)) &&
 		    vp->v_dirtyblkhd.lh_first == NULL)
 			continue;
 		if (vp->v_type != VCHR) {
 			if (vget(vp, 1))
 				goto loop;
 			error = VOP_FSYNC(vp, cred, waitfor, p);
 			if (error)
 				allerror = error;
 			vput(vp);
 		} else {
 			tv = time;
 			/* VOP_UPDATE(vp, &tv, &tv, waitfor == MNT_WAIT); */
 			VOP_UPDATE(vp, &tv, &tv, 0);
 		}
 	}
 	/*
 	 * Force stale file system control information to be flushed.
 	 */
 	error = VOP_FSYNC(ump->um_devvp, cred, waitfor, p);
 	if (error)
 		allerror = error;
 #ifdef QUOTA
 	qsync(mp);
 #endif
 	return (allerror);
 }
 
 /*
  * Look up a FFS dinode number to find its incore vnode, otherwise read it
  * in from disk.  If it is in core, wait for the lock bit to clear, then
  * return the inode locked.  Detection and handling of mount points must be
  * done by the calling routine.
  */
 static int ffs_inode_hash_lock;
 
 int
 ffs_vget(mp, ino, vpp)
 	struct mount *mp;
 	ino_t ino;
 	struct vnode **vpp;
 {
 	register struct fs *fs;
 	register struct inode *ip;
 	struct ufsmount *ump;
 	struct buf *bp;
 	struct vnode *vp;
 	dev_t dev;
 	int type, error;
 
 	ump = VFSTOUFS(mp);
 	dev = ump->um_dev;
 restart:
 	if ((*vpp = ufs_ihashget(dev, ino)) != NULL)
 		return (0);
 
 	/*
 	 * Lock out the creation of new entries in the FFS hash table in
 	 * case getnewvnode() or MALLOC() blocks, otherwise a duplicate
 	 * may occur!
 	 */
 	if (ffs_inode_hash_lock) {
 		while (ffs_inode_hash_lock) {
 			ffs_inode_hash_lock = -1;
 			tsleep(&ffs_inode_hash_lock, PVM, "ffsvgt", 0);
 		}
 		goto restart;
 	}
 	ffs_inode_hash_lock = 1;
 
 	/* Allocate a new vnode/inode. */
 	error = getnewvnode(VT_UFS, mp, ffs_vnodeop_p, &vp);
 	if (error) {
 		if (ffs_inode_hash_lock < 0)
 			wakeup(&ffs_inode_hash_lock);
 		ffs_inode_hash_lock = 0;
 		*vpp = NULL;
 		return (error);
 	}
 	type = ump->um_devvp->v_tag == VT_MFS ? M_MFSNODE : M_FFSNODE; /* XXX */
 	MALLOC(ip, struct inode *, sizeof(struct inode), type, M_WAITOK);
 	bzero((caddr_t)ip, sizeof(struct inode));
 	vp->v_data = ip;
 	ip->i_vnode = vp;
 	ip->i_fs = fs = ump->um_fs;
 	ip->i_dev = dev;
 	ip->i_number = ino;
 #ifdef QUOTA
 	{
 		int i;
 		for (i = 0; i < MAXQUOTAS; i++)
 			ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	/*
 	 * Put it onto its hash chain and lock it so that other requests for
 	 * this inode will block if they arrive while we are sleeping waiting
 	 * for old data structures to be purged or for the contents of the
 	 * disk portion of this inode to be read.
 	 */
 	ufs_ihashins(ip);
 
 	if (ffs_inode_hash_lock < 0)
 		wakeup(&ffs_inode_hash_lock);
 	ffs_inode_hash_lock = 0;
 
 	/* Read in the disk contents for the inode, copy into the inode. */
 	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)),
 	    (int)fs->fs_bsize, NOCRED, &bp);
 	if (error) {
 		/*
 		 * The inode does not contain anything useful, so it would
 		 * be misleading to leave it on its hash chain. With mode
 		 * still zero, it will be unlinked and returned to the free
 		 * list by vput().
 		 */
-		vput(vp);
 		brelse(bp);
+		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	ip->i_din = *((struct dinode *)bp->b_data + ino_to_fsbo(fs, ino));
-	brelse(bp);
+	bqrelse(bp);
 
 	/*
 	 * Initialize the vnode from the inode, check for aliases.
 	 * Note that the underlying vnode may have changed.
 	 */
 	error = ufs_vinit(mp, ffs_specop_p, FFS_FIFOOPS, &vp);
 	if (error) {
 		vput(vp);
 		*vpp = NULL;
 		return (error);
 	}
 	/*
 	 * Finish inode initialization now that aliasing has been resolved.
 	 */
 	ip->i_devvp = ump->um_devvp;
 	VREF(ip->i_devvp);
 	/*
 	 * Set up a generation number for this inode if it does not
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
 		if (++nextgennumber < (u_long)time.tv_sec)
 			nextgennumber = time.tv_sec;
 		ip->i_gen = nextgennumber;
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0)
 			ip->i_flag |= IN_MODIFIED;
 	}
 	/*
 	 * Ensure that uid and gid are correct. This is a temporary
 	 * fix until fsck has been changed to do the update.
 	 */
 	if (fs->fs_inodefmt < FS_44INODEFMT) {		/* XXX */
 		ip->i_uid = ip->i_din.di_ouid;		/* XXX */
 		ip->i_gid = ip->i_din.di_ogid;		/* XXX */
 	}						/* XXX */
 
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * File handle to vnode
  *
  * Have to be really careful about stale file handles:
  * - check that the inode number is valid
  * - call ffs_vget() to get the locked inode
  * - check for an unallocated inode (i_mode == 0)
  * - check that the given client host has export rights and return
  *   those rights via. exflagsp and credanonp
  */
 int
 ffs_fhtovp(mp, fhp, nam, vpp, exflagsp, credanonp)
 	register struct mount *mp;
 	struct fid *fhp;
 	struct mbuf *nam;
 	struct vnode **vpp;
 	int *exflagsp;
 	struct ucred **credanonp;
 {
 	register struct ufid *ufhp;
 	struct fs *fs;
 
 	ufhp = (struct ufid *)fhp;
 	fs = VFSTOUFS(mp)->um_fs;
 	if (ufhp->ufid_ino < ROOTINO ||
 	    ufhp->ufid_ino >= fs->fs_ncg * fs->fs_ipg)
 		return (ESTALE);
 	return (ufs_check_export(mp, ufhp, nam, vpp, exflagsp, credanonp));
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 int
 ffs_vptofh(vp, fhp)
 	struct vnode *vp;
 	struct fid *fhp;
 {
 	register struct inode *ip;
 	register struct ufid *ufhp;
 
 	ip = VTOI(vp);
 	ufhp = (struct ufid *)fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Write a superblock and associated information back to disk.
  */
 static int
 ffs_sbupdate(mp, waitfor)
 	struct ufsmount *mp;
 	int waitfor;
 {
 	register struct fs *fs = mp->um_fs;
 	register struct buf *bp;
 	int blks;
 	caddr_t space;
 	int i, size, error = 0;
 
 	bp = getblk(mp->um_devvp, SBLOCK, (int)fs->fs_sbsize, 0, 0);
 	bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
 	/* Restore compatibility to old file systems.		   XXX */
 	if (fs->fs_postblformat == FS_42POSTBLFMT)		/* XXX */
 		((struct fs *)bp->b_data)->fs_nrpos = -1;	/* XXX */
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 	blks = howmany(fs->fs_cssize, fs->fs_fsize);
 	space = (caddr_t)fs->fs_csp[0];
 	for (i = 0; i < blks; i += fs->fs_frag) {
 		size = fs->fs_bsize;
 		if (i + fs->fs_frag > blks)
 			size = (blks - i) * fs->fs_fsize;
 		bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i),
 		    size, 0, 0);
 		bcopy(space, bp->b_data, (u_int)size);
 		space += size;
 		if (waitfor == MNT_WAIT)
 			error = bwrite(bp);
 		else
 			bawrite(bp);
 	}
 	return (error);
 }
Index: head/sys/ufs/ufs/ufs_bmap.c
===================================================================
--- head/sys/ufs/ufs/ufs_bmap.c	(revision 13489)
+++ head/sys/ufs/ufs/ufs_bmap.c	(revision 13490)
@@ -1,317 +1,317 @@
 /*
  * Copyright (c) 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_bmap.c	8.6 (Berkeley) 1/21/94
- * $Id: ufs_bmap.c,v 1.9 1995/09/04 00:21:09 dyson Exp $
+ * $Id: ufs_bmap.c,v 1.10 1995/11/05 23:07:37 dyson Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/resourcevar.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 /*
  * Bmap converts a the logical block number of a file to its physical block
  * number on the disk. The conversion is done by using the logical block
  * number to index into the array of block pointers described by the dinode.
  */
 int
 ufs_bmap(ap)
 	struct vop_bmap_args /* {
 		struct vnode *a_vp;
 		daddr_t  a_bn;
 		struct vnode **a_vpp;
 		daddr_t *a_bnp;
 		int *a_runp;
 		int *a_runb;
 	} */ *ap;
 {
 	/*
 	 * Check for underlying vnode requests and ensure that logical
 	 * to physical mapping is requested.
 	 */
 	if (ap->a_vpp != NULL)
 		*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
 	if (ap->a_bnp == NULL)
 		return (0);
 
 	return (ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
 	    ap->a_runp, ap->a_runb));
 }
 
 /*
  * Indirect blocks are now on the vnode for the file.  They are given negative
  * logical block numbers.  Indirect blocks are addressed by the negative
  * address of the first data block to which they point.  Double indirect blocks
  * are addressed by one less than the address of the first indirect block to
  * which they point.  Triple indirect blocks are addressed by one less than
  * the address of the first double indirect block to which they point.
  *
  * ufs_bmaparray does the bmap conversion, and if requested returns the
  * array of logical blocks which must be traversed to get to a block.
  * Each entry contains the offset into that block that gets you to the
  * next block and the disk address of the block (if it is assigned).
  */
 
 int
 ufs_bmaparray(vp, bn, bnp, ap, nump, runp, runb)
 	struct vnode *vp;
 	register daddr_t bn;
 	daddr_t *bnp;
 	struct indir *ap;
 	int *nump;
 	int *runp;
 	int *runb;
 {
 	register struct inode *ip;
 	struct buf *bp;
 	struct ufsmount *ump;
 	struct mount *mp;
 	struct vnode *devvp;
 	struct indir a[NIADDR+1], *xap;
 	daddr_t daddr;
 	long metalbn;
 	int error, maxrun = 0, num;
 
 	ip = VTOI(vp);
 	mp = vp->v_mount;
 	ump = VFSTOUFS(mp);
 #ifdef DIAGNOSTIC
 	if (ap != NULL && nump == NULL || ap == NULL && nump != NULL)
 		panic("ufs_bmaparray: invalid arguments");
 #endif
 
 	if (runp) {
 		/*
 		 * XXX
 		 * If MAXPHYS is the largest transfer the disks can handle,
 		 * we probably want maxrun to be 1 block less so that we
 		 * don't create a block larger than the device can handle.
 		 */
 		*runp = 0;
 		maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
 	}
 
 	if (runb) {
 		*runb = 0;
 	}
 
 	xap = ap == NULL ? a : ap;
 	if (!nump)
 		nump = &num;
 	error = ufs_getlbns(vp, bn, xap, nump);
 	if (error)
 		return (error);
 
 	num = *nump;
 	if (num == 0) {
 		*bnp = blkptrtodb(ump, ip->i_db[bn]);
 		if (*bnp == 0)
 			*bnp = -1;
 		else if (runp) {
 			daddr_t bnb = bn;
 			for (++bn; bn < NDADDR && *runp < maxrun &&
 			    is_sequential(ump, ip->i_db[bn - 1], ip->i_db[bn]);
 			    ++bn, ++*runp);
 			bn = bnb;
 			if (runb && (bn > 0)) {
 				for (--bn; (bn >= 0) && (*runb < maxrun) &&
 					is_sequential(ump, ip->i_db[bn],
 						ip->i_db[bn+1]);
 						--bn, ++*runb);
 			}
 		}
 		return (0);
 	}
 
 
 	/* Get disk address out of indirect block array */
 	daddr = ip->i_ib[xap->in_off];
 
 	devvp = VFSTOUFS(vp->v_mount)->um_devvp;
 	for (bp = NULL, ++xap; --num; ++xap) {
 		/*
 		 * Exit the loop if there is no disk address assigned yet and
 		 * the indirect block isn't in the cache, or if we were
 		 * looking for an indirect block and we've found it.
 		 */
 
 		metalbn = xap->in_lbn;
 		if ((daddr == 0 && !incore(vp, metalbn)) || metalbn == bn)
 			break;
 		/*
 		 * If we get here, we've either got the block in the cache
 		 * or we have a disk address for it, go fetch it.
 		 */
 		if (bp)
-			brelse(bp);
+			bqrelse(bp);
 
 		xap->in_exists = 1;
 		bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
 		if ((bp->b_flags & B_CACHE) == 0) {
 #ifdef DIAGNOSTIC
 			if (!daddr)
 				panic("ufs_bmaparry: indirect block not in cache");
 #endif
 			bp->b_blkno = blkptrtodb(ump, daddr);
 			bp->b_flags |= B_READ;
 			vfs_busy_pages(bp, 0);
 			VOP_STRATEGY(bp);
 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
 			error = biowait(bp);
 			if (error) {
 				brelse(bp);
 				return (error);
 			}
 		}
 
 		daddr = ((daddr_t *)bp->b_data)[xap->in_off];
 		if (num == 1 && daddr && runp) {
 			for (bn = xap->in_off + 1;
 			    bn < MNINDIR(ump) && *runp < maxrun &&
 			    is_sequential(ump, ((daddr_t *)bp->b_data)[bn - 1],
 			    ((daddr_t *)bp->b_data)[bn]);
 			    ++bn, ++*runp);
 			bn = xap->in_off;
 			if (runb && bn) {
 				for(--bn; bn > 0 && *runb < maxrun &&
 			    		is_sequential(ump, ((daddr_t *)bp->b_data)[bn],
 					    ((daddr_t *)bp->b_data)[bn+1]);
 			    		--bn, ++*runb);
 			}
 		}
 	}
 	if (bp)
-		brelse(bp);
+		bqrelse(bp);
 
 	daddr = blkptrtodb(ump, daddr);
 	*bnp = daddr == 0 ? -1 : daddr;
 	return (0);
 }
 
 /*
  * Create an array of logical block number/offset pairs which represent the
  * path of indirect blocks required to access a data block.  The first "pair"
  * contains the logical block number of the appropriate single, double or
  * triple indirect block and the offset into the inode indirect block array.
  * Note, the logical block number of the inode single/double/triple indirect
  * block appears twice in the array, once with the offset into the i_ib and
  * once with the offset into the page itself.
  */
 int
 ufs_getlbns(vp, bn, ap, nump)
 	struct vnode *vp;
 	register daddr_t bn;
 	struct indir *ap;
 	int *nump;
 {
 	long metalbn, realbn;
 	struct ufsmount *ump;
 	int blockcnt, i, numlevels, off;
 
 	ump = VFSTOUFS(vp->v_mount);
 	if (nump)
 		*nump = 0;
 	numlevels = 0;
 	realbn = bn;
 	if ((long)bn < 0)
 		bn = -(long)bn;
 
 	/* The first NDADDR blocks are direct blocks. */
 	if (bn < NDADDR)
 		return (0);
 
 	/*
 	 * Determine the number of levels of indirection.  After this loop
 	 * is done, blockcnt indicates the number of data blocks possible
 	 * at the given level of indirection, and NIADDR - i is the number
 	 * of levels of indirection needed to locate the requested block.
 	 */
 	for (blockcnt = 1, i = NIADDR, bn -= NDADDR;; i--, bn -= blockcnt) {
 		if (i == 0)
 			return (EFBIG);
 		blockcnt *= MNINDIR(ump);
 		if (bn < blockcnt)
 			break;
 	}
 
 	/* Calculate the address of the first meta-block. */
 	if (realbn >= 0)
 		metalbn = -(realbn - bn + NIADDR - i);
 	else
 		metalbn = -(-realbn - bn + NIADDR - i);
 
 	/*
 	 * At each iteration, off is the offset into the bap array which is
 	 * an array of disk addresses at the current level of indirection.
 	 * The logical block number and the offset in that block are stored
 	 * into the argument array.
 	 */
 	ap->in_lbn = metalbn;
 	ap->in_off = off = NIADDR - i;
 	ap->in_exists = 0;
 	ap++;
 	for (++numlevels; i <= NIADDR; i++) {
 		/* If searching for a meta-data block, quit when found. */
 		if (metalbn == realbn)
 			break;
 
 		blockcnt /= MNINDIR(ump);
 		off = (bn / blockcnt) % MNINDIR(ump);
 
 		++numlevels;
 		ap->in_lbn = metalbn;
 		ap->in_off = off;
 		ap->in_exists = 0;
 		++ap;
 
 		metalbn -= -1 + off * blockcnt;
 	}
 	if (nump)
 		*nump = numlevels;
 	return (0);
 }
Index: head/sys/ufs/ufs/ufs_readwrite.c
===================================================================
--- head/sys/ufs/ufs/ufs_readwrite.c	(revision 13489)
+++ head/sys/ufs/ufs/ufs_readwrite.c	(revision 13490)
@@ -1,435 +1,438 @@
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_readwrite.c	8.7 (Berkeley) 1/21/94
- * $Id: ufs_readwrite.c,v 1.18 1996/01/06 12:49:53 phk Exp $
+ * $Id: ufs_readwrite.c,v 1.19 1996/01/07 09:42:36 phk Exp $
  */
 
 #ifdef LFS_READWRITE
 #define	BLKSIZE(a, b, c)	blksize(a)
 #define	FS			struct lfs
 #define	I_FS			i_lfs
 #define	READ			lfs_read
 #define	READ_S			"lfs_read"
 #define	WRITE			lfs_write
 #define	WRITE_S			"lfs_write"
 #define	fs_bsize		lfs_bsize
 #define	fs_maxfilesize		lfs_maxfilesize
 #else
 #define	BLKSIZE(a, b, c)	blksize(a, b, c)
 #define	FS			struct fs
 #define	I_FS			i_fs
 #define	READ			ffs_read
 #define	READ_S			"ffs_read"
 #define	WRITE			ffs_write
 #define	WRITE_S			"ffs_write"
 #include <vm/vm.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #endif
 
 /*
  * Vnode op for reading.
  */
 /* ARGSUSED */
 int
 READ(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct inode *ip;
 	register struct uio *uio;
 	register FS *fs;
 	struct buf *bp;
 	daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error;
 	u_short mode;
 
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	mode = ip->i_mode;
 	uio = ap->a_uio;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("%s: mode", READ_S);
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("%s: short symlink", READ_S);
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("%s: type %d", READ_S, vp->v_type);
 #endif
 	fs = ip->I_FS;
 	if ((u_quad_t)uio->uio_offset > fs->fs_maxfilesize)
 		return (EFBIG);
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = BLKSIZE(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
+
 		xfersize = fs->fs_bsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 #ifdef LFS_READWRITE
 		(void)lfs_check(vp, lbn);
 		error = cluster_read(vp, ip->i_size, lbn, size, NOCRED, &bp);
 #else
 		if (lblktosize(fs, nextlbn) > ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if (doclusterread)
 			error = cluster_read(vp,
 			    ip->i_size, lbn, size, NOCRED, &bp);
 		else if (lbn - 1 == vp->v_lastr) {
 			int nextsize = BLKSIZE(fs, ip, nextlbn);
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else
 			error = bread(vp, lbn, size, NOCRED, &bp);
 #endif
-		if (error)
+		if (error) {
+			brelse(bp);
+			bp = NULL;
 			break;
+		}
 		vp->v_lastr = lbn;
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 		if (error)
 			break;
 
-		brelse(bp);
+		bqrelse(bp);
 	}
 	if (bp != NULL)
-		brelse(bp);
+		bqrelse(bp);
 	ip->i_flag |= IN_ACCESS;
 	return (error);
 }
 
 /*
  * Vnode op for writing.
  */
 int
 WRITE(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp;
 	register struct uio *uio;
 	register struct inode *ip;
 	register FS *fs;
 	struct buf *bp;
 	struct proc *p;
 	daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, xfersize;
 	struct timeval tv;
 
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_WRITE)
 		panic("%s: mode", WRITE_S);
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		if ((ioflag & IO_SYNC) == 0)
 			panic("%s: nonsync dir write", WRITE_S);
 		break;
 	default:
 		panic("%s: type", WRITE_S);
 	}
 
 	fs = ip->I_FS;
 	if (uio->uio_offset < 0 ||
 	    (u_quad_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
 		return (EFBIG);
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	p = uio->uio_procp;
 	if (vp->v_type == VREG && p &&
 	    uio->uio_offset + uio->uio_resid >
 	    p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 		psignal(p, SIGXFSZ);
 		return (EFBIG);
 	}
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	flags = ioflag & IO_SYNC ? B_SYNC : 0;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->fs_bsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 
 #ifdef LFS_READWRITE
 		(void)lfs_check(vp, lbn);
 		error = lfs_balloc(vp, xfersize, lbn, &bp);
 #else
 		if (fs->fs_bsize > xfersize)
 			flags |= B_CLRBUF;
 		else
 			flags &= ~B_CLRBUF;
 
 		error = ffs_balloc(ip,
 		    lbn, blkoffset + xfersize, ap->a_cred, &bp, flags);
 #endif
 		if (error)
 			break;
 
 		if (uio->uio_offset + xfersize > ip->i_size) {
 			ip->i_size = uio->uio_offset + xfersize;
 		}
 
 		size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag |= IN_RECURSE;
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		if (uio->uio_segflg != UIO_NOCOPY)
 			ip->i_flag &= ~IN_RECURSE;
 #ifdef LFS_READWRITE
 		(void)VOP_BWRITE(bp);
 #else
 		if (ioflag & IO_VMIO)
 			bp->b_flags |= B_RELBUF;
 
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
-		} else if (xfersize + blkoffset == fs->fs_bsize &&
-			(vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
+		} else if (xfersize + blkoffset == fs->fs_bsize) {
 			if (doclusterwrite) {
 				bp->b_flags |= B_CLUSTEROK;
 				cluster_write(bp, ip->i_size);
 			} else {
 				bawrite(bp);
 			}
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 #endif
 		if (error || xfersize == 0)
 			break;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 		ip->i_mode &= ~(ISUID | ISGID);
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)VOP_TRUNCATE(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_procp);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
 		tv = time;
 		error = VOP_UPDATE(vp, &tv, &tv, 1);
 	}
 	return (error);
 }
 
 #ifndef LFS_READWRITE
 
 /*
  * get page routine
  */
 int
 ffs_getpages(ap)
 	struct vop_getpages_args *ap;
 {
 	off_t foff, physoffset;
 	int i, size, bsize;
 	struct vnode *dp;
 	int bbackwards, bforwards;
 	int pbackwards, pforwards;
 	int firstpage;
 	int reqlblkno;
 	daddr_t reqblkno;
 	int poff;
 	int pcount;
 	int rtval;
 	int pagesperblock;
 
 	pcount = round_page(ap->a_count) / PAGE_SIZE;
 	/*
 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block
 	 * then, the entire page is valid --
 	 */
 	if (ap->a_m[ap->a_reqpage]->valid) {
 		ap->a_m[ap->a_reqpage]->valid = VM_PAGE_BITS_ALL;
 		for (i = 0; i < pcount; i++) {
 			if (i != ap->a_reqpage)
 				vnode_pager_freepage(ap->a_m[i]);
 		}
 		return VM_PAGER_OK;
 	}
 
 	bsize = ap->a_vp->v_mount->mnt_stat.f_iosize;
 
 	/*
 	 * foff is the file offset of the required page
 	 * reqlblkno is the logical block that contains the page
 	 * poff is the index of the page into the logical block
 	 */
 	foff = IDX_TO_OFF(ap->a_m[ap->a_reqpage]->pindex) + ap->a_offset;
 	reqlblkno = foff / bsize;
 	poff = (foff % bsize) / PAGE_SIZE;
 
 	if ( VOP_BMAP( ap->a_vp, reqlblkno, &dp, &reqblkno,
 		&bforwards, &bbackwards) || (reqblkno == -1)) {
 		for(i = 0; i < pcount; i++) {
 			if (i != ap->a_reqpage)
 				vnode_pager_freepage(ap->a_m[i]);
 		}
 		if (reqblkno == -1) {
 			if ((ap->a_m[ap->a_reqpage]->flags & PG_ZERO) == 0)
 				vm_page_zero_fill(ap->a_m[ap->a_reqpage]);
 			ap->a_m[ap->a_reqpage]->dirty = 0;
 			ap->a_m[ap->a_reqpage]->valid = VM_PAGE_BITS_ALL;
 			return VM_PAGER_OK;
 		} else {
 			return VM_PAGER_ERROR;
 		}
 	}
 
 	physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
 	pagesperblock = bsize / PAGE_SIZE;
 	/*
 	 * find the first page that is contiguous...
 	 * note that pbackwards is the number of pages that are contiguous
 	 * backwards.
 	 */
 	firstpage = 0;
 	if (ap->a_count) {
 		pbackwards = poff + bbackwards * pagesperblock;
 		if (ap->a_reqpage > pbackwards) {
 			firstpage = ap->a_reqpage - pbackwards;
 			for(i=0;i<firstpage;i++)
 				vnode_pager_freepage(ap->a_m[i]);
 		}
 
 	/*
 	 * pforwards is the number of pages that are contiguous
 	 * after the current page.
 	 */
 		pforwards = (pagesperblock - (poff + 1)) +
 			bforwards * pagesperblock;
 		if (pforwards < (pcount - (ap->a_reqpage + 1))) {
 			for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
 				vnode_pager_freepage(ap->a_m[i]);
 			pcount = ap->a_reqpage + pforwards + 1;
 		}
 
 	/*
 	 * number of pages for I/O corrected for the non-contig pages at
 	 * the beginning of the array.
 	 */
 		pcount -= firstpage;
 	}
 
 	/*
 	 * calculate the size of the transfer
 	 */
 
 	size = pcount * PAGE_SIZE;
 	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
 		((vm_object_t) ap->a_vp->v_object)->un_pager.vnp.vnp_size)
 		size = ((vm_object_t) ap->a_vp->v_object)->un_pager.vnp.vnp_size - IDX_TO_OFF(ap->a_m[firstpage]->pindex);
 
 	physoffset -= IDX_TO_OFF(ap->a_m[ap->a_reqpage]->pindex);
 	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
 		(ap->a_reqpage - firstpage), physoffset);
 
 	return (rtval);
 }
 #endif
Index: head/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- head/sys/ufs/ufs/ufs_vnops.c	(revision 13489)
+++ head/sys/ufs/ufs/ufs_vnops.c	(revision 13490)
@@ -1,2151 +1,2152 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.10 (Berkeley) 4/1/94
- * $Id: ufs_vnops.c,v 1.35 1995/12/11 04:57:49 dyson Exp $
+ * $Id: ufs_vnops.c,v 1.36 1996/01/05 18:31:58 wollman Exp $
  */
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
 #include <sys/proc.h>
 #include <sys/conf.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 
 #include <vm/vm.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <miscfs/fifofs/fifo.h>
 
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 
 static int ufs_chmod __P((struct vnode *, int, struct ucred *, struct proc *));
 static int ufs_chown
 	__P((struct vnode *, uid_t, gid_t, struct ucred *, struct proc *));
 
 #ifdef EXT2FS
 #include <gnu/ext2fs/ext2_extern.h>
 #include <gnu/ext2fs/ext2_fs.h>
 #include <gnu/ext2fs/ext2_fs_sb.h>
 #endif /* EXT2FS */
 
 union _qcvt {
 	quad_t qcvt;
 	long val[2];
 };
 #define SETHIGH(q, h) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_HIGHWORD] = (h); \
 	(q) = tmp.qcvt; \
 }
 #define SETLOW(q, l) { \
 	union _qcvt tmp; \
 	tmp.qcvt = (q); \
 	tmp.val[_QUAD_LOWWORD] = (l); \
 	(q) = tmp.qcvt; \
 }
 
 /*
  * Create a regular file
  */
 int
 ufs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	return (0);
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 int
 ufs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	int error;
 
 	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		ip->i_rdev = vap->va_rdev;
 	}
 	/*
 	 * Remove inode so that it will be reloaded by VFS_VGET and
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.
 	 */
 	vput(*vpp);
 	(*vpp)->v_type = VNON;
 	vgone(*vpp);
 	*vpp = 0;
 	return (0);
 }
 
 /*
  * Open called.
  *
  * Nothing to do.
  */
 /* ARGSUSED */
 int
 ufs_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 int
 ufs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct inode *ip = VTOI(vp);
 
 	if (vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED))
 		ITIMES(ip, &time, &time);
 	return (0);
 }
 
 int
 ufs_access(ap)
 	struct vop_access_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	mode_t mask, mode = ap->a_mode;
 	register gid_t *gp;
 	int i, error;
 
 	/*
 	 * Disallow write attempts on read-only file systems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the file system.
 	 */
 	if (mode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 #ifdef QUOTA
 			if (error = getinoquota(ip))
 				return (error);
 #endif
 			break;
 		}
 	}
 
 	/* If immutable bit set, nobody gets to write it. */
 	if ((mode & VWRITE) && (ip->i_flags & IMMUTABLE))
 		return (EPERM);
 
 	/* Otherwise, user id 0 always gets access. */
 	if (cred->cr_uid == 0)
 		return (0);
 
 	mask = 0;
 
 	/* Otherwise, check the owner. */
 	if (cred->cr_uid == ip->i_uid) {
 		if (mode & VEXEC)
 			mask |= S_IXUSR;
 		if (mode & VREAD)
 			mask |= S_IRUSR;
 		if (mode & VWRITE)
 			mask |= S_IWUSR;
 		return ((ip->i_mode & mask) == mask ? 0 : EACCES);
 	}
 
 	/* Otherwise, check the groups. */
 	for (i = 0, gp = cred->cr_groups; i < cred->cr_ngroups; i++, gp++)
 		if (ip->i_gid == *gp) {
 			if (mode & VEXEC)
 				mask |= S_IXGRP;
 			if (mode & VREAD)
 				mask |= S_IRGRP;
 			if (mode & VWRITE)
 				mask |= S_IWGRP;
 			return ((ip->i_mode & mask) == mask ? 0 : EACCES);
 		}
 
 	/* Otherwise, check everyone else. */
 	if (mode & VEXEC)
 		mask |= S_IXOTH;
 	if (mode & VREAD)
 		mask |= S_IROTH;
 	if (mode & VWRITE)
 		mask |= S_IWOTH;
 	return ((ip->i_mode & mask) == mask ? 0 : EACCES);
 }
 
 /* ARGSUSED */
 int
 ufs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct inode *ip = VTOI(vp);
 	register struct vattr *vap = ap->a_vap;
 
 	ITIMES(ip, &time, &time);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = ip->i_dev;
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_nlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	vap->va_rdev = (dev_t)ip->i_rdev;
 	vap->va_size = ip->i_din.di_size;
 	vap->va_atime = ip->i_atime;
 	vap->va_mtime = ip->i_mtime;
 	vap->va_ctime = ip->i_ctime;
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	/* this doesn't belong here */
 	if (vp->v_type == VBLK)
 		vap->va_blocksize = BLKDEV_IOSIZE;
 	else if (vp->v_type == VCHR)
 		vap->va_blocksize = MAXBSIZE;
 	else
 		vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_bytes = dbtob(ip->i_blocks);
 	vap->va_type = vp->v_type;
 	vap->va_filerev = ip->i_modrev;
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 int
 ufs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct proc *p = ap->a_p;
 	struct timeval atimeval, mtimeval;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != ip->i_uid &&
 		    (error = suser(cred, &p->p_acflag)))
 			return (error);
 		if (cred->cr_uid == 0) {
 			if ((ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) &&
 			    securelevel > 0)
 				return (EPERM);
 			ip->i_flags = vap->va_flags;
 		} else {
 			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND))
 				return (EPERM);
 			ip->i_flags &= SF_SETTABLE;
 			ip->i_flags |= (vap->va_flags & UF_SETTABLE);
 		}
 		ip->i_flag |= IN_CHANGE;
 		if (vap->va_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred, p);
 		if (error)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * Disallow write attempts on read-only file systems;
 		 * unless the file is a socket, fifo, or a block or
 		 * character device resident on the file system.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		}
 		error = VOP_TRUNCATE(vp, vap->va_size, 0, cred, p);
 		if (error)
 			return (error);
 	}
 	ip = VTOI(vp);
 	if (vap->va_atime.ts_sec != VNOVAL || vap->va_mtime.ts_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if (cred->cr_uid != ip->i_uid &&
 		    (error = suser(cred, &p->p_acflag)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, cred, p))))
 			return (error);
 		if (vap->va_atime.ts_sec != VNOVAL)
 			ip->i_flag |= IN_ACCESS;
 		if (vap->va_mtime.ts_sec != VNOVAL)
 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		atimeval.tv_sec = vap->va_atime.ts_sec;
 		atimeval.tv_usec = vap->va_atime.ts_nsec / 1000;
 		mtimeval.tv_sec = vap->va_mtime.ts_sec;
 		mtimeval.tv_usec = vap->va_mtime.ts_nsec / 1000;
 		error = VOP_UPDATE(vp, &atimeval, &mtimeval, 1);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, p);
 	}
 	return (error);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ufs_chmod(vp, mode, cred, p)
 	register struct vnode *vp;
 	register int mode;
 	register struct ucred *cred;
 	struct proc *p;
 {
 	register struct inode *ip = VTOI(vp);
 	int error;
 
 	if (cred->cr_uid != ip->i_uid) {
 	    error = suser(cred, &p->p_acflag);
 	    if (error)
 		return (error);
 	}
 	if (cred->cr_uid) {
 		if (vp->v_type != VDIR && (mode & S_ISTXT))
 			return (EFTYPE);
 		if (!groupmember(ip->i_gid, cred) && (mode & ISGID))
 			return (EPERM);
 	}
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
 	return (0);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ufs_chown(vp, uid, gid, cred, p)
 	register struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct proc *p;
 {
 	register struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 #ifdef QUOTA
 	register int i;
 	long change;
 #endif
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * If we don't own the file, are trying to change the owner
 	 * of the file, or are not a member of the target group,
 	 * the caller must be superuser or the call fails.
 	 */
 	if ((cred->cr_uid != ip->i_uid || uid != ip->i_uid ||
 	    (gid != ip->i_gid && !groupmember((gid_t)gid, cred))) &&
 	    (error = suser(cred, &p->p_acflag)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 #ifdef QUOTA
 	if (error = getinoquota(ip))
 		return (error);
 	if (ouid == uid) {
 		dqrele(vp, ip->i_dquot[USRQUOTA]);
 		ip->i_dquot[USRQUOTA] = NODQUOT;
 	}
 	if (ogid == gid) {
 		dqrele(vp, ip->i_dquot[GRPQUOTA]);
 		ip->i_dquot[GRPQUOTA] = NODQUOT;
 	}
 	change = ip->i_blocks;
 	(void) chkdq(ip, -change, cred, CHOWN);
 	(void) chkiq(ip, -1, cred, CHOWN);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dqrele(vp, ip->i_dquot[i]);
 		ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	ip->i_gid = gid;
 	ip->i_uid = uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		if ((error = chkdq(ip, change, cred, CHOWN)) == 0) {
 			if ((error = chkiq(ip, 1, cred, CHOWN)) == 0)
 				goto good;
 			else
 				(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dqrele(vp, ip->i_dquot[i]);
 			ip->i_dquot[i] = NODQUOT;
 		}
 	}
 	ip->i_gid = ogid;
 	ip->i_uid = ouid;
 	if (getinoquota(ip) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		(void) chkdq(ip, change, cred, FORCE|CHOWN);
 		(void) chkiq(ip, 1, cred, FORCE|CHOWN);
 		(void) getinoquota(ip);
 	}
 	return (error);
 good:
 	if (getinoquota(ip))
 		panic("chown: lost quota");
 #endif /* QUOTA */
 	if (ouid != uid || ogid != gid)
 		ip->i_flag |= IN_CHANGE;
 	if (ouid != uid && cred->cr_uid != 0)
 		ip->i_mode &= ~ISUID;
 	if (ogid != gid && cred->cr_uid != 0)
 		ip->i_mode &= ~ISGID;
 	return (0);
 }
 
 /* ARGSUSED */
 int
 ufs_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		int  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	return (ENOTTY);
 }
 
 /* ARGSUSED */
 int
 ufs_select(ap)
 	struct vop_select_args /* {
 		struct vnode *a_vp;
 		int  a_which;
 		int  a_fflags;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	/*
 	 * We should really check to see if I/O is possible.
 	 */
 	return (1);
 }
 
 /*
  * Mmap a file
  *
  * NB Currently unsupported.
  */
 /* ARGSUSED */
 int
 ufs_mmap(ap)
 	struct vop_mmap_args /* {
 		struct vnode *a_vp;
 		int  a_fflags;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 
 	return (EINVAL);
 }
 
 /*
  * Seek on a file
  *
  * Nothing to do, so just return.
  */
 /* ARGSUSED */
 int
 ufs_seek(ap)
 	struct vop_seek_args /* {
 		struct vnode *a_vp;
 		off_t  a_oldoff;
 		off_t  a_newoff;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	return (0);
 }
 
 int
 ufs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 
 	ip = VTOI(vp);
 	if ((ip->i_flags & (IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 #ifdef EXT2FS
 	if (IS_EXT2_VNODE(dvp)) {
 		error = ext2_dirremove(dvp, ap->a_cnp);
 	} else {
 		error = ufs_dirremove(dvp, ap->a_cnp);
 	}
 #else
 	error = ufs_dirremove(dvp, ap->a_cnp);
 #endif /* EXT2FS */
 	if (error == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	if (dvp == vp)
 		vrele(vp);
 	else
 		vput(vp);
 	vput(dvp);
 	return (error);
 }
 
 /*
  * link vnode call
  */
 int
 ufs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	struct timeval tv;
 	int error;
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
 	if (vp->v_mount != tdvp->v_mount) {
 		VOP_ABORTOP(tdvp, cnp);
 		error = EXDEV;
 		goto out2;
 	}
 	if (vp != tdvp && (error = VOP_LOCK(vp))) {
 		VOP_ABORTOP(tdvp, cnp);
 		goto out2;
 	}
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= LINK_MAX) {
 		VOP_ABORTOP(tdvp, cnp);
 		error = EMLINK;
 		goto out1;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		VOP_ABORTOP(tdvp, cnp);
 		error = EPERM;
 		goto out1;
 	}
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	tv = time;
 	error = VOP_UPDATE(vp, &tv, &tv, 1);
 	if (!error) {
 #ifdef EXT2FS
 		if (IS_EXT2_VNODE(tdvp)) {
 			error = ext2_direnter(ip, tdvp, cnp);
 		} else {
 			error = ufs_direnter(ip, tdvp, cnp);
 		}
 #else
 		error = ufs_direnter(ip, tdvp, cnp);
 #endif /* EXT2FS */
 	}
 
 	if (error) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 out1:
 	if (vp != tdvp)
 		VOP_UNLOCK(vp);
 out2:
 	vput(tdvp);
 	return (error);
 }
 
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 int
 ufs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	register struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct inode *ip, *xp, *dp;
 	struct dirtemplate dirbuf;
 	struct timeval tv;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
 	u_char namlen;
 
 #ifdef DIAGNOSTIC
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		VOP_ABORTOP(tdvp, tcnp); /* XXX, why not in NFS? */
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		VOP_ABORTOP(fdvp, fcnp); /* XXX, why not in NFS? */
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	/*
 	 * Check if just deleting a link name.
 	 */
 	if (tvp && ((VTOI(tvp)->i_flags & (IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 	if (fvp == tvp) {
 		if (fvp->v_type == VDIR) {
 			error = EINVAL;
 			goto abortit;
 		}
 
 		/* Release destination completely. */
 		VOP_ABORTOP(tdvp, tcnp);
 		vput(tdvp);
 		vput(tvp);
 
 		/* Delete source. */
 		vrele(fdvp);
 		vrele(fvp);
 		fcnp->cn_flags &= ~MODMASK;
 		fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 		if ((fcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost from startdir");
 		fcnp->cn_nameiop = DELETE;
 		(void) relookup(fdvp, &fvp, fcnp);
 		return (VOP_REMOVE(fdvp, fvp, fcnp));
 	}
 	error = VOP_LOCK(fvp);
 	if (error)
 		goto abortit;
 	dp = VTOI(fdvp);
 	ip = VTOI(fvp);
 	if ((ip->i_flags & (IMMUTABLE | APPEND)) || (dp->i_flags & APPEND)) {
 		VOP_UNLOCK(fvp);
 		error = EPERM;
 		goto abortit;
 	}
 	if ((ip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
 		    (ip->i_flag & IN_RENAME)) {
 			VOP_UNLOCK(fvp);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->i_flag |= IN_RENAME;
 		oldparent = dp->i_number;
 		doingdirectory++;
 	}
 	vrele(fdvp);
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTOI(tdvp);
 	xp = NULL;
 	if (tvp)
 		xp = VTOI(tvp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	tv = time;
 	error = VOP_UPDATE(fvp, &tv, &tv, 1);
 	if (error) {
 		VOP_UNLOCK(fvp);
 		goto bad;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory heirarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to checkpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_proc);
 	VOP_UNLOCK(fvp);
 	if (oldparent != dp->i_number)
 		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 #ifdef EXT2FS
 		if (IS_EXT2_VNODE(tdvp)) {
 			error = ext2_checkpath(ip, dp, tcnp->cn_cred);
 		} else {
 			error = ufs_checkpath(ip, dp, tcnp->cn_cred);
 		}
 #else
 		error = ufs_checkpath(ip, dp, tcnp->cn_cred);
 #endif /* EXT2FS */
 		if (error)
 			goto out;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		dp = VTOI(tdvp);
 		xp = NULL;
 		if (tvp)
 			xp = VTOI(tvp);
 	}
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (xp == NULL) {
 		if (dp->i_dev != ip->i_dev)
 			panic("rename: EXDEV");
 		/*
 		 * Account for ".." in new directory.
 		 * When source and destination have the same
 		 * parent we don't fool with the link count.
 		 */
 		if (doingdirectory && newparent) {
 			if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 			dp->i_nlink++;
 			dp->i_flag |= IN_CHANGE;
 			error = VOP_UPDATE(tdvp, &tv, &tv, 1);
 			if (error)
 				goto bad;
 		}
 #ifdef EXT2FS
 		if (IS_EXT2_VNODE(tdvp)) {
 			error = ext2_direnter(ip, tdvp, tcnp);
 		} else {
 			error = ufs_direnter(ip, tdvp, tcnp);
 		}
 #else
 		error = ufs_direnter(ip, tdvp, tcnp);
 #endif /* EXT2FS */
 		if (error) {
 			if (doingdirectory && newparent) {
 				dp->i_nlink--;
 				dp->i_flag |= IN_CHANGE;
 				(void)VOP_UPDATE(tdvp, &tv, &tv, 1);
 			}
 			goto bad;
 		}
 		vput(tdvp);
 	} else {
 		if (xp->i_dev != dp->i_dev || xp->i_dev != ip->i_dev)
 			panic("rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (xp->i_number == ip->i_number)
 			panic("rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the user must
 		 * own the parent directory, or the destination of the rename,
 		 * otherwise the destination may not be changed (except by
 		 * root). This implements append-only directories.
 		 */
 		if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 &&
 		    tcnp->cn_cred->cr_uid != dp->i_uid &&
 		    xp->i_uid != tcnp->cn_cred->cr_uid) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode&IFMT) == IFDIR) {
 #ifdef EXT2FS
 			if (! (IS_EXT2_VNODE(ITOV(xp)) ? 
 					ext2_dirempty : ufs_dirempty)
 #else
 			if (! ufs_dirempty
 #endif /* EXT2FS */
 					 (xp, dp->i_number, tcnp->cn_cred) || 
 			    xp->i_nlink > 2) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 #ifdef EXT2FS
 		if (IS_EXT2_VNODE(ITOV(dp))) {
 			error = ext2_dirrewrite(dp, ip, tcnp);
 		} else {
 			error = ufs_dirrewrite(dp, ip, tcnp);
 		}
 #else
 		error = ufs_dirrewrite(dp, ip, tcnp);
 #endif /* EXT2FS */
 		if (error)
 			goto bad;
 		/*
 		 * If the target directory is in the same
 		 * directory as the source directory,
 		 * decrement the link count on the parent
 		 * of the target directory.
 		 */
 		 if (doingdirectory && !newparent) {
 			dp->i_nlink--;
 			dp->i_flag |= IN_CHANGE;
 		}
 		vput(tdvp);
 		/*
 		 * Adjust the link count of the target to
 		 * reflect the dirrewrite above.  If this is
 		 * a directory it is empty and there are
 		 * no links to it, so we can squash the inode and
 		 * any space associated with it.  We disallowed
 		 * renaming over top of a directory with links to
 		 * it above, as the remaining link would point to
 		 * a directory without "." or ".." entries.
 		 */
 		xp->i_nlink--;
 		if (doingdirectory) {
 			if (--xp->i_nlink != 0)
 				panic("rename: linked directory");
 			error = VOP_TRUNCATE(tvp, (off_t)0, IO_SYNC,
 			    tcnp->cn_cred, tcnp->cn_proc);
 		}
 		xp->i_flag |= IN_CHANGE;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * 3) Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	if ((fcnp->cn_flags & SAVESTART) == 0)
 		panic("ufs_rename: lost from startdir");
 	(void) relookup(fdvp, &fvp, fcnp);
 	if (fvp != NULL) {
 		xp = VTOI(fvp);
 		dp = VTOI(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.
 		 */
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 		vrele(ap->a_fvp);
 		return (0);
 	}
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; its link
 	 * count of three would cause a rmdir to fail with ENOTEMPTY.
 	 * The IN_RENAME flag ensures that it cannot be moved by another
 	 * rename.
 	 */
 	if (xp != ip) {
 		if (doingdirectory)
 			panic("rename: lost dir entry");
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			dp->i_nlink--;
 			dp->i_flag |= IN_CHANGE;
 			error = vn_rdwr(UIO_READ, fvp, (caddr_t)&dirbuf,
 				sizeof (struct dirtemplate), (off_t)0,
 				UIO_SYSSPACE, IO_NODELOCKED,
 				tcnp->cn_cred, (int *)0, (struct proc *)0);
 			if (error == 0) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					if (fvp->v_mount->mnt_maxsymlinklen <= 0)
 						namlen = dirbuf.dotdot_type;
 					else
 						namlen = dirbuf.dotdot_namlen;
 #				else
 					namlen = dirbuf.dotdot_namlen;
 #				endif
 #ifdef EXT2FS
 				if(IS_EXT2_VNODE(fvp))
 					namlen = ((struct odirtemplate *)
 						    &dirbuf)->dotdot_namlen;
 #endif /* EXT2FS */
 				if (namlen != 2 ||
 				    dirbuf.dotdot_name[0] != '.' ||
 				    dirbuf.dotdot_name[1] != '.') {
 					ufs_dirbad(xp, (doff_t)12,
 					    "rename: mangled dir");
 				} else {
 					dirbuf.dotdot_ino = newparent;
 					(void) vn_rdwr(UIO_WRITE, fvp,
 					    (caddr_t)&dirbuf,
 					    sizeof (struct dirtemplate),
 					    (off_t)0, UIO_SYSSPACE,
 					    IO_NODELOCKED|IO_SYNC,
 					    tcnp->cn_cred, (int *)0,
 					    (struct proc *)0);
 					cache_purge(fdvp);
 				}
 			}
 		}
 #ifdef EXT2FS
 		if (IS_EXT2_VNODE(fdvp)) {
 			error = ext2_dirremove(fdvp, fcnp);
 		} else {
 			error = ufs_dirremove(fdvp, fcnp);
 		}
 #else
 		error = ufs_dirremove(fdvp, fcnp);
 #endif /* EXT2FS */
 		if (!error) {
 			xp->i_nlink--;
 			xp->i_flag |= IN_CHANGE;
 		}
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
 		vput(fdvp);
 	if (xp)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	if (xp)
 		vput(ITOV(xp));
 	vput(ITOV(dp));
 out:
 	if (VOP_LOCK(fvp) == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 /*
  * A virgin directory (no blushing please).
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, DT_DIR, 1, { '.', 0 },
 	0, DIRBLKSIZ - 12, DT_DIR, 2, { '.', '.', 0 }
 };
 static struct odirtemplate omastertemplate = {
 	0, 12, 1, { '.', 0 },
 	0, DIRBLKSIZ - 12, 2, { '.', '.', 0 }
 };
 
 /*
  * Mkdir system call
  */
 int
 ufs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	register struct vnode *dvp = ap->a_dvp;
 	register struct vattr *vap = ap->a_vap;
 	register struct componentname *cnp = ap->a_cnp;
 	register struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct dirtemplate dirtemplate, *dtp;
 	struct timeval tv;
 	int error, dmode;
 
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if ((nlink_t)dp->i_nlink >= LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	error = VOP_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	ip->i_gid = dp->i_gid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		free(cnp->cn_pnbuf, M_NAMEI);
 		VOP_VFREE(tvp, ip->i_number, dmode);
 		vput(tvp);
 		vput(dvp);
 		return (error);
 	}
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 2;
 	tv = time;
 	error = VOP_UPDATE(tvp, &tv, &tv, 1);
 
 	/*
 	 * Bump link count in parent directory
 	 * to reflect work done below.  Should
 	 * be done before reference is created
 	 * so reparation is possible if we crash.
 	 */
 	dp->i_nlink++;
 	dp->i_flag |= IN_CHANGE;
 	error = VOP_UPDATE(dvp, &tv, &tv, 1);
 	if (error)
 		goto bad;
 
 	/* Initialize directory with "." and ".." from static template. */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0
 #ifdef EXT2FS
 		/* omastertemplate is want we want for EXT2 */
 		&& !IS_EXT2_VNODE(dvp) 
 #endif /* EXT2FS */
 	)
 		dtp = &mastertemplate;
 	else
 		dtp = (struct dirtemplate *)&omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 #ifdef EXT2FS
 	/* note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE 
 	 * so let's just redefine it - for this function only
 	 */
 #undef  DIRBLKSIZ 
 #define DIRBLKSIZ (IS_EXT2_VNODE(dvp) ? \
 		   VTOI(dvp)->i_e2fs->s_blocksize : DEV_BSIZE)
 	if(IS_EXT2_VNODE(dvp))
 		dirtemplate.dotdot_reclen = DIRBLKSIZ - 12;
 #endif /* EXT2FS */
 	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)&dirtemplate,
 	    sizeof (dirtemplate), (off_t)0, UIO_SYSSPACE,
 	    IO_NODELOCKED|IO_SYNC, cnp->cn_cred, (int *)0, (struct proc *)0);
 	if (error) {
 		dp->i_nlink--;
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (DIRBLKSIZ > VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
 		panic("ufs_mkdir: blksize"); /* XXX should grow with balloc() */
 	else {
 		ip->i_size = DIRBLKSIZ;
 		ip->i_flag |= IN_CHANGE;
 	}
 
 	/* Directory set up, now install it's entry in the parent directory. */
 #ifdef EXT2FS
 	if (IS_EXT2_VNODE(dvp)) {
 		error = ext2_direnter(ip, dvp, cnp);
 	} else {
 		error = ufs_direnter(ip, dvp, cnp);
 	}
 #else
 	error = ufs_direnter(ip, dvp, cnp);
 #endif /* EXT2FS */
 	if (error) {
 		dp->i_nlink--;
 		dp->i_flag |= IN_CHANGE;
 	}
 bad:
 	/*
 	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
 	 * for us because we set the link count to 0.
 	 */
 	if (error) {
 		ip->i_nlink = 0;
 		ip->i_flag |= IN_CHANGE;
 		vput(tvp);
 	} else
 		*ap->a_vpp = tvp;
 out:
 	FREE(cnp->cn_pnbuf, M_NAMEI);
 	vput(dvp);
 	return (error);
 #ifdef EXT2FS
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  DEV_BSIZE
 #endif /* EXT2FS */
 }
 
 /*
  * Rmdir system call.
  */
 int
 ufs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 	/*
 	 * No rmdir "." please.
 	 */
 	if (dp == ip) {
 		vrele(dvp);
 		vput(vp);
 		return (EINVAL);
 	}
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	error = 0;
 	if (ip->i_nlink != 2 ||
 #ifdef EXT2FS
 	    !(IS_EXT2_VNODE(ITOV(ip)) ? ext2_dirempty : ufs_dirempty)
 	    		 (ip, dp->i_number, cnp->cn_cred)) {
 #else
 	    !ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 #endif /* EXT2FS */
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND) || (ip->i_flags & (IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 #ifdef EXT2FS
 	if (IS_EXT2_VNODE(dvp)) {
 		error = ext2_dirremove(dvp, cnp);
 	} else {
 		error = ufs_dirremove(dvp, cnp);
 	}
 #else
 	error = ufs_dirremove(dvp, cnp);
 #endif /* EXT2FS */
 	if (error)
 		goto out;
 	dp->i_nlink--;
 	dp->i_flag |= IN_CHANGE;
 	cache_purge(dvp);
 	vput(dvp);
 	dvp = NULL;
 	/*
 	 * Truncate inode.  The only stuff left
 	 * in the directory is "." and "..".  The
 	 * "." reference is inconsequential since
 	 * we're quashing it.  The ".." reference
 	 * has already been adjusted above.  We've
 	 * removed the "." reference and the reference
 	 * in the parent directory, but there may be
 	 * other hard links so decrement by 2 and
 	 * worry about them later.
 	 */
 	ip->i_nlink -= 2;
 	error = VOP_TRUNCATE(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
 	    cnp->cn_proc);
 	cache_purge(ITOV(ip));
 out:
 	if (dvp)
 		vput(dvp);
 	vput(vp);
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 int
 ufs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		char *a_target;
 	} */ *ap;
 {
 	register struct vnode *vp, **vpp = ap->a_vpp;
 	register struct inode *ip;
 	int len, error;
 
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, (char *)ip->i_shortlink, len);
 		ip->i_size = len;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, ap->a_target, len, (off_t)0,
 		    UIO_SYSSPACE, IO_NODELOCKED, ap->a_cnp->cn_cred, (int *)0,
 		    (struct proc *)0);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Vnode op for reading directories.
  *
  * The routine below assumes that the on-disk format of a directory
  * is the same as that defined by <sys/dirent.h>. If the on-disk
  * format changes, then it will be necessary to do a conversion
  * from the on-disk format that read returns to the format defined
  * by <sys/dirent.h>.
  */
 int
 ufs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_ncookies;
 		u_int **cookies;
 	} */ *ap;
 {
 	register struct uio *uio = ap->a_uio;
 	off_t off;
 	int count, lost, error;
 
 	if (ap->a_ncookies != NULL)
 		/*
 		 * Ensure that the block is aligned.  The caller can use
 		 * the cookies to determine where in the block to start.
 		 */
 		uio->uio_offset &= ~(DIRBLKSIZ - 1);
 	off = uio->uio_offset;
 	count = uio->uio_resid;
 	count &= ~(DIRBLKSIZ - 1);
 	lost = uio->uio_resid - count;
 	if (count < DIRBLKSIZ || (uio->uio_offset & (DIRBLKSIZ -1)))
 		return (EINVAL);
 	uio->uio_resid = count;
 	uio->uio_iov->iov_len = count;
 #	if (BYTE_ORDER == LITTLE_ENDIAN)
 		if (ap->a_vp->v_mount->mnt_maxsymlinklen > 0) {
 			error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
 		} else {
 			struct dirent *dp, *edp;
 			struct uio auio;
 			struct iovec aiov;
 			caddr_t dirbuf;
 			int readcnt;
 			u_char tmp;
 
 			auio = *uio;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			auio.uio_segflg = UIO_SYSSPACE;
 			aiov.iov_len = count;
 			MALLOC(dirbuf, caddr_t, count, M_TEMP, M_WAITOK);
 			aiov.iov_base = dirbuf;
 			error = VOP_READ(ap->a_vp, &auio, 0, ap->a_cred);
 			if (error == 0) {
 				readcnt = count - auio.uio_resid;
 				edp = (struct dirent *)&dirbuf[readcnt];
 				for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 					tmp = dp->d_namlen;
 					dp->d_namlen = dp->d_type;
 					dp->d_type = tmp;
 					if (dp->d_reclen > 0) {
 						dp = (struct dirent *)
 						    ((char *)dp + dp->d_reclen);
 					} else {
 						error = EIO;
 						break;
 					}
 				}
 				if (dp >= edp)
 					error = uiomove(dirbuf, readcnt, uio);
 			}
 			FREE(dirbuf, M_TEMP);
 		}
 #	else
 		error = VOP_READ(ap->a_vp, uio, 0, ap->a_cred);
 #	endif
 	if (!error && ap->a_ncookies != NULL) {
 		struct dirent* dpStart;
 		struct dirent* dpEnd;
 		struct dirent* dp;
 		int ncookies;
 		u_int *cookies;
 		u_int *cookiep;
 
 		if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
 			panic("ufs_readdir: unexpected uio from NFS server");
 		dpStart = (struct dirent *)
 		     (uio->uio_iov->iov_base - (uio->uio_offset - off));
 		dpEnd = (struct dirent *) uio->uio_iov->iov_base;
 		for (dp = dpStart, ncookies = 0;
 		     dp < dpEnd;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen))
 			ncookies++;
 		MALLOC(cookies, u_int *, ncookies * sizeof(u_int),
 		       M_TEMP, M_WAITOK);
 		for (dp = dpStart, cookiep = cookies;
 		     dp < dpEnd;
 		     dp = (struct dirent *)((caddr_t) dp + dp->d_reclen)) {
 			off += dp->d_reclen;
 			*cookiep++ = (u_int) off;
 		}
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	}
 	if (ap->a_eofflag)
 	    *ap->a_eofflag = VTOI(ap->a_vp)->i_size <= uio->uio_offset;
 	uio->uio_resid += lost;
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 int
 ufs_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct inode *ip = VTOI(vp);
 	int isize;
 
 	isize = ip->i_size;
 	if ((isize < vp->v_mount->mnt_maxsymlinklen) ||
 	    (ip->i_din.di_blocks == 0)) {	/* XXX - for old fastlink support */
 		uiomove((char *)ip->i_shortlink, isize, ap->a_uio);
 		return (0);
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Ufs abort op, called after namei() when a CREATE/DELETE isn't actually
  * done. If a buffer has been saved in anticipation of a CREATE, delete it.
  */
 /* ARGSUSED */
 int
 ufs_abortop(ap)
 	struct vop_abortop_args /* {
 		struct vnode *a_dvp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	if ((ap->a_cnp->cn_flags & (HASBUF | SAVESTART)) == HASBUF)
 		FREE(ap->a_cnp->cn_pnbuf, M_NAMEI);
 	return (0);
 }
 
 /*
  * Lock an inode. If its already locked, set the WANT bit and sleep.
  */
 int
 ufs_lock(ap)
 	struct vop_lock_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct proc *p = curproc;
 	register struct vnode *vp = ap->a_vp;
 	register struct inode *ip;
 
 start:
 	while (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		(void) tsleep((caddr_t)vp, PINOD, "ufslk1", 0);
 	}
 	if (vp->v_tag == VT_NON)
 		return (ENOENT);
 	ip = VTOI(vp);
 	if (ip->i_flag & IN_LOCKED) {
 		if (p->p_pid == ip->i_lockholder) {
 			if( (ip->i_flag & IN_RECURSE) == 0)
 				panic("ufs_lock: recursive lock not expected, pid: %d\n",
 					ip->i_lockholder);
 		} else {
 			ip->i_flag |= IN_WANTED;
 #ifdef DIAGNOSTIC
 			if (p)
 				ip->i_lockwaiter = p->p_pid;
 			else
 				ip->i_lockwaiter = -1;
 #endif
 			(void) tsleep((caddr_t)ip, PINOD, "ufslk2", 0);
 			goto start;
 		}
 	}
 #ifdef DIAGNOSTIC
 	ip->i_lockwaiter = 0;
 	if (((ip->i_flag & IN_RECURSE) == 0) && (ip->i_lockholder != 0))
 		panic("lockholder (%d) != 0", ip->i_lockholder);
 	if (p && p->p_pid == 0)
 		printf("locking by process 0\n");
 #endif
 
 	if ((ip->i_flag & IN_RECURSE) == 0)
 		ip->i_lockcount = 1;
 	else
 		++ip->i_lockcount;
 
 	if (p)
 		ip->i_lockholder = p->p_pid;
 	else
 		ip->i_lockholder = -1;
 	ip->i_flag |= IN_LOCKED;
 	return (0);
 }
 
 /*
  * Unlock an inode.  If WANT bit is on, wakeup.
  */
 int lockcount = 90;
 int
 ufs_unlock(ap)
 	struct vop_unlock_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct inode *ip = VTOI(ap->a_vp);
 
 #ifdef DIAGNOSTIC
 	struct proc *p = curproc;
 
 	if ((ip->i_flag & IN_LOCKED) == 0) {
 		vprint("ufs_unlock: unlocked inode", ap->a_vp);
 		panic("ufs_unlock NOT LOCKED");
 	}
 	if (p && p->p_pid != ip->i_lockholder && p->p_pid > -1 &&
 	    ip->i_lockholder > -1 && lockcount++ < 100)
 		panic("unlocker (%d) != lock holder (%d)",
 		    p->p_pid, ip->i_lockholder);
 #endif
 	if (--ip->i_lockcount > 0) {
 		if ((ip->i_flag & IN_RECURSE) == 0)
 			panic("ufs_unlock: recursive lock prematurely released, pid=%d\n",
 				ip->i_lockholder);
 		return (0);
 	}
 	ip->i_lockholder = 0;
 	ip->i_flag &= ~(IN_LOCKED|IN_RECURSE);
 	if (ip->i_flag & IN_WANTED) {
 		ip->i_flag &= ~IN_WANTED;
 		wakeup((caddr_t)ip);
 	}
 	return (0);
 }
 
 /*
  * Check for a locked inode.
  */
 int
 ufs_islocked(ap)
 	struct vop_islocked_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	if (VTOI(ap->a_vp)->i_flag & IN_LOCKED)
 		return (1);
 	return (0);
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  */
 int
 ufs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct buf *a_bp;
 	} */ *ap;
 {
 	register struct buf *bp = ap->a_bp;
 	register struct vnode *vp = bp->b_vp;
 	register struct inode *ip;
 	int error;
 
 	ip = VTOI(vp);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("ufs_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 		if (error) {
 			bp->b_error = error;
 			bp->b_flags |= B_ERROR;
 			biodone(bp);
 			return (error);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		biodone(bp);
 		return (0);
 	}
 	vp = ip->i_devvp;
 	bp->b_dev = vp->v_rdev;
 	VOCALL (vp->v_op, VOFFSET(vop_strategy), ap);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 int
 ufs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct inode *ip = VTOI(vp);
 
 	printf("tag VT_UFS, ino %ld, on dev %d, %d", ip->i_number,
 		major(ip->i_dev), minor(ip->i_dev));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("%s\n", (ip->i_flag & IN_LOCKED) ? " (LOCKED)" : "");
 	if (ip->i_lockholder == 0)
 		return (0);
 	printf("\towner pid %lu", (u_long)ip->i_lockholder);
 	if (ip->i_lockwaiter)
 		printf(" waiting pid %lu", (u_long)ip->i_lockwaiter);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Read wrapper for special devices.
  */
 int
 ufsspec_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	/*
 	 * Set access flag.
 	 */
 	VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
 	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_read), ap));
 }
 
 /*
  * Write wrapper for special devices.
  */
 int
 ufsspec_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 
 	/*
 	 * Set update and change flags.
 	 */
 	VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_write), ap));
 }
 
 /*
  * Close wrapper for special devices.
  *
  * Update the times on the inode then do device close.
  */
 int
 ufsspec_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct inode *ip = VTOI(ap->a_vp);
 
 	if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED))
 		ITIMES(ip, &time, &time);
 	return (VOCALL (spec_vnodeop_p, VOFFSET(vop_close), ap));
 }
 
 /*
  * Read wrapper for fifo's
  */
 int
 ufsfifo_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	/*
 	 * Set access flag.
 	 */
 	VTOI(ap->a_vp)->i_flag |= IN_ACCESS;
 	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_read), ap));
 }
 
 /*
  * Write wrapper for fifo's.
  */
 int
 ufsfifo_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	/*
 	 * Set update and change flags.
 	 */
 	VTOI(ap->a_vp)->i_flag |= IN_CHANGE | IN_UPDATE;
 	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_write), ap));
 }
 
 /*
  * Close wrapper for fifo's.
  *
  * Update the times on the inode then do device close.
  */
 int
 ufsfifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct proc *a_p;
 	} */ *ap;
 {
 	register struct inode *ip = VTOI(ap->a_vp);
 
 	if (ap->a_vp->v_usecount > 1 && !(ip->i_flag & IN_LOCKED))
 		ITIMES(ip, &time, &time);
 	return (VOCALL (fifo_vnodeop_p, VOFFSET(vop_close), ap));
 }
 
 /*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 int
 ufs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = LINK_MAX;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_PATH_MAX:
 		*ap->a_retval = PATH_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Advisory record locking support
  */
 int
 ufs_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 	register struct inode *ip = VTOI(ap->a_vp);
 
 	return (lf_advlock(ap, &(ip->i_lockf), ip->i_size));
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ufs_vinit(mntp, specops, fifoops, vpp)
 	struct mount *mntp;
 	vop_t **specops;
 	vop_t **fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp, *nvp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	switch(vp->v_type = IFTOVT(ip->i_mode)) {
 	case VCHR:
 	case VBLK:
 		vp->v_op = specops;
 		nvp = checkalias(vp, ip->i_rdev, mntp);
 		if (nvp) {
 			/*
 			 * Discard unneeded vnode, but save its inode.
 			 */
 			ufs_ihashrem(ip);
 			VOP_UNLOCK(vp);
 			nvp->v_data = vp->v_data;
 			vp->v_data = NULL;
 			vp->v_op = spec_vnodeop_p;
 			vrele(vp);
 			vgone(vp);
 			/*
 			 * Reinitialize aliased inode.
 			 */
 			vp = nvp;
 			ip->i_vnode = vp;
 			ufs_ihashins(ip);
 		}
 		break;
 	case VFIFO:
 		vp->v_op = fifoops;
 		break;
 	default:
 		break;
 
 	}
 	if (ip->i_number == ROOTINO)
                 vp->v_flag |= VROOT;
 	/*
 	 * Initialize modrev times
 	 */
 	SETHIGH(ip->i_modrev, mono_time.tv_sec);
 	SETLOW(ip->i_modrev, mono_time.tv_usec * 4294);
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  */
 int
 ufs_makeinode(mode, dvp, vpp, cnp)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	register struct inode *ip, *pdir;
 	struct timeval tv;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef DIAGNOSTIC
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_makeinode: no name");
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	error = VOP_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error) {
 		free(cnp->cn_pnbuf, M_NAMEI);
 		vput(dvp);
 		return (error);
 	}
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 	if ((mode & IFMT) == IFLNK)
 		ip->i_uid = pdir->i_uid;
 	else
 		ip->i_uid = cnp->cn_cred->cr_uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		free(cnp->cn_pnbuf, M_NAMEI);
 		VOP_VFREE(tvp, ip->i_number, mode);
 		vput(tvp);
 		vput(dvp);
 		return (error);
 	}
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 1;
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    suser(cnp->cn_cred, NULL))
 		ip->i_mode &= ~ISGID;
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	tv = time;
 	error = VOP_UPDATE(tvp, &tv, &tv, 1);
 	if (error)
 		goto bad;
 #ifdef EXT2FS
 	if (IS_EXT2_VNODE(dvp)) {
 		error = ext2_direnter(ip, dvp, cnp);
 	} else {
 		error = ufs_direnter(ip, dvp, cnp);
 	}
 #else
 	error = ufs_direnter(ip, dvp, cnp);
 #endif  /* EXT2FS */
 	if (error)
 		goto bad;
+
 	if ((cnp->cn_flags & SAVESTART) == 0)
 		FREE(cnp->cn_pnbuf, M_NAMEI);
 	vput(dvp);
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	free(cnp->cn_pnbuf, M_NAMEI);
 	vput(dvp);
 	ip->i_nlink = 0;
 	ip->i_flag |= IN_CHANGE;
 	vput(tvp);
 	return (error);
 }
Index: head/sys/vm/default_pager.c
===================================================================
--- head/sys/vm/default_pager.c	(revision 13489)
+++ head/sys/vm/default_pager.c	(revision 13490)
@@ -1,145 +1,145 @@
 /*
  * Copyright (c) 1995, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by David Greenman.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	$Id: default_pager.c,v 1.4 1995/12/11 04:57:56 dyson Exp $
+ *	$Id: default_pager.c,v 1.5 1995/12/14 09:54:46 phk Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/default_pager.h>
 #include <vm/swap_pager.h>
 
 static vm_object_t default_pager_alloc __P((void *, vm_size_t, vm_prot_t,
 		vm_ooffset_t));
 static void default_pager_dealloc __P((vm_object_t));
 static int default_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static int default_pager_putpages __P((vm_object_t, vm_page_t *, int, 
 		boolean_t, int *));
 static boolean_t default_pager_haspage __P((vm_object_t, vm_pindex_t, int *, 
 		int *));
 /*
  * pagerops for OBJT_DEFAULT - "default pager".
  */
 struct pagerops defaultpagerops = {
 	NULL,
 	default_pager_alloc,
 	default_pager_dealloc,
 	default_pager_getpages,
 	default_pager_putpages,
 	default_pager_haspage,
 	NULL
 };
 
 /*
  * no_pager_alloc just returns an initialized object.
  */
 static vm_object_t
 default_pager_alloc(handle, size, prot, offset)
 	void *handle;
 	register vm_size_t size;
 	vm_prot_t prot;
 	vm_ooffset_t offset;
 {
 	if (handle != NULL)
 		panic("default_pager_alloc: handle specified");
 
-	return vm_object_allocate(OBJT_DEFAULT, offset + size);
+	return vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(offset) + size);
 }
 
 static void
 default_pager_dealloc(object)
 	vm_object_t object;
 {
 	/*
 	 * OBJT_DEFAULT objects have no special resources allocated to them.
 	 */
 }
 
 /*
  * The default pager has no backing store, so we always return
  * failure.
  */
 static int
 default_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	return VM_PAGER_FAIL;
 }
 
 static int
 default_pager_putpages(object, m, c, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int c;
 	boolean_t sync;
 	int *rtvals;
 {
 	int i;
 
 	/*
 	 * Try to convert the object type into a OBJT_SWAP.
 	 * If the swp structure allocation fails, convert it
 	 * back to OBJT_DEFAULT and return failure. Otherwise
 	 * pass this putpages to the swap pager.
 	 */
 	object->type = OBJT_SWAP;
 
 	if (swap_pager_swp_alloc(object, M_KERNEL) != 0) {
 		object->type = OBJT_DEFAULT;
 		for (i = 0; i < c; i++)
 			rtvals[i] = VM_PAGER_FAIL;
 		return VM_PAGER_FAIL;
 	}
 
 	return swap_pager_putpages(object, m, c, sync, rtvals);
 }
 
 static boolean_t
 default_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	return FALSE;
 }
Index: head/sys/vm/device_pager.c
===================================================================
--- head/sys/vm/device_pager.c	(revision 13489)
+++ head/sys/vm/device_pager.c	(revision 13490)
@@ -1,297 +1,297 @@
 /*
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)device_pager.c	8.1 (Berkeley) 6/11/93
- * $Id: device_pager.c,v 1.18 1995/12/13 15:13:54 julian Exp $
+ * $Id: device_pager.c,v 1.19 1995/12/14 09:54:49 phk Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/mman.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/device_pager.h>
 
 static void dev_pager_init __P((void));
 static vm_object_t dev_pager_alloc __P((void *, vm_size_t, vm_prot_t,
 		vm_ooffset_t));
 static void dev_pager_dealloc __P((vm_object_t));
 static int dev_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static int dev_pager_putpages __P((vm_object_t, vm_page_t *, int, 
 		boolean_t, int *));
 static boolean_t dev_pager_haspage __P((vm_object_t, vm_pindex_t, int *,
 		int *));
 
 /* list of device pager objects */
 static struct pagerlst dev_pager_object_list;
 
 /* list of available vm_page_t's */
 static TAILQ_HEAD(, vm_page) dev_pager_fakelist;
 
 static vm_page_t dev_pager_getfake __P((vm_offset_t));
 static void dev_pager_putfake __P((vm_page_t));
 
 static int dev_pager_alloc_lock, dev_pager_alloc_lock_want;
 
 struct pagerops devicepagerops = {
 	dev_pager_init,
 	dev_pager_alloc,
 	dev_pager_dealloc,
 	dev_pager_getpages,
 	dev_pager_putpages,
 	dev_pager_haspage,
 	NULL
 };
 
 static void
 dev_pager_init()
 {
 	TAILQ_INIT(&dev_pager_object_list);
 	TAILQ_INIT(&dev_pager_fakelist);
 }
 
 static vm_object_t
 dev_pager_alloc(handle, size, prot, foff)
 	void *handle;
 	vm_size_t size;
 	vm_prot_t prot;
 	vm_ooffset_t foff;
 {
 	dev_t dev;
 	d_mmap_t *mapfunc;
 	vm_object_t object;
 	unsigned int npages, off;
 
 	/*
 	 * Make sure this device can be mapped.
 	 */
 	dev = (dev_t) (u_long) handle;
 	mapfunc = cdevsw[major(dev)]->d_mmap;
 	if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop) {
 		printf("obsolete map function %p\n", (void *)mapfunc);
 		return (NULL);
 	}
 
 	/*
 	 * Offset should be page aligned.
 	 */
 	if (foff & (PAGE_SIZE - 1))
 		return (NULL);
 
 	/*
 	 * Check that the specified range of the device allows the desired
 	 * protection.
 	 *
 	 * XXX assumes VM_PROT_* == PROT_*
 	 */
-	npages = atop(round_page(size));
+	npages = size;
 	for (off = foff; npages--; off += PAGE_SIZE)
 		if ((*mapfunc) (dev, off, (int) prot) == -1)
 			return (NULL);
 
 	/*
 	 * Lock to prevent object creation race contion.
 	 */
 	while (dev_pager_alloc_lock) {
 		dev_pager_alloc_lock_want++;
 		tsleep(&dev_pager_alloc_lock, PVM, "dvpall", 0);
 		dev_pager_alloc_lock_want--;
 	}
 	dev_pager_alloc_lock = 1;
 
 	/*
 	 * Look up pager, creating as necessary.
 	 */
 	object = vm_pager_object_lookup(&dev_pager_object_list, handle);
 	if (object == NULL) {
 		/*
 		 * Allocate object and associate it with the pager.
 		 */
 		object = vm_object_allocate(OBJT_DEVICE,
-			OFF_TO_IDX(foff + size));
+			OFF_TO_IDX(foff) + size);
 		object->handle = handle;
 		TAILQ_INIT(&object->un_pager.devp.devp_pglist);
 		TAILQ_INSERT_TAIL(&dev_pager_object_list, object, pager_object_list);
 	} else {
 		/*
 		 * Gain a reference to the object.
 		 */
 		vm_object_reference(object);
-		if (OFF_TO_IDX(foff + size) > object->size)
-			object->size = OFF_TO_IDX(foff + size);
+		if (OFF_TO_IDX(foff) + size > object->size)
+			object->size = OFF_TO_IDX(foff) + size;
 	}
 
 	dev_pager_alloc_lock = 0;
 	if (dev_pager_alloc_lock_want)
 		wakeup(&dev_pager_alloc_lock);
 
 	return (object);
 }
 
 static void
 dev_pager_dealloc(object)
 	vm_object_t object;
 {
 	vm_page_t m;
 
 	TAILQ_REMOVE(&dev_pager_object_list, object, pager_object_list);
 	/*
 	 * Free up our fake pages.
 	 */
 	while ((m = object->un_pager.devp.devp_pglist.tqh_first) != 0) {
 		TAILQ_REMOVE(&object->un_pager.devp.devp_pglist, m, pageq);
 		dev_pager_putfake(m);
 	}
 }
 
 static int
 dev_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	vm_offset_t offset;
 	vm_offset_t paddr;
 	vm_page_t page;
 	dev_t dev;
 	int i, s;
 	d_mmap_t *mapfunc;
 	int prot;
 
 	dev = (dev_t) (u_long) object->handle;
 	offset = m[reqpage]->pindex + OFF_TO_IDX(object->paging_offset);
 	prot = PROT_READ;	/* XXX should pass in? */
 	mapfunc = cdevsw[major(dev)]->d_mmap;
 
 	if (mapfunc == NULL || mapfunc == (d_mmap_t *)nullop)
 		panic("dev_pager_getpage: no map function");
 
 	paddr = pmap_phys_address((*mapfunc) ((dev_t) dev, (int) offset << PAGE_SHIFT, prot));
 #ifdef DIAGNOSTIC
 	if (paddr == -1)
 		panic("dev_pager_getpage: map function returns error");
 #endif
 	/*
 	 * Replace the passed in reqpage page with our own fake page and free up the
 	 * all of the original pages.
 	 */
 	page = dev_pager_getfake(paddr);
 	TAILQ_INSERT_TAIL(&object->un_pager.devp.devp_pglist, page, pageq);
 	for (i = 0; i < count; i++) {
 		PAGE_WAKEUP(m[i]);
 		vm_page_free(m[i]);
 	}
 	s = splhigh();
 	vm_page_insert(page, object, offset);
 	splx(s);
 
 	return (VM_PAGER_OK);
 }
 
 static int
 dev_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	panic("dev_pager_putpage called");
 }
 
 static boolean_t
 dev_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	if (before != NULL)
 		*before = 0;
 	if (after != NULL)
 		*after = 0;
 	return (TRUE);
 }
 
 static vm_page_t
 dev_pager_getfake(paddr)
 	vm_offset_t paddr;
 {
 	vm_page_t m;
 	int i;
 
 	if (dev_pager_fakelist.tqh_first == NULL) {
 		m = (vm_page_t) malloc(PAGE_SIZE * 2, M_VMPGDATA, M_WAITOK);
 		for (i = (PAGE_SIZE * 2) / sizeof(*m); i > 0; i--) {
 			TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq);
 			m++;
 		}
 	}
 	m = dev_pager_fakelist.tqh_first;
 	TAILQ_REMOVE(&dev_pager_fakelist, m, pageq);
 
 	m->flags = PG_BUSY | PG_FICTITIOUS;
 	m->valid = VM_PAGE_BITS_ALL;
 	m->dirty = 0;
 	m->busy = 0;
-	m->bmapped = 0;
+	m->queue = PQ_NONE;
 
 	m->wire_count = 1;
 	m->phys_addr = paddr;
 
 	return (m);
 }
 
 static void
 dev_pager_putfake(m)
 	vm_page_t m;
 {
 	if (!(m->flags & PG_FICTITIOUS))
 		panic("dev_pager_putfake: bad page");
 	TAILQ_INSERT_TAIL(&dev_pager_fakelist, m, pageq);
 }
Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c	(revision 13489)
+++ head/sys/vm/swap_pager.c	(revision 13490)
@@ -1,1620 +1,1630 @@
 /*
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
- * $Id: swap_pager.c,v 1.57 1995/12/14 09:54:52 phk Exp $
+ * $Id: swap_pager.c,v 1.58 1995/12/17 07:19:55 bde Exp $
  */
 
 /*
  * Quick hack to page to dedicated partition(s).
  * TODO:
  *	Add multiprocessor locks
  *	Deal with async writes in a better fashion
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/vmmeter.h>
 
 #include <miscfs/specfs/specdev.h>
 #include <sys/rlist.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #ifndef NPENDINGIO
 #define NPENDINGIO	10
 #endif
 
 static int nswiodone;
 int swap_pager_full;
 extern int vm_swap_size;
 static int no_swap_space = 1;
 struct rlist *swaplist;
 
 #define MAX_PAGEOUT_CLUSTER 16
 
 TAILQ_HEAD(swpclean, swpagerclean);
 
 typedef struct swpagerclean *swp_clean_t;
 
 static struct swpagerclean {
 	TAILQ_ENTRY(swpagerclean) spc_list;
 	int spc_flags;
 	struct buf *spc_bp;
 	vm_object_t spc_object;
 	vm_offset_t spc_kva;
 	int spc_count;
 	vm_page_t spc_m[MAX_PAGEOUT_CLUSTER];
 } swcleanlist[NPENDINGIO];
 
 
 /* spc_flags values */
 #define SPC_ERROR	0x01
 
 #define SWB_EMPTY (-1)
 
 /* list of completed page cleans */
 static struct swpclean swap_pager_done;
 
 /* list of pending page cleans */
 static struct swpclean swap_pager_inuse;
 
 /* list of free pager clean structs */
 static struct swpclean swap_pager_free;
 
 /* list of "named" anon region objects */
 static struct pagerlst swap_pager_object_list;
 
 /* list of "unnamed" anon region objects */
 struct pagerlst swap_pager_un_object_list;
 
 #define	SWAP_FREE_NEEDED	0x1	/* need a swap block */
 #define SWAP_FREE_NEEDED_BY_PAGEOUT 0x2
 static int swap_pager_needflags;
 
 static struct pagerlst *swp_qs[] = {
 	&swap_pager_object_list, &swap_pager_un_object_list, (struct pagerlst *) 0
 };
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".
  */
 static vm_object_t
 		swap_pager_alloc __P((void *handle, vm_size_t size,
 				      vm_prot_t prot, vm_ooffset_t offset));
 static void	swap_pager_dealloc __P((vm_object_t object));
 static boolean_t
 		swap_pager_haspage __P((vm_object_t object, vm_pindex_t pindex,
 					int *before, int *after));
 static int	swap_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static void	swap_pager_init __P((void));
 static void	swap_pager_sync __P((void));
 
 struct pagerops swappagerops = {
 	swap_pager_init,
 	swap_pager_alloc,
 	swap_pager_dealloc,
 	swap_pager_getpages,
 	swap_pager_putpages,
 	swap_pager_haspage,
 	swap_pager_sync
 };
 
 static int npendingio = NPENDINGIO;
 static int dmmin;
 int dmmax;
 
 static __pure int
 		swap_pager_block_index __P((vm_pindex_t pindex)) __pure2;
 static __pure int
 		swap_pager_block_offset __P((vm_pindex_t pindex)) __pure2;
 static daddr_t *swap_pager_diskaddr __P((vm_object_t object,
 					  vm_pindex_t pindex, int *valid));
 static void	swap_pager_finish __P((swp_clean_t spc));
 static void	swap_pager_freepage __P((vm_page_t m));
 static void	swap_pager_free_swap __P((vm_object_t object));
 static void	swap_pager_freeswapspace __P((vm_object_t object,
 					      unsigned int from,
 					      unsigned int to));
 static int	swap_pager_getswapspace __P((vm_object_t object,
 					     unsigned int amount,
 					     daddr_t *rtval));
 static void	swap_pager_iodone __P((struct buf *));
 static void	swap_pager_iodone1 __P((struct buf *bp));
 static void	swap_pager_reclaim __P((void));
 static void	swap_pager_ridpages __P((vm_page_t *m, int count,
 					 int reqpage));
 static void	swap_pager_setvalid __P((vm_object_t object,
 					 vm_offset_t offset, int valid));
 static void	swapsizecheck __P((void));
 
 static inline void
 swapsizecheck()
 {
 	if (vm_swap_size < 128 * btodb(PAGE_SIZE)) {
 		if (swap_pager_full == 0)
 			printf("swap_pager: out of space\n");
 		swap_pager_full = 1;
 	} else if (vm_swap_size > 192 * btodb(PAGE_SIZE))
 		swap_pager_full = 0;
 }
 
 static void
 swap_pager_init()
 {
 	TAILQ_INIT(&swap_pager_object_list);
 	TAILQ_INIT(&swap_pager_un_object_list);
 
 	/*
 	 * Initialize clean lists
 	 */
 	TAILQ_INIT(&swap_pager_inuse);
 	TAILQ_INIT(&swap_pager_done);
 	TAILQ_INIT(&swap_pager_free);
 
 	/*
 	 * Calculate the swap allocation constants.
 	 */
 	dmmin = CLBYTES / DEV_BSIZE;
 	dmmax = btodb(SWB_NPAGES * PAGE_SIZE) * 2;
 }
 
 void
 swap_pager_swap_init()
 {
 	swp_clean_t spc;
 	struct buf *bp;
 	int i;
 
 	/*
 	 * kva's are allocated here so that we dont need to keep doing
 	 * kmem_alloc pageables at runtime
 	 */
 	for (i = 0, spc = swcleanlist; i < npendingio; i++, spc++) {
 		spc->spc_kva = kmem_alloc_pageable(pager_map, PAGE_SIZE * MAX_PAGEOUT_CLUSTER);
 		if (!spc->spc_kva) {
 			break;
 		}
 		spc->spc_bp = malloc(sizeof(*bp), M_TEMP, M_KERNEL);
 		if (!spc->spc_bp) {
 			kmem_free_wakeup(pager_map, spc->spc_kva, PAGE_SIZE);
 			break;
 		}
 		spc->spc_flags = 0;
 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
 	}
 }
 
 int
 swap_pager_swp_alloc(object, wait)
 	vm_object_t object;
 	int wait;
 {
 	sw_blk_t swb;
 	int nblocks;
 	int i, j;
 
 	nblocks = (object->size + SWB_NPAGES - 1) / SWB_NPAGES;
 	swb = malloc(nblocks * sizeof(*swb), M_VMPGDATA, wait);
 	if (swb == NULL)
 		return 1;
 
 	for (i = 0; i < nblocks; i++) {
 		swb[i].swb_valid = 0;
 		swb[i].swb_locked = 0;
 		for (j = 0; j < SWB_NPAGES; j++)
 			swb[i].swb_block[j] = SWB_EMPTY;
 	}
 
 	object->un_pager.swp.swp_nblocks = nblocks;
 	object->un_pager.swp.swp_allocsize = 0;
 	object->un_pager.swp.swp_blocks = swb;
 	object->un_pager.swp.swp_poip = 0;
 
 	if (object->handle != NULL) {
 		TAILQ_INSERT_TAIL(&swap_pager_object_list, object, pager_object_list);
 	} else {
 		TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
 	}
 
 	return 0;
 }
 
 /*
  * Allocate an object and associated resources.
  * Note that if we are called from the pageout daemon (handle == NULL)
  * we should not wait for memory as it could resulting in deadlock.
  */
 static vm_object_t
 swap_pager_alloc(handle, size, prot, offset)
 	void *handle;
 	register vm_size_t size;
 	vm_prot_t prot;
 	vm_ooffset_t offset;
 {
 	vm_object_t object;
 
 	/*
 	 * If this is a "named" anonymous region, look it up and use the
 	 * object if it exists, otherwise allocate a new one.
 	 */
 	if (handle) {
 		object = vm_pager_object_lookup(&swap_pager_object_list, handle);
 		if (object != NULL) {
 			vm_object_reference(object);
 		} else {
 			/*
 			 * XXX - there is a race condition here. Two processes
 			 * can request the same named object simultaneuously,
 			 * and if one blocks for memory, the result is a disaster.
 			 * Probably quite rare, but is yet another reason to just
 			 * rip support of "named anonymous regions" out altogether.
 			 */
 			object = vm_object_allocate(OBJT_SWAP,
-				OFF_TO_IDX(offset+ PAGE_SIZE - 1 + size));
+				OFF_TO_IDX(offset + PAGE_SIZE - 1) + size);
 			object->handle = handle;
 			(void) swap_pager_swp_alloc(object, M_WAITOK);
 		}
 	} else {
 		object = vm_object_allocate(OBJT_SWAP,
-			OFF_TO_IDX(offset + PAGE_SIZE - 1 + size));
+			OFF_TO_IDX(offset + PAGE_SIZE - 1) + size);
 		(void) swap_pager_swp_alloc(object, M_WAITOK);
 	}
 
 	return (object);
 }
 
 /*
  * returns disk block associated with pager and offset
  * additionally, as a side effect returns a flag indicating
  * if the block has been written
  */
 
 inline static daddr_t *
 swap_pager_diskaddr(object, pindex, valid)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *valid;
 {
 	register sw_blk_t swb;
 	int ix;
 
 	if (valid)
 		*valid = 0;
 	ix = pindex / SWB_NPAGES;
 	if ((ix >= object->un_pager.swp.swp_nblocks) ||
 	    (pindex >= object->size)) {
 		return (FALSE);
 	}
 	swb = &object->un_pager.swp.swp_blocks[ix];
 	ix = pindex % SWB_NPAGES;
 	if (valid)
 		*valid = swb->swb_valid & (1 << ix);
 	return &swb->swb_block[ix];
 }
 
 /*
  * Utility routine to set the valid (written) bit for
  * a block associated with a pager and offset
  */
 static void
 swap_pager_setvalid(object, offset, valid)
 	vm_object_t object;
 	vm_offset_t offset;
 	int valid;
 {
 	register sw_blk_t swb;
 	int ix;
 
 	ix = offset / SWB_NPAGES;
 	if (ix >= object->un_pager.swp.swp_nblocks)
 		return;
 
 	swb = &object->un_pager.swp.swp_blocks[ix];
 	ix = offset % SWB_NPAGES;
 	if (valid)
 		swb->swb_valid |= (1 << ix);
 	else
 		swb->swb_valid &= ~(1 << ix);
 	return;
 }
 
 /*
  * this routine allocates swap space with a fragmentation
  * minimization policy.
  */
 static int
 swap_pager_getswapspace(object, amount, rtval)
 	vm_object_t object;
 	unsigned int amount;
 	daddr_t *rtval;
 {
 	unsigned location;
 	vm_swap_size -= amount;
 	if (!rlist_alloc(&swaplist, amount, &location)) {
 		vm_swap_size += amount;
 		return 0;
 	} else {
 		swapsizecheck();
 		object->un_pager.swp.swp_allocsize += amount;
 		*rtval = location;
 		return 1;
 	}
 }
 
 /*
  * this routine frees swap space with a fragmentation
  * minimization policy.
  */
 static void
 swap_pager_freeswapspace(object, from, to)
 	vm_object_t object;
 	unsigned int from;
 	unsigned int to;
 {
 	rlist_free(&swaplist, from, to);
 	vm_swap_size += (to - from) + 1;
 	object->un_pager.swp.swp_allocsize -= (to - from) + 1;
 	swapsizecheck();
 }
 /*
  * this routine frees swap blocks from a specified pager
  */
 void
 swap_pager_freespace(object, start, size)
 	vm_object_t object;
 	vm_pindex_t start;
 	vm_size_t size;
 {
 	vm_pindex_t i;
 	int s;
 
 	s = splbio();
 	for (i = start; i < start + size; i += 1) {
 		int valid;
 		daddr_t *addr = swap_pager_diskaddr(object, i, &valid);
 
 		if (addr && *addr != SWB_EMPTY) {
 			swap_pager_freeswapspace(object, *addr, *addr + btodb(PAGE_SIZE) - 1);
 			if (valid) {
 				swap_pager_setvalid(object, i, 0);
 			}
 			*addr = SWB_EMPTY;
 		}
 	}
 	splx(s);
 }
 
 static void
 swap_pager_free_swap(object)
 	vm_object_t object;
 {
 	register int i, j;
 	register sw_blk_t swb;
 	int first_block=0, block_count=0;
 	int s;
 	/*
 	 * Free left over swap blocks
 	 */
 	s = splbio();
 	for (i = 0, swb = object->un_pager.swp.swp_blocks;
 	    i < object->un_pager.swp.swp_nblocks; i++, swb++) {
 		for (j = 0; j < SWB_NPAGES; j++) {
 			if (swb->swb_block[j] != SWB_EMPTY) {
 				/*
 				 * initially the length of the run is zero
 				 */
 				if (block_count == 0) {
 					first_block = swb->swb_block[j];
 					block_count = btodb(PAGE_SIZE);
 					swb->swb_block[j] = SWB_EMPTY;
 				/*
 				 * if the new block can be included into the current run
 				 */
 				} else if (swb->swb_block[j] == first_block + block_count) {
 					block_count += btodb(PAGE_SIZE);
 					swb->swb_block[j] = SWB_EMPTY;
 				/*
 				 * terminate the previous run, and start a new one
 				 */
 				} else {
 					swap_pager_freeswapspace(object, first_block,
 				   	 (unsigned) first_block + block_count - 1);
 					first_block = swb->swb_block[j];
 					block_count = btodb(PAGE_SIZE);
 					swb->swb_block[j] = SWB_EMPTY;
 				}
 			}
 		}
 	}
 
 	if (block_count) {
 		swap_pager_freeswapspace(object, first_block,
 		   	 (unsigned) first_block + block_count - 1);
 	}
 	splx(s);
 }
 
 
 /*
  * swap_pager_reclaim frees up over-allocated space from all pagers
  * this eliminates internal fragmentation due to allocation of space
  * for segments that are never swapped to. It has been written so that
  * it does not block until the rlist_free operation occurs; it keeps
  * the queues consistant.
  */
 
 /*
  * Maximum number of blocks (pages) to reclaim per pass
  */
 #define MAXRECLAIM 128
 
 static void
 swap_pager_reclaim()
 {
 	vm_object_t object;
 	int i, j, k;
 	int s;
 	int reclaimcount;
 	static struct {
 		int address;
 		vm_object_t object;
 	} reclaims[MAXRECLAIM];
 	static int in_reclaim;
 
 	/*
 	 * allow only one process to be in the swap_pager_reclaim subroutine
 	 */
 	s = splbio();
 	if (in_reclaim) {
 		tsleep(&in_reclaim, PSWP, "swrclm", 0);
 		splx(s);
 		return;
 	}
 	in_reclaim = 1;
 	reclaimcount = 0;
 
 	/* for each pager queue */
 	for (k = 0; swp_qs[k]; k++) {
 
 		object = swp_qs[k]->tqh_first;
 		while (object && (reclaimcount < MAXRECLAIM)) {
 
 			/*
 			 * see if any blocks associated with a pager has been
 			 * allocated but not used (written)
 			 */
 			if (object->paging_in_progress == 0) {
 				for (i = 0; i < object->un_pager.swp.swp_nblocks; i++) {
 					sw_blk_t swb = &object->un_pager.swp.swp_blocks[i];
 
 					if (swb->swb_locked)
 						continue;
 					for (j = 0; j < SWB_NPAGES; j++) {
 						if (swb->swb_block[j] != SWB_EMPTY &&
 						    (swb->swb_valid & (1 << j)) == 0) {
 							reclaims[reclaimcount].address = swb->swb_block[j];
 							reclaims[reclaimcount++].object = object;
 							swb->swb_block[j] = SWB_EMPTY;
 							if (reclaimcount >= MAXRECLAIM)
 								goto rfinished;
 						}
 					}
 				}
 			}
 			object = object->pager_object_list.tqe_next;
 		}
 	}
 
 rfinished:
 
 	/*
 	 * free the blocks that have been added to the reclaim list
 	 */
 	for (i = 0; i < reclaimcount; i++) {
 		swap_pager_freeswapspace(reclaims[i].object,
 		    reclaims[i].address, reclaims[i].address + btodb(PAGE_SIZE) - 1);
 	}
 	splx(s);
 	in_reclaim = 0;
 	wakeup(&in_reclaim);
 }
 
 
 /*
  * swap_pager_copy copies blocks from one pager to another and
  * destroys the source pager
  */
 
 void
 swap_pager_copy(srcobject, srcoffset, dstobject, dstoffset, offset)
 	vm_object_t srcobject;
 	vm_pindex_t srcoffset;
 	vm_object_t dstobject;
 	vm_pindex_t dstoffset;
 	vm_pindex_t offset;
 {
 	vm_pindex_t i;
 	int origsize;
 	int s;
 
 	if (vm_swap_size)
 		no_swap_space = 0;
 
 	origsize = srcobject->un_pager.swp.swp_allocsize;
 
 	/*
 	 * remove the source object from the swap_pager internal queue
 	 */
 	if (srcobject->handle == NULL) {
 		TAILQ_REMOVE(&swap_pager_un_object_list, srcobject, pager_object_list);
 	} else {
 		TAILQ_REMOVE(&swap_pager_object_list, srcobject, pager_object_list);
 	}
 
 	s = splbio();
 	while (srcobject->un_pager.swp.swp_poip) {
 		tsleep(srcobject, PVM, "spgout", 0);
 	}
 	splx(s);
 
 	/*
 	 * clean all of the pages that are currently active and finished
 	 */
 	swap_pager_sync();
 
 	s = splbio();
 	/*
 	 * transfer source to destination
 	 */
 	for (i = 0; i < dstobject->size; i += 1) {
 		int srcvalid, dstvalid;
 		daddr_t *srcaddrp = swap_pager_diskaddr(srcobject, i + offset + srcoffset,
 						    &srcvalid);
 		daddr_t *dstaddrp;
 
 		/*
 		 * see if the source has space allocated
 		 */
 		if (srcaddrp && *srcaddrp != SWB_EMPTY) {
 			/*
 			 * if the source is valid and the dest has no space,
 			 * then copy the allocation from the srouce to the
 			 * dest.
 			 */
 			if (srcvalid) {
 				dstaddrp = swap_pager_diskaddr(dstobject, i + dstoffset,
 							&dstvalid);
 				/*
 				 * if the dest already has a valid block,
 				 * deallocate the source block without
 				 * copying.
 				 */
 				if (!dstvalid && dstaddrp && *dstaddrp != SWB_EMPTY) {
 					swap_pager_freeswapspace(dstobject, *dstaddrp,
 						*dstaddrp + btodb(PAGE_SIZE) - 1);
 					*dstaddrp = SWB_EMPTY;
 				}
 				if (dstaddrp && *dstaddrp == SWB_EMPTY) {
 					*dstaddrp = *srcaddrp;
 					*srcaddrp = SWB_EMPTY;
 					dstobject->un_pager.swp.swp_allocsize += btodb(PAGE_SIZE);
 					srcobject->un_pager.swp.swp_allocsize -= btodb(PAGE_SIZE);
 					swap_pager_setvalid(dstobject, i + dstoffset, 1);
 				}
 			}
 			/*
 			 * if the source is not empty at this point, then
 			 * deallocate the space.
 			 */
 			if (*srcaddrp != SWB_EMPTY) {
 				swap_pager_freeswapspace(srcobject, *srcaddrp,
 					*srcaddrp + btodb(PAGE_SIZE) - 1);
 				*srcaddrp = SWB_EMPTY;
 			}
 		}
 	}
 	splx(s);
 
 	/*
 	 * Free left over swap blocks
 	 */
 	swap_pager_free_swap(srcobject);
 
 	if (srcobject->un_pager.swp.swp_allocsize) {
 		printf("swap_pager_copy: *warning* pager with %d blocks (orig: %d)\n",
 		    srcobject->un_pager.swp.swp_allocsize, origsize);
 	}
 
 	free(srcobject->un_pager.swp.swp_blocks, M_VMPGDATA);
 	srcobject->un_pager.swp.swp_blocks = NULL;
 
 	return;
 }
 
 static void
 swap_pager_dealloc(object)
 	vm_object_t object;
 {
 	int s;
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle == NULL) {
 		TAILQ_REMOVE(&swap_pager_un_object_list, object, pager_object_list);
 	} else {
 		TAILQ_REMOVE(&swap_pager_object_list, object, pager_object_list);
 	}
 
 	/*
 	 * Wait for all pageouts to finish and remove all entries from
 	 * cleaning list.
 	 */
 
 	s = splbio();
 	while (object->un_pager.swp.swp_poip) {
 		tsleep(object, PVM, "swpout", 0);
 	}
 	splx(s);
 
 
 	swap_pager_sync();
 
 	/*
 	 * Free left over swap blocks
 	 */
 	swap_pager_free_swap(object);
 
 	if (object->un_pager.swp.swp_allocsize) {
 		printf("swap_pager_dealloc: *warning* freeing pager with %d blocks\n",
 		    object->un_pager.swp.swp_allocsize);
 	}
 	/*
 	 * Free swap management resources
 	 */
 	free(object->un_pager.swp.swp_blocks, M_VMPGDATA);
 	object->un_pager.swp.swp_blocks = NULL;
 }
 
 static inline __pure int
 swap_pager_block_index(pindex)
 	vm_pindex_t pindex;
 {
 	return (pindex / SWB_NPAGES);
 }
 
 static inline __pure int
 swap_pager_block_offset(pindex)
 	vm_pindex_t pindex;
 {
 	return (pindex % SWB_NPAGES);
 }
 
 /*
  * swap_pager_haspage returns TRUE if the pager has data that has
  * been written out.
  */
 static boolean_t
 swap_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	register sw_blk_t swb;
 	int ix;
 
 	if (before != NULL)
 		*before = 0;
 	if (after != NULL)
 		*after = 0;
 	ix = pindex / SWB_NPAGES;
 	if (ix >= object->un_pager.swp.swp_nblocks) {
 		return (FALSE);
 	}
 	swb = &object->un_pager.swp.swp_blocks[ix];
 	ix = pindex % SWB_NPAGES;
 
 	if (swb->swb_block[ix] != SWB_EMPTY) {
 
 		if (swb->swb_valid & (1 << ix)) {
 			int tix;
 			if (before) {
 				for(tix = ix - 1; tix >= 0; --tix) {
 					if ((swb->swb_valid & (1 << tix)) == 0)
 						break;
 					if ((swb->swb_block[tix] +
 						(ix - tix) * (PAGE_SIZE/DEV_BSIZE)) !=
 						swb->swb_block[ix])
 						break;
 					(*before)++;
 				}
 			}
 
 			if (after) {
 				for(tix = ix + 1; tix < SWB_NPAGES; tix++) {
 					if ((swb->swb_valid & (1 << tix)) == 0)
 						break;
 					if ((swb->swb_block[tix] -
 						(tix - ix) * (PAGE_SIZE/DEV_BSIZE)) !=
 						swb->swb_block[ix])
 						break;
 					(*after)++;
 				}
 			}
 
 			return TRUE;
 		}
 	}
 	return (FALSE);
 }
 
 /*
  * swap_pager_freepage is a convienience routine that clears the busy
  * bit and deallocates a page.
  */
 static void
 swap_pager_freepage(m)
 	vm_page_t m;
 {
 	PAGE_WAKEUP(m);
 	vm_page_free(m);
 }
 
 /*
  * swap_pager_ridpages is a convienience routine that deallocates all
  * but the required page.  this is usually used in error returns that
  * need to invalidate the "extra" readahead pages.
  */
 static void
 swap_pager_ridpages(m, count, reqpage)
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	int i;
 
 	for (i = 0; i < count; i++)
 		if (i != reqpage)
 			swap_pager_freepage(m[i]);
 }
 
 /*
  * swap_pager_iodone1 is the completion routine for both reads and async writes
  */
 static void
 swap_pager_iodone1(bp)
 	struct buf *bp;
 {
 	bp->b_flags |= B_DONE;
 	bp->b_flags &= ~B_ASYNC;
 	wakeup(bp);
 }
 
 static int
 swap_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count, reqpage;
 {
 	register struct buf *bp;
 	sw_blk_t swb[count];
 	register int s;
 	int i;
 	boolean_t rv;
 	vm_offset_t kva, off[count];
 	swp_clean_t spc;
 	vm_pindex_t paging_offset;
 	int reqaddr[count];
 	int sequential;
 
 	int first, last;
 	int failed;
 	int reqdskregion;
 
 	object = m[reqpage]->object;
 	paging_offset = OFF_TO_IDX(object->paging_offset);
 	sequential = (m[reqpage]->pindex == (object->last_read + 1));
 
 	for (i = 0; i < count; i++) {
 		vm_pindex_t fidx = m[i]->pindex + paging_offset;
 		int ix = swap_pager_block_index(fidx);
 
 		if (ix >= object->un_pager.swp.swp_nblocks) {
 			int j;
 
 			if (i <= reqpage) {
 				swap_pager_ridpages(m, count, reqpage);
 				return (VM_PAGER_FAIL);
 			}
 			for (j = i; j < count; j++) {
 				swap_pager_freepage(m[j]);
 			}
 			count = i;
 			break;
 		}
 		swb[i] = &object->un_pager.swp.swp_blocks[ix];
 		off[i] = swap_pager_block_offset(fidx);
 		reqaddr[i] = swb[i]->swb_block[off[i]];
 	}
 
 	/* make sure that our required input request is existant */
 
 	if (reqaddr[reqpage] == SWB_EMPTY ||
 	    (swb[reqpage]->swb_valid & (1 << off[reqpage])) == 0) {
 		swap_pager_ridpages(m, count, reqpage);
 		return (VM_PAGER_FAIL);
 	}
 	reqdskregion = reqaddr[reqpage] / dmmax;
 
 	/*
 	 * search backwards for the first contiguous page to transfer
 	 */
 	failed = 0;
 	first = 0;
 	for (i = reqpage - 1; i >= 0; --i) {
 		if (sequential || failed || (reqaddr[i] == SWB_EMPTY) ||
 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
 			failed = 1;
 			swap_pager_freepage(m[i]);
 			if (first == 0)
 				first = i + 1;
 		}
 	}
 	/*
 	 * search forwards for the last contiguous page to transfer
 	 */
 	failed = 0;
 	last = count;
 	for (i = reqpage + 1; i < count; i++) {
 		if (failed || (reqaddr[i] == SWB_EMPTY) ||
 		    (swb[i]->swb_valid & (1 << off[i])) == 0 ||
 		    (reqaddr[i] != (reqaddr[reqpage] + (i - reqpage) * btodb(PAGE_SIZE))) ||
 		    ((reqaddr[i] / dmmax) != reqdskregion)) {
 			failed = 1;
 			swap_pager_freepage(m[i]);
 			if (last == count)
 				last = i;
 		}
 	}
 
 	count = last;
 	if (first != 0) {
 		for (i = first; i < count; i++) {
 			m[i - first] = m[i];
 			reqaddr[i - first] = reqaddr[i];
 			off[i - first] = off[i];
 		}
 		count -= first;
 		reqpage -= first;
 	}
 	++swb[reqpage]->swb_locked;
 
 	/*
 	 * at this point: "m" is a pointer to the array of vm_page_t for
 	 * paging I/O "count" is the number of vm_page_t entries represented
 	 * by "m" "object" is the vm_object_t for I/O "reqpage" is the index
 	 * into "m" for the page actually faulted
 	 */
 
 	spc = NULL;	/* we might not use an spc data structure */
 
 	if ((count == 1) && (swap_pager_free.tqh_first != NULL)) {
 		spc = swap_pager_free.tqh_first;
 		TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
 		kva = spc->spc_kva;
 		bp = spc->spc_bp;
 		bzero(bp, sizeof *bp);
 		bp->b_spc = spc;
 		bp->b_vnbufs.le_next = NOLIST;
 	} else {
 		/*
 		 * Get a swap buffer header to perform the IO
 		 */
 		bp = getpbuf();
 		kva = (vm_offset_t) bp->b_data;
 	}
 
 	/*
 	 * map our page(s) into kva for input
 	 */
 	pmap_qenter(kva, m, count);
 
 	bp->b_flags = B_BUSY | B_READ | B_CALL | B_PAGING;
 	bp->b_iodone = swap_pager_iodone1;
 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 	crhold(bp->b_rcred);
 	crhold(bp->b_wcred);
 	bp->b_un.b_addr = (caddr_t) kva;
 	bp->b_blkno = reqaddr[0];
 	bp->b_bcount = PAGE_SIZE * count;
 	bp->b_bufsize = PAGE_SIZE * count;
 
 	pbgetvp(swapdev_vp, bp);
 
 	cnt.v_swapin++;
 	cnt.v_swappgsin += count;
 	/*
 	 * perform the I/O
 	 */
 	VOP_STRATEGY(bp);
 
 	/*
 	 * wait for the sync I/O to complete
 	 */
 	s = splbio();
 	while ((bp->b_flags & B_DONE) == 0) {
 		tsleep(bp, PVM, "swread", 0);
 	}
 
 	if (bp->b_flags & B_ERROR) {
 		printf("swap_pager: I/O error - pagein failed; blkno %d, size %d, error %d\n",
 		    bp->b_blkno, bp->b_bcount, bp->b_error);
 		rv = VM_PAGER_ERROR;
 	} else {
 		rv = VM_PAGER_OK;
 	}
 
 	/*
 	 * relpbuf does this, but we maintain our own buffer list also...
 	 */
 	if (bp->b_vp)
 		pbrelvp(bp);
 
 	splx(s);
 	swb[reqpage]->swb_locked--;
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	pmap_qremove(kva, count);
 
 	if (spc) {
 		m[reqpage]->object->last_read = m[reqpage]->pindex;
 		if (bp->b_flags & B_WANTED)
 			wakeup(bp);
 		/*
 		 * if we have used an spc, we need to free it.
 		 */
 		if (bp->b_rcred != NOCRED)
 			crfree(bp->b_rcred);
 		if (bp->b_wcred != NOCRED)
 			crfree(bp->b_wcred);
 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
 			wakeup(&swap_pager_free);
 		}
 		if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
 			pagedaemon_wakeup();
 		swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
 	} else {
 		/*
 		 * release the physical I/O buffer
 		 */
 		relpbuf(bp);
 		/*
 		 * finish up input if everything is ok
 		 */
 		if (rv == VM_PAGER_OK) {
 			for (i = 0; i < count; i++) {
 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
 				m[i]->dirty = 0;
 				m[i]->flags &= ~PG_ZERO;
 				if (i != reqpage) {
 					/*
 					 * whether or not to leave the page
 					 * activated is up in the air, but we
 					 * should put the page on a page queue
 					 * somewhere. (it already is in the
 					 * object). After some emperical
 					 * results, it is best to deactivate
 					 * the readahead pages.
 					 */
 					vm_page_deactivate(m[i]);
 
 					/*
 					 * just in case someone was asking for
 					 * this page we now tell them that it
 					 * is ok to use
 					 */
 					m[i]->valid = VM_PAGE_BITS_ALL;
 					PAGE_WAKEUP(m[i]);
 				}
 			}
 
 			m[reqpage]->object->last_read = m[count-1]->pindex;
 
 			/*
 			 * If we're out of swap space, then attempt to free
 			 * some whenever pages are brought in. We must clear
 			 * the clean flag so that the page contents will be
 			 * preserved.
 			 */
 			if (swap_pager_full) {
 				for (i = 0; i < count; i++) {
 					m[i]->dirty = VM_PAGE_BITS_ALL;
 				}
 				swap_pager_freespace(object, m[0]->pindex + paging_offset, count);
 			}
 		} else {
 			swap_pager_ridpages(m, count, reqpage);
 		}
 	}
 	if (rv == VM_PAGER_OK) {
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m[reqpage]));
 		m[reqpage]->valid = VM_PAGE_BITS_ALL;
 		m[reqpage]->dirty = 0;
 	}
 	return (rv);
 }
 
 int
 swap_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	register struct buf *bp;
 	sw_blk_t swb[count];
 	register int s;
 	int i, j, ix;
 	boolean_t rv;
 	vm_offset_t kva, off, fidx;
 	swp_clean_t spc;
 	vm_pindex_t paging_pindex;
 	int reqaddr[count];
 	int failed;
 
 	if (vm_swap_size)
 		no_swap_space = 0;
 	if (no_swap_space) {
 		for (i = 0; i < count; i++)
 			rtvals[i] = VM_PAGER_FAIL;
 		return VM_PAGER_FAIL;
 	}
 	spc = NULL;
 
 	object = m[0]->object;
 	paging_pindex = OFF_TO_IDX(object->paging_offset);
 
 	failed = 0;
 	for (j = 0; j < count; j++) {
 		fidx = m[j]->pindex + paging_pindex;
 		ix = swap_pager_block_index(fidx);
 		swb[j] = 0;
 		if (ix >= object->un_pager.swp.swp_nblocks) {
 			rtvals[j] = VM_PAGER_FAIL;
 			failed = 1;
 			continue;
 		} else {
 			rtvals[j] = VM_PAGER_OK;
 		}
 		swb[j] = &object->un_pager.swp.swp_blocks[ix];
 		swb[j]->swb_locked++;
 		if (failed) {
 			rtvals[j] = VM_PAGER_FAIL;
 			continue;
 		}
 		off = swap_pager_block_offset(fidx);
 		reqaddr[j] = swb[j]->swb_block[off];
 		if (reqaddr[j] == SWB_EMPTY) {
 			daddr_t blk;
 			int tries;
 			int ntoget;
 
 			tries = 0;
 			s = splbio();
 
 			/*
 			 * if any other pages have been allocated in this
 			 * block, we only try to get one page.
 			 */
 			for (i = 0; i < SWB_NPAGES; i++) {
 				if (swb[j]->swb_block[i] != SWB_EMPTY)
 					break;
 			}
 
 			ntoget = (i == SWB_NPAGES) ? SWB_NPAGES : 1;
 			/*
 			 * this code is alittle conservative, but works (the
 			 * intent of this code is to allocate small chunks for
 			 * small objects)
 			 */
 			if ((off == 0) && ((fidx + ntoget) > object->size)) {
 				ntoget = object->size - fidx;
 			}
 	retrygetspace:
 			if (!swap_pager_full && ntoget > 1 &&
 			    swap_pager_getswapspace(object, ntoget * btodb(PAGE_SIZE),
 				&blk)) {
 
 				for (i = 0; i < ntoget; i++) {
 					swb[j]->swb_block[i] = blk + btodb(PAGE_SIZE) * i;
 					swb[j]->swb_valid = 0;
 				}
 
 				reqaddr[j] = swb[j]->swb_block[off];
 			} else if (!swap_pager_getswapspace(object, btodb(PAGE_SIZE),
 				&swb[j]->swb_block[off])) {
 				/*
 				 * if the allocation has failed, we try to
 				 * reclaim space and retry.
 				 */
 				if (++tries == 1) {
 					swap_pager_reclaim();
 					goto retrygetspace;
 				}
 				rtvals[j] = VM_PAGER_AGAIN;
 				failed = 1;
 				swap_pager_full = 1;
 			} else {
 				reqaddr[j] = swb[j]->swb_block[off];
 				swb[j]->swb_valid &= ~(1 << off);
 			}
 			splx(s);
 		}
 	}
 
 	/*
 	 * search forwards for the last contiguous page to transfer
 	 */
 	failed = 0;
 	for (i = 0; i < count; i++) {
 		if (failed ||
 			(reqaddr[i] != reqaddr[0] + i * btodb(PAGE_SIZE)) ||
 		    ((reqaddr[i] / dmmax) != (reqaddr[0] / dmmax)) ||
 		    (rtvals[i] != VM_PAGER_OK)) {
 			failed = 1;
 			if (rtvals[i] == VM_PAGER_OK)
 				rtvals[i] = VM_PAGER_AGAIN;
 		}
 	}
 
 	for (i = 0; i < count; i++) {
 		if (rtvals[i] != VM_PAGER_OK) {
 			if (swb[i])
 				--swb[i]->swb_locked;
 		}
 	}
 
 	for (i = 0; i < count; i++)
 		if (rtvals[i] != VM_PAGER_OK)
 			break;
 
 	if (i == 0) {
 		return VM_PAGER_AGAIN;
 	}
 	count = i;
 	for (i = 0; i < count; i++) {
 		if (reqaddr[i] == SWB_EMPTY) {
 			printf("I/O to empty block???? -- pindex: %d, i: %d\n",
 				m[i]->pindex, i);
 		}
 	}
 
 	/*
 	 * For synchronous writes, we clean up all completed async pageouts.
 	 */
 	if (sync == TRUE) {
 		swap_pager_sync();
 	}
 	kva = 0;
 
 	/*
 	 * get a swap pager clean data structure, block until we get it
 	 */
 	if (swap_pager_free.tqh_first == NULL ||
 		swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
 		swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
 		s = splbio();
 		if (curproc == pageproc) {
+retryfree:
 			/*
 			 * pageout daemon needs a swap control block
 			 */
 			swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT|SWAP_FREE_NEEDED;
 			/*
 			 * if it does not get one within a short time, then
 			 * there is a potential deadlock, so we go-on trying
-			 * to free pages.
+			 * to free pages.  It is important to block here as opposed
+			 * to returning, thereby allowing the pageout daemon to continue.
+			 * It is likely that pageout daemon will start suboptimally
+			 * reclaiming vnode backed pages if we don't block.  Since the
+			 * I/O subsystem is probably already fully utilized, might as
+			 * well wait.
 			 */
-			tsleep(&swap_pager_free, PVM, "swpfre", hz/10);
-			swap_pager_sync();
-			if (swap_pager_free.tqh_first == NULL ||
-				swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
-				swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
-				splx(s);
-				return VM_PAGER_AGAIN;
+			if (tsleep(&swap_pager_free, PVM, "swpfre", hz/5)) {
+				swap_pager_sync();
+				if (swap_pager_free.tqh_first == NULL ||
+					swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
+					swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
+					splx(s);
+					return VM_PAGER_AGAIN;
+				}
+			} else {
+			/*
+			 * we make sure that pageouts aren't taking up all of
+			 * the free swap control blocks.
+			 */
+				swap_pager_sync();
+				if (swap_pager_free.tqh_first == NULL ||
+					swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
+					swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
+					goto retryfree;
+				}
 			}
-		} else
+		} else {
 			pagedaemon_wakeup();
-		while (swap_pager_free.tqh_first == NULL ||
-			swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
-			swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
-			if (curproc == pageproc) {
-			    swap_pager_needflags |= SWAP_FREE_NEEDED_BY_PAGEOUT;
-			    if((cnt.v_free_count + cnt.v_cache_count) > cnt.v_free_reserved)
-					wakeup(&cnt.v_free_count);
-			}
-
-			swap_pager_needflags |= SWAP_FREE_NEEDED;
-			tsleep(&swap_pager_free, PVM, "swpfre", 0);
-			if (curproc == pageproc)
-				swap_pager_sync();
-			else
+			while (swap_pager_free.tqh_first == NULL ||
+				swap_pager_free.tqh_first->spc_list.tqe_next == NULL ||
+				swap_pager_free.tqh_first->spc_list.tqe_next->spc_list.tqe_next == NULL) {
+				swap_pager_needflags |= SWAP_FREE_NEEDED;
+				tsleep(&swap_pager_free, PVM, "swpfre", 0);
 				pagedaemon_wakeup();
+			}
 		}
 		splx(s);
 	}
 	spc = swap_pager_free.tqh_first;
 	TAILQ_REMOVE(&swap_pager_free, spc, spc_list);
 
 	kva = spc->spc_kva;
 
 	/*
 	 * map our page(s) into kva for I/O
 	 */
 	pmap_qenter(kva, m, count);
 
 	/*
 	 * get the base I/O offset into the swap file
 	 */
 	for (i = 0; i < count; i++) {
 		fidx = m[i]->pindex + paging_pindex;
 		off = swap_pager_block_offset(fidx);
 		/*
 		 * set the valid bit
 		 */
 		swb[i]->swb_valid |= (1 << off);
 		/*
 		 * and unlock the data structure
 		 */
 		swb[i]->swb_locked--;
 	}
 
 	/*
 	 * Get a swap buffer header and perform the IO
 	 */
 	bp = spc->spc_bp;
 	bzero(bp, sizeof *bp);
 	bp->b_spc = spc;
 	bp->b_vnbufs.le_next = NOLIST;
 
 	bp->b_flags = B_BUSY | B_PAGING;
 	bp->b_proc = &proc0;	/* XXX (but without B_PHYS set this is ok) */
 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 	if (bp->b_rcred != NOCRED)
 		crhold(bp->b_rcred);
 	if (bp->b_wcred != NOCRED)
 		crhold(bp->b_wcred);
 	bp->b_data = (caddr_t) kva;
 	bp->b_blkno = reqaddr[0];
 	pbgetvp(swapdev_vp, bp);
 
 	bp->b_bcount = PAGE_SIZE * count;
 	bp->b_bufsize = PAGE_SIZE * count;
 	swapdev_vp->v_numoutput++;
 
 	/*
 	 * If this is an async write we set up additional buffer fields and
 	 * place a "cleaning" entry on the inuse queue.
 	 */
 	s = splbio();
 	if (sync == FALSE) {
 		spc->spc_flags = 0;
 		spc->spc_object = object;
 		for (i = 0; i < count; i++)
 			spc->spc_m[i] = m[i];
 		spc->spc_count = count;
 		/*
 		 * the completion routine for async writes
 		 */
 		bp->b_flags |= B_CALL;
 		bp->b_iodone = swap_pager_iodone;
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 		object->un_pager.swp.swp_poip++;
 		TAILQ_INSERT_TAIL(&swap_pager_inuse, spc, spc_list);
 	} else {
 		object->un_pager.swp.swp_poip++;
 		bp->b_flags |= B_CALL;
 		bp->b_iodone = swap_pager_iodone1;
 	}
 
 	cnt.v_swapout++;
 	cnt.v_swappgsout += count;
 	/*
 	 * perform the I/O
 	 */
 	VOP_STRATEGY(bp);
 	if (sync == FALSE) {
 		if ((bp->b_flags & B_DONE) == B_DONE) {
 			swap_pager_sync();
 		}
 		splx(s);
 		for (i = 0; i < count; i++) {
 			rtvals[i] = VM_PAGER_PEND;
 		}
 		return VM_PAGER_PEND;
 	}
 	/*
 	 * wait for the sync I/O to complete
 	 */
 	while ((bp->b_flags & B_DONE) == 0) {
 		tsleep(bp, PVM, "swwrt", 0);
 	}
 	if (bp->b_flags & B_ERROR) {
 		printf("swap_pager: I/O error - pageout failed; blkno %d, size %d, error %d\n",
 		    bp->b_blkno, bp->b_bcount, bp->b_error);
 		rv = VM_PAGER_ERROR;
 	} else {
 		rv = VM_PAGER_OK;
 	}
 
 	object->un_pager.swp.swp_poip--;
 	if (object->un_pager.swp.swp_poip == 0)
 		wakeup(object);
 
 	if (bp->b_vp)
 		pbrelvp(bp);
 	if (bp->b_flags & B_WANTED)
 		wakeup(bp);
 
 	splx(s);
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	pmap_qremove(kva, count);
 
 	/*
 	 * if we have written the page, then indicate that the page is clean.
 	 */
 	if (rv == VM_PAGER_OK) {
 		for (i = 0; i < count; i++) {
 			if (rtvals[i] == VM_PAGER_OK) {
 				pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
 				m[i]->dirty = 0;
 				/*
 				 * optimization, if a page has been read
 				 * during the pageout process, we activate it.
 				 */
-				if ((m[i]->flags & PG_ACTIVE) == 0 &&
+				if ((m[i]->queue != PQ_ACTIVE) &&
 				    ((m[i]->flags & (PG_WANTED|PG_REFERENCED)) ||
 				    pmap_is_referenced(VM_PAGE_TO_PHYS(m[i])))) {
 					vm_page_activate(m[i]);
 				}
 			}
 		}
 	} else {
 		for (i = 0; i < count; i++) {
 			rtvals[i] = rv;
 		}
 	}
 
 	if (bp->b_rcred != NOCRED)
 		crfree(bp->b_rcred);
 	if (bp->b_wcred != NOCRED)
 		crfree(bp->b_wcred);
 	TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
 	if (swap_pager_needflags & SWAP_FREE_NEEDED) {
 		wakeup(&swap_pager_free);
 	}
 	if (swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
 		pagedaemon_wakeup();
 	swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
 	return (rv);
 }
 
 static void
 swap_pager_sync()
 {
 	register swp_clean_t spc, tspc;
 	register int s;
 
 	tspc = NULL;
 	if (swap_pager_done.tqh_first == NULL)
 		return;
 	for (;;) {
 		s = splbio();
 		/*
 		 * Look up and removal from done list must be done at splbio()
 		 * to avoid conflicts with swap_pager_iodone.
 		 */
 		while ((spc = swap_pager_done.tqh_first) != 0) {
 			pmap_qremove(spc->spc_kva, spc->spc_count);
 			swap_pager_finish(spc);
 			TAILQ_REMOVE(&swap_pager_done, spc, spc_list);
 			goto doclean;
 		}
 
 		/*
 		 * No operations done, thats all we can do for now.
 		 */
 
 		splx(s);
 		break;
 
 		/*
 		 * The desired page was found to be busy earlier in the scan
 		 * but has since completed.
 		 */
 doclean:
 		if (tspc && tspc == spc) {
 			tspc = NULL;
 		}
 		spc->spc_flags = 0;
 		TAILQ_INSERT_TAIL(&swap_pager_free, spc, spc_list);
 		if (swap_pager_needflags & SWAP_FREE_NEEDED) {
 			wakeup(&swap_pager_free);
 		}
 		if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT)
 			pagedaemon_wakeup();
 		swap_pager_needflags &= ~(SWAP_FREE_NEEDED|SWAP_FREE_NEEDED_BY_PAGEOUT);
 		splx(s);
 	}
 
 	return;
 }
 
 void
 swap_pager_finish(spc)
 	register swp_clean_t spc;
 {
 	vm_object_t object = spc->spc_m[0]->object;
 	int i;
 
 	object->paging_in_progress -= spc->spc_count;
 	if ((object->paging_in_progress == 0) &&
 	    (object->flags & OBJ_PIPWNT)) {
 		object->flags &= ~OBJ_PIPWNT;
 		wakeup(object);
 	}
 
 	/*
 	 * If no error, mark as clean and inform the pmap system. If error,
 	 * mark as dirty so we will try again. (XXX could get stuck doing
 	 * this, should give up after awhile)
 	 */
 	if (spc->spc_flags & SPC_ERROR) {
 		for (i = 0; i < spc->spc_count; i++) {
 			printf("swap_pager_finish: I/O error, clean of page %lx failed\n",
 			    (u_long) VM_PAGE_TO_PHYS(spc->spc_m[i]));
 		}
 	} else {
 		for (i = 0; i < spc->spc_count; i++) {
 			pmap_clear_modify(VM_PAGE_TO_PHYS(spc->spc_m[i]));
 			spc->spc_m[i]->dirty = 0;
-			if ((spc->spc_m[i]->flags & PG_ACTIVE) == 0 &&
+			if ((spc->spc_m[i]->queue != PQ_ACTIVE) &&
 			    ((spc->spc_m[i]->flags & PG_WANTED) || pmap_is_referenced(VM_PAGE_TO_PHYS(spc->spc_m[i]))))
 				vm_page_activate(spc->spc_m[i]);
 		}
 	}
 
 
 	for (i = 0; i < spc->spc_count; i++) {
 		/*
 		 * we wakeup any processes that are waiting on these pages.
 		 */
 		PAGE_WAKEUP(spc->spc_m[i]);
 	}
 	nswiodone -= spc->spc_count;
 
 	return;
 }
 
 /*
  * swap_pager_iodone
  */
 static void
 swap_pager_iodone(bp)
 	register struct buf *bp;
 {
 	register swp_clean_t spc;
 	int s;
 
 	s = splbio();
 	spc = (swp_clean_t) bp->b_spc;
 	TAILQ_REMOVE(&swap_pager_inuse, spc, spc_list);
 	TAILQ_INSERT_TAIL(&swap_pager_done, spc, spc_list);
 	if (bp->b_flags & B_ERROR) {
 		spc->spc_flags |= SPC_ERROR;
 		printf("swap_pager: I/O error - async %s failed; blkno %lu, size %ld, error %d\n",
 		    (bp->b_flags & B_READ) ? "pagein" : "pageout",
 		    (u_long) bp->b_blkno, bp->b_bcount, bp->b_error);
 	}
 
 	if (bp->b_vp)
 		pbrelvp(bp);
 
 	if (bp->b_flags & B_WANTED)
 		wakeup(bp);
 
 	if (bp->b_rcred != NOCRED)
 		crfree(bp->b_rcred);
 	if (bp->b_wcred != NOCRED)
 		crfree(bp->b_wcred);
 
 	nswiodone += spc->spc_count;
 	if (--spc->spc_object->un_pager.swp.swp_poip == 0) {
 		wakeup(spc->spc_object);
 	}
 	if ((swap_pager_needflags & SWAP_FREE_NEEDED) ||
 	    swap_pager_inuse.tqh_first == 0) {
 		swap_pager_needflags &= ~SWAP_FREE_NEEDED;
 		wakeup(&swap_pager_free);
 	}
 
 	if( swap_pager_needflags & SWAP_FREE_NEEDED_BY_PAGEOUT) {
 		swap_pager_needflags &= ~SWAP_FREE_NEEDED_BY_PAGEOUT;
 		pagedaemon_wakeup();
 	}
 
 	if (vm_pageout_pages_needed) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	if ((swap_pager_inuse.tqh_first == NULL) ||
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min &&
 	    nswiodone + cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min)) {
 		pagedaemon_wakeup();
 	}
 	splx(s);
 }
Index: head/sys/vm/vm_fault.c
===================================================================
--- head/sys/vm/vm_fault.c	(revision 13489)
+++ head/sys/vm/vm_fault.c	(revision 13490)
@@ -1,1000 +1,998 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_fault.c,v 1.38 1995/12/07 12:48:10 davidg Exp $
+ * $Id: vm_fault.c,v 1.39 1995/12/11 04:58:06 dyson Exp $
  */
 
 /*
  *	Page fault handling module.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/resource.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 
 int vm_fault_additional_pages __P((vm_page_t, int, int, vm_page_t *, int *));
 
 #define VM_FAULT_READ_AHEAD 4
 #define VM_FAULT_READ_BEHIND 3
 #define VM_FAULT_READ (VM_FAULT_READ_AHEAD+VM_FAULT_READ_BEHIND+1)
 
 /*
  *	vm_fault:
  *
  *	Handle a page fault occuring at the given address,
  *	requiring the given permissions, in the map specified.
  *	If successful, the page is inserted into the
  *	associated physical map.
  *
  *	NOTE: the given address should be truncated to the
  *	proper page address.
  *
  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
  *	a standard error specifying why the fault is fatal is returned.
  *
  *
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
 int
 vm_fault(map, vaddr, fault_type, change_wiring)
 	vm_map_t map;
 	vm_offset_t vaddr;
 	vm_prot_t fault_type;
 	boolean_t change_wiring;
 {
 	vm_object_t first_object;
 	vm_pindex_t first_pindex;
 	vm_map_entry_t entry;
 	register vm_object_t object;
 	register vm_pindex_t pindex;
 	vm_page_t m;
 	vm_page_t first_m;
 	vm_prot_t prot;
 	int result;
 	boolean_t wired;
 	boolean_t su;
 	boolean_t lookup_still_valid;
 	vm_page_t old_m;
 	vm_object_t next_object;
 	vm_page_t marray[VM_FAULT_READ];
 	int hardfault = 0;
 	struct vnode *vp = NULL;
 
 	cnt.v_vm_faults++;	/* needs lock XXX */
 /*
  *	Recovery actions
  */
 #define	FREE_PAGE(m)	{				\
 	PAGE_WAKEUP(m);					\
 	vm_page_free(m);				\
 }
 
 #define	RELEASE_PAGE(m)	{				\
 	PAGE_WAKEUP(m);					\
-	if ((m->flags & PG_ACTIVE) == 0) vm_page_activate(m);		\
+	if (m->queue != PQ_ACTIVE) vm_page_activate(m);		\
 }
 
 #define	UNLOCK_MAP	{				\
 	if (lookup_still_valid) {			\
 		vm_map_lookup_done(map, entry);		\
 		lookup_still_valid = FALSE;		\
 	}						\
 }
 
 #define	UNLOCK_THINGS	{				\
 	vm_object_pip_wakeup(object); \
 	if (object != first_object) {			\
 		FREE_PAGE(first_m);			\
 		vm_object_pip_wakeup(first_object); \
 	}						\
 	UNLOCK_MAP;					\
 	if (vp != NULL) VOP_UNLOCK(vp);			\
 }
 
 #define	UNLOCK_AND_DEALLOCATE	{			\
 	UNLOCK_THINGS;					\
 	vm_object_deallocate(first_object);		\
 }
 
 
 RetryFault:;
 
 	/*
 	 * Find the backing store object and offset into it to begin the
 	 * search.
 	 */
 
 	if ((result = vm_map_lookup(&map, vaddr,
 		fault_type, &entry, &first_object,
 		&first_pindex, &prot, &wired, &su)) != KERN_SUCCESS) {
 		return (result);
 	}
 
 	vp = vnode_pager_lock(first_object);
 
 	lookup_still_valid = TRUE;
 
 	if (wired)
 		fault_type = prot;
 
 	first_m = NULL;
 
 	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
 	 * they will stay around as well.
 	 */
 
 	first_object->ref_count++;
 	first_object->paging_in_progress++;
 
 	/*
 	 * INVARIANTS (through entire routine):
 	 *
 	 * 1)	At all times, we must either have the object lock or a busy
 	 * page in some object to prevent some other process from trying to
 	 * bring in the same page.
 	 *
 	 * Note that we cannot hold any locks during the pager access or when
 	 * waiting for memory, so we use a busy page then.
 	 *
 	 * Note also that we aren't as concerned about more than one thead
 	 * attempting to pager_data_unlock the same page at once, so we don't
 	 * hold the page as busy then, but do record the highest unlock value
 	 * so far.  [Unlock requests may also be delivered out of order.]
 	 *
 	 * 2)	Once we have a busy page, we must remove it from the pageout
 	 * queues, so that the pageout daemon will not grab it away.
 	 *
 	 * 3)	To prevent another process from racing us down the shadow chain
 	 * and entering a new page in the top object before we do, we must
 	 * keep a busy page in the top object while following the shadow
 	 * chain.
 	 *
 	 * 4)	We must increment paging_in_progress on any object for which
 	 * we have a busy page, to prevent vm_object_collapse from removing
 	 * the busy page without our noticing.
 	 */
 
 	/*
 	 * Search for the page at object/offset.
 	 */
 
 	object = first_object;
 	pindex = first_pindex;
 
 	/*
 	 * See whether this page is resident
 	 */
 
 	while (TRUE) {
 		m = vm_page_lookup(object, pindex);
 		if (m != NULL) {
 			/*
 			 * If the page is being brought in, wait for it and
 			 * then retry.
 			 */
 			if ((m->flags & PG_BUSY) || m->busy) {
 				int s;
 
 				UNLOCK_THINGS;
 				s = splhigh();
 				if ((m->flags & PG_BUSY) || m->busy) {
 					m->flags |= PG_WANTED | PG_REFERENCED;
 					cnt.v_intrans++;
 					tsleep(m, PSWP, "vmpfw", 0);
 				}
 				splx(s);
 				vm_object_deallocate(first_object);
 				goto RetryFault;
 			}
 
 			/*
 			 * Mark page busy for other processes, and the pagedaemon.
 			 */
 			m->flags |= PG_BUSY;
-			if ((m->flags & PG_CACHE) &&
+			if ((m->queue == PQ_CACHE) &&
 			    (cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_reserved) {
 				UNLOCK_AND_DEALLOCATE;
 				VM_WAIT;
 				PAGE_WAKEUP(m);
 				goto RetryFault;
 			}
 
-			if (m->valid && ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
-				 m->object != kernel_object && m->object != kmem_object) {
+			if (m->valid &&
+				((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) &&
+				m->object != kernel_object && m->object != kmem_object) {
 				goto readrest;
 			}
 			break;
 		}
 		if (((object->type != OBJT_DEFAULT) && (!change_wiring || wired))
 		    || (object == first_object)) {
 
 			if (pindex >= object->size) {
 				UNLOCK_AND_DEALLOCATE;
 				return (KERN_PROTECTION_FAILURE);
 			}
 
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 */
 			m = vm_page_alloc(object, pindex,
-				vp?VM_ALLOC_NORMAL:(VM_ALLOC_NORMAL|VM_ALLOC_ZERO));
+				vp?VM_ALLOC_NORMAL:VM_ALLOC_ZERO);
 
 			if (m == NULL) {
 				UNLOCK_AND_DEALLOCATE;
 				VM_WAIT;
 				goto RetryFault;
 			}
 		}
 readrest:
 		if (object->type != OBJT_DEFAULT && (!change_wiring || wired)) {
 			int rv;
 			int faultcount;
 			int reqpage;
 
 			/*
 			 * now we find out if any other pages should be paged
 			 * in at this time this routine checks to see if the
 			 * pages surrounding this fault reside in the same
 			 * object as the page for this fault.  If they do,
 			 * then they are faulted in also into the object.  The
 			 * array "marray" returned contains an array of
 			 * vm_page_t structs where one of them is the
 			 * vm_page_t passed to the routine.  The reqpage
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    m, VM_FAULT_READ_BEHIND, VM_FAULT_READ_AHEAD,
 			    marray, &reqpage);
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
 			 * releasing the lock on the map.
 			 */
 			UNLOCK_MAP;
 
 			rv = faultcount ?
 			    vm_pager_get_pages(object, marray, faultcount,
 				reqpage) : VM_PAGER_FAIL;
 
 			if (rv == VM_PAGER_OK) {
 				/*
 				 * Found the page. Leave it busy while we play
 				 * with it.
 				 */
 
 				/*
 				 * Relookup in case pager changed page. Pager
 				 * is responsible for disposition of old page
 				 * if moved.
 				 */
 				m = vm_page_lookup(object, pindex);
 				if( !m) {
 					UNLOCK_AND_DEALLOCATE;
 					goto RetryFault;
 				}
 
 				hardfault++;
 				break;
 			}
 			/*
 			 * Remove the bogus page (which does not exist at this
 			 * object/offset); before doing so, we must get back
 			 * our object lock to preserve our invariant.
 			 *
 			 * Also wake up any other process that may want to bring
 			 * in this page.
 			 *
 			 * If this is the top-level object, we must leave the
 			 * busy page to prevent another process from rushing
 			 * past us, and inserting the page in that object at
 			 * the same time that we are.
 			 */
 
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager input (probably hardware) error, PID %d failure\n",
 				    curproc->p_pid);
 			/*
 			 * Data outside the range of the pager or an I/O error
 			 */
 			/*
 			 * XXX - the check for kernel_map is a kludge to work
 			 * around having the machine panic on a kernel space
 			 * fault w/ I/O error.
 			 */
 			if (((map != kernel_map) && (rv == VM_PAGER_ERROR)) || (rv == VM_PAGER_BAD)) {
 				FREE_PAGE(m);
 				UNLOCK_AND_DEALLOCATE;
 				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
 			}
 			if (object != first_object) {
 				FREE_PAGE(m);
 				/*
 				 * XXX - we cannot just fall out at this
 				 * point, m has been freed and is invalid!
 				 */
 			}
 		}
 		/*
 		 * We get here if the object has default pager (or unwiring) or the
 		 * pager doesn't have the page.
 		 */
 		if (object == first_object)
 			first_m = m;
 
 		/*
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
 
 		pindex += OFF_TO_IDX(object->backing_object_offset);
 		next_object = object->backing_object;
 		if (next_object == NULL) {
 			/*
 			 * If there's no object left, fill the page in the top
 			 * object with zeros.
 			 */
 			if (object != first_object) {
 				vm_object_pip_wakeup(object);
 
 				object = first_object;
 				pindex = first_pindex;
 				m = first_m;
 			}
 			first_m = NULL;
 
 			if ((m->flags & PG_ZERO) == 0)
 				vm_page_zero_fill(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			cnt.v_zfod++;
 			break;
 		} else {
 			if (object != first_object) {
 				vm_object_pip_wakeup(object);
 			}
 			object = next_object;
 			object->paging_in_progress++;
 		}
 	}
 
 	if ((m->flags & PG_BUSY) == 0)
 		panic("vm_fault: not busy after main loop");
 
 	/*
 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 	 * is held.]
 	 */
 
 	old_m = m;	/* save page that would be copied */
 
 	/*
 	 * If the page is being written, but isn't already owned by the
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
 
 	if (object != first_object) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
 
 		if (fault_type & VM_PROT_WRITE) {
 
 			/*
 			 * If we try to collapse first_object at this point,
 			 * we may deadlock when we try to get the lock on an
 			 * intermediate object (since we have the bottom
 			 * object locked).  We can't unlock the bottom object,
 			 * because the page we found may move (by collapse) if
 			 * we do.
 			 *
 			 * Instead, we first copy the page.  Then, when we have
 			 * no more use for the bottom object, we unlock it and
 			 * try to collapse.
 			 *
 			 * Note that we copy the page even if we didn't need
 			 * to... that's the breaks.
 			 */
 
 			/*
 			 * We already have an empty page in first_object - use
 			 * it.
 			 */
 
 			vm_page_copy(m, first_m);
 			first_m->valid = VM_PAGE_BITS_ALL;
 
 			/*
 			 * If another map is truly sharing this page with us,
 			 * we have to flush all uses of the original page,
 			 * since we can't distinguish those which want the
 			 * original from those which need the new copy.
 			 *
 			 * XXX If we know that only one map has access to this
 			 * page, then we could avoid the pmap_page_protect()
 			 * call.
 			 */
 
-			if ((m->flags & PG_ACTIVE) == 0)
+			if (m->queue != PQ_ACTIVE)
 				vm_page_activate(m);
-			vm_page_protect(m, VM_PROT_NONE);
 
 			/*
 			 * We no longer need the old page or object.
 			 */
 			PAGE_WAKEUP(m);
 			vm_object_pip_wakeup(object);
 
 			/*
 			 * Only use the new page below...
 			 */
 
 			cnt.v_cow_faults++;
 			m = first_m;
 			object = first_object;
 			pindex = first_pindex;
 
 			/*
 			 * Now that we've gotten the copy out of the way,
 			 * let's try to collapse the top object.
 			 *
 			 * But we have to play ugly games with
 			 * paging_in_progress to do that...
 			 */
 			vm_object_pip_wakeup(object);
 			vm_object_collapse(object);
 			object->paging_in_progress++;
 		} else {
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * We must verify that the maps have not changed since our last
 	 * lookup.
 	 */
 
 	if (!lookup_still_valid) {
 		vm_object_t retry_object;
 		vm_pindex_t retry_pindex;
 		vm_prot_t retry_prot;
 
 		/*
 		 * Since map entries may be pageable, make sure we can take a
 		 * page fault on them.
 		 */
 
 		/*
 		 * To avoid trying to write_lock the map while another process
 		 * has it read_locked (in vm_map_pageable), we do not try for
 		 * write permission.  If the page is still writable, we will
 		 * get write permission.  If it is not, or has been marked
 		 * needs_copy, we enter the mapping without write permission,
 		 * and will merely take another fault.
 		 */
 		result = vm_map_lookup(&map, vaddr, fault_type & ~VM_PROT_WRITE,
 		    &entry, &retry_object, &retry_pindex, &retry_prot, &wired, &su);
 
 		/*
 		 * If we don't need the page any longer, put it on the active
 		 * list (the easiest thing to do here).  If no one needs it,
 		 * pageout will grab it eventually.
 		 */
 
 		if (result != KERN_SUCCESS) {
 			RELEASE_PAGE(m);
 			UNLOCK_AND_DEALLOCATE;
 			return (result);
 		}
 		lookup_still_valid = TRUE;
 
 		if ((retry_object != first_object) ||
 		    (retry_pindex != first_pindex)) {
 			RELEASE_PAGE(m);
 			UNLOCK_AND_DEALLOCATE;
 			goto RetryFault;
 		}
 		/*
 		 * Check whether the protection has changed or the object has
 		 * been copied while we left the map unlocked. Changing from
 		 * read to write permission is OK - we leave the page
 		 * write-protected, and catch the write fault. Changing from
 		 * write to read permission means that we can't mark the page
 		 * write-enabled after all.
 		 */
 		prot &= retry_prot;
 	}
 	/*
 	 * (the various bits we're fiddling with here are locked by the
 	 * object's lock)
 	 */
 
 	/*
 	 * It's critically important that a wired-down page be faulted only
 	 * once in each map for which it is wired.
 	 */
 
 	/*
 	 * Put this page into the physical map. We had to do the unlock above
 	 * because pmap_enter may cause other faults.   We don't put the page
 	 * back on the active queue until later so that the page-out daemon
 	 * won't find us (yet).
 	 */
 
 	if (prot & VM_PROT_WRITE) {
 		m->flags |= PG_WRITEABLE;
 		m->object->flags |= OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY;
 		/*
 		 * If the fault is a write, we know that this page is being
 		 * written NOW. This will save on the pmap_is_modified() calls
 		 * later.
 		 */
 		if (fault_type & VM_PROT_WRITE) {
 			m->dirty = VM_PAGE_BITS_ALL;
 		}
 	}
 
 	m->flags |= PG_MAPPED|PG_REFERENCED;
 	m->flags &= ~PG_ZERO;
 
 	pmap_enter(map->pmap, vaddr, VM_PAGE_TO_PHYS(m), prot, wired);
 #if 0
 	if (change_wiring == 0 && wired == 0)
 		pmap_prefault(map->pmap, vaddr, entry, first_object);
 #endif
 
 	/*
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
 	if (change_wiring) {
 		if (wired)
 			vm_page_wire(m);
 		else
 			vm_page_unwire(m);
 	} else {
-		if ((m->flags & PG_ACTIVE) == 0)
+		if (m->queue != PQ_ACTIVE)
 			vm_page_activate(m);
 	}
 
 	if (curproc && (curproc->p_flag & P_INMEM) && curproc->p_stats) {
 		if (hardfault) {
 			curproc->p_stats->p_ru.ru_majflt++;
 		} else {
 			curproc->p_stats->p_ru.ru_minflt++;
 		}
 	}
 
-	if ((m->flags & PG_BUSY) == 0)
-		printf("page not busy: %d\n", m->pindex);
 	/*
 	 * Unlock everything, and return
 	 */
 
 	PAGE_WAKEUP(m);
 	UNLOCK_AND_DEALLOCATE;
 
 	return (KERN_SUCCESS);
 
 }
 
 /*
  *	vm_fault_wire:
  *
  *	Wire down a range of virtual addresses in a map.
  */
 int
 vm_fault_wire(map, start, end)
 	vm_map_t map;
 	vm_offset_t start, end;
 {
 
 	register vm_offset_t va;
 	register pmap_t pmap;
 	int rv;
 
 	pmap = vm_map_pmap(map);
 
 	/*
 	 * Inform the physical mapping system that the range of addresses may
 	 * not fault, so that page tables and such can be locked down as well.
 	 */
 
 	pmap_pageable(pmap, start, end, FALSE);
 
 	/*
 	 * We simulate a fault to get the page and enter it in the physical
 	 * map.
 	 */
 
 	for (va = start; va < end; va += PAGE_SIZE) {
 
 		while( curproc != pageproc &&
 			(cnt.v_free_count <= cnt.v_pageout_free_min))
 			VM_WAIT;
 
 		rv = vm_fault(map, va, VM_PROT_READ|VM_PROT_WRITE, TRUE);
 		if (rv) {
 			if (va != start)
 				vm_fault_unwire(map, start, va);
 			return (rv);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 
 /*
  *	vm_fault_unwire:
  *
  *	Unwire a range of virtual addresses in a map.
  */
 void
 vm_fault_unwire(map, start, end)
 	vm_map_t map;
 	vm_offset_t start, end;
 {
 
 	register vm_offset_t va, pa;
 	register pmap_t pmap;
 
 	pmap = vm_map_pmap(map);
 
 	/*
 	 * Since the pages are wired down, we must be able to get their
 	 * mappings from the physical map system.
 	 */
 
 	for (va = start; va < end; va += PAGE_SIZE) {
 		pa = pmap_extract(pmap, va);
 		if (pa == (vm_offset_t) 0) {
 			panic("unwire: page not in pmap");
 		}
 		pmap_change_wiring(pmap, va, FALSE);
 		vm_page_unwire(PHYS_TO_VM_PAGE(pa));
 	}
 
 	/*
 	 * Inform the physical mapping system that the range of addresses may
 	 * fault, so that page tables and such may be unwired themselves.
 	 */
 
 	pmap_pageable(pmap, start, end, TRUE);
 
 }
 
 /*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
  *		Copy all of the pages from a wired-down map entry to another.
  *
  *	In/out conditions:
  *		The source and destination maps must be locked for write.
  *		The source map entry must be wired down (or be a sharing map
  *		entry corresponding to a main map entry that is wired down).
  */
 
 void
 vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry)
 	vm_map_t dst_map;
 	vm_map_t src_map;
 	vm_map_entry_t dst_entry;
 	vm_map_entry_t src_entry;
 {
 	vm_object_t dst_object;
 	vm_object_t src_object;
 	vm_ooffset_t dst_offset;
 	vm_ooffset_t src_offset;
 	vm_prot_t prot;
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
 
 #ifdef	lint
 	src_map++;
 #endif	/* lint */
 
 	src_object = src_entry->object.vm_object;
 	src_offset = src_entry->offset;
 
 	/*
 	 * Create the top-level object for the destination entry. (Doesn't
 	 * actually shadow anything - we copy the pages directly.)
 	 */
 	dst_object = vm_object_allocate(OBJT_DEFAULT,
 	    (vm_size_t) OFF_TO_IDX(dst_entry->end - dst_entry->start));
 
 	dst_entry->object.vm_object = dst_object;
 	dst_entry->offset = 0;
 
 	prot = dst_entry->max_protection;
 
 	/*
 	 * Loop through all of the pages in the entry's range, copying each
 	 * one from the source object (it should be there) to the destination
 	 * object.
 	 */
 	for (vaddr = dst_entry->start, dst_offset = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
 
 		/*
 		 * Allocate a page in the destination object
 		 */
 		do {
 			dst_m = vm_page_alloc(dst_object,
 				OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL);
 			if (dst_m == NULL) {
 				VM_WAIT;
 			}
 		} while (dst_m == NULL);
 
 		/*
 		 * Find the page in the source object, and copy it in.
 		 * (Because the source is wired down, the page will be in
 		 * memory.)
 		 */
 		src_m = vm_page_lookup(src_object,
 			OFF_TO_IDX(dst_offset + src_offset));
 		if (src_m == NULL)
 			panic("vm_fault_copy_wired: page missing");
 
 		vm_page_copy(src_m, dst_m);
 
 		/*
 		 * Enter it in the pmap...
 		 */
 
 		dst_m->flags |= PG_WRITEABLE|PG_MAPPED;
 		dst_m->flags &= ~PG_ZERO;
 		pmap_enter(dst_map->pmap, vaddr, VM_PAGE_TO_PHYS(dst_m),
 		    prot, FALSE);
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
 		vm_page_activate(dst_m);
 		PAGE_WAKEUP(dst_m);
 	}
 }
 
 
 /*
  * This routine checks around the requested page for other pages that
  * might be able to be faulted in.  This routine brackets the viable
  * pages for the pages to be paged in.
  *
  * Inputs:
  *	m, rbehind, rahead
  *
  * Outputs:
  *  marray (array of vm_page_t), reqpage (index of requested page)
  *
  * Return value:
  *  number of pages in marray
  */
 int
 vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	vm_page_t m;
 	int rbehind;
 	int rahead;
 	vm_page_t *marray;
 	int *reqpage;
 {
 	int i;
 	vm_object_t object;
 	vm_pindex_t pindex, startpindex, endpindex, tpindex;
 	vm_offset_t size;
 	vm_page_t rtm;
 	int treqpage;
 	int cbehind, cahead;
 
 	object = m->object;
 	pindex = m->pindex;
 
 	/*
 	 * if the requested page is not available, then give up now
 	 */
 
 	if (!vm_pager_has_page(object,
 		OFF_TO_IDX(object->paging_offset) + pindex, &cbehind, &cahead))
 		return 0;
 
 	if ((cbehind == 0) && (cahead == 0)) {
 		*reqpage = 0;
 		marray[0] = m;
 		return 1;
 	}
 
 	if (rahead > cahead) {
 		rahead = cahead;
 	}
 
 	if (rbehind > cbehind) {
 		rbehind = cbehind;
 	}
 
 	/*
 	 * try to do any readahead that we might have free pages for.
 	 */
 	if ((rahead + rbehind) >
 		((cnt.v_free_count + cnt.v_cache_count) - cnt.v_free_reserved)) {
 		pagedaemon_wakeup();
 		*reqpage = 0;
 		marray[0] = m;
 		return 1;
 	}
 
 	/*
 	 * scan backward for the read behind pages -- in memory or on disk not
 	 * in same object
 	 */
 	tpindex = pindex - 1;
 	if (tpindex < pindex) {
 		if (rbehind > pindex)
 			rbehind = pindex;
 		startpindex = pindex - rbehind;
 		while (tpindex >= startpindex) {
 			if (vm_page_lookup( object, tpindex)) {
 				startpindex = tpindex + 1;
 				break;
 			}
 			if (tpindex == 0)
 				break;
 			tpindex -= 1;
 		}
 	} else {
 		startpindex = pindex;
 	}
 
 	/*
 	 * scan forward for the read ahead pages -- in memory or on disk not
 	 * in same object
 	 */
 	tpindex = pindex + 1;
 	endpindex = pindex + (rahead + 1);
 	if (endpindex > object->size)
 		endpindex = object->size;
 	while (tpindex <  endpindex) {
 		if ( vm_page_lookup(object, tpindex)) {
 			break;
 		}	
 		tpindex += 1;
 	}
 	endpindex = tpindex;
 
 	/* calculate number of bytes of pages */
 	size = endpindex - startpindex;
 
 	/* calculate the page offset of the required page */
 	treqpage = pindex - startpindex;
 
 	/* see if we have space (again) */
 	if ((cnt.v_free_count + cnt.v_cache_count) >
 		(cnt.v_free_reserved + size)) {
 		/*
 		 * get our pages and don't block for them
 		 */
 		for (i = 0; i < size; i++) {
 			if (i != treqpage) {
 				rtm = vm_page_alloc(object,
 					startpindex + i,
 					VM_ALLOC_NORMAL);
 				if (rtm == NULL) {
 					if (i < treqpage) {
 						int j;
 						for (j = 0; j < i; j++) {
 							FREE_PAGE(marray[j]);
 						}
 						*reqpage = 0;
 						marray[0] = m;
 						return 1;
 					} else {
 						size = i;
 						*reqpage = treqpage;
 						return size;
 					}
 				}
 				marray[i] = rtm;
 			} else {
 				marray[i] = m;
 			}
 		}
 
 		*reqpage = treqpage;
 		return size;
 	}
 	*reqpage = 0;
 	marray[0] = m;
 	return 1;
 }
Index: head/sys/vm/vm_glue.c
===================================================================
--- head/sys/vm/vm_glue.c	(revision 13489)
+++ head/sys/vm/vm_glue.c	(revision 13490)
@@ -1,582 +1,633 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_glue.c,v 1.33 1995/12/14 09:54:57 phk Exp $
+ * $Id: vm_glue.c,v 1.35 1996/01/04 21:13:14 wollman Exp $
  */
 
 #include "opt_sysvipc.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 
 #include <sys/kernel.h>
 #include <sys/dkstat.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_inherit.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_object.h>
+#include <vm/vm_pager.h>
 
 #include <sys/user.h>
 
 #include <machine/stdarg.h>
 #include <machine/cpu.h>
 
 /*
  * System initialization
  *
  * Note: proc0 from proc.h
  */
 
 static void vm_init_limits __P((void *));
 SYSINIT(vm_limits, SI_SUB_VM_CONF, SI_ORDER_FIRST, vm_init_limits, &proc0)
 
 /*
  * THIS MUST BE THE LAST INITIALIZATION ITEM!!!
  *
  * Note: run scheduling should be divorced from the vm system.
  */
 static void scheduler __P((void *));
 SYSINIT(scheduler, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, scheduler, NULL)
 
 
 static void swapout __P((struct proc *));
 
 extern char kstack[];
 
 /* vm_map_t upages_map; */
 
 int
 kernacc(addr, len, rw)
 	caddr_t addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE;
 
 	saddr = trunc_page(addr);
 	eaddr = round_page(addr + len);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	return (rv == TRUE);
 }
 
 int
 useracc(addr, len, rw)
 	caddr_t addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE;
 
 	/*
 	 * XXX - check separately to disallow access to user area and user
 	 * page tables - they are in the map.
 	 *
 	 * XXX - VM_MAXUSER_ADDRESS is an end address, not a max.  It was once
 	 * only used (as an end address) in trap.c.  Use it as an end address
 	 * here too.  This bogusness has spread.  I just fixed where it was
 	 * used as a max in vm_mmap.c.
 	 */
 	if ((vm_offset_t) addr + len > /* XXX */ VM_MAXUSER_ADDRESS
 	    || (vm_offset_t) addr + len < (vm_offset_t) addr) {
 		return (FALSE);
 	}
 	rv = vm_map_check_protection(&curproc->p_vmspace->vm_map,
 	    trunc_page(addr), round_page(addr + len), prot);
 	return (rv == TRUE);
 }
 
 #ifdef KGDB
 /*
  * Change protections on kernel pages from addr to addr+len
  * (presumably so debugger can plant a breakpoint).
  * All addresses are assumed to reside in the Sysmap,
  */
 chgkprot(addr, len, rw)
 	register caddr_t addr;
 	int len, rw;
 {
 	vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE;
 
 	vm_map_protect(kernel_map, trunc_page(addr),
 	    round_page(addr + len), prot, FALSE);
 }
 #endif
 void
 vslock(addr, len)
 	caddr_t addr;
 	u_int len;
 {
 	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr),
 	    round_page(addr + len), FALSE);
 }
 
 void
 vsunlock(addr, len, dirtied)
 	caddr_t addr;
 	u_int len;
 	int dirtied;
 {
 #ifdef	lint
 	dirtied++;
 #endif	/* lint */
 	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr),
 	    round_page(addr + len), TRUE);
 }
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.
  * NOTE: the kernel stack may be at a different location in the child
  * process, and thus addresses of automatic variables may be invalid
  * after cpu_fork returns in the child process.  We do nothing here
  * after cpu_fork returns.
  */
 int
 vm_fork(p1, p2, isvfork)
 	register struct proc *p1, *p2;
 	int isvfork;
 {
 	register struct user *up;
-	vm_offset_t addr, ptaddr;
+	vm_offset_t addr, ptaddr, ptpa;
 	int error, i;
-	struct vm_map *vp;
+	vm_map_t vp;
+	pmap_t pvp;
+	vm_page_t stkm;
 
 	while ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) {
 		VM_WAIT;
 	}
 
 	/*
 	 * avoid copying any of the parent's pagetables or other per-process
 	 * objects that reside in the map by marking all of them
 	 * non-inheritable
 	 */
 	(void) vm_map_inherit(&p1->p_vmspace->vm_map,
 	    UPT_MIN_ADDRESS - UPAGES * PAGE_SIZE, VM_MAX_ADDRESS, VM_INHERIT_NONE);
 	p2->p_vmspace = vmspace_fork(p1->p_vmspace);
 
 #ifdef SYSVSHM
 	if (p1->p_vmspace->vm_shm)
 		shmfork(p1, p2, isvfork);
 #endif
 
 	/*
 	 * Allocate a wired-down (for now) pcb and kernel stack for the
 	 * process
 	 */
 
 	addr = (vm_offset_t) kstack;
 
 	vp = &p2->p_vmspace->vm_map;
+	pvp = &p2->p_vmspace->vm_pmap;
 
 	/* get new pagetables and kernel stack */
-	(void) vm_map_find(vp, NULL, 0, &addr, UPT_MAX_ADDRESS - addr, FALSE);
+	(void) vm_map_find(vp, NULL, 0, &addr, UPT_MAX_ADDRESS - addr, FALSE,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
 
-	/* force in the page table encompassing the UPAGES */
-	ptaddr = trunc_page((u_int) vtopte(addr));
-	error = vm_map_pageable(vp, ptaddr, ptaddr + PAGE_SIZE, FALSE);
-	if (error)
-		panic("vm_fork: wire of PT failed. error=%d", error);
-
-	/* and force in (demand-zero) the UPAGES */
-	error = vm_map_pageable(vp, addr, addr + UPAGES * PAGE_SIZE, FALSE);
-	if (error)
-		panic("vm_fork: wire of UPAGES failed. error=%d", error);
-
 	/* get a kernel virtual address for the UPAGES for this proc */
 	up = (struct user *) kmem_alloc_pageable(u_map, UPAGES * PAGE_SIZE);
 	if (up == NULL)
 		panic("vm_fork: u_map allocation failed");
 
-	/* and force-map the upages into the kernel pmap */
-	for (i = 0; i < UPAGES; i++)
-		pmap_kenter(((vm_offset_t) up) + PAGE_SIZE * i,
-		    pmap_extract(vp->pmap, addr + PAGE_SIZE * i));
+	p2->p_vmspace->vm_upages_obj = vm_object_allocate( OBJT_DEFAULT,
+		UPAGES);
 
+	ptaddr = trunc_page((u_int) vtopte(kstack));
+	(void) vm_fault(vp, ptaddr, VM_PROT_READ|VM_PROT_WRITE, FALSE);
+	ptpa = pmap_extract(pvp, ptaddr);
+	if (ptpa == 0) {
+		panic("vm_fork: no pte for UPAGES");
+	}
+	stkm = PHYS_TO_VM_PAGE(ptpa);
+	vm_page_hold(stkm);
+
+	for(i=0;i<UPAGES;i++) {
+		vm_page_t m;
+
+		while ((m = vm_page_alloc(p2->p_vmspace->vm_upages_obj, i, VM_ALLOC_ZERO)) == NULL) {
+			VM_WAIT;
+		}
+
+		vm_page_wire(m);
+		m->flags &= ~PG_BUSY;
+		pmap_enter( pvp, (vm_offset_t) kstack + i * PAGE_SIZE,
+			VM_PAGE_TO_PHYS(m), VM_PROT_READ|VM_PROT_WRITE, 1);
+		pmap_kenter(((vm_offset_t) up) + i * PAGE_SIZE,
+			VM_PAGE_TO_PHYS(m));
+		if ((m->flags & PG_ZERO) == 0)
+			bzero(((caddr_t) up) + i * PAGE_SIZE, PAGE_SIZE);
+		m->flags &= ~PG_ZERO;
+		m->valid = VM_PAGE_BITS_ALL;
+	}
+	vm_page_unhold(stkm);
+
 	p2->p_addr = up;
 
 	/*
 	 * p_stats and p_sigacts currently point at fields in the user struct
 	 * but not at &u, instead at p_addr. Copy p_sigacts and parts of
 	 * p_stats; zero the rest of p_stats (statistics).
 	 */
 	p2->p_stats = &up->u_stats;
 	p2->p_sigacts = &up->u_sigacts;
 	up->u_sigacts = *p1->p_sigacts;
 	bzero(&up->u_stats.pstat_startzero,
 	    (unsigned) ((caddr_t) &up->u_stats.pstat_endzero -
 		(caddr_t) &up->u_stats.pstat_startzero));
 	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
 	    ((caddr_t) &up->u_stats.pstat_endcopy -
 		(caddr_t) &up->u_stats.pstat_startcopy));
 
 
 	/*
 	 * cpu_fork will copy and update the kernel stack and pcb, and make
 	 * the child ready to run.  It marks the child so that it can return
 	 * differently than the parent. It returns twice, once in the parent
 	 * process and once in the child.
 	 */
 	return (cpu_fork(p1, p2));
 }
 
 /*
  * Set default limits for VM system.
  * Called for proc 0, and then inherited by all others.
  *
  * XXX should probably act directly on proc0.
  */
 static void
 vm_init_limits(udata)
 	void *udata;
 {
 	register struct proc *p = udata;
 	int rss_limit;
 
 	/*
 	 * Set up the initial limits on process VM. Set the maximum resident
 	 * set size to be half of (reasonably) available memory.  Since this
 	 * is a soft limit, it comes into effect only when the system is out
 	 * of memory - half of main memory helps to favor smaller processes,
 	 * and reduces thrashing of the object cache.
 	 */
 	p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
 	p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
 	p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
 	p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ;
 	/* limit the limit to no less than 2MB */
 	rss_limit = max(cnt.v_free_count, 512);
 	p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(rss_limit);
 	p->p_rlimit[RLIMIT_RSS].rlim_max = RLIM_INFINITY;
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 	vm_offset_t i;
 	vm_offset_t ptaddr;
 	int s;
 
 	if ((p->p_flag & P_INMEM) == 0) {
-		vm_map_t map;
+		vm_map_t map = &p->p_vmspace->vm_map;
+		pmap_t pmap = &p->p_vmspace->vm_pmap;
+		vm_page_t stkm, m;
+		vm_offset_t ptpa;
 		int error;
 
 		++p->p_lock;
 
-		map = &p->p_vmspace->vm_map;
-		/* force the page table encompassing the kernel stack (upages) */
 		ptaddr = trunc_page((u_int) vtopte(kstack));
-		error = vm_map_pageable(map, ptaddr, ptaddr + PAGE_SIZE, FALSE);
-		if (error)
-			panic("faultin: wire of PT failed. error=%d", error);
+		(void) vm_fault(map, ptaddr, VM_PROT_READ|VM_PROT_WRITE, FALSE);
+		ptpa = pmap_extract(&p->p_vmspace->vm_pmap, ptaddr);
+		if (ptpa == 0) {
+			panic("vm_fork: no pte for UPAGES");
+		}
+		stkm = PHYS_TO_VM_PAGE(ptpa);
+		vm_page_hold(stkm);
 
-		/* wire in the UPAGES */
-		error = vm_map_pageable(map, (vm_offset_t) kstack,
-		    (vm_offset_t) kstack + UPAGES * PAGE_SIZE, FALSE);
-		if (error)
-			panic("faultin: wire of UPAGES failed. error=%d", error);
+		for(i=0;i<UPAGES;i++) {
+			int s;
+			s = splhigh();
 
-		/* and map them nicely into the kernel pmap */
-		for (i = 0; i < UPAGES; i++) {
-			vm_offset_t off = i * PAGE_SIZE;
-			vm_offset_t pa = (vm_offset_t)
-				pmap_extract(&p->p_vmspace->vm_pmap,
-				    (vm_offset_t) kstack + off);
+retry:
+			if ((m = vm_page_lookup(p->p_vmspace->vm_upages_obj, i)) == NULL) {
+				if ((m = vm_page_alloc(p->p_vmspace->vm_upages_obj, i, VM_ALLOC_NORMAL)) == NULL) {
+					VM_WAIT;
+					goto retry;
+				}
+			} else {
+				if ((m->flags & PG_BUSY) || m->busy) {
+					m->flags |= PG_WANTED;
+					tsleep(m, PVM, "swinuw",0);
+					goto retry;
+				}
+			}
+			vm_page_wire(m);
+			if (m->valid == VM_PAGE_BITS_ALL)
+				m->flags &= ~PG_BUSY;
+			splx(s);
 
-			if (pa == 0) 
-				panic("faultin: missing page for UPAGES\n");
-				
-			pmap_kenter(((vm_offset_t) p->p_addr) + off, pa);
+			pmap_enter( pmap, (vm_offset_t) kstack + i * PAGE_SIZE,
+				VM_PAGE_TO_PHYS(m), VM_PROT_READ|VM_PROT_WRITE, TRUE);
+			pmap_kenter(((vm_offset_t) p->p_addr) + i * PAGE_SIZE,
+				VM_PAGE_TO_PHYS(m));
+			if (m->valid != VM_PAGE_BITS_ALL) {
+				int rv;
+				rv = vm_pager_get_pages(p->p_vmspace->vm_upages_obj,
+					&m, 1, 0);
+				if (rv != VM_PAGER_OK)
+					panic("faultin: cannot get upages for proc: %d\n", p->p_pid);
+				m->valid = VM_PAGE_BITS_ALL;
+				m->flags &= ~PG_BUSY;
+			}
 		}
+		vm_page_unhold(stkm);
 
+		
 		s = splhigh();
 
 		if (p->p_stat == SRUN)
 			setrunqueue(p);
 
 		p->p_flag |= P_INMEM;
 
 		/* undo the effect of setting SLOCK above */
 		--p->p_lock;
 		splx(s);
 
 	}
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  */
 /* ARGSUSED*/
 static void
 scheduler(dummy)
 	void *dummy;
 {
 	register struct proc *p;
 	register int pri;
 	struct proc *pp;
 	int ppri;
 
 loop:
 	while ((cnt.v_free_count + cnt.v_cache_count) < (cnt.v_free_reserved + UPAGES + 2)) {
 		VM_WAIT;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
-		if (p->p_stat == SRUN && (p->p_flag & (P_INMEM | P_SWAPPING)) == 0) {
+		if (p->p_stat == SRUN &&
+			(p->p_flag & (P_INMEM | P_SWAPPING)) == 0) {
 			int mempri;
 
 			pri = p->p_swtime + p->p_slptime - p->p_nice * 8;
 			mempri = pri > 0 ? pri : 0;
 			/*
 			 * if this process is higher priority and there is
 			 * enough space, then select this process instead of
 			 * the previous selection.
 			 */
 			if (pri > ppri) {
 				pp = p;
 				ppri = pri;
 			}
 		}
 	}
 
 	/*
 	 * Nothing to do, back to sleep
 	 */
 	if ((p = pp) == NULL) {
 		tsleep(&proc0, PVM, "sched", 0);
 		goto loop;
 	}
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 */
 	faultin(p);
 	p->p_swtime = 0;
 	goto loop;
 }
 
 #define	swappable(p) \
 	(((p)->p_lock == 0) && \
 		((p)->p_flag & (P_TRACED|P_NOSWAP|P_SYSTEM|P_INMEM|P_WEXIT|P_PHYSIO|P_SWAPPING)) == P_INMEM)
 
 extern int vm_pageout_free_min;
 
 /*
  * Swapout is driven by the pageout daemon.  Very simple, we find eligible
  * procs and unwire their u-areas.  We try to always "swap" at least one
  * process in case we need the room for a swapin.
  * If any procs have been sleeping/stopped for at least maxslp seconds,
  * they are swapped.  Else, we swap the longest-sleeping or stopped process,
  * if any, otherwise the longest-resident process.
  */
 void
 swapout_procs()
 {
 	register struct proc *p;
 	struct proc *outp, *outp2;
 	int outpri, outpri2;
 	int didswap = 0;
 
 	outp = outp2 = NULL;
 	outpri = outpri2 = INT_MIN;
 retry:
 	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 		if (!swappable(p))
 			continue;
 		switch (p->p_stat) {
 		default:
 			continue;
 
 		case SSLEEP:
 		case SSTOP:
 			/*
 			 * do not swapout a realtime process
 			 */
 			if (p->p_rtprio.type == RTP_PRIO_REALTIME)
 				continue;
 
 			/*
 			 * do not swapout a process waiting on a critical
 			 * event of some kind
 			 */
 			if (((p->p_priority & 0x7f) < PSOCK) ||
 				(p->p_slptime <= 4))
 				continue;
 
 			vm_map_reference(&p->p_vmspace->vm_map);
 			/*
 			 * do not swapout a process that is waiting for VM
 			 * datastructures there is a possible deadlock.
 			 */
 			if (!lock_try_write(&p->p_vmspace->vm_map.lock)) {
 				vm_map_deallocate(&p->p_vmspace->vm_map);
 				continue;
 			}
 			vm_map_unlock(&p->p_vmspace->vm_map);
 			/*
 			 * If the process has been asleep for awhile and had
 			 * most of its pages taken away already, swap it out.
 			 */
 			swapout(p);
 			vm_map_deallocate(&p->p_vmspace->vm_map);
 			didswap++;
 			goto retry;
 		}
 	}
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapout(p)
 	register struct proc *p;
 {
 	vm_map_t map = &p->p_vmspace->vm_map;
+	pmap_t pmap = &p->p_vmspace->vm_pmap;
 	vm_offset_t ptaddr;
 	int i;
 
 	++p->p_stats->p_ru.ru_nswap;
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss =
 	    p->p_vmspace->vm_pmap.pm_stats.resident_count;
 
 	(void) splhigh();
 	p->p_flag &= ~P_INMEM;
 	p->p_flag |= P_SWAPPING;
 	if (p->p_stat == SRUN)
 		remrq(p);
 	(void) spl0();
 
 	/*
 	 * let the upages be paged
 	 */
-	for(i=0;i<UPAGES;i++)
+	for(i=0;i<UPAGES;i++) {
+		vm_page_t m;
+		if ((m = vm_page_lookup(p->p_vmspace->vm_upages_obj, i)) == NULL)
+			panic("swapout: upage already missing???");
+		m->dirty = VM_PAGE_BITS_ALL;
+		vm_page_unwire(m);
 		pmap_kremove( (vm_offset_t) p->p_addr + PAGE_SIZE * i);
-
-	vm_map_pageable(map, (vm_offset_t) kstack,
-	    (vm_offset_t) kstack + UPAGES * PAGE_SIZE, TRUE);
-
-	ptaddr = trunc_page((u_int) vtopte(kstack));
-	vm_map_pageable(map, ptaddr, ptaddr + PAGE_SIZE, TRUE);
+	}
+	pmap_remove(pmap, (vm_offset_t) kstack,
+		(vm_offset_t) kstack + PAGE_SIZE * UPAGES);
 
 	p->p_flag &= ~P_SWAPPING;
 	p->p_swtime = 0;
 }
 
 #ifdef DDB
 /*
  * DEBUG stuff
  */
 
 int indent;
 
 #include <machine/stdarg.h>	/* see subr_prf.c */
 
 /*ARGSUSED2*/
 void
 #if __STDC__
 iprintf(const char *fmt,...)
 #else
 iprintf(fmt /* , va_alist */ )
 	char *fmt;
 
  /* va_dcl */
 #endif
 {
 	register int i;
 	va_list ap;
 
 	for (i = indent; i >= 8; i -= 8)
 		printf("\t");
 	while (--i >= 0)
 		printf(" ");
 	va_start(ap, fmt);
 	printf("%r", fmt, ap);
 	va_end(ap);
 }
 #endif /* DDB */
Index: head/sys/vm/vm_kern.c
===================================================================
--- head/sys/vm/vm_kern.c	(revision 13489)
+++ head/sys/vm/vm_kern.c	(revision 13490)
@@ -1,462 +1,464 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_kern.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_kern.c,v 1.19 1995/12/10 14:52:09 bde Exp $
+ * $Id: vm_kern.c,v 1.20 1995/12/11 04:58:09 dyson Exp $
  */
 
 /*
  *	Kernel memory management.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 vm_map_t buffer_map;
 vm_map_t kernel_map;
 vm_map_t kmem_map;
 vm_map_t mb_map;
 int mb_map_full;
 vm_map_t io_map;
 vm_map_t clean_map;
 vm_map_t phys_map;
 vm_map_t exec_map;
 vm_map_t u_map;
 
 /*
  *	kmem_alloc_pageable:
  *
  *	Allocate pageable memory to the kernel's address map.
  *	"map" must be kernel_map or a submap of kernel_map.
  */
 
 vm_offset_t
 kmem_alloc_pageable(map, size)
 	vm_map_t map;
 	register vm_size_t size;
 {
 	vm_offset_t addr;
 	register int result;
 
 	size = round_page(size);
 	addr = vm_map_min(map);
 	result = vm_map_find(map, NULL, (vm_offset_t) 0,
-	    &addr, size, TRUE);
+	    &addr, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (result != KERN_SUCCESS) {
 		return (0);
 	}
 	return (addr);
 }
 
 /*
  *	Allocate wired-down memory in the kernel's address map
  *	or a submap.
  */
 vm_offset_t
 kmem_alloc(map, size)
 	register vm_map_t map;
 	register vm_size_t size;
 {
 	vm_offset_t addr;
 	register vm_offset_t offset;
 	vm_offset_t i;
 
 	size = round_page(size);
 
 	/*
 	 * Use the kernel object for wired-down kernel pages. Assume that no
 	 * region of the kernel object is referenced more than once.
 	 */
 
 	/*
 	 * Locate sufficient space in the map.  This will give us the final
 	 * virtual address for the new memory, and thus will tell us the
 	 * offset within the kernel map.
 	 */
 	vm_map_lock(map);
 	if (vm_map_findspace(map, 0, size, &addr)) {
 		vm_map_unlock(map);
 		return (0);
 	}
 	offset = addr - VM_MIN_KERNEL_ADDRESS;
 	vm_object_reference(kernel_object);
-	vm_map_insert(map, kernel_object, offset, addr, addr + size);
+	vm_map_insert(map, kernel_object, offset, addr, addr + size,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 
 	/*
 	 * Guarantee that there are pages already in this object before
 	 * calling vm_map_pageable.  This is to prevent the following
 	 * scenario:
 	 *
 	 * 1) Threads have swapped out, so that there is a pager for the
 	 * kernel_object. 2) The kmsg zone is empty, and so we are
 	 * kmem_allocing a new page for it. 3) vm_map_pageable calls vm_fault;
 	 * there is no page, but there is a pager, so we call
 	 * pager_data_request.  But the kmsg zone is empty, so we must
 	 * kmem_alloc. 4) goto 1 5) Even if the kmsg zone is not empty: when
 	 * we get the data back from the pager, it will be (very stale)
 	 * non-zero data.  kmem_alloc is defined to return zero-filled memory.
 	 *
 	 * We're intentionally not activating the pages we allocate to prevent a
 	 * race with page-out.  vm_map_pageable will wire the pages.
 	 */
 
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		vm_page_t mem;
 
 		while ((mem = vm_page_alloc(kernel_object,
-			OFF_TO_IDX(offset + i),
-			(VM_ALLOC_NORMAL|VM_ALLOC_ZERO))) == NULL) {
+			OFF_TO_IDX(offset + i), VM_ALLOC_ZERO)) == NULL) {
 			VM_WAIT;
 		}
 		if ((mem->flags & PG_ZERO) == 0)
 			vm_page_zero_fill(mem);
 		mem->flags &= ~(PG_BUSY|PG_ZERO);
 		mem->valid = VM_PAGE_BITS_ALL;
 	}
 
 	/*
 	 * And finally, mark the data as non-pageable.
 	 */
 
 	(void) vm_map_pageable(map, (vm_offset_t) addr, addr + size, FALSE);
 
 	/*
 	 * Try to coalesce the map
 	 */
 	vm_map_simplify(map, addr);
 
 	return (addr);
 }
 
 /*
  *	kmem_free:
  *
  *	Release a region of kernel virtual memory allocated
  *	with kmem_alloc, and return the physical pages
  *	associated with that region.
  */
 void
 kmem_free(map, addr, size)
 	vm_map_t map;
 	register vm_offset_t addr;
 	vm_size_t size;
 {
 	(void) vm_map_remove(map, trunc_page(addr), round_page(addr + size));
 }
 
 /*
  *	kmem_suballoc:
  *
  *	Allocates a map to manage a subrange
  *	of the kernel virtual address space.
  *
  *	Arguments are as follows:
  *
  *	parent		Map to take range from
  *	size		Size of range to find
  *	min, max	Returned endpoints of map
  *	pageable	Can the region be paged
  */
 vm_map_t
 kmem_suballoc(parent, min, max, size, pageable)
 	register vm_map_t parent;
 	vm_offset_t *min, *max;
 	register vm_size_t size;
 	boolean_t pageable;
 {
 	register int ret;
 	vm_map_t result;
 
 	size = round_page(size);
 
 	*min = (vm_offset_t) vm_map_min(parent);
 	ret = vm_map_find(parent, NULL, (vm_offset_t) 0,
-	    min, size, TRUE);
+	    min, size, TRUE, VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (ret != KERN_SUCCESS) {
 		printf("kmem_suballoc: bad status return of %d.\n", ret);
 		panic("kmem_suballoc");
 	}
 	*max = *min + size;
 	pmap_reference(vm_map_pmap(parent));
 	result = vm_map_create(vm_map_pmap(parent), *min, *max, pageable);
 	if (result == NULL)
 		panic("kmem_suballoc: cannot create submap");
 	if ((ret = vm_map_submap(parent, *min, *max, result)) != KERN_SUCCESS)
 		panic("kmem_suballoc: unable to change range to submap");
 	return (result);
 }
 
 /*
  * Allocate wired-down memory in the kernel's address map for the higher
  * level kernel memory allocator (kern/kern_malloc.c).  We cannot use
  * kmem_alloc() because we may need to allocate memory at interrupt
  * level where we cannot block (canwait == FALSE).
  *
  * This routine has its own private kernel submap (kmem_map) and object
  * (kmem_object).  This, combined with the fact that only malloc uses
  * this routine, ensures that we will never block in map or object waits.
  *
  * Note that this still only works in a uni-processor environment and
  * when called at splhigh().
  *
  * We don't worry about expanding the map (adding entries) since entries
  * for wired maps are statically allocated.
  */
 vm_offset_t
 kmem_malloc(map, size, waitflag)
 	register vm_map_t map;
 	register vm_size_t size;
 	boolean_t waitflag;
 {
 	register vm_offset_t offset, i;
 	vm_map_entry_t entry;
 	vm_offset_t addr;
 	vm_page_t m;
 
 	if (map != kmem_map && map != mb_map)
 		panic("kmem_malloc: map != {kmem,mb}_map");
 
 	size = round_page(size);
 	addr = vm_map_min(map);
 
 	/*
 	 * Locate sufficient space in the map.  This will give us the final
 	 * virtual address for the new memory, and thus will tell us the
 	 * offset within the kernel map.
 	 */
 	vm_map_lock(map);
 	if (vm_map_findspace(map, 0, size, &addr)) {
 		vm_map_unlock(map);
 		if (map == mb_map) {
 			mb_map_full = TRUE;
 			log(LOG_ERR, "mb_map full\n");
 			return (0);
 		}
 		if (waitflag == M_WAITOK)
 			panic("kmem_malloc: kmem_map too small");
 		return (0);
 	}
 	offset = addr - vm_map_min(kmem_map);
 	vm_object_reference(kmem_object);
-	vm_map_insert(map, kmem_object, offset, addr, addr + size);
+	vm_map_insert(map, kmem_object, offset, addr, addr + size,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	/*
 	 * If we can wait, just mark the range as wired (will fault pages as
 	 * necessary).
 	 */
 	if (waitflag == M_WAITOK) {
 		vm_map_unlock(map);
 		(void) vm_map_pageable(map, (vm_offset_t) addr, addr + size,
 		    FALSE);
 		vm_map_simplify(map, addr);
 		return (addr);
 	}
 	/*
 	 * If we cannot wait then we must allocate all memory up front,
 	 * pulling it off the active queue to prevent pageout.
 	 */
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		m = vm_page_alloc(kmem_object, OFF_TO_IDX(offset + i),
 			(waitflag == M_NOWAIT) ? VM_ALLOC_INTERRUPT : VM_ALLOC_SYSTEM);
 
 		/*
 		 * Ran out of space, free everything up and return. Don't need
 		 * to lock page queues here as we know that the pages we got
 		 * aren't on any queues.
 		 */
 		if (m == NULL) {
 			while (i != 0) {
 				i -= PAGE_SIZE;
 				m = vm_page_lookup(kmem_object,
 					OFF_TO_IDX(offset + i));
 				vm_page_free(m);
 			}
 			vm_map_delete(map, addr, addr + size);
 			vm_map_unlock(map);
 			return (0);
 		}
 		m->flags &= ~(PG_BUSY|PG_ZERO);
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 
 	/*
 	 * Mark map entry as non-pageable. Assert: vm_map_insert() will never
 	 * be able to extend the previous entry so there will be a new entry
 	 * exactly corresponding to this address range and it will have
 	 * wired_count == 0.
 	 */
 	if (!vm_map_lookup_entry(map, addr, &entry) ||
 	    entry->start != addr || entry->end != addr + size ||
 	    entry->wired_count)
 		panic("kmem_malloc: entry not found or misaligned");
 	entry->wired_count++;
 
 	/*
 	 * Loop thru pages, entering them in the pmap. (We cannot add them to
 	 * the wired count without wrapping the vm_page_queue_lock in
 	 * splimp...)
 	 */
 	for (i = 0; i < size; i += PAGE_SIZE) {
 		m = vm_page_lookup(kmem_object, OFF_TO_IDX(offset + i));
+		vm_page_wire(m);
 		pmap_kenter(addr + i, VM_PAGE_TO_PHYS(m));
 	}
 	vm_map_unlock(map);
 
 	vm_map_simplify(map, addr);
 	return (addr);
 }
 
 /*
  *	kmem_alloc_wait
  *
  *	Allocates pageable memory from a sub-map of the kernel.  If the submap
  *	has no room, the caller sleeps waiting for more memory in the submap.
  *
  */
 vm_offset_t
 kmem_alloc_wait(map, size)
 	vm_map_t map;
 	vm_size_t size;
 {
 	vm_offset_t addr;
 
 	size = round_page(size);
 
 	for (;;) {
 		/*
 		 * To make this work for more than one map, use the map's lock
 		 * to lock out sleepers/wakers.
 		 */
 		vm_map_lock(map);
 		if (vm_map_findspace(map, 0, size, &addr) == 0)
 			break;
 		/* no space now; see if we can ever get space */
 		if (vm_map_max(map) - vm_map_min(map) < size) {
 			vm_map_unlock(map);
 			return (0);
 		}
 		vm_map_unlock(map);
 		tsleep(map, PVM, "kmaw", 0);
 	}
-	vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size);
+	vm_map_insert(map, NULL, (vm_offset_t) 0, addr, addr + size, VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 	return (addr);
 }
 
 /*
  *	kmem_free_wakeup
  *
  *	Returns memory to a submap of the kernel, and wakes up any processes
  *	waiting for memory in that map.
  */
 void
 kmem_free_wakeup(map, addr, size)
 	vm_map_t map;
 	vm_offset_t addr;
 	vm_size_t size;
 {
 	vm_map_lock(map);
 	(void) vm_map_delete(map, trunc_page(addr), round_page(addr + size));
 	wakeup(map);
 	vm_map_unlock(map);
 }
 
 /*
  * Create the kernel map; insert a mapping covering kernel text, data, bss,
  * and all space allocated thus far (`boostrap' data).  The new map will thus
  * map the range between VM_MIN_KERNEL_ADDRESS and `start' as allocated, and
  * the range between `start' and `end' as free.
  */
 void
 kmem_init(start, end)
 	vm_offset_t start, end;
 {
 	register vm_map_t m;
 
 	m = vm_map_create(kernel_pmap, VM_MIN_KERNEL_ADDRESS, end, FALSE);
 	vm_map_lock(m);
 	/* N.B.: cannot use kgdb to debug, starting with this assignment ... */
 	kernel_map = m;
 	(void) vm_map_insert(m, NULL, (vm_offset_t) 0,
-	    VM_MIN_KERNEL_ADDRESS, start);
+	    VM_MIN_KERNEL_ADDRESS, start, VM_PROT_ALL, VM_PROT_ALL, 0);
 	/* ... and ending with the completion of the above `insert' */
 	vm_map_unlock(m);
 }
Index: head/sys/vm/vm_map.c
===================================================================
--- head/sys/vm/vm_map.c	(revision 13489)
+++ head/sys/vm/vm_map.c	(revision 13490)
@@ -1,2327 +1,2337 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.c	8.3 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_map.c,v 1.30 1995/12/14 09:54:59 phk Exp $
+ * $Id: vm_map.c,v 1.31 1996/01/04 21:13:17 wollman Exp $
  */
 
 /*
  *	Virtual memory mapping module.
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_inherit.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 /*
  *	Virtual memory maps provide for the mapping, protection,
  *	and sharing of virtual memory objects.  In addition,
  *	this module provides for an efficient virtual copy of
  *	memory from one map to another.
  *
  *	Synchronization is required prior to most operations.
  *
  *	Maps consist of an ordered doubly-linked list of simple
  *	entries; a single hint is used to speed up lookups.
  *
  *	In order to properly represent the sharing of virtual
  *	memory regions among maps, the map structure is bi-level.
  *	Top-level ("address") maps refer to regions of sharable
  *	virtual memory.  These regions are implemented as
  *	("sharing") maps, which then refer to the actual virtual
  *	memory objects.  When two address maps "share" memory,
  *	their top-level maps both have references to the same
  *	sharing map.  When memory is virtual-copied from one
  *	address map to another, the references in the sharing
  *	maps are actually copied -- no copying occurs at the
  *	virtual memory object level.
  *
  *	Since portions of maps are specified by start/end addreses,
  *	which may not align with existing map entries, all
  *	routines merely "clip" entries to these start/end values.
  *	[That is, an entry is split into two, bordering at a
  *	start or end value.]  Note that these clippings may not
  *	always be necessary (as the two resulting entries are then
  *	not changed); however, the clipping is done for convenience.
  *	No attempt is currently made to "glue back together" two
  *	abutting entries.
  *
  *	As mentioned above, virtual copy operations are performed
  *	by copying VM object references from one sharing map to
  *	another, and then marking both regions as copy-on-write.
  *	It is important to note that only one writeable reference
  *	to a VM object region exists in any map -- this means that
  *	shadow object creation can be delayed until a write operation
  *	occurs.
  */
 
 /*
  *	vm_map_startup:
  *
  *	Initialize the vm_map module.  Must be called before
  *	any other vm_map routines.
  *
  *	Map and entry structures are allocated from the general
  *	purpose memory pool with some exceptions:
  *
  *	- The kernel map and kmem submap are allocated statically.
  *	- Kernel map entries are allocated out of a static pool.
  *
  *	These restrictions are necessary since malloc() uses the
  *	maps and requires map entries.
  */
 
 vm_offset_t kentry_data;
 vm_size_t kentry_data_size;
 static vm_map_entry_t kentry_free;
 static vm_map_t kmap_free;
+extern char kstack[];
 
 static int kentry_count;
 static vm_offset_t mapvm_start, mapvm, mapvmmax;
 static int mapvmpgcnt;
 
 static void _vm_map_clip_end __P((vm_map_t, vm_map_entry_t, vm_offset_t));
 static void _vm_map_clip_start __P((vm_map_t, vm_map_entry_t, vm_offset_t));
 static vm_map_entry_t vm_map_entry_create __P((vm_map_t));
 static void vm_map_entry_delete __P((vm_map_t, vm_map_entry_t));
 static void vm_map_entry_dispose __P((vm_map_t, vm_map_entry_t));
 static void vm_map_entry_unwire __P((vm_map_t, vm_map_entry_t));
 static void vm_map_copy_entry __P((vm_map_t, vm_map_t, vm_map_entry_t,
 		vm_map_entry_t));
 #ifdef notyet
 static void vm_map_simplify_entry __P((vm_map_t, vm_map_entry_t));
 #endif
 
 void
 vm_map_startup()
 {
 	register int i;
 	register vm_map_entry_t mep;
 	vm_map_t mp;
 
 	/*
 	 * Static map structures for allocation before initialization of
 	 * kernel map or kmem map.  vm_map_create knows how to deal with them.
 	 */
 	kmap_free = mp = (vm_map_t) kentry_data;
 	i = MAX_KMAP;
 	while (--i > 0) {
 		mp->header.next = (vm_map_entry_t) (mp + 1);
 		mp++;
 	}
 	mp++->header.next = NULL;
 
 	/*
 	 * Form a free list of statically allocated kernel map entries with
 	 * the rest.
 	 */
 	kentry_free = mep = (vm_map_entry_t) mp;
 	kentry_count = i = (kentry_data_size - MAX_KMAP * sizeof *mp) / sizeof *mep;
 	while (--i > 0) {
 		mep->next = mep + 1;
 		mep++;
 	}
 	mep->next = NULL;
 }
 
 /*
  * Allocate a vmspace structure, including a vm_map and pmap,
  * and initialize those structures.  The refcnt is set to 1.
  * The remaining fields must be initialized by the caller.
  */
 struct vmspace *
 vmspace_alloc(min, max, pageable)
 	vm_offset_t min, max;
 	int pageable;
 {
 	register struct vmspace *vm;
 
 	if (mapvmpgcnt == 0 && mapvm == 0) {
 		int s;
 
 		mapvmpgcnt = (cnt.v_page_count * sizeof(struct vm_map_entry) + PAGE_SIZE - 1) / PAGE_SIZE;
 		s = splhigh();
 		mapvm_start = mapvm = kmem_alloc_pageable(kmem_map, mapvmpgcnt * PAGE_SIZE);
 		mapvmmax = mapvm_start + mapvmpgcnt * PAGE_SIZE;
 		splx(s);
 		if (!mapvm)
 			mapvmpgcnt = 0;
 	}
 	MALLOC(vm, struct vmspace *, sizeof(struct vmspace), M_VMMAP, M_WAITOK);
 	bzero(vm, (caddr_t) &vm->vm_startcopy - (caddr_t) vm);
 	vm_map_init(&vm->vm_map, min, max, pageable);
 	pmap_pinit(&vm->vm_pmap);
 	vm->vm_map.pmap = &vm->vm_pmap;	/* XXX */
 	vm->vm_refcnt = 1;
 	return (vm);
 }
 
 void
 vmspace_free(vm)
 	register struct vmspace *vm;
 {
 
 	if (vm->vm_refcnt == 0)
 		panic("vmspace_free: attempt to free already freed vmspace");
 
 	if (--vm->vm_refcnt == 0) {
+		int s, i;
+
+		pmap_remove(&vm->vm_pmap, (vm_offset_t) kstack, (vm_offset_t) kstack+UPAGES*PAGE_SIZE);
+			
 		/*
 		 * Lock the map, to wait out all other references to it.
 		 * Delete all of the mappings and pages they hold, then call
 		 * the pmap module to reclaim anything left.
 		 */
 		vm_map_lock(&vm->vm_map);
+		vm_object_deallocate(vm->vm_upages_obj);
 		(void) vm_map_delete(&vm->vm_map, vm->vm_map.min_offset,
 		    vm->vm_map.max_offset);
 		vm_map_unlock(&vm->vm_map);
 		while( vm->vm_map.ref_count != 1)
 			tsleep(&vm->vm_map.ref_count, PVM, "vmsfre", 0);
 		--vm->vm_map.ref_count;
 		pmap_release(&vm->vm_pmap);
 		FREE(vm, M_VMMAP);
 	}
 }
 
 /*
  *	vm_map_create:
  *
  *	Creates and returns a new empty VM map with
  *	the given physical map structure, and having
  *	the given lower and upper address bounds.
  */
 vm_map_t
 vm_map_create(pmap, min, max, pageable)
 	pmap_t pmap;
 	vm_offset_t min, max;
 	boolean_t pageable;
 {
 	register vm_map_t result;
 
 	if (kmem_map == NULL) {
 		result = kmap_free;
 		kmap_free = (vm_map_t) result->header.next;
 		if (result == NULL)
 			panic("vm_map_create: out of maps");
 	} else
 		MALLOC(result, vm_map_t, sizeof(struct vm_map),
 		    M_VMMAP, M_WAITOK);
 
 	vm_map_init(result, min, max, pageable);
 	result->pmap = pmap;
 	return (result);
 }
 
 /*
  * Initialize an existing vm_map structure
  * such as that in the vmspace structure.
  * The pmap is set elsewhere.
  */
 void
 vm_map_init(map, min, max, pageable)
 	register struct vm_map *map;
 	vm_offset_t min, max;
 	boolean_t pageable;
 {
 	map->header.next = map->header.prev = &map->header;
 	map->nentries = 0;
 	map->size = 0;
 	map->ref_count = 1;
 	map->is_main_map = TRUE;
 	map->min_offset = min;
 	map->max_offset = max;
 	map->entries_pageable = pageable;
 	map->first_free = &map->header;
 	map->hint = &map->header;
 	map->timestamp = 0;
 	lock_init(&map->lock, TRUE);
 }
 
 /*
  *	vm_map_entry_create:	[ internal use only ]
  *
  *	Allocates a VM map entry for insertion.
  *	No entry fields are filled in.  This routine is
  */
 static struct vm_map_entry *mappool;
 static int mappoolcnt;
 
 static vm_map_entry_t
 vm_map_entry_create(map)
 	vm_map_t map;
 {
 	vm_map_entry_t entry;
 	int i;
 
 #define KENTRY_LOW_WATER 64
 #define MAPENTRY_LOW_WATER 128
 
 	/*
 	 * This is a *very* nasty (and sort of incomplete) hack!!!!
 	 */
 	if (kentry_count < KENTRY_LOW_WATER) {
 		if (mapvmpgcnt && mapvm) {
 			vm_page_t m;
 
 			m = vm_page_alloc(kmem_object,
 			        OFF_TO_IDX(mapvm - vm_map_min(kmem_map)),
 				    (map == kmem_map) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL);
 			if (m) {
 				int newentries;
 
 				newentries = (PAGE_SIZE / sizeof(struct vm_map_entry));
 				vm_page_wire(m);
 				m->flags &= ~PG_BUSY;
 				m->valid = VM_PAGE_BITS_ALL;
 				pmap_enter(vm_map_pmap(kmem_map), mapvm,
 				    VM_PAGE_TO_PHYS(m), VM_PROT_DEFAULT, 1);
 				m->flags |= PG_WRITEABLE|PG_MAPPED;
 
 				entry = (vm_map_entry_t) mapvm;
 				mapvm += PAGE_SIZE;
 				--mapvmpgcnt;
 
 				for (i = 0; i < newentries; i++) {
 					vm_map_entry_dispose(kernel_map, entry);
 					entry++;
 				}
 			}
 		}
 	}
 	if (map == kernel_map || map == kmem_map || map == pager_map) {
 
 		entry = kentry_free;
 		if (entry) {
 			kentry_free = entry->next;
 			--kentry_count;
 			return entry;
 		}
 		entry = mappool;
 		if (entry) {
 			mappool = entry->next;
 			--mappoolcnt;
 			return entry;
 		}
 	} else {
 		entry = mappool;
 		if (entry) {
 			mappool = entry->next;
 			--mappoolcnt;
 			return entry;
 		}
 		MALLOC(entry, vm_map_entry_t, sizeof(struct vm_map_entry),
 		    M_VMMAPENT, M_WAITOK);
 	}
 	if (entry == NULL)
 		panic("vm_map_entry_create: out of map entries");
 
 	return (entry);
 }
 
 /*
  *	vm_map_entry_dispose:	[ internal use only ]
  *
  *	Inverse of vm_map_entry_create.
  */
 static void
 vm_map_entry_dispose(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 	if ((kentry_count < KENTRY_LOW_WATER) ||
 	    ((vm_offset_t) entry >= kentry_data && (vm_offset_t) entry < (kentry_data + kentry_data_size)) ||
 	    ((vm_offset_t) entry >= mapvm_start && (vm_offset_t) entry < mapvmmax)) {
 		entry->next = kentry_free;
 		kentry_free = entry;
 		++kentry_count;
 		return;
 	} else {
 		if (mappoolcnt < MAPENTRY_LOW_WATER) {
 			entry->next = mappool;
 			mappool = entry;
 			++mappoolcnt;
 			return;
 		}
 		FREE(entry, M_VMMAPENT);
 	}
 }
 
 /*
  *	vm_map_entry_{un,}link:
  *
  *	Insert/remove entries from maps.
  */
 #define	vm_map_entry_link(map, after_where, entry) \
 		{ \
 		(map)->nentries++; \
 		(entry)->prev = (after_where); \
 		(entry)->next = (after_where)->next; \
 		(entry)->prev->next = (entry); \
 		(entry)->next->prev = (entry); \
 		}
 #define	vm_map_entry_unlink(map, entry) \
 		{ \
 		(map)->nentries--; \
 		(entry)->next->prev = (entry)->prev; \
 		(entry)->prev->next = (entry)->next; \
 		}
 
 /*
  *	vm_map_reference:
  *
  *	Creates another valid reference to the given map.
  *
  */
 void
 vm_map_reference(map)
 	register vm_map_t map;
 {
 	if (map == NULL)
 		return;
 
 	map->ref_count++;
 }
 
 /*
  *	vm_map_deallocate:
  *
  *	Removes a reference from the specified map,
  *	destroying it if no references remain.
  *	The map should not be locked.
  */
 void
 vm_map_deallocate(map)
 	register vm_map_t map;
 {
 	register int c;
 
 	if (map == NULL)
 		return;
 
 	c = map->ref_count;
 
 	if (c == 0)
 		panic("vm_map_deallocate: deallocating already freed map");
 
 	if (c != 1) {
 		--map->ref_count;
 		wakeup(&map->ref_count);
 		return;
 	}
 	/*
 	 * Lock the map, to wait out all other references to it.
 	 */
 
 	vm_map_lock(map);
 	(void) vm_map_delete(map, map->min_offset, map->max_offset);
 	--map->ref_count;
 	if( map->ref_count != 0) {
 		vm_map_unlock(map);
 		return;
 	}
 
 	pmap_destroy(map->pmap);
 	FREE(map, M_VMMAP);
 }
 
 /*
  *	vm_map_insert:
  *
  *	Inserts the given whole VM object into the target
  *	map at the specified address range.  The object's
  *	size should match that of the address range.
  *
  *	Requires that the map be locked, and leaves it so.
  */
 int
-vm_map_insert(map, object, offset, start, end)
+vm_map_insert(map, object, offset, start, end, prot, max, cow)
 	vm_map_t map;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	vm_offset_t start;
 	vm_offset_t end;
+	vm_prot_t prot, max;
+	int cow;
 {
 	register vm_map_entry_t new_entry;
 	register vm_map_entry_t prev_entry;
 	vm_map_entry_t temp_entry;
 
 	/*
 	 * Check that the start and end points are not bogus.
 	 */
 
 	if ((start < map->min_offset) || (end > map->max_offset) ||
 	    (start >= end))
 		return (KERN_INVALID_ADDRESS);
 
 	/*
 	 * Find the entry prior to the proposed starting address; if it's part
 	 * of an existing entry, this range is bogus.
 	 */
 
 	if (vm_map_lookup_entry(map, start, &temp_entry))
 		return (KERN_NO_SPACE);
 
 	prev_entry = temp_entry;
 
 	/*
 	 * Assert that the next entry doesn't overlap the end point.
 	 */
 
 	if ((prev_entry->next != &map->header) &&
 	    (prev_entry->next->start < end))
 		return (KERN_NO_SPACE);
 
 	/*
 	 * See if we can avoid creating a new entry by extending one of our
 	 * neighbors.
 	 */
 
 	if (object == NULL) {
 		if ((prev_entry != &map->header) &&
 		    (prev_entry->end == start) &&
 		    (map->is_main_map) &&
 		    (prev_entry->is_a_map == FALSE) &&
 		    (prev_entry->is_sub_map == FALSE) &&
 		    (prev_entry->inheritance == VM_INHERIT_DEFAULT) &&
-		    (prev_entry->protection == VM_PROT_DEFAULT) &&
-		    (prev_entry->max_protection == VM_PROT_DEFAULT) &&
+		    (prev_entry->protection == prot) &&
+		    (prev_entry->max_protection == max) &&
 		    (prev_entry->wired_count == 0)) {
 
 			if (vm_object_coalesce(prev_entry->object.vm_object,
 				OFF_TO_IDX(prev_entry->offset),
 				(vm_size_t) (prev_entry->end
 				    - prev_entry->start),
 				(vm_size_t) (end - prev_entry->end))) {
 				/*
 				 * Coalesced the two objects - can extend the
 				 * previous map entry to include the new
 				 * range.
 				 */
 				map->size += (end - prev_entry->end);
 				prev_entry->end = end;
 				return (KERN_SUCCESS);
 			}
 		}
 	}
 	/*
 	 * Create a new entry
 	 */
 
 	new_entry = vm_map_entry_create(map);
 	new_entry->start = start;
 	new_entry->end = end;
 
 	new_entry->is_a_map = FALSE;
 	new_entry->is_sub_map = FALSE;
 	new_entry->object.vm_object = object;
 	new_entry->offset = offset;
 
-	new_entry->copy_on_write = FALSE;
-	new_entry->needs_copy = FALSE;
+	if (cow & MAP_COPY_NEEDED)
+		new_entry->needs_copy = TRUE;
+	else
+		new_entry->needs_copy = FALSE;
 
+	if (cow & MAP_COPY_ON_WRITE)
+		new_entry->copy_on_write = TRUE;
+	else
+		new_entry->copy_on_write = FALSE;
+
 	if (map->is_main_map) {
 		new_entry->inheritance = VM_INHERIT_DEFAULT;
-		new_entry->protection = VM_PROT_DEFAULT;
-		new_entry->max_protection = VM_PROT_DEFAULT;
+		new_entry->protection = prot;
+		new_entry->max_protection = max;
 		new_entry->wired_count = 0;
 	}
 	/*
 	 * Insert the new entry into the list
 	 */
 
 	vm_map_entry_link(map, prev_entry, new_entry);
 	map->size += new_entry->end - new_entry->start;
 
 	/*
 	 * Update the free space hint
 	 */
 
-	if ((map->first_free == prev_entry) && (prev_entry->end >= new_entry->start))
+	if ((map->first_free == prev_entry) &&
+		(prev_entry->end >= new_entry->start))
 		map->first_free = new_entry;
 
 	return (KERN_SUCCESS);
 }
 
 /*
  *	SAVE_HINT:
  *
  *	Saves the specified entry as the hint for
  *	future lookups.
  */
 #define	SAVE_HINT(map,value) \
 		(map)->hint = (value);
 
 /*
  *	vm_map_lookup_entry:	[ internal use only ]
  *
  *	Finds the map entry containing (or
  *	immediately preceding) the specified address
  *	in the given map; the entry is returned
  *	in the "entry" parameter.  The boolean
  *	result indicates whether the address is
  *	actually contained in the map.
  */
 boolean_t
 vm_map_lookup_entry(map, address, entry)
 	register vm_map_t map;
 	register vm_offset_t address;
 	vm_map_entry_t *entry;	/* OUT */
 {
 	register vm_map_entry_t cur;
 	register vm_map_entry_t last;
 
 	/*
 	 * Start looking either from the head of the list, or from the hint.
 	 */
 
 	cur = map->hint;
 
 	if (cur == &map->header)
 		cur = cur->next;
 
 	if (address >= cur->start) {
 		/*
 		 * Go from hint to end of list.
 		 *
 		 * But first, make a quick check to see if we are already looking
 		 * at the entry we want (which is usually the case). Note also
 		 * that we don't need to save the hint here... it is the same
 		 * hint (unless we are at the header, in which case the hint
 		 * didn't buy us anything anyway).
 		 */
 		last = &map->header;
 		if ((cur != last) && (cur->end > address)) {
 			*entry = cur;
 			return (TRUE);
 		}
 	} else {
 		/*
 		 * Go from start to hint, *inclusively*
 		 */
 		last = cur->next;
 		cur = map->header.next;
 	}
 
 	/*
 	 * Search linearly
 	 */
 
 	while (cur != last) {
 		if (cur->end > address) {
 			if (address >= cur->start) {
 				/*
 				 * Save this lookup for future hints, and
 				 * return
 				 */
 
 				*entry = cur;
 				SAVE_HINT(map, cur);
 				return (TRUE);
 			}
 			break;
 		}
 		cur = cur->next;
 	}
 	*entry = cur->prev;
 	SAVE_HINT(map, *entry);
 	return (FALSE);
 }
 
 /*
  * Find sufficient space for `length' bytes in the given map, starting at
  * `start'.  The map must be locked.  Returns 0 on success, 1 on no space.
  */
 int
 vm_map_findspace(map, start, length, addr)
 	register vm_map_t map;
 	register vm_offset_t start;
 	vm_size_t length;
 	vm_offset_t *addr;
 {
 	register vm_map_entry_t entry, next;
 	register vm_offset_t end;
 
 	if (start < map->min_offset)
 		start = map->min_offset;
 	if (start > map->max_offset)
 		return (1);
 
 	/*
 	 * Look for the first possible address; if there's already something
 	 * at this address, we have to start after it.
 	 */
 	if (start == map->min_offset) {
 		if ((entry = map->first_free) != &map->header)
 			start = entry->end;
 	} else {
 		vm_map_entry_t tmp;
 
 		if (vm_map_lookup_entry(map, start, &tmp))
 			start = tmp->end;
 		entry = tmp;
 	}
 
 	/*
 	 * Look through the rest of the map, trying to fit a new region in the
 	 * gap between existing regions, or after the very last region.
 	 */
 	for (;; start = (entry = next)->end) {
 		/*
 		 * Find the end of the proposed new region.  Be sure we didn't
 		 * go beyond the end of the map, or wrap around the address;
 		 * if so, we lose.  Otherwise, if this is the last entry, or
 		 * if the proposed new region fits before the next entry, we
 		 * win.
 		 */
 		end = start + length;
 		if (end > map->max_offset || end < start)
 			return (1);
 		next = entry->next;
 		if (next == &map->header || next->start >= end)
 			break;
 	}
 	SAVE_HINT(map, entry);
 	*addr = start;
 	if (map == kernel_map && round_page(start + length) > kernel_vm_end)
 		pmap_growkernel(round_page(start + length));
 	return (0);
 }
 
 /*
  *	vm_map_find finds an unallocated region in the target address
  *	map with the given length.  The search is defined to be
  *	first-fit from the specified address; the region found is
  *	returned in the same parameter.
  *
  */
 int
-vm_map_find(map, object, offset, addr, length, find_space)
+vm_map_find(map, object, offset, addr, length, find_space, prot, max, cow)
 	vm_map_t map;
 	vm_object_t object;
 	vm_ooffset_t offset;
 	vm_offset_t *addr;	/* IN/OUT */
 	vm_size_t length;
 	boolean_t find_space;
+	vm_prot_t prot, max;
+	int cow;
 {
 	register vm_offset_t start;
 	int result, s = 0;
 
 	start = *addr;
 
 	if (map == kmem_map)
 		s = splhigh();
 
 	vm_map_lock(map);
 	if (find_space) {
 		if (vm_map_findspace(map, start, length, addr)) {
 			vm_map_unlock(map);
 			if (map == kmem_map)
 				splx(s);
 			return (KERN_NO_SPACE);
 		}
 		start = *addr;
 	}
-	result = vm_map_insert(map, object, offset, start, start + length);
+	result = vm_map_insert(map, object, offset,
+		start, start + length, prot, max, cow);
 	vm_map_unlock(map);
 
 	if (map == kmem_map)
 		splx(s);
 
 	return (result);
 }
 
 #ifdef notyet
 /*
  *	vm_map_simplify_entry:	[ internal use only ]
  *
  *	Simplify the given map entry by:
  *		removing extra sharing maps
  *		[XXX maybe later] merging with a neighbor
  */
 static void
 vm_map_simplify_entry(map, entry)
 	vm_map_t map;
 	vm_map_entry_t entry;
 {
 #ifdef	lint
 	map++;
 #endif
 
 	/*
 	 * If this entry corresponds to a sharing map, then see if we can
 	 * remove the level of indirection. If it's not a sharing map, then it
 	 * points to a VM object, so see if we can merge with either of our
 	 * neighbors.
 	 */
 
 	if (entry->is_sub_map)
 		return;
 	if (entry->is_a_map) {
 #if	0
 		vm_map_t my_share_map;
 		int count;
 
 		my_share_map = entry->object.share_map;
 		count = my_share_map->ref_count;
 
 		if (count == 1) {
 			/*
 			 * Can move the region from entry->start to entry->end
 			 * (+ entry->offset) in my_share_map into place of
 			 * entry. Later.
 			 */
 		}
 #endif
 	} else {
 		/*
 		 * Try to merge with our neighbors.
 		 *
 		 * Conditions for merge are:
 		 *
 		 * 1.  entries are adjacent. 2.  both entries point to objects
 		 * with null pagers.
 		 *
 		 * If a merge is possible, we replace the two entries with a
 		 * single entry, then merge the two objects into a single
 		 * object.
 		 *
 		 * Now, all that is left to do is write the code!
 		 */
 	}
 }
 #endif
 
 /*
  *	vm_map_clip_start:	[ internal use only ]
  *
  *	Asserts that the given entry begins at or after
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 #define vm_map_clip_start(map, entry, startaddr) \
 { \
 	if (startaddr > entry->start) \
 		_vm_map_clip_start(map, entry, startaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_start(map, entry, start)
 	register vm_map_t map;
 	register vm_map_entry_t entry;
 	register vm_offset_t start;
 {
 	register vm_map_entry_t new_entry;
 
 	/*
 	 * See if we can simplify this entry first
 	 */
 
 	/* vm_map_simplify_entry(map, entry); */
 
 	/*
 	 * Split off the front portion -- note that we must insert the new
 	 * entry BEFORE this one, so that this entry has the specified
 	 * starting address.
 	 */
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->end = start;
 	entry->offset += (start - entry->start);
 	entry->start = start;
 
 	vm_map_entry_link(map, entry->prev, new_entry);
 
 	if (entry->is_a_map || entry->is_sub_map)
 		vm_map_reference(new_entry->object.share_map);
 	else
 		vm_object_reference(new_entry->object.vm_object);
 }
 
 /*
  *	vm_map_clip_end:	[ internal use only ]
  *
  *	Asserts that the given entry ends at or before
  *	the specified address; if necessary,
  *	it splits the entry into two.
  */
 
 #define vm_map_clip_end(map, entry, endaddr) \
 { \
 	if (endaddr < entry->end) \
 		_vm_map_clip_end(map, entry, endaddr); \
 }
 
 /*
  *	This routine is called only when it is known that
  *	the entry must be split.
  */
 static void
 _vm_map_clip_end(map, entry, end)
 	register vm_map_t map;
 	register vm_map_entry_t entry;
 	register vm_offset_t end;
 {
 	register vm_map_entry_t new_entry;
 
 	/*
 	 * Create a new entry and insert it AFTER the specified entry
 	 */
 
 	new_entry = vm_map_entry_create(map);
 	*new_entry = *entry;
 
 	new_entry->start = entry->end = end;
 	new_entry->offset += (end - entry->start);
 
 	vm_map_entry_link(map, entry, new_entry);
 
 	if (entry->is_a_map || entry->is_sub_map)
 		vm_map_reference(new_entry->object.share_map);
 	else
 		vm_object_reference(new_entry->object.vm_object);
 }
 
 /*
  *	VM_MAP_RANGE_CHECK:	[ internal use only ]
  *
  *	Asserts that the starting and ending region
  *	addresses fall within the valid range of the map.
  */
 #define	VM_MAP_RANGE_CHECK(map, start, end)		\
 		{					\
 		if (start < vm_map_min(map))		\
 			start = vm_map_min(map);	\
 		if (end > vm_map_max(map))		\
 			end = vm_map_max(map);		\
 		if (start > end)			\
 			start = end;			\
 		}
 
 /*
  *	vm_map_submap:		[ kernel use only ]
  *
  *	Mark the given range as handled by a subordinate map.
  *
  *	This range must have been created with vm_map_find,
  *	and no other operations may have been performed on this
  *	range prior to calling vm_map_submap.
  *
  *	Only a limited number of operations can be performed
  *	within this rage after calling vm_map_submap:
  *		vm_fault
  *	[Don't try vm_map_copy!]
  *
  *	To remove a submapping, one must first remove the
  *	range from the superior map, and then destroy the
  *	submap (if desired).  [Better yet, don't try it.]
  */
 int
 vm_map_submap(map, start, end, submap)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 	vm_map_t submap;
 {
 	vm_map_entry_t entry;
 	register int result = KERN_INVALID_ARGUMENT;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	vm_map_clip_end(map, entry, end);
 
 	if ((entry->start == start) && (entry->end == end) &&
 	    (!entry->is_a_map) &&
 	    (entry->object.vm_object == NULL) &&
 	    (!entry->copy_on_write)) {
 		entry->is_a_map = FALSE;
 		entry->is_sub_map = TRUE;
 		vm_map_reference(entry->object.sub_map = submap);
 		result = KERN_SUCCESS;
 	}
 	vm_map_unlock(map);
 
 	return (result);
 }
 
 /*
  *	vm_map_protect:
  *
  *	Sets the protection of the specified address
  *	region in the target map.  If "set_max" is
  *	specified, the maximum protection is to be set;
  *	otherwise, only the current protection is affected.
  */
 int
 vm_map_protect(map, start, end, new_prot, set_max)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 	register vm_prot_t new_prot;
 	register boolean_t set_max;
 {
 	register vm_map_entry_t current;
 	vm_map_entry_t entry;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = entry->next;
 
 	/*
 	 * Make a first pass to check for protection violations.
 	 */
 
 	current = entry;
 	while ((current != &map->header) && (current->start < end)) {
 		if (current->is_sub_map) {
 			vm_map_unlock(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if ((new_prot & current->max_protection) != new_prot) {
 			vm_map_unlock(map);
 			return (KERN_PROTECTION_FAILURE);
 		}
 		current = current->next;
 	}
 
 	/*
 	 * Go back and fix up protections. [Note that clipping is not
 	 * necessary the second time.]
 	 */
 
 	current = entry;
 
 	while ((current != &map->header) && (current->start < end)) {
 		vm_prot_t old_prot;
 
 		vm_map_clip_end(map, current, end);
 
 		old_prot = current->protection;
 		if (set_max)
 			current->protection =
 			    (current->max_protection = new_prot) &
 			    old_prot;
 		else
 			current->protection = new_prot;
 
 		/*
 		 * Update physical map if necessary. Worry about copy-on-write
 		 * here -- CHECK THIS XXX
 		 */
 
 		if (current->protection != old_prot) {
 
 #define MASK(entry)	((entry)->copy_on_write ? ~VM_PROT_WRITE : \
 							VM_PROT_ALL)
 #define	max(a,b)	((a) > (b) ? (a) : (b))
 
 			if (current->is_a_map) {
 				vm_map_entry_t share_entry;
 				vm_offset_t share_end;
 
 				vm_map_lock(current->object.share_map);
 				(void) vm_map_lookup_entry(
 				    current->object.share_map,
 				    current->offset,
 				    &share_entry);
 				share_end = current->offset +
 				    (current->end - current->start);
 				while ((share_entry !=
 					&current->object.share_map->header) &&
 				    (share_entry->start < share_end)) {
 
 					pmap_protect(map->pmap,
 					    (max(share_entry->start,
 						    current->offset) -
 						current->offset +
 						current->start),
 					    min(share_entry->end,
 						share_end) -
 					    current->offset +
 					    current->start,
 					    current->protection &
 					    MASK(share_entry));
 
 					share_entry = share_entry->next;
 				}
 				vm_map_unlock(current->object.share_map);
 			} else
 				pmap_protect(map->pmap, current->start,
 				    current->end,
 				    current->protection & MASK(entry));
 #undef	max
 #undef	MASK
 		}
 		current = current->next;
 	}
 
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_inherit:
  *
  *	Sets the inheritance of the specified address
  *	range in the target map.  Inheritance
  *	affects how the map will be shared with
  *	child maps at the time of vm_map_fork.
  */
 int
 vm_map_inherit(map, start, end, new_inheritance)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 	register vm_inherit_t new_inheritance;
 {
 	register vm_map_entry_t entry;
 	vm_map_entry_t temp_entry;
 
 	switch (new_inheritance) {
 	case VM_INHERIT_NONE:
 	case VM_INHERIT_COPY:
 	case VM_INHERIT_SHARE:
 		break;
 	default:
 		return (KERN_INVALID_ARGUMENT);
 	}
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	if (vm_map_lookup_entry(map, start, &temp_entry)) {
 		entry = temp_entry;
 		vm_map_clip_start(map, entry, start);
 	} else
 		entry = temp_entry->next;
 
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_clip_end(map, entry, end);
 
 		entry->inheritance = new_inheritance;
 
 		entry = entry->next;
 	}
 
 	vm_map_unlock(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_pageable:
  *
  *	Sets the pageability of the specified address
  *	range in the target map.  Regions specified
  *	as not pageable require locked-down physical
  *	memory and physical page maps.
  *
  *	The map must not be locked, but a reference
  *	must remain to the map throughout the call.
  */
 int
 vm_map_pageable(map, start, end, new_pageable)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 	register boolean_t new_pageable;
 {
 	register vm_map_entry_t entry;
 	vm_map_entry_t start_entry;
 	register vm_offset_t failed = 0;
 	int rv;
 
 	vm_map_lock(map);
 
 	VM_MAP_RANGE_CHECK(map, start, end);
 
 	/*
 	 * Only one pageability change may take place at one time, since
 	 * vm_fault assumes it will be called only once for each
 	 * wiring/unwiring.  Therefore, we have to make sure we're actually
 	 * changing the pageability for the entire region.  We do so before
 	 * making any changes.
 	 */
 
 	if (vm_map_lookup_entry(map, start, &start_entry) == FALSE) {
 		vm_map_unlock(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 	entry = start_entry;
 
 	/*
 	 * Actions are rather different for wiring and unwiring, so we have
 	 * two separate cases.
 	 */
 
 	if (new_pageable) {
 
 		vm_map_clip_start(map, entry, start);
 
 		/*
 		 * Unwiring.  First ensure that the range to be unwired is
 		 * really wired down and that there are no holes.
 		 */
 		while ((entry != &map->header) && (entry->start < end)) {
 
 			if (entry->wired_count == 0 ||
 			    (entry->end < end &&
 				(entry->next == &map->header ||
 				    entry->next->start > entry->end))) {
 				vm_map_unlock(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 			entry = entry->next;
 		}
 
 		/*
 		 * Now decrement the wiring count for each region. If a region
 		 * becomes completely unwired, unwire its physical pages and
 		 * mappings.
 		 */
 		lock_set_recursive(&map->lock);
 
 		entry = start_entry;
 		while ((entry != &map->header) && (entry->start < end)) {
 			vm_map_clip_end(map, entry, end);
 
 			entry->wired_count--;
 			if (entry->wired_count == 0)
 				vm_fault_unwire(map, entry->start, entry->end);
 
 			entry = entry->next;
 		}
 		lock_clear_recursive(&map->lock);
 	} else {
 		/*
 		 * Wiring.  We must do this in two passes:
 		 *
 		 * 1.  Holding the write lock, we create any shadow or zero-fill
 		 * objects that need to be created. Then we clip each map
 		 * entry to the region to be wired and increment its wiring
 		 * count.  We create objects before clipping the map entries
 		 * to avoid object proliferation.
 		 *
 		 * 2.  We downgrade to a read lock, and call vm_fault_wire to
 		 * fault in the pages for any newly wired area (wired_count is
 		 * 1).
 		 *
 		 * Downgrading to a read lock for vm_fault_wire avoids a possible
 		 * deadlock with another process that may have faulted on one
 		 * of the pages to be wired (it would mark the page busy,
 		 * blocking us, then in turn block on the map lock that we
 		 * hold).  Because of problems in the recursive lock package,
 		 * we cannot upgrade to a write lock in vm_map_lookup.  Thus,
 		 * any actions that require the write lock must be done
 		 * beforehand.  Because we keep the read lock on the map, the
 		 * copy-on-write status of the entries we modify here cannot
 		 * change.
 		 */
 
 		/*
 		 * Pass 1.
 		 */
 		while ((entry != &map->header) && (entry->start < end)) {
 			if (entry->wired_count == 0) {
 
 				/*
 				 * Perform actions of vm_map_lookup that need
 				 * the write lock on the map: create a shadow
 				 * object for a copy-on-write region, or an
 				 * object for a zero-fill region.
 				 *
 				 * We don't have to do this for entries that
 				 * point to sharing maps, because we won't
 				 * hold the lock on the sharing map.
 				 */
 				if (!entry->is_a_map && !entry->is_sub_map) {
 					if (entry->needs_copy &&
 					    ((entry->protection & VM_PROT_WRITE) != 0)) {
 
 						vm_object_shadow(&entry->object.vm_object,
 						    &entry->offset,
 						    OFF_TO_IDX(entry->end
 							- entry->start));
 						entry->needs_copy = FALSE;
 					} else if (entry->object.vm_object == NULL) {
 						entry->object.vm_object =
 						    vm_object_allocate(OBJT_DEFAULT,
 							OFF_TO_IDX(entry->end - entry->start));
 						entry->offset = (vm_offset_t) 0;
 					}
 				}
 			}
 			vm_map_clip_start(map, entry, start);
 			vm_map_clip_end(map, entry, end);
 			entry->wired_count++;
 
 			/*
 			 * Check for holes
 			 */
 			if (entry->end < end &&
 			    (entry->next == &map->header ||
 				entry->next->start > entry->end)) {
 				/*
 				 * Found one.  Object creation actions do not
 				 * need to be undone, but the wired counts
 				 * need to be restored.
 				 */
 				while (entry != &map->header && entry->end > start) {
 					entry->wired_count--;
 					entry = entry->prev;
 				}
 				vm_map_unlock(map);
 				return (KERN_INVALID_ARGUMENT);
 			}
 			entry = entry->next;
 		}
 
 		/*
 		 * Pass 2.
 		 */
 
 		/*
 		 * HACK HACK HACK HACK
 		 *
 		 * If we are wiring in the kernel map or a submap of it,
 		 * unlock the map to avoid deadlocks.  We trust that the
 		 * kernel is well-behaved, and therefore will not do
 		 * anything destructive to this region of the map while
 		 * we have it unlocked.  We cannot trust user processes
 		 * to do the same.
 		 *
 		 * HACK HACK HACK HACK
 		 */
 		if (vm_map_pmap(map) == kernel_pmap) {
 			vm_map_unlock(map);	/* trust me ... */
 		} else {
 			lock_set_recursive(&map->lock);
 			lock_write_to_read(&map->lock);
 		}
 
 		rv = 0;
 		entry = start_entry;
 		while (entry != &map->header && entry->start < end) {
 			/*
 			 * If vm_fault_wire fails for any page we need to undo
 			 * what has been done.  We decrement the wiring count
 			 * for those pages which have not yet been wired (now)
 			 * and unwire those that have (later).
 			 *
 			 * XXX this violates the locking protocol on the map,
 			 * needs to be fixed.
 			 */
 			if (rv)
 				entry->wired_count--;
 			else if (entry->wired_count == 1) {
 				rv = vm_fault_wire(map, entry->start, entry->end);
 				if (rv) {
 					failed = entry->start;
 					entry->wired_count--;
 				}
 			}
 			entry = entry->next;
 		}
 
 		if (vm_map_pmap(map) == kernel_pmap) {
 			vm_map_lock(map);
 		} else {
 			lock_clear_recursive(&map->lock);
 		}
 		if (rv) {
 			vm_map_unlock(map);
 			(void) vm_map_pageable(map, start, failed, TRUE);
 			return (rv);
 		}
 	}
 
 	vm_map_unlock(map);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * vm_map_clean
  *
  * Push any dirty cached pages in the address range to their pager.
  * If syncio is TRUE, dirty pages are written synchronously.
  * If invalidate is TRUE, any cached pages are freed as well.
  *
  * Returns an error if any part of the specified range is not mapped.
  */
 int
 vm_map_clean(map, start, end, syncio, invalidate)
 	vm_map_t map;
 	vm_offset_t start;
 	vm_offset_t end;
 	boolean_t syncio;
 	boolean_t invalidate;
 {
 	register vm_map_entry_t current;
 	vm_map_entry_t entry;
 	vm_size_t size;
 	vm_object_t object;
 	vm_ooffset_t offset;
 
 	vm_map_lock_read(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	if (!vm_map_lookup_entry(map, start, &entry)) {
 		vm_map_unlock_read(map);
 		return (KERN_INVALID_ADDRESS);
 	}
 	/*
 	 * Make a first pass to check for holes.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		if (current->is_sub_map) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ARGUMENT);
 		}
 		if (end > current->end &&
 		    (current->next == &map->header ||
 			current->end != current->next->start)) {
 			vm_map_unlock_read(map);
 			return (KERN_INVALID_ADDRESS);
 		}
 	}
 
 	/*
 	 * Make a second pass, cleaning/uncaching pages from the indicated
 	 * objects as we go.
 	 */
 	for (current = entry; current->start < end; current = current->next) {
 		offset = current->offset + (start - current->start);
 		size = (end <= current->end ? end : current->end) - start;
 		if (current->is_a_map || current->is_sub_map) {
 			register vm_map_t smap;
 			vm_map_entry_t tentry;
 			vm_size_t tsize;
 
 			smap = current->object.share_map;
 			vm_map_lock_read(smap);
 			(void) vm_map_lookup_entry(smap, offset, &tentry);
 			tsize = tentry->end - offset;
 			if (tsize < size)
 				size = tsize;
 			object = tentry->object.vm_object;
 			offset = tentry->offset + (offset - tentry->start);
 			vm_map_unlock_read(smap);
 		} else {
 			object = current->object.vm_object;
 		}
 		if (object && (object->type == OBJT_VNODE)) {
 			/*
 			 * Flush pages if writing is allowed. XXX should we continue
 			 * on an error?
 			 *
 			 * XXX Doing async I/O and then removing all the pages from
 			 *     the object before it completes is probably a very bad
 			 *     idea.
 			 */
 			if (current->protection & VM_PROT_WRITE)
 		   	    	vm_object_page_clean(object,
 					OFF_TO_IDX(offset),
 					OFF_TO_IDX(offset + size),
 					syncio, TRUE);
 			if (invalidate)
 				vm_object_page_remove(object,
 					OFF_TO_IDX(offset),
 					OFF_TO_IDX(offset + size),
 					FALSE);
 		}
 		start += size;
 	}
 
 	vm_map_unlock_read(map);
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_entry_unwire:	[ internal use only ]
  *
  *	Make the region specified by this entry pageable.
  *
  *	The map in question should be locked.
  *	[This is the reason for this routine's existence.]
  */
 static void
 vm_map_entry_unwire(map, entry)
 	vm_map_t map;
 	register vm_map_entry_t entry;
 {
 	vm_fault_unwire(map, entry->start, entry->end);
 	entry->wired_count = 0;
 }
 
 /*
  *	vm_map_entry_delete:	[ internal use only ]
  *
  *	Deallocate the given entry from the target map.
  */
 static void
 vm_map_entry_delete(map, entry)
 	register vm_map_t map;
 	register vm_map_entry_t entry;
 {
 	if (entry->wired_count != 0)
 		vm_map_entry_unwire(map, entry);
 
 	vm_map_entry_unlink(map, entry);
 	map->size -= entry->end - entry->start;
 
 	if (entry->is_a_map || entry->is_sub_map)
 		vm_map_deallocate(entry->object.share_map);
 	else
 		vm_object_deallocate(entry->object.vm_object);
 
 	vm_map_entry_dispose(map, entry);
 }
 
 /*
  *	vm_map_delete:	[ internal use only ]
  *
  *	Deallocates the given address range from the target
  *	map.
  *
  *	When called with a sharing map, removes pages from
  *	that region from all physical maps.
  */
 int
 vm_map_delete(map, start, end)
 	register vm_map_t map;
 	vm_offset_t start;
 	register vm_offset_t end;
 {
 	register vm_map_entry_t entry;
 	vm_map_entry_t first_entry;
 
 	/*
 	 * Find the start of the region, and clip it
 	 */
 
 	if (!vm_map_lookup_entry(map, start, &first_entry))
 		entry = first_entry->next;
 	else {
 		entry = first_entry;
 		vm_map_clip_start(map, entry, start);
 
 		/*
 		 * Fix the lookup hint now, rather than each time though the
 		 * loop.
 		 */
 
 		SAVE_HINT(map, entry->prev);
 	}
 
 	/*
 	 * Save the free space hint
 	 */
 
 	if (map->first_free->start >= start)
 		map->first_free = entry->prev;
 
 	/*
 	 * Step through all entries in this region
 	 */
 
 	while ((entry != &map->header) && (entry->start < end)) {
 		vm_map_entry_t next;
 		register vm_offset_t s, e;
 		register vm_object_t object;
 
 		vm_map_clip_end(map, entry, end);
 
 		next = entry->next;
 		s = entry->start;
 		e = entry->end;
 
 		/*
 		 * Unwire before removing addresses from the pmap; otherwise,
 		 * unwiring will put the entries back in the pmap.
 		 */
 
 		object = entry->object.vm_object;
 		if (entry->wired_count != 0)
 			vm_map_entry_unwire(map, entry);
 
 		/*
 		 * If this is a sharing map, we must remove *all* references
 		 * to this data, since we can't find all of the physical maps
 		 * which are sharing it.
 		 */
 
 		if (object == kernel_object || object == kmem_object)
 			vm_object_page_remove(object, OFF_TO_IDX(entry->offset),
 			    OFF_TO_IDX(entry->offset + (e - s)), FALSE);
 		else if (!map->is_main_map)
 			vm_object_pmap_remove(object,
 			    OFF_TO_IDX(entry->offset),
 			    OFF_TO_IDX(entry->offset + (e - s)));
 		else
 			pmap_remove(map->pmap, s, e);
 
 		/*
 		 * Delete the entry (which may delete the object) only after
 		 * removing all pmap entries pointing to its pages.
 		 * (Otherwise, its page frames may be reallocated, and any
 		 * modify bits will be set in the wrong object!)
 		 */
 
 		vm_map_entry_delete(map, entry);
 		entry = next;
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_map_remove:
  *
  *	Remove the given address range from the target map.
  *	This is the exported form of vm_map_delete.
  */
 int
 vm_map_remove(map, start, end)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 {
 	register int result, s = 0;
 
 	if (map == kmem_map)
 		s = splhigh();
 
 	vm_map_lock(map);
 	VM_MAP_RANGE_CHECK(map, start, end);
 	result = vm_map_delete(map, start, end);
 	vm_map_unlock(map);
 
 	if (map == kmem_map)
 		splx(s);
 
 	return (result);
 }
 
 /*
  *	vm_map_check_protection:
  *
  *	Assert that the target map allows the specified
  *	privilege on the entire address region given.
  *	The entire region must be allocated.
  */
 boolean_t
 vm_map_check_protection(map, start, end, protection)
 	register vm_map_t map;
 	register vm_offset_t start;
 	register vm_offset_t end;
 	register vm_prot_t protection;
 {
 	register vm_map_entry_t entry;
 	vm_map_entry_t tmp_entry;
 
 	if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
 		return (FALSE);
 	}
 	entry = tmp_entry;
 
 	while (start < end) {
 		if (entry == &map->header) {
 			return (FALSE);
 		}
 		/*
 		 * No holes allowed!
 		 */
 
 		if (start < entry->start) {
 			return (FALSE);
 		}
 		/*
 		 * Check protection associated with entry.
 		 */
 
 		if ((entry->protection & protection) != protection) {
 			return (FALSE);
 		}
 		/* go to next entry */
 
 		start = entry->end;
 		entry = entry->next;
 	}
 	return (TRUE);
 }
 
 /*
  *	vm_map_copy_entry:
  *
  *	Copies the contents of the source entry to the destination
  *	entry.  The entries *must* be aligned properly.
  */
 static void
 vm_map_copy_entry(src_map, dst_map, src_entry, dst_entry)
 	vm_map_t src_map, dst_map;
 	register vm_map_entry_t src_entry, dst_entry;
 {
 	vm_pindex_t temp_pindex;
 
 	if (src_entry->is_sub_map || dst_entry->is_sub_map)
 		return;
 
 	if (dst_entry->object.vm_object != NULL)
 		printf("vm_map_copy_entry: dst_entry object not NULL!\n");
 
 	/*
 	 * If our destination map was wired down, unwire it now.
 	 */
 
 	if (dst_entry->wired_count != 0)
 		vm_map_entry_unwire(dst_map, dst_entry);
 
-	/*
-	 * If we're dealing with a sharing map, we must remove the destination
-	 * pages from all maps (since we cannot know which maps this sharing
-	 * map belongs in).
-	 */
-
-	if (dst_map->is_main_map)
-		pmap_remove(dst_map->pmap, dst_entry->start, dst_entry->end);
-	else
-		vm_object_pmap_remove(dst_entry->object.vm_object,
-		    OFF_TO_IDX(dst_entry->offset),
-		    OFF_TO_IDX(dst_entry->offset +
-		    (dst_entry->end - dst_entry->start)));
-
 	if (src_entry->wired_count == 0) {
 
 		boolean_t src_needs_copy;
 
 		/*
 		 * If the source entry is marked needs_copy, it is already
 		 * write-protected.
 		 */
 		if (!src_entry->needs_copy) {
 
 			boolean_t su;
 
 			/*
 			 * If the source entry has only one mapping, we can
 			 * just protect the virtual address range.
 			 */
 			if (!(su = src_map->is_main_map)) {
 				su = (src_map->ref_count == 1);
 			}
+#ifdef VM_MAP_OLD 
 			if (su) {
 				pmap_protect(src_map->pmap,
 				    src_entry->start,
 				    src_entry->end,
 				    src_entry->protection & ~VM_PROT_WRITE);
 			} else {
+#endif
 				vm_object_pmap_copy(src_entry->object.vm_object,
 				    OFF_TO_IDX(src_entry->offset),
 				    OFF_TO_IDX(src_entry->offset + (src_entry->end
 					- src_entry->start)));
+#ifdef VM_MAP_OLD
 			}
+#endif
 		}
 		/*
 		 * Make a copy of the object.
 		 */
 		temp_pindex = OFF_TO_IDX(dst_entry->offset);
 		vm_object_copy(src_entry->object.vm_object,
 		    OFF_TO_IDX(src_entry->offset),
 		    &dst_entry->object.vm_object,
 		    &temp_pindex,
 		    &src_needs_copy);
 		dst_entry->offset = IDX_TO_OFF(temp_pindex);
 		/*
 		 * If we didn't get a copy-object now, mark the source map
 		 * entry so that a shadow will be created to hold its changed
 		 * pages.
 		 */
 		if (src_needs_copy)
 			src_entry->needs_copy = TRUE;
 
 		/*
 		 * The destination always needs to have a shadow created.
 		 */
 		dst_entry->needs_copy = TRUE;
 
 		/*
 		 * Mark the entries copy-on-write, so that write-enabling the
 		 * entry won't make copy-on-write pages writable.
 		 */
 		src_entry->copy_on_write = TRUE;
 		dst_entry->copy_on_write = TRUE;
 
 		pmap_copy(dst_map->pmap, src_map->pmap, dst_entry->start,
 		    dst_entry->end - dst_entry->start, src_entry->start);
 	} else {
 		/*
 		 * Of course, wired down pages can't be set copy-on-write.
 		 * Cause wired pages to be copied into the new map by
 		 * simulating faults (the new pages are pageable)
 		 */
 		vm_fault_copy_entry(dst_map, src_map, dst_entry, src_entry);
 	}
 }
 
 /*
  * vmspace_fork:
  * Create a new process vmspace structure and vm_map
  * based on those of an existing process.  The new map
  * is based on the old map, according to the inheritance
  * values on the regions in that map.
  *
  * The source map must not be locked.
  */
 struct vmspace *
 vmspace_fork(vm1)
 	register struct vmspace *vm1;
 {
 	register struct vmspace *vm2;
 	vm_map_t old_map = &vm1->vm_map;
 	vm_map_t new_map;
 	vm_map_entry_t old_entry;
 	vm_map_entry_t new_entry;
 	pmap_t new_pmap;
 
 	vm_map_lock(old_map);
 
 	vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset,
 	    old_map->entries_pageable);
 	bcopy(&vm1->vm_startcopy, &vm2->vm_startcopy,
 	    (caddr_t) (vm1 + 1) - (caddr_t) &vm1->vm_startcopy);
 	new_pmap = &vm2->vm_pmap;	/* XXX */
 	new_map = &vm2->vm_map;	/* XXX */
 
 	old_entry = old_map->header.next;
 
 	while (old_entry != &old_map->header) {
 		if (old_entry->is_sub_map)
 			panic("vm_map_fork: encountered a submap");
 
 		switch (old_entry->inheritance) {
 		case VM_INHERIT_NONE:
 			break;
 
 		case VM_INHERIT_SHARE:
 			/*
 			 * Clone the entry, referencing the sharing map.
 			 */
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->wired_count = 0;
 			++new_entry->object.vm_object->ref_count;
 
 			/*
 			 * Insert the entry into the new map -- we know we're
 			 * inserting at the end of the new map.
 			 */
 
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
 
 			/*
 			 * Update the physical map
 			 */
 
 			pmap_copy(new_map->pmap, old_map->pmap,
 			    new_entry->start,
 			    (old_entry->end - old_entry->start),
 			    old_entry->start);
 			break;
 
 		case VM_INHERIT_COPY:
 			/*
 			 * Clone the entry and link into the map.
 			 */
 
 			new_entry = vm_map_entry_create(new_map);
 			*new_entry = *old_entry;
 			new_entry->wired_count = 0;
 			new_entry->object.vm_object = NULL;
 			new_entry->is_a_map = FALSE;
 			vm_map_entry_link(new_map, new_map->header.prev,
 			    new_entry);
-			vm_map_copy_entry(old_map, new_map, old_entry, new_entry);
+			vm_map_copy_entry(old_map, new_map, old_entry,
+			    new_entry);
 			break;
 		}
 		old_entry = old_entry->next;
 	}
 
 	new_map->size = old_map->size;
 	vm_map_unlock(old_map);
 
 	return (vm2);
 }
 
 /*
  *	vm_map_lookup:
  *
  *	Finds the VM object, offset, and
  *	protection for a given virtual address in the
  *	specified map, assuming a page fault of the
  *	type specified.
  *
  *	Leaves the map in question locked for read; return
  *	values are guaranteed until a vm_map_lookup_done
  *	call is performed.  Note that the map argument
  *	is in/out; the returned map must be used in
  *	the call to vm_map_lookup_done.
  *
  *	A handle (out_entry) is returned for use in
  *	vm_map_lookup_done, to make that fast.
  *
  *	If a lookup is requested with "write protection"
  *	specified, the map may be changed to perform virtual
  *	copying operations, although the data referenced will
  *	remain the same.
  */
 int
 vm_map_lookup(var_map, vaddr, fault_type, out_entry,
     object, pindex, out_prot, wired, single_use)
 	vm_map_t *var_map;	/* IN/OUT */
 	register vm_offset_t vaddr;
 	register vm_prot_t fault_type;
 
 	vm_map_entry_t *out_entry;	/* OUT */
 	vm_object_t *object;	/* OUT */
 	vm_pindex_t *pindex;	/* OUT */
 	vm_prot_t *out_prot;	/* OUT */
 	boolean_t *wired;	/* OUT */
 	boolean_t *single_use;	/* OUT */
 {
 	vm_map_t share_map;
 	vm_offset_t share_offset;
 	register vm_map_entry_t entry;
 	register vm_map_t map = *var_map;
 	register vm_prot_t prot;
 	register boolean_t su;
 
 RetryLookup:;
 
 	/*
 	 * Lookup the faulting address.
 	 */
 
 	vm_map_lock_read(map);
 
 #define	RETURN(why) \
 		{ \
 		vm_map_unlock_read(map); \
 		return(why); \
 		}
 
 	/*
 	 * If the map has an interesting hint, try it before calling full
 	 * blown lookup routine.
 	 */
 
 	entry = map->hint;
 
 	*out_entry = entry;
 
 	if ((entry == &map->header) ||
 	    (vaddr < entry->start) || (vaddr >= entry->end)) {
 		vm_map_entry_t tmp_entry;
 
 		/*
 		 * Entry was either not a valid hint, or the vaddr was not
 		 * contained in the entry, so do a full lookup.
 		 */
 		if (!vm_map_lookup_entry(map, vaddr, &tmp_entry))
 			RETURN(KERN_INVALID_ADDRESS);
 
 		entry = tmp_entry;
 		*out_entry = entry;
 	}
 	/*
 	 * Handle submaps.
 	 */
 
 	if (entry->is_sub_map) {
 		vm_map_t old_map = map;
 
 		*var_map = map = entry->object.sub_map;
 		vm_map_unlock_read(old_map);
 		goto RetryLookup;
 	}
 	/*
 	 * Check whether this task is allowed to have this page.
 	 */
 
 	prot = entry->protection;
 	if ((fault_type & (prot)) != fault_type)
 		RETURN(KERN_PROTECTION_FAILURE);
 
 	/*
 	 * If this page is not pageable, we have to get it for all possible
 	 * accesses.
 	 */
 
 	*wired = (entry->wired_count != 0);
 	if (*wired)
 		prot = fault_type = entry->protection;
 
 	/*
 	 * If we don't already have a VM object, track it down.
 	 */
 
 	su = !entry->is_a_map;
 	if (su) {
 		share_map = map;
 		share_offset = vaddr;
 	} else {
 		vm_map_entry_t share_entry;
 
 		/*
 		 * Compute the sharing map, and offset into it.
 		 */
 
 		share_map = entry->object.share_map;
 		share_offset = (vaddr - entry->start) + entry->offset;
 
 		/*
 		 * Look for the backing store object and offset
 		 */
 
 		vm_map_lock_read(share_map);
 
 		if (!vm_map_lookup_entry(share_map, share_offset,
 			&share_entry)) {
 			vm_map_unlock_read(share_map);
 			RETURN(KERN_INVALID_ADDRESS);
 		}
 		entry = share_entry;
 	}
 
 	/*
 	 * If the entry was copy-on-write, we either ...
 	 */
 
 	if (entry->needs_copy) {
 		/*
 		 * If we want to write the page, we may as well handle that
 		 * now since we've got the sharing map locked.
 		 *
 		 * If we don't need to write the page, we just demote the
 		 * permissions allowed.
 		 */
 
 		if (fault_type & VM_PROT_WRITE) {
 			/*
 			 * Make a new object, and place it in the object
 			 * chain.  Note that no new references have appeared
 			 * -- one just moved from the share map to the new
 			 * object.
 			 */
 
 			if (lock_read_to_write(&share_map->lock)) {
 				if (share_map != map)
 					vm_map_unlock_read(map);
 				goto RetryLookup;
 			}
 			vm_object_shadow(
 			    &entry->object.vm_object,
 			    &entry->offset,
 			    OFF_TO_IDX(entry->end - entry->start));
 
 			entry->needs_copy = FALSE;
 
 			lock_write_to_read(&share_map->lock);
 		} else {
 			/*
 			 * We're attempting to read a copy-on-write page --
 			 * don't allow writes.
 			 */
 
 			prot &= (~VM_PROT_WRITE);
 		}
 	}
 	/*
 	 * Create an object if necessary.
 	 */
 	if (entry->object.vm_object == NULL) {
 
 		if (lock_read_to_write(&share_map->lock)) {
 			if (share_map != map)
 				vm_map_unlock_read(map);
 			goto RetryLookup;
 		}
 		entry->object.vm_object = vm_object_allocate(OBJT_DEFAULT,
 		    OFF_TO_IDX(entry->end - entry->start));
 		entry->offset = 0;
 		lock_write_to_read(&share_map->lock);
 	}
 	/*
 	 * Return the object/offset from this entry.  If the entry was
 	 * copy-on-write or empty, it has been fixed up.
 	 */
 
 	*pindex = OFF_TO_IDX((share_offset - entry->start) + entry->offset);
 	*object = entry->object.vm_object;
 
 	/*
 	 * Return whether this is the only map sharing this data.
 	 */
 
 	if (!su) {
 		su = (share_map->ref_count == 1);
 	}
 	*out_prot = prot;
 	*single_use = su;
 
 	return (KERN_SUCCESS);
 
 #undef	RETURN
 }
 
 /*
  *	vm_map_lookup_done:
  *
  *	Releases locks acquired by a vm_map_lookup
  *	(according to the handle returned by that lookup).
  */
 
 void
 vm_map_lookup_done(map, entry)
 	register vm_map_t map;
 	vm_map_entry_t entry;
 {
 	/*
 	 * If this entry references a map, unlock it first.
 	 */
 
 	if (entry->is_a_map)
 		vm_map_unlock_read(entry->object.share_map);
 
 	/*
 	 * Unlock the main-level map
 	 */
 
 	vm_map_unlock_read(map);
 }
 
 /*
  *	Routine:	vm_map_simplify
  *	Purpose:
  *		Attempt to simplify the map representation in
  *		the vicinity of the given starting address.
  *	Note:
  *		This routine is intended primarily to keep the
  *		kernel maps more compact -- they generally don't
  *		benefit from the "expand a map entry" technology
  *		at allocation time because the adjacent entry
  *		is often wired down.
  */
 void
 vm_map_simplify(map, start)
 	vm_map_t map;
 	vm_offset_t start;
 {
 	vm_map_entry_t this_entry;
 	vm_map_entry_t prev_entry;
 
 	vm_map_lock(map);
 	if (
 	    (vm_map_lookup_entry(map, start, &this_entry)) &&
 	    ((prev_entry = this_entry->prev) != &map->header) &&
 
 	    (prev_entry->end == start) &&
 	    (map->is_main_map) &&
 
 	    (prev_entry->is_a_map == FALSE) &&
 	    (prev_entry->is_sub_map == FALSE) &&
 
 	    (this_entry->is_a_map == FALSE) &&
 	    (this_entry->is_sub_map == FALSE) &&
 
 	    (prev_entry->inheritance == this_entry->inheritance) &&
 	    (prev_entry->protection == this_entry->protection) &&
 	    (prev_entry->max_protection == this_entry->max_protection) &&
 	    (prev_entry->wired_count == this_entry->wired_count) &&
 
 	    (prev_entry->copy_on_write == this_entry->copy_on_write) &&
 	    (prev_entry->needs_copy == this_entry->needs_copy) &&
 
 	    (prev_entry->object.vm_object == this_entry->object.vm_object) &&
 	    ((prev_entry->offset + (prev_entry->end - prev_entry->start))
 		== this_entry->offset)
 	    ) {
 		if (map->first_free == this_entry)
 			map->first_free = prev_entry;
 
 		if (!this_entry->object.vm_object->paging_in_progress) {
 			SAVE_HINT(map, prev_entry);
 			vm_map_entry_unlink(map, this_entry);
 			prev_entry->end = this_entry->end;
 			vm_object_deallocate(this_entry->object.vm_object);
 			vm_map_entry_dispose(map, this_entry);
 		}
 	}
 	vm_map_unlock(map);
 }
 
 #ifdef DDB
 /*
  *	vm_map_print:	[ debug ]
  */
 void
 vm_map_print(imap, full, dummy3, dummy4)
 	/* db_expr_t */ int imap;
 	boolean_t full;
 	/* db_expr_t */ int dummy3;
 	char *dummy4;
 {
 	register vm_map_entry_t entry;
 	register vm_map_t map = (vm_map_t)imap;	/* XXX */
 
 	iprintf("%s map 0x%x: pmap=0x%x,ref=%d,nentries=%d,version=%d\n",
 	    (map->is_main_map ? "Task" : "Share"),
 	    (int) map, (int) (map->pmap), map->ref_count, map->nentries,
 	    map->timestamp);
 
 	if (!full && indent)
 		return;
 
 	indent += 2;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		iprintf("map entry 0x%x: start=0x%x, end=0x%x, ",
 		    (int) entry, (int) entry->start, (int) entry->end);
 		if (map->is_main_map) {
 			static char *inheritance_name[4] =
 			{"share", "copy", "none", "donate_copy"};
 
 			printf("prot=%x/%x/%s, ",
 			    entry->protection,
 			    entry->max_protection,
 			    inheritance_name[entry->inheritance]);
 			if (entry->wired_count != 0)
 				printf("wired, ");
 		}
 		if (entry->is_a_map || entry->is_sub_map) {
 			printf("share=0x%x, offset=0x%x\n",
 			    (int) entry->object.share_map,
 			    (int) entry->offset);
 			if ((entry->prev == &map->header) ||
 			    (!entry->prev->is_a_map) ||
 			    (entry->prev->object.share_map !=
 				entry->object.share_map)) {
 				indent += 2;
 				vm_map_print((int)entry->object.share_map,
 					     full, 0, (char *)0);
 				indent -= 2;
 			}
 		} else {
 			printf("object=0x%x, offset=0x%x",
 			    (int) entry->object.vm_object,
 			    (int) entry->offset);
 			if (entry->copy_on_write)
 				printf(", copy (%s)",
 				    entry->needs_copy ? "needed" : "done");
 			printf("\n");
 
 			if ((entry->prev == &map->header) ||
 			    (entry->prev->is_a_map) ||
 			    (entry->prev->object.vm_object !=
 				entry->object.vm_object)) {
 				indent += 2;
 				vm_object_print((int)entry->object.vm_object,
 						full, 0, (char *)0);
 				indent -= 2;
 			}
 		}
 	}
 	indent -= 2;
 }
 #endif
Index: head/sys/vm/vm_map.h
===================================================================
--- head/sys/vm/vm_map.h	(revision 13489)
+++ head/sys/vm/vm_map.h	(revision 13490)
@@ -1,234 +1,241 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_map.h	8.3 (Berkeley) 3/15/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_map.h,v 1.9 1995/12/11 04:58:14 dyson Exp $
+ * $Id: vm_map.h,v 1.10 1995/12/14 09:55:00 phk Exp $
  */
 
 /*
  *	Virtual memory map module definitions.
  */
 
 #ifndef	_VM_MAP_
 #define	_VM_MAP_
 
 /*
  *	Types defined:
  *
  *	vm_map_t		the high-level address map data structure.
  *	vm_map_entry_t		an entry in an address map.
  *	vm_map_version_t	a timestamp of a map, for use with vm_map_lookup
  */
 
 /*
  *	Objects which live in maps may be either VM objects, or
  *	another map (called a "sharing map") which denotes read-write
  *	sharing with other maps.
  */
 
 union vm_map_object {
 	struct vm_object *vm_object;	/* object object */
 	struct vm_map *share_map;	/* share map */
 	struct vm_map *sub_map;		/* belongs to another map */
 };
 
 /*
  *	Address map entries consist of start and end addresses,
  *	a VM object (or sharing map) and offset into that object,
  *	and user-exported inheritance and protection information.
  *	Also included is control information for virtual copy operations.
  */
 struct vm_map_entry {
 	struct vm_map_entry *prev;	/* previous entry */
 	struct vm_map_entry *next;	/* next entry */
 	vm_offset_t start;		/* start address */
 	vm_offset_t end;		/* end address */
 	union vm_map_object object;	/* object I point to */
 	vm_ooffset_t offset;		/* offset into object */
 	boolean_t is_a_map:1,		/* Is "object" a map? */
 	 is_sub_map:1,			/* Is "object" a submap? */
 	/* Only in sharing maps: */
 	 copy_on_write:1,		/* is data copy-on-write */
 	 needs_copy:1;			/* does object need to be copied */
 	/* Only in task maps: */
 	vm_prot_t protection;		/* protection code */
 	vm_prot_t max_protection;	/* maximum protection */
 	vm_inherit_t inheritance;	/* inheritance */
 	int wired_count;		/* can be paged if = 0 */
 };
 
 /*
  *	Maps are doubly-linked lists of map entries, kept sorted
  *	by address.  A single hint is provided to start
  *	searches again from the last successful search,
  *	insertion, or removal.
  */
 struct vm_map {
 	struct pmap *pmap;		/* Physical map */
 	lock_data_t lock;		/* Lock for map data */
 	struct vm_map_entry header;	/* List of entries */
 	int nentries;			/* Number of entries */
 	vm_size_t size;			/* virtual size */
 	boolean_t is_main_map;		/* Am I a main map? */
 	int ref_count;			/* Reference count */
 	vm_map_entry_t hint;		/* hint for quick lookups */
 	vm_map_entry_t first_free;	/* First free space hint */
 	boolean_t entries_pageable;	/* map entries pageable?? */
 	unsigned int timestamp;		/* Version number */
 #define	min_offset		header.start
 #define max_offset		header.end
 };
 
 /* 
  * Shareable process virtual address space.
  * May eventually be merged with vm_map.
  * Several fields are temporary (text, data stuff).
  */
 struct vmspace {
 	struct vm_map vm_map;	/* VM address map */
 	struct pmap vm_pmap;	/* private physical map */
 	int vm_refcnt;		/* number of references */
 	caddr_t vm_shm;		/* SYS5 shared memory private data XXX */
+	vm_object_t vm_upages_obj;	/* UPAGES object */
 /* we copy from vm_startcopy to the end of the structure on fork */
 #define vm_startcopy vm_rssize
 	segsz_t vm_rssize;	/* current resident set size in pages */
 	segsz_t vm_swrss;	/* resident set size before last swap */
 	segsz_t vm_tsize;	/* text size (pages) XXX */
 	segsz_t vm_dsize;	/* data size (pages) XXX */
 	segsz_t vm_ssize;	/* stack size (pages) */
 	caddr_t vm_taddr;	/* user virtual address of text XXX */
 	caddr_t vm_daddr;	/* user virtual address of data XXX */
 	caddr_t vm_maxsaddr;	/* user VA at max stack growth */
 	caddr_t vm_minsaddr;	/* user VA at max stack growth */
 };
 
 
 /*
  *	Map versions are used to validate a previous lookup attempt.
  *
  *	Since lookup operations may involve both a main map and
  *	a sharing map, it is necessary to have a timestamp from each.
  *	[If the main map timestamp has changed, the share_map and
  *	associated timestamp are no longer valid; the map version
  *	does not include a reference for the imbedded share_map.]
  */
 typedef struct {
 	int main_timestamp;
 	vm_map_t share_map;
 	int share_timestamp;
 } vm_map_version_t;
 
 /*
  *	Macros:		vm_map_lock, etc.
  *	Function:
  *		Perform locking on the data portion of a map.
  */
 
 #define	vm_map_lock(map) { \
 	lock_write(&(map)->lock); \
 	(map)->timestamp++; \
 }
 #define	vm_map_unlock(map)	lock_write_done(&(map)->lock)
 #define	vm_map_lock_read(map)	lock_read(&(map)->lock)
 #define	vm_map_unlock_read(map)	lock_read_done(&(map)->lock)
 
 /*
  *	Functions implemented as macros
  */
 #define		vm_map_min(map)		((map)->min_offset)
 #define		vm_map_max(map)		((map)->max_offset)
 #define		vm_map_pmap(map)	((map)->pmap)
 
 /* XXX: number of kernel maps and entries to statically allocate */
 #define MAX_KMAP	10
 #define	MAX_KMAPENT	128
 
+/*
+ * Copy-on-write flags for vm_map operations
+ */
+#define MAP_COPY_NEEDED 0x1
+#define MAP_COPY_ON_WRITE 0x2
+
 #ifdef KERNEL
 extern vm_offset_t kentry_data;
 extern vm_size_t kentry_data_size;
 
 boolean_t vm_map_check_protection __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t));
 int vm_map_copy __P((vm_map_t, vm_map_t, vm_offset_t, vm_size_t, vm_offset_t, boolean_t, boolean_t));
 struct pmap;
 vm_map_t vm_map_create __P((struct pmap *, vm_offset_t, vm_offset_t, boolean_t));
 void vm_map_deallocate __P((vm_map_t));
 int vm_map_delete __P((vm_map_t, vm_offset_t, vm_offset_t));
-int vm_map_find __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t));
+int vm_map_find __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, boolean_t, vm_prot_t, vm_prot_t, int));
 int vm_map_findspace __P((vm_map_t, vm_offset_t, vm_size_t, vm_offset_t *));
 int vm_map_inherit __P((vm_map_t, vm_offset_t, vm_offset_t, vm_inherit_t));
 void vm_map_init __P((struct vm_map *, vm_offset_t, vm_offset_t, boolean_t));
-int vm_map_insert __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t));
+int vm_map_insert __P((vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t, vm_offset_t, vm_prot_t, vm_prot_t, int));
 int vm_map_lookup __P((vm_map_t *, vm_offset_t, vm_prot_t, vm_map_entry_t *, vm_object_t *,
     vm_pindex_t *, vm_prot_t *, boolean_t *, boolean_t *));
 void vm_map_lookup_done __P((vm_map_t, vm_map_entry_t));
 boolean_t vm_map_lookup_entry __P((vm_map_t, vm_offset_t, vm_map_entry_t *));
 int vm_map_pageable __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t));
 int vm_map_clean __P((vm_map_t, vm_offset_t, vm_offset_t, boolean_t, boolean_t));
 int vm_map_protect __P((vm_map_t, vm_offset_t, vm_offset_t, vm_prot_t, boolean_t));
 void vm_map_reference __P((vm_map_t));
 int vm_map_remove __P((vm_map_t, vm_offset_t, vm_offset_t));
 void vm_map_simplify __P((vm_map_t, vm_offset_t));
 void vm_map_startup __P((void));
 int vm_map_submap __P((vm_map_t, vm_offset_t, vm_offset_t, vm_map_t));
 
 #endif
 #endif				/* _VM_MAP_ */
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c	(revision 13489)
+++ head/sys/vm/vm_mmap.c	(revision 13490)
@@ -1,746 +1,739 @@
 /*
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
- * $Id: vm_mmap.c,v 1.33 1995/12/13 12:28:39 dyson Exp $
+ * $Id: vm_mmap.c,v 1.34 1995/12/17 07:19:57 bde Exp $
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/resourcevar.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/conf.h>
 #include <sys/vmmeter.h>
 
 #include <miscfs/specfs/specdev.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_inherit.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 /* ARGSUSED */
 int
 sbrk(p, uap, retval)
 	struct proc *p;
 	struct sbrk_args *uap;
 	int *retval;
 {
 
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 /* ARGSUSED */
 int
 sstk(p, uap, retval)
 	struct proc *p;
 	struct sstk_args *uap;
 	int *retval;
 {
 
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 ogetpagesize(p, uap, retval)
 	struct proc *p;
 	struct getpagesize_args *uap;
 	int *retval;
 {
 
 	*retval = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 || COMPAT_SUNOS */
 
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	caddr_t addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 int
 mmap(p, uap, retval)
 	struct proc *p;
 	register struct mmap_args *uap;
 	int *retval;
 {
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
 	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size;
 	vm_prot_t prot, maxprot;
 	caddr_t handle;
 	int flags, error;
 
 	prot = uap->prot & VM_PROT_ALL;
 	flags = uap->flags;
 	/*
 	 * Address (if FIXED) must be page aligned. Size is implicitly rounded
 	 * to a page boundary.
 	 */
 	addr = (vm_offset_t) uap->addr;
 	if (((flags & MAP_FIXED) && (addr & PAGE_MASK)) ||
 	    (ssize_t) uap->len < 0 || ((flags & MAP_ANON) && uap->fd != -1))
 		return (EINVAL);
 	size = (vm_size_t) round_page(uap->len);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
 			return (EINVAL);
 #ifndef i386
 		if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
 			return (EINVAL);
 #endif
 		if (addr + size < addr)
 			return (EINVAL);
 	}
 	/*
 	 * XXX if no hint provided for a non-fixed mapping place it after the
 	 * end of the largest possible heap.
 	 *
 	 * There should really be a pmap call to determine a reasonable location.
 	 */
 	if (addr == 0 && (flags & MAP_FIXED) == 0)
 		addr = round_page(p->p_vmspace->vm_daddr + MAXDSIZ);
 	if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
 		handle = NULL;
 		maxprot = VM_PROT_ALL;
 	} else {
 		/*
 		 * Mapping file, get fp for validation. Obtain vnode and make
 		 * sure it is of appropriate type.
 		 */
 		if (((unsigned) uap->fd) >= fdp->fd_nfiles ||
 		    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 			return (EBADF);
 		if (fp->f_type != DTYPE_VNODE)
 			return (EINVAL);
 		vp = (struct vnode *) fp->f_data;
 		if (vp->v_type != VREG && vp->v_type != VCHR)
 			return (EINVAL);
 		/*
 		 * XXX hack to handle use of /dev/zero to map anon memory (ala
 		 * SunOS).
 		 */
 		if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) {
 			handle = NULL;
 			maxprot = VM_PROT_ALL;
 			flags |= MAP_ANON;
 		} else {
 			/*
 			 * Ensure that file and memory protections are
 			 * compatible.  Note that we only worry about
 			 * writability if mapping is shared; in this case,
 			 * current and max prot are dictated by the open file.
 			 * XXX use the vnode instead?  Problem is: what
 			 * credentials do we use for determination? What if
 			 * proc does a setuid?
 			 */
 			maxprot = VM_PROT_EXECUTE;	/* ??? */
 			if (fp->f_flag & FREAD)
 				maxprot |= VM_PROT_READ;
 			else if (prot & PROT_READ)
 				return (EACCES);
 			if (flags & MAP_SHARED) {
 				if (fp->f_flag & FWRITE)
 					maxprot |= VM_PROT_WRITE;
 				else if (prot & PROT_WRITE)
 					return (EACCES);
 			} else
 				maxprot |= VM_PROT_WRITE;
 			handle = (caddr_t) vp;
 		}
 	}
 	error = vm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
 	    flags, handle, uap->pos);
 	if (error == 0)
 		*retval = (int) addr;
 	return (error);
 }
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(p, uap, retval)
 	struct proc *p;
 	register struct ommap_args *uap;
 	int *retval;
 {
 	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 #define	OMAP_INHERIT	0x0800
 
 	nargs.addr = uap->addr;
 	nargs.len = uap->len;
 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
 	nargs.flags = 0;
 	if (uap->flags & OMAP_ANON)
 		nargs.flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		nargs.flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		nargs.flags |= MAP_SHARED;
 	else
 		nargs.flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		nargs.flags |= MAP_FIXED;
 	if (uap->flags & OMAP_INHERIT)
 		nargs.flags |= MAP_INHERIT;
 	nargs.fd = uap->fd;
 	nargs.pos = uap->pos;
 	return (mmap(p, &nargs, retval));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	caddr_t addr;
 	int len;
 	int flags;
 };
 #endif
 int
 msync(p, uap, retval)
 	struct proc *p;
 	struct msync_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int flags;
 	vm_map_t map;
 	int rv;
 
 	map = &p->p_vmspace->vm_map;
 	addr = (vm_offset_t) uap->addr;
 	size = round_page((vm_size_t) uap->len);
 	flags = uap->flags;
 
 	if (((int) addr & PAGE_MASK) || addr + size < addr ||
 	    (flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	/*
 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
 	 * pages with the region containing addr".  Unfortunately, we don't
 	 * really keep track of individual mmaps so we approximate by flushing
 	 * the range of the map entry containing addr. This can be incorrect
 	 * if the region splits or is coalesced with a neighbor.
 	 */
 	if (size == 0) {
 		vm_map_entry_t entry;
 
 		vm_map_lock_read(map);
 		rv = vm_map_lookup_entry(map, addr, &entry);
 		vm_map_unlock_read(map);
 		if (rv == FALSE)
 			return (EINVAL);
 		addr = entry->start;
 		size = entry->end - entry->start;
 	}
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		break;
 	case KERN_INVALID_ADDRESS:
 		return (EINVAL);	/* Sun returns ENOMEM? */
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	caddr_t addr;
 	int len;
 };
 #endif
 int
 munmap(p, uap, retval)
 	register struct proc *p;
 	register struct munmap_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	vm_map_t map;
 
 	addr = (vm_offset_t) uap->addr;
 	if ((addr & PAGE_MASK) || uap->len < 0)
 		return (EINVAL);
 	size = (vm_size_t) round_page(uap->len);
 	if (size == 0)
 		return (0);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 #ifndef i386
 	if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
 		return (EINVAL);
 #endif
 	if (addr + size < addr)
 		return (EINVAL);
 	map = &p->p_vmspace->vm_map;
 	/*
 	 * Make sure entire range is allocated.
 	 */
 	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE))
 		return (EINVAL);
 	/* returns nothing but KERN_SUCCESS anyway */
 	(void) vm_map_remove(map, addr, addr + size);
 	return (0);
 }
 
 void
 munmapfd(p, fd)
 	struct proc *p;
 	int fd;
 {
 	/*
 	 * XXX should unmap any regions mapped to this file
 	 */
 	p->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	caddr_t addr;
 	int len;
 	int prot;
 };
 #endif
 int
 mprotect(p, uap, retval)
 	struct proc *p;
 	struct mprotect_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	register vm_prot_t prot;
 
 	addr = (vm_offset_t) uap->addr;
 	if ((addr & PAGE_MASK) || uap->len < 0)
 		return (EINVAL);
 	size = (vm_size_t) uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 
 	switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
 		FALSE)) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	caddr_t addr;
 	int len;
 	int behav;
 };
 #endif
 
 /* ARGSUSED */
 int
 madvise(p, uap, retval)
 	struct proc *p;
 	struct madvise_args *uap;
 	int *retval;
 {
 
 	/* Not yet implemented */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	caddr_t addr;
 	int len;
 	char *vec;
 };
 #endif
 
 /* ARGSUSED */
 int
 mincore(p, uap, retval)
 	struct proc *p;
 	struct mincore_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_offset_t end;
 	char *vec;
 
 	addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + round_page((vm_size_t) uap->len);
 	if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 	if (end < addr)
 		return (EINVAL);
 
 	vec = uap->vec;
 	while(addr < end) {
 		int error;
 		if (pmap_extract(&p->p_vmspace->vm_pmap, addr)) {
 			error = subyte( vec, 1);
 		} else {
 			error = subyte( vec, 0);
 		}
 		if (error)
 			return EFAULT;
 		vec++;
 		addr += PAGE_SIZE;
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	caddr_t addr;
 	size_t len;
 };
 #endif
 int
 mlock(p, uap, retval)
 	struct proc *p;
 	struct mlock_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int error;
 
 	addr = (vm_offset_t) uap->addr;
 	if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr)
 		return (EINVAL);
 	size = round_page((vm_size_t) uap->len);
 	if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #ifdef pmap_wired_count
 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
 		return (EAGAIN);
 #else
 	error = suser(p->p_ucred, &p->p_acflag);
 	if (error)
 		return (error);
 #endif
 
 	error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, FALSE);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	caddr_t addr;
 	size_t len;
 };
 #endif
 int
 munlock(p, uap, retval)
 	struct proc *p;
 	struct munlock_args *uap;
 	int *retval;
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int error;
 
 	addr = (vm_offset_t) uap->addr;
 	if ((addr & PAGE_MASK) || uap->addr + uap->len < uap->addr)
 		return (EINVAL);
 #ifndef pmap_wired_count
 	error = suser(p->p_ucred, &p->p_acflag);
 	if (error)
 		return (error);
 #endif
 	size = round_page((vm_size_t) uap->len);
 
 	error = vm_map_pageable(&p->p_vmspace->vm_map, addr, addr + size, TRUE);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * Internal version of mmap.
  * Currently used by mmap, exec, and sys5 shared memory.
  * Handle is either a vnode pointer or NULL for MAP_ANON.
  */
 int
 vm_mmap(map, addr, size, prot, maxprot, flags, handle, foff)
 	register vm_map_t map;
 	register vm_offset_t *addr;
 	register vm_size_t size;
 	vm_prot_t prot, maxprot;
 	register int flags;
 	caddr_t handle;		/* XXX should be vp */
 	vm_ooffset_t foff;
 {
 	boolean_t fitit;
-	vm_object_t object;
+	vm_object_t object, object2;
 	struct vnode *vp = NULL;
 	objtype_t type;
 	int rv = KERN_SUCCESS;
-	vm_size_t objsize;
+	vm_ooffset_t objsize;
+	int docow;
 	struct proc *p = curproc;
 
 	if (size == 0)
 		return (0);
 
 	objsize = size = round_page(size);
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The check is here rather than in the syscall because the
 	 * kernel calls this function internally for other mmaping
 	 * operations (such as in exec) and non-aligned offsets will
 	 * cause pmap inconsistencies...so we want to be sure to
 	 * disallow this in all cases.
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 		(void) vm_map_remove(map, *addr, *addr + size);
 	}
 
 	/*
 	 * Lookup/allocate object.
 	 */
 	if (flags & MAP_ANON) {
 		type = OBJT_SWAP;
 		/*
 		 * Unnamed anonymous regions always start at 0.
 		 */
 		if (handle == 0)
 			foff = 0;
 	} else {
 		vp = (struct vnode *) handle;
 		if (vp->v_type == VCHR) {
 			type = OBJT_DEVICE;
 			handle = (caddr_t) vp->v_rdev;
 		} else {
 			struct vattr vat;
 			int error;
 
 			error = VOP_GETATTR(vp, &vat, p->p_ucred, p);
 			if (error)
 				return (error);
-			objsize = vat.va_size;
+			objsize = round_page(vat.va_size);
 			type = OBJT_VNODE;
 		}
 	}
-	object = vm_pager_allocate(type, handle, objsize, prot, foff);
+	object = vm_pager_allocate(type, handle, OFF_TO_IDX(objsize), prot, foff);
 	if (object == NULL)
 		return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 
-	rv = vm_map_find(map, object, foff, addr, size, fitit);
+	object2 = NULL;
+	docow = 0;
+	if ((flags & (MAP_ANON|MAP_SHARED)) == 0 && (type != OBJT_DEVICE)) {
+		docow = MAP_COPY_ON_WRITE;
+		if (objsize < size) {
+			object2 = vm_object_allocate( OBJT_DEFAULT,
+				OFF_TO_IDX(size - (foff & ~(PAGE_SIZE - 1))));
+			object2->backing_object = object;
+			object2->backing_object_offset = foff;
+			TAILQ_INSERT_TAIL(&object->shadow_head,
+				object2, shadow_list);
+		} else {
+			docow |= MAP_COPY_NEEDED;
+		}
+	}
+	if (object2)
+		rv = vm_map_find(map, object2, 0, addr, size, fitit,
+			prot, maxprot, docow);
+	else
+		rv = vm_map_find(map, object, foff, addr, size, fitit,
+			prot, maxprot, docow);
+
+
 	if (rv != KERN_SUCCESS) {
 		/*
 		 * Lose the object reference. Will destroy the
 		 * object if it's an unnamed anonymous mapping
 		 * or named anonymous without other references.
 		 */
-		vm_object_deallocate(object);
+		if (object2)
+			vm_object_deallocate(object2);
+		else
+			vm_object_deallocate(object);
 		goto out;
 	}
 
 	/*
-	 * mmap a COW regular file
-	 */
-	if ((flags & (MAP_ANON|MAP_SHARED)) == 0 && (type != OBJT_DEVICE)) {
-		vm_map_entry_t entry;
-		if (!vm_map_lookup_entry(map, *addr, &entry)) {
-			panic("vm_mmap: missing map entry!!!");
-		}
-		entry->copy_on_write = TRUE;
-		/*
-		 * This will create the processes private object on
-		 * an as needed basis.
-		 */
-		entry->needs_copy = TRUE;
-
-		/*
-		 * set pages COW and protect for read access only
-		 */
-		vm_object_pmap_copy(object, foff, foff + size);
-
-	}
-
-	/*
 	 * "Pre-fault" resident pages.
 	 */
-	if ((type == OBJT_VNODE) && (map->pmap != NULL)) {
+	if ((map != kernel_map) &&
+		(type == OBJT_VNODE) && (map->pmap != NULL)) {
 		pmap_object_init_pt(map->pmap, *addr,
 			object, (vm_pindex_t) OFF_TO_IDX(foff), size);
 	}
 
-	/*
-	 * Correct protection (default is VM_PROT_ALL). If maxprot is
-	 * different than prot, we must set both explicitly.
-	 */
-	rv = KERN_SUCCESS;
-	if (maxprot != VM_PROT_ALL)
-		rv = vm_map_protect(map, *addr, *addr + size, maxprot, TRUE);
-	if (rv == KERN_SUCCESS && prot != maxprot)
-		rv = vm_map_protect(map, *addr, *addr + size, prot, FALSE);
-	if (rv != KERN_SUCCESS) {
-		(void) vm_map_remove(map, *addr, *addr + size);
-		goto out;
-	}
 	/*
 	 * Shared memory is also shared with children.
 	 */
 	if (flags & MAP_SHARED) {
 		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 		if (rv != KERN_SUCCESS) {
 			(void) vm_map_remove(map, *addr, *addr + size);
 			goto out;
 		}
 	}
 out:
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }
Index: head/sys/vm/vm_object.c
===================================================================
--- head/sys/vm/vm_object.c	(revision 13489)
+++ head/sys/vm/vm_object.c	(revision 13490)
@@ -1,1393 +1,1445 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_object.c	8.5 (Berkeley) 3/22/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_object.c,v 1.61 1996/01/04 18:32:31 davidg Exp $
+ * $Id: vm_object.c,v 1.62 1996/01/04 21:13:20 wollman Exp $
  */
 
 /*
  *	Virtual memory object module.
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>		/* for curproc, pageproc */
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 #ifdef DDB
 static void	DDB_vm_object_check __P((void));
 #endif
 
 static void	_vm_object_allocate __P((objtype_t, vm_size_t, vm_object_t));
 #ifdef DDB
 static int	_vm_object_in_map __P((vm_map_t map, vm_object_t object,
 				       vm_map_entry_t entry));
 static int	vm_object_in_map __P((vm_object_t object));
 #endif
 static void	vm_object_qcollapse __P((vm_object_t object));
 #ifdef not_used
 static void	vm_object_deactivate_pages __P((vm_object_t));
 #endif
 static void	vm_object_terminate __P((vm_object_t));
 static void	vm_object_cache_trim __P((void));
 
 /*
  *	Virtual memory objects maintain the actual data
  *	associated with allocated virtual memory.  A given
  *	page of memory exists within exactly one object.
  *
  *	An object is only deallocated when all "references"
  *	are given up.  Only one "reference" to a given
  *	region of an object should be writeable.
  *
  *	Associated with each object is a list of all resident
  *	memory pages belonging to that object; this list is
  *	maintained by the "vm_page" module, and locked by the object's
  *	lock.
  *
  *	Each object also records a "pager" routine which is
  *	used to retrieve (and store) pages to the proper backing
  *	storage.  In addition, objects may be backed by other
  *	objects from which they were virtual-copied.
  *
  *	The only items within the object structure which are
  *	modified after time of creation are:
  *		reference count		locked by object's lock
  *		pager routine		locked by object's lock
  *
  */
 
 int vm_object_cache_max;
 struct object_q vm_object_cached_list;
 static int vm_object_cached;
 struct object_q vm_object_list;
 static long vm_object_count;
 vm_object_t kernel_object;
 vm_object_t kmem_object;
 static struct vm_object kernel_object_store;
 static struct vm_object kmem_object_store;
 extern int vm_pageout_page_count;
 
 static long object_collapses;
 static long object_bypasses;
 
 static void
 _vm_object_allocate(type, size, object)
 	objtype_t type;
 	vm_size_t size;
 	register vm_object_t object;
 {
 	TAILQ_INIT(&object->memq);
 	TAILQ_INIT(&object->shadow_head);
 
 	object->type = type;
 	object->size = size;
 	object->ref_count = 1;
 	object->flags = 0;
 	object->paging_in_progress = 0;
 	object->resident_page_count = 0;
 	object->handle = NULL;
 	object->paging_offset = (vm_ooffset_t) 0;
 	object->backing_object = NULL;
 	object->backing_object_offset = (vm_ooffset_t) 0;
 
 	object->last_read = 0;
 
 	TAILQ_INSERT_TAIL(&vm_object_list, object, object_list);
 	vm_object_count++;
 }
 
 /*
  *	vm_object_init:
  *
  *	Initialize the VM objects module.
  */
 void
 vm_object_init()
 {
 	TAILQ_INIT(&vm_object_cached_list);
 	TAILQ_INIT(&vm_object_list);
 	vm_object_count = 0;
 	
 	vm_object_cache_max = 84;
 	if (cnt.v_page_count > 1000)
 		vm_object_cache_max += (cnt.v_page_count - 1000) / 3;
 
 	kernel_object = &kernel_object_store;
 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kernel_object);
 
 	kmem_object = &kmem_object_store;
 	_vm_object_allocate(OBJT_DEFAULT, OFF_TO_IDX(VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS),
 	    kmem_object);
 }
 
 /*
  *	vm_object_allocate:
  *
  *	Returns a new object with the given size.
  */
 
 vm_object_t
 vm_object_allocate(type, size)
 	objtype_t type;
 	vm_size_t size;
 {
 	register vm_object_t result;
 
 	result = (vm_object_t)
 	    malloc((u_long) sizeof *result, M_VMOBJ, M_WAITOK);
 
 
 	_vm_object_allocate(type, size, result);
 
 	return (result);
 }
 
 
 /*
  *	vm_object_reference:
  *
  *	Gets another reference to the given object.
  */
 inline void
 vm_object_reference(object)
 	register vm_object_t object;
 {
 	if (object == NULL)
 		return;
 
 	if (object->ref_count == 0) {
 		if ((object->flags & OBJ_CANPERSIST) == 0)
 			panic("vm_object_reference: non-persistent object with 0 ref_count");
 		TAILQ_REMOVE(&vm_object_cached_list, object, cached_list);
 		vm_object_cached--;
 	}
 	object->ref_count++;
 }
 
 /*
  *	vm_object_deallocate:
  *
  *	Release a reference to the specified object,
  *	gained either through a vm_object_allocate
  *	or a vm_object_reference call.  When all references
  *	are gone, storage associated with this object
  *	may be relinquished.
  *
  *	No object may be locked.
  */
 void
 vm_object_deallocate(object)
 	vm_object_t object;
 {
 	vm_object_t temp;
 
 	while (object != NULL) {
 
 		if (object->ref_count == 0)
 			panic("vm_object_deallocate: object deallocated too many times");
 
 		/*
 		 * Lose the reference
 		 */
 		object->ref_count--;
 
 		if (object->ref_count != 0) {
 			if ((object->ref_count == 1) &&
 			    (object->handle == NULL) &&
 			    (object->type == OBJT_DEFAULT ||
 			     object->type == OBJT_SWAP)) {
 				vm_object_t robject;
 				robject = object->shadow_head.tqh_first;
 				if ((robject != NULL) &&
 				    (robject->handle == NULL) &&
 				    (robject->type == OBJT_DEFAULT ||
 				     robject->type == OBJT_SWAP)) {
 					int s;
 					robject->ref_count += 2;
 					object->ref_count += 2;
 
 					do {
 						s = splhigh();
 						while (robject->paging_in_progress) {
 							robject->flags |= OBJ_PIPWNT;
 							tsleep(robject, PVM, "objde1", 0);
 						}
 
 						while (object->paging_in_progress) {
 							object->flags |= OBJ_PIPWNT;
 							tsleep(object, PVM, "objde2", 0);
 						}
 						splx(s);
 
 					} while( object->paging_in_progress || robject->paging_in_progress);
 
 					object->ref_count -= 2;
 					robject->ref_count -= 2;
 					if( robject->ref_count == 0) {
 						robject->ref_count += 1;
 						object = robject;
 						continue;
 					}
 					vm_object_collapse(robject);
 					return;
 				}
 			}
 			/*
 			 * If there are still references, then we are done.
 			 */
 			return;
 		}
 
 		if (object->type == OBJT_VNODE) {
 			struct vnode *vp = object->handle;
 
 			vp->v_flag &= ~VTEXT;
 		}
 
 		/*
 		 * See if this object can persist and has some resident
 		 * pages.  If so, enter it in the cache.
 		 */
 		if (object->flags & OBJ_CANPERSIST) {
 			if (object->resident_page_count != 0) {
 				vm_object_page_clean(object, 0, 0 ,TRUE, TRUE);
 				TAILQ_INSERT_TAIL(&vm_object_cached_list, object,
 				    cached_list);
 				vm_object_cached++;
 
 				vm_object_cache_trim();
 				return;
 			} else {
 				object->flags &= ~OBJ_CANPERSIST;
 			}
 		}
 
 		/*
 		 * Make sure no one uses us.
 		 */
 		object->flags |= OBJ_DEAD;
 
 		temp = object->backing_object;
 		if (temp)
 			TAILQ_REMOVE(&temp->shadow_head, object, shadow_list);
 		vm_object_terminate(object);
 		/* unlocks and deallocates object */
 		object = temp;
 	}
 }
 
 /*
  *	vm_object_terminate actually destroys the specified object, freeing
  *	up all previously used resources.
  *
  *	The object must be locked.
  */
 static void
 vm_object_terminate(object)
 	register vm_object_t object;
 {
 	register vm_page_t p;
 	int s;
 
 	/*
 	 * wait for the pageout daemon to be done with the object
 	 */
 	s = splhigh();
 	while (object->paging_in_progress) {
 		object->flags |= OBJ_PIPWNT;
 		tsleep(object, PVM, "objtrm", 0);
 	}
 	splx(s);
 
 	if (object->paging_in_progress != 0)
 		panic("vm_object_deallocate: pageout in progress");
 
 	/*
 	 * Clean and free the pages, as appropriate. All references to the
 	 * object are gone, so we don't need to lock it.
 	 */
 	if (object->type == OBJT_VNODE) {
 		struct vnode *vp = object->handle;
 
 		VOP_LOCK(vp);
 		vm_object_page_clean(object, 0, 0, TRUE, FALSE);
 		vinvalbuf(vp, V_SAVE, NOCRED, NULL, 0, 0);
 		VOP_UNLOCK(vp);
 	}
 
 	/*
 	 * Now free the pages. For internal objects, this also removes them
 	 * from paging queues.
 	 */
 	while ((p = object->memq.tqh_first) != NULL) {
 		if (p->flags & PG_BUSY)
 			printf("vm_object_terminate: freeing busy page\n");
 		PAGE_WAKEUP(p);
 		vm_page_free(p);
 		cnt.v_pfree++;
 	}
 
 	/*
 	 * Let the pager know object is dead.
 	 */
 	vm_pager_deallocate(object);
 
 	TAILQ_REMOVE(&vm_object_list, object, object_list);
 	vm_object_count--;
 
 	wakeup(object);
 
 	/*
 	 * Free the space for the object.
 	 */
 	free((caddr_t) object, M_VMOBJ);
 }
 
 /*
  *	vm_object_page_clean
  *
  *	Clean all dirty pages in the specified range of object.
  *	Leaves page on whatever queue it is currently on.
  *
  *	Odd semantics: if start == end, we clean everything.
  *
  *	The object must be locked.
  */
 
 void
 vm_object_page_clean(object, start, end, syncio, lockflag)
 	vm_object_t object;
 	vm_pindex_t start;
 	vm_pindex_t end;
 	boolean_t syncio;
 	boolean_t lockflag;
 {
-	register vm_page_t p;
+	register vm_page_t p, np, tp;
 	register vm_offset_t tstart, tend;
+	vm_pindex_t pi;
 	int s;
 	struct vnode *vp;
 	int runlen;
+	int maxf;
+	int chkb;
+	int maxb;
+	int i;
+	vm_page_t maf[vm_pageout_page_count];
+	vm_page_t mab[vm_pageout_page_count];
 	vm_page_t ma[vm_pageout_page_count];
 
 	if (object->type != OBJT_VNODE ||
 		(object->flags & OBJ_MIGHTBEDIRTY) == 0)
 		return;
 
 	vp = object->handle;
 
 	if (lockflag)
 		VOP_LOCK(vp);
 	object->flags |= OBJ_CLEANING;
 
 	tstart = start;
 	if (end == 0) {
 		tend = object->size;
 	} else {
 		tend = end;
 	}
 	if ((tstart == 0) && (tend == object->size)) {
 		object->flags &= ~(OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
 	}
+	for(p = object->memq.tqh_first; p; p = p->listq.tqe_next)
+		p->flags |= PG_CLEANCHK;
 
-	runlen = 0;
-	for(;tstart < tend; tstart += 1) {
-relookup:
-		p = vm_page_lookup(object, tstart);
-		if (!p) {
-			if (runlen > 0) {
-				vm_pageout_flush(ma, runlen, syncio);
-				runlen = 0;
-			}
+rescan:
+	for(p = object->memq.tqh_first; p; p = np) {
+		np = p->listq.tqe_next;
+
+		pi = p->pindex;
+		if (((p->flags & PG_CLEANCHK) == 0) ||
+			(pi < tstart) || (pi >= tend) ||
+			(p->valid == 0) || (p->queue == PQ_CACHE)) {
+			p->flags &= ~PG_CLEANCHK;
 			continue;
 		}
-		if ((p->valid == 0) || (p->flags & PG_CACHE)) {
-			if (runlen > 0) {
-				vm_pageout_flush(ma, runlen, syncio);
-				runlen = 0;
-			}
+
+		vm_page_test_dirty(p);
+		if ((p->dirty & p->valid) == 0) {
+			p->flags &= ~PG_CLEANCHK;
 			continue;
 		}
 
-		vm_page_protect(p, VM_PROT_READ);
-
 		s = splhigh();
-		while ((p->flags & PG_BUSY) || p->busy) {
-			if (runlen > 0) {
-				splx(s);
-				vm_pageout_flush(ma, runlen, syncio);
-				runlen = 0;
-				goto relookup;
-			}
+		if ((p->flags & PG_BUSY) || p->busy) {
 			p->flags |= PG_WANTED|PG_REFERENCED;
 			tsleep(p, PVM, "vpcwai", 0);
 			splx(s);
-			goto relookup;
+			goto rescan;
 		}
 		splx(s);
+			
+		maxf = 0;
+		for(i=1;i<vm_pageout_page_count;i++) {
+			if (tp = vm_page_lookup(object, pi + i)) {
+				if ((tp->flags & PG_BUSY) ||
+					(tp->flags & PG_CLEANCHK) == 0)
+					break;
+				vm_page_test_dirty(tp);
+				if ((tp->dirty & tp->valid) == 0) {
+					tp->flags &= ~PG_CLEANCHK;
+					break;
+				}
+				maf[ i - 1 ] = tp;
+				maxf++;
+				continue;
+			}
+			break;
+		}
 
-		if (p->dirty == 0)
-			vm_page_test_dirty(p);
-
-		if ((p->valid & p->dirty) != 0) {
-			ma[runlen] = p;
-			p->flags |= PG_BUSY;
-			runlen++;
-			if (runlen >= vm_pageout_page_count) {
-				vm_pageout_flush(ma, runlen, syncio);
-				runlen = 0;
+		maxb = 0;
+		chkb = vm_pageout_page_count -  maxf;
+		if (chkb) {
+			for(i = 1; i < chkb;i++) {
+				if (tp = vm_page_lookup(object, pi - i)) {
+					if ((tp->flags & PG_BUSY) ||
+						(tp->flags & PG_CLEANCHK) == 0)
+						break;
+					vm_page_test_dirty(tp);
+					if ((tp->dirty & tp->valid) == 0) {
+						tp->flags &= ~PG_CLEANCHK;
+						break;
+					}
+					mab[ i - 1 ] = tp;
+					maxb++;
+					continue;
+				}
+				break;
 			}
-		} else if (runlen > 0) {
-			vm_pageout_flush(ma, runlen, syncio);
-			runlen = 0;
 		}
-			
+
+		for(i=0;i<maxb;i++) {
+			int index = (maxb - i) - 1;
+			ma[index] = mab[i];
+			ma[index]->flags |= PG_BUSY;
+			ma[index]->flags &= ~PG_CLEANCHK;
+			vm_page_protect(ma[index], VM_PROT_READ);
+		}
+		vm_page_protect(p, VM_PROT_READ);
+		p->flags |= PG_BUSY;
+		p->flags &= ~PG_CLEANCHK;
+		ma[maxb] = p;
+		for(i=0;i<maxf;i++) {
+			int index = (maxb + i) + 1;
+			ma[index] = maf[i];
+			ma[index]->flags |= PG_BUSY;
+			ma[index]->flags &= ~PG_CLEANCHK;
+			vm_page_protect(ma[index], VM_PROT_READ);
+		}
+		runlen = maxb + maxf + 1;
+/*
+		printf("maxb: %d, maxf: %d, runlen: %d, offset: %d\n", maxb, maxf, runlen, ma[0]->pindex);
+*/
+		vm_pageout_flush(ma, runlen, 0);
+		goto rescan;
 	}
-	if (runlen > 0) {
-		vm_pageout_flush(ma, runlen, syncio);
-	}
 
 	VOP_FSYNC(vp, NULL, syncio, curproc);
 
 	if (lockflag)
 		VOP_UNLOCK(vp);
 	object->flags &= ~OBJ_CLEANING;
 	return;
 }
 
 #ifdef not_used
 /* XXX I cannot tell if this should be an exported symbol */
 /*
  *	vm_object_deactivate_pages
  *
  *	Deactivate all pages in the specified object.  (Keep its pages
  *	in memory even though it is no longer referenced.)
  *
  *	The object must be locked.
  */
 static void
 vm_object_deactivate_pages(object)
 	register vm_object_t object;
 {
 	register vm_page_t p, next;
 
 	for (p = object->memq.tqh_first; p != NULL; p = next) {
 		next = p->listq.tqe_next;
 		vm_page_deactivate(p);
 	}
 }
 #endif
 
 /*
  *	Trim the object cache to size.
  */
 static void
 vm_object_cache_trim()
 {
 	register vm_object_t object;
 
 	while (vm_object_cached > vm_object_cache_max) {
 		object = vm_object_cached_list.tqh_first;
 
 		vm_object_reference(object);
 		pager_cache(object, FALSE);
 	}
 }
 
 
 /*
  *	vm_object_pmap_copy:
  *
  *	Makes all physical pages in the specified
  *	object range copy-on-write.  No writeable
  *	references to these pages should remain.
  *
  *	The object must *not* be locked.
  */
 void
 vm_object_pmap_copy(object, start, end)
 	register vm_object_t object;
 	register vm_pindex_t start;
 	register vm_pindex_t end;
 {
 	register vm_page_t p;
 
 	if (object == NULL || (object->flags & OBJ_WRITEABLE) == 0)
 		return;
 
 	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
 		vm_page_protect(p, VM_PROT_READ);
 	}
 
 	object->flags &= ~OBJ_WRITEABLE;
 }
 
 /*
  *	vm_object_pmap_remove:
  *
  *	Removes all physical pages in the specified
  *	object range from all physical maps.
  *
  *	The object must *not* be locked.
  */
 void
 vm_object_pmap_remove(object, start, end)
 	register vm_object_t object;
 	register vm_pindex_t start;
 	register vm_pindex_t end;
 {
 	register vm_page_t p;
 	if (object == NULL)
 		return;
 	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
-		vm_page_protect(p, VM_PROT_NONE);
+		if (p->pindex >= start && p->pindex < end)
+			vm_page_protect(p, VM_PROT_NONE);
 	}
 }
 
 /*
  *	vm_object_copy:
  *
  *	Create a new object which is a copy of an existing
  *	object, and mark all of the pages in the existing
  *	object 'copy-on-write'.  The new object has one reference.
  *	Returns the new object.
  *
  *	May defer the copy until later if the object is not backed
  *	up by a non-default pager.
  */
 void
 vm_object_copy(src_object, src_offset,
     dst_object, dst_offset, src_needs_copy)
 	register vm_object_t src_object;
 	vm_pindex_t src_offset;
 	vm_object_t *dst_object;/* OUT */
 	vm_pindex_t *dst_offset;/* OUT */
 	boolean_t *src_needs_copy;	/* OUT */
 {
 	if (src_object == NULL) {
 		/*
 		 * Nothing to copy
 		 */
 		*dst_object = NULL;
 		*dst_offset = 0;
 		*src_needs_copy = FALSE;
 		return;
 	}
 
 	/*
 	 * Try to collapse the object before copying it.
 	 */
 	if (src_object->handle == NULL &&
 	    (src_object->type == OBJT_DEFAULT ||
 	     src_object->type == OBJT_SWAP))
 		vm_object_collapse(src_object);
 
 
 	/*
 	 * Make another reference to the object
 	 */
 	src_object->ref_count++;
 
 	*dst_object = src_object;
 	*dst_offset = src_offset;
 
 	/*
 	 * Must make a shadow when write is desired
 	 */
 	*src_needs_copy = TRUE;
 	return;
 }
 
 /*
  *	vm_object_shadow:
  *
  *	Create a new object which is backed by the
  *	specified existing object range.  The source
  *	object reference is deallocated.
  *
  *	The new object and offset into that object
  *	are returned in the source parameters.
  */
 
 void
 vm_object_shadow(object, offset, length)
 	vm_object_t *object;	/* IN/OUT */
 	vm_ooffset_t *offset;	/* IN/OUT */
 	vm_size_t length;
 {
 	register vm_object_t source;
 	register vm_object_t result;
 
 	source = *object;
 
 	/*
 	 * Allocate a new object with the given length
 	 */
 
 	if ((result = vm_object_allocate(OBJT_DEFAULT, length)) == NULL)
 		panic("vm_object_shadow: no object for shadowing");
 
 	/*
 	 * The new object shadows the source object, adding a reference to it.
 	 * Our caller changes his reference to point to the new object,
 	 * removing a reference to the source object.  Net result: no change
 	 * of reference count.
 	 */
 	result->backing_object = source;
 	if (source)
 		TAILQ_INSERT_TAIL(&result->backing_object->shadow_head, result, shadow_list);
 
 	/*
 	 * Store the offset into the source object, and fix up the offset into
 	 * the new object.
 	 */
 
 	result->backing_object_offset = *offset;
 
 	/*
 	 * Return the new things
 	 */
 
 	*offset = 0;
 	*object = result;
 }
 
 
 /*
  * this version of collapse allows the operation to occur earlier and
  * when paging_in_progress is true for an object...  This is not a complete
  * operation, but should plug 99.9% of the rest of the leaks.
  */
 static void
 vm_object_qcollapse(object)
 	register vm_object_t object;
 {
 	register vm_object_t backing_object;
 	register vm_pindex_t backing_offset_index, paging_offset_index;
 	vm_pindex_t backing_object_paging_offset_index;
 	vm_pindex_t new_pindex;
 	register vm_page_t p, pp;
 	register vm_size_t size;
 
 	backing_object = object->backing_object;
 	if (backing_object->ref_count != 1)
 		return;
 
 	backing_object->ref_count += 2;
 
 	backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 	backing_object_paging_offset_index = OFF_TO_IDX(backing_object->paging_offset);
 	paging_offset_index = OFF_TO_IDX(object->paging_offset);
 	size = object->size;
 	p = backing_object->memq.tqh_first;
 	while (p) {
 		vm_page_t next;
 
 		next = p->listq.tqe_next;
-		if ((p->flags & (PG_BUSY | PG_FICTITIOUS | PG_CACHE)) ||
-		    !p->valid || p->hold_count || p->wire_count || p->busy) {
+		if ((p->flags & (PG_BUSY | PG_FICTITIOUS)) ||
+		    (p->queue == PQ_CACHE) || !p->valid || p->hold_count || p->wire_count || p->busy) {
 			p = next;
 			continue;
 		}
 		vm_page_protect(p, VM_PROT_NONE);
 		new_pindex = p->pindex - backing_offset_index;
 		if (p->pindex < backing_offset_index ||
 		    new_pindex >= size) {
 			if (backing_object->type == OBJT_SWAP)
 				swap_pager_freespace(backing_object,
 				    backing_object_paging_offset_index+p->pindex,
 				    1);
 			vm_page_free(p);
 		} else {
 			pp = vm_page_lookup(object, new_pindex);
 			if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
 				    paging_offset_index + new_pindex, NULL, NULL))) {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
 					    backing_object_paging_offset_index + p->pindex, 1);
 				vm_page_free(p);
 			} else {
 				if (backing_object->type == OBJT_SWAP)
 					swap_pager_freespace(backing_object,
 					    backing_object_paging_offset_index + p->pindex, 1);
 				vm_page_rename(p, object, new_pindex);
 				p->dirty = VM_PAGE_BITS_ALL;
 			}
 		}
 		p = next;
 	}
 	backing_object->ref_count -= 2;
 }
 
 /*
  *	vm_object_collapse:
  *
  *	Collapse an object with the object backing it.
  *	Pages in the backing object are moved into the
  *	parent, and the backing object is deallocated.
  */
 void
 vm_object_collapse(object)
 	vm_object_t object;
 
 {
 	vm_object_t backing_object;
 	vm_ooffset_t backing_offset;
 	vm_size_t size;
 	vm_pindex_t new_pindex, backing_offset_index;
 	vm_page_t p, pp;
 
 	while (TRUE) {
 		/*
 		 * Verify that the conditions are right for collapse:
 		 *
 		 * The object exists and no pages in it are currently being paged
 		 * out.
 		 */
 		if (object == NULL)
 			return;
 
 		/*
 		 * Make sure there is a backing object.
 		 */
 		if ((backing_object = object->backing_object) == NULL)
 			return;
 
 		/*
 		 * we check the backing object first, because it is most likely
 		 * not collapsable.
 		 */
 		if (backing_object->handle != NULL ||
 		    (backing_object->type != OBJT_DEFAULT &&
 		     backing_object->type != OBJT_SWAP) ||
 		    (backing_object->flags & OBJ_DEAD) ||
 		    object->handle != NULL ||
 		    (object->type != OBJT_DEFAULT &&
 		     object->type != OBJT_SWAP) ||
 		    (object->flags & OBJ_DEAD)) {
 			return;
 		}
 
 		if (object->paging_in_progress != 0 ||
 		    backing_object->paging_in_progress != 0) {
 			vm_object_qcollapse(object);
 			return;
 		}
 
 		/*
 		 * We know that we can either collapse the backing object (if
 		 * the parent is the only reference to it) or (perhaps) remove
 		 * the parent's reference to it.
 		 */
 
 		backing_offset = object->backing_object_offset;
 		backing_offset_index = OFF_TO_IDX(backing_offset);
 		size = object->size;
 
 		/*
 		 * If there is exactly one reference to the backing object, we
 		 * can collapse it into the parent.
 		 */
 
 		if (backing_object->ref_count == 1) {
 
 			backing_object->flags |= OBJ_DEAD;
 			/*
 			 * We can collapse the backing object.
 			 *
 			 * Move all in-memory pages from backing_object to the
 			 * parent.  Pages that have been paged out will be
 			 * overwritten by any of the parent's pages that
 			 * shadow them.
 			 */
 
 			while ((p = backing_object->memq.tqh_first) != 0) {
 
 				new_pindex = p->pindex - backing_offset_index;
 
 				/*
 				 * If the parent has a page here, or if this
 				 * page falls outside the parent, dispose of
 				 * it.
 				 *
 				 * Otherwise, move it as planned.
 				 */
 
 				if (p->pindex < backing_offset_index ||
 				    new_pindex >= size) {
 					vm_page_protect(p, VM_PROT_NONE);
 					PAGE_WAKEUP(p);
 					vm_page_free(p);
 				} else {
 					pp = vm_page_lookup(object, new_pindex);
 					if (pp != NULL || (object->type == OBJT_SWAP && vm_pager_has_page(object,
 					    OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL))) {
 						vm_page_protect(p, VM_PROT_NONE);
 						PAGE_WAKEUP(p);
 						vm_page_free(p);
 					} else {
 						vm_page_rename(p, object, new_pindex);
 					}
 				}
 			}
 
 			/*
 			 * Move the pager from backing_object to object.
 			 */
 
 			if (backing_object->type == OBJT_SWAP) {
 				backing_object->paging_in_progress++;
 				if (object->type == OBJT_SWAP) {
 					object->paging_in_progress++;
 					/*
 					 * copy shadow object pages into ours
 					 * and destroy unneeded pages in
 					 * shadow object.
 					 */
 					swap_pager_copy(
 					    backing_object,
 					    OFF_TO_IDX(backing_object->paging_offset),
 					    object,
 					    OFF_TO_IDX(object->paging_offset),
 					    OFF_TO_IDX(object->backing_object_offset));
 					vm_object_pip_wakeup(object);
 				} else {
 					object->paging_in_progress++;
 					/*
 					 * move the shadow backing_object's pager data to
 					 * "object" and convert "object" type to OBJT_SWAP.
 					 */
 					object->type = OBJT_SWAP;
 					object->un_pager.swp.swp_nblocks =
 					    backing_object->un_pager.swp.swp_nblocks;
 					object->un_pager.swp.swp_allocsize =
 					    backing_object->un_pager.swp.swp_allocsize;
 					object->un_pager.swp.swp_blocks =
 					    backing_object->un_pager.swp.swp_blocks;
 					object->un_pager.swp.swp_poip =		/* XXX */
 					    backing_object->un_pager.swp.swp_poip;
 					object->paging_offset = backing_object->paging_offset + backing_offset;
 					TAILQ_INSERT_TAIL(&swap_pager_un_object_list, object, pager_object_list);
 
 					/*
 					 * Convert backing object from OBJT_SWAP to
 					 * OBJT_DEFAULT. XXX - only the TAILQ_REMOVE is
 					 * actually necessary.
 					 */
 					backing_object->type = OBJT_DEFAULT;
 					TAILQ_REMOVE(&swap_pager_un_object_list, backing_object, pager_object_list);
 					/*
 					 * free unnecessary blocks
 					 */
 					swap_pager_freespace(object, 0,
 						OFF_TO_IDX(object->paging_offset));
 					vm_object_pip_wakeup(object);
 				}
 
 				vm_object_pip_wakeup(backing_object);
 			}
 			/*
 			 * Object now shadows whatever backing_object did.
 			 * Note that the reference to backing_object->backing_object
 			 * moves from within backing_object to within object.
 			 */
 
 			TAILQ_REMOVE(&object->backing_object->shadow_head, object,
 			    shadow_list);
 			if (backing_object->backing_object)
 				TAILQ_REMOVE(&backing_object->backing_object->shadow_head,
 				    backing_object, shadow_list);
 			object->backing_object = backing_object->backing_object;
 			if (object->backing_object)
 				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
 				    object, shadow_list);
 
 			object->backing_object_offset += backing_object->backing_object_offset;
 			/*
 			 * Discard backing_object.
 			 *
 			 * Since the backing object has no pages, no pager left,
 			 * and no object references within it, all that is
 			 * necessary is to dispose of it.
 			 */
 
 			TAILQ_REMOVE(&vm_object_list, backing_object,
 			    object_list);
 			vm_object_count--;
 
 			free((caddr_t) backing_object, M_VMOBJ);
 
 			object_collapses++;
 		} else {
 			/*
 			 * If all of the pages in the backing object are
 			 * shadowed by the parent object, the parent object no
 			 * longer has to shadow the backing object; it can
 			 * shadow the next one in the chain.
 			 *
 			 * The backing object must not be paged out - we'd have
 			 * to check all of the paged-out pages, as well.
 			 */
 
 			if (backing_object->type != OBJT_DEFAULT) {
 				return;
 			}
 			/*
 			 * Should have a check for a 'small' number of pages
 			 * here.
 			 */
 
 			for (p = backing_object->memq.tqh_first; p; p = p->listq.tqe_next) {
 				new_pindex = p->pindex - backing_offset_index;
 
 				/*
 				 * If the parent has a page here, or if this
 				 * page falls outside the parent, keep going.
 				 *
 				 * Otherwise, the backing_object must be left in
 				 * the chain.
 				 */
 
 				if (p->pindex >= backing_offset_index &&
 					new_pindex <= size) {
 
 					pp = vm_page_lookup(object, new_pindex);
 
 					if ((pp == NULL || pp->valid == 0) &&
 				   	    !vm_pager_has_page(object, OFF_TO_IDX(object->paging_offset) + new_pindex, NULL, NULL)) {
 						/*
 						 * Page still needed. Can't go any
 						 * further.
 						 */
 						return;
 					}
 				}
 			}
 
 			/*
 			 * Make the parent shadow the next object in the
 			 * chain.  Deallocating backing_object will not remove
 			 * it, since its reference count is at least 2.
 			 */
 
 			TAILQ_REMOVE(&object->backing_object->shadow_head,
 			    object, shadow_list);
 			vm_object_reference(object->backing_object = backing_object->backing_object);
 			if (object->backing_object)
 				TAILQ_INSERT_TAIL(&object->backing_object->shadow_head,
 				    object, shadow_list);
 			object->backing_object_offset += backing_object->backing_object_offset;
 
 			/*
 			 * Drop the reference count on backing_object. Since
 			 * its ref_count was at least 2, it will not vanish;
 			 * so we don't need to call vm_object_deallocate.
 			 */
 			if (backing_object->ref_count == 1)
 				printf("should have called obj deallocate\n");
 			backing_object->ref_count--;
 
 			object_bypasses++;
 
 		}
 
 		/*
 		 * Try again with this object's new backing object.
 		 */
 	}
 }
 
 /*
  *	vm_object_page_remove: [internal]
  *
  *	Removes all physical pages in the specified
  *	object range from the object's list of pages.
  *
  *	The object must be locked.
  */
 void
 vm_object_page_remove(object, start, end, clean_only)
 	register vm_object_t object;
 	register vm_pindex_t start;
 	register vm_pindex_t end;
 	boolean_t clean_only;
 {
 	register vm_page_t p, next;
 	unsigned int size;
 	int s;
 
 	if (object == NULL)
 		return;
 
 	object->paging_in_progress++;
 again:
 	size = end - start;
 	if (size > 4 || size >= object->size / 4) {
 		for (p = object->memq.tqh_first; p != NULL; p = next) {
 			next = p->listq.tqe_next;
+			if (p->wire_count != 0) {
+				vm_page_protect(p, VM_PROT_NONE);
+				p->valid = 0;
+				continue;
+			}
 			if ((start <= p->pindex) && (p->pindex < end)) {
 				s = splhigh();
-				if (p->bmapped) {
-					splx(s);
-					continue;
-				}
 				if ((p->flags & PG_BUSY) || p->busy) {
 					p->flags |= PG_WANTED;
 					tsleep(p, PVM, "vmopar", 0);
 					splx(s);
 					goto again;
 				}
 				splx(s);
 				if (clean_only) {
 					vm_page_test_dirty(p);
 					if (p->valid & p->dirty)
 						continue;
 				}
 				vm_page_protect(p, VM_PROT_NONE);
 				PAGE_WAKEUP(p);
 				vm_page_free(p);
 			}
 		}
 	} else {
 		while (size > 0) {
-			while ((p = vm_page_lookup(object, start)) != 0) {
-				s = splhigh();
-				if (p->bmapped) {
-					splx(s);
-					break;
+			if ((p = vm_page_lookup(object, start)) != 0) {
+				if (p->wire_count != 0) {
+					p->valid = 0;
+					vm_page_protect(p, VM_PROT_NONE);
+					start += 1;
+					size -= 1;
+					continue;
 				}
+				s = splhigh();
 				if ((p->flags & PG_BUSY) || p->busy) {
 					p->flags |= PG_WANTED;
 					tsleep(p, PVM, "vmopar", 0);
 					splx(s);
 					goto again;
 				}
 				splx(s);
 				if (clean_only) {
 					vm_page_test_dirty(p);
-					if (p->valid & p->dirty)
+					if (p->valid & p->dirty) {
+						start += 1;
+						size -= 1;
 						continue;
+					}
 				}
 				vm_page_protect(p, VM_PROT_NONE);
 				PAGE_WAKEUP(p);
 				vm_page_free(p);
 			}
 			start += 1;
 			size -= 1;
 		}
 	}
 	vm_object_pip_wakeup(object);
 }
 
 /*
  *	Routine:	vm_object_coalesce
  *	Function:	Coalesces two objects backing up adjoining
  *			regions of memory into a single object.
  *
  *	returns TRUE if objects were combined.
  *
  *	NOTE:	Only works at the moment if the second object is NULL -
  *		if it's not, which object do we lock first?
  *
  *	Parameters:
  *		prev_object	First object to coalesce
  *		prev_offset	Offset into prev_object
  *		next_object	Second object into coalesce
  *		next_offset	Offset into next_object
  *
  *		prev_size	Size of reference to prev_object
  *		next_size	Size of reference to next_object
  *
  *	Conditions:
  *	The object must *not* be locked.
  */
 boolean_t
 vm_object_coalesce(prev_object, prev_pindex, prev_size, next_size)
 	register vm_object_t prev_object;
 	vm_pindex_t prev_pindex;
 	vm_size_t prev_size, next_size;
 {
 	vm_size_t newsize;
 
 	if (prev_object == NULL) {
 		return (TRUE);
 	}
 
 	/*
 	 * Try to collapse the object first
 	 */
 	vm_object_collapse(prev_object);
 
 	/*
 	 * Can't coalesce if: . more than one reference . paged out . shadows
 	 * another object . has a copy elsewhere (any of which mean that the
 	 * pages not mapped to prev_entry may be in use anyway)
 	 */
 
 	if (prev_object->ref_count > 1 ||
 	    prev_object->type != OBJT_DEFAULT ||
 	    prev_object->backing_object != NULL) {
 		return (FALSE);
 	}
 
 	prev_size >>= PAGE_SHIFT;
 	next_size >>= PAGE_SHIFT;
 	/*
 	 * Remove any pages that may still be in the object from a previous
 	 * deallocation.
 	 */
 
 	vm_object_page_remove(prev_object,
 	    prev_pindex + prev_size,
 	    prev_pindex + prev_size + next_size, FALSE);
 
 	/*
 	 * Extend the object if necessary.
 	 */
 	newsize = prev_pindex + prev_size + next_size;
 	if (newsize > prev_object->size)
 		prev_object->size = newsize;
 
 	return (TRUE);
 }
 
 #ifdef DDB
 
 static int
 _vm_object_in_map(map, object, entry)
 	vm_map_t map;
 	vm_object_t object;
 	vm_map_entry_t entry;
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 	int entcount;
 
 	if (map == 0)
 		return 0;
 
 	if (entry == 0) {
 		tmpe = map->header.next;
 		entcount = map->nentries;
 		while (entcount-- && (tmpe != &map->header)) {
 			if( _vm_object_in_map(map, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (entry->is_sub_map || entry->is_a_map) {
 		tmpm = entry->object.share_map;
 		tmpe = tmpm->header.next;
 		entcount = tmpm->nentries;
 		while (entcount-- && tmpe != &tmpm->header) {
 			if( _vm_object_in_map(tmpm, object, tmpe)) {
 				return 1;
 			}
 			tmpe = tmpe->next;
 		}
 	} else if (obj = entry->object.vm_object) {
 		for(; obj; obj=obj->backing_object)
 			if( obj == object) {
 				return 1;
 			}
 	}
 	return 0;
 }
 
 static int
 vm_object_in_map( object)
 	vm_object_t object;
 {
 	struct proc *p;
 	for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 		if( !p->p_vmspace /* || (p->p_flag & (P_SYSTEM|P_WEXIT)) */)
 			continue;
 /*
 		if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
 			continue;
 		}
 */
 		if( _vm_object_in_map(&p->p_vmspace->vm_map, object, 0))
 			return 1;
 	}
 	if( _vm_object_in_map( kernel_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( kmem_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( pager_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( buffer_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( io_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( phys_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( mb_map, object, 0))
 		return 1;
 	if( _vm_object_in_map( u_map, object, 0))
 		return 1;
 	return 0;
 }
 
 
 #ifdef DDB
 static void
 DDB_vm_object_check()
 {
 	vm_object_t object;
 
 	/*
 	 * make sure that internal objs are in a map somewhere
 	 * and none have zero ref counts.
 	 */
 	for (object = vm_object_list.tqh_first;
 			object != NULL;
 			object = object->object_list.tqe_next) {
 		if (object->handle == NULL &&
 		    (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP)) {
 			if (object->ref_count == 0) {
 				printf("vmochk: internal obj has zero ref count: %d\n",
 					object->size);
 			}
 			if (!vm_object_in_map(object)) {
 				printf("vmochk: internal obj is not in a map: "
 		"ref: %d, size: %d: 0x%x, backing_object: 0x%x\n",
 				    object->ref_count, object->size, 
 				    object->size, object->backing_object);
 			}
 		}
 	}
 }
 #endif /* DDB */
 
 /*
  *	vm_object_print:	[ debug ]
  */
 void
 vm_object_print(iobject, full, dummy3, dummy4)
 	/* db_expr_t */ int iobject;
 	boolean_t full;
 	/* db_expr_t */ int dummy3;
 	char *dummy4;
 {
 	vm_object_t object = (vm_object_t)iobject;	/* XXX */
 	register vm_page_t p;
 
 	register int count;
 
 	if (object == NULL)
 		return;
 
 	iprintf("Object 0x%x: size=0x%x, res=%d, ref=%d, ",
 	    (int) object, (int) object->size,
 	    object->resident_page_count, object->ref_count);
 	printf("offset=0x%x, backing_object=(0x%x)+0x%x\n",
 	    (int) object->paging_offset,
 	    (int) object->backing_object, (int) object->backing_object_offset);
 	printf("cache: next=%p, prev=%p\n",
 	    object->cached_list.tqe_next, object->cached_list.tqe_prev);
 
 	if (!full)
 		return;
 
 	indent += 2;
 	count = 0;
 	for (p = object->memq.tqh_first; p != NULL; p = p->listq.tqe_next) {
 		if (count == 0)
 			iprintf("memory:=");
 		else if (count == 6) {
 			printf("\n");
 			iprintf(" ...");
 			count = 0;
 		} else
 			printf(",");
 		count++;
 
 		printf("(off=0x%lx,page=0x%lx)",
 		    (u_long) p->pindex, (u_long) VM_PAGE_TO_PHYS(p));
 	}
 	if (count != 0)
 		printf("\n");
 	indent -= 2;
 }
 #endif /* DDB */
Index: head/sys/vm/vm_page.c
===================================================================
--- head/sys/vm/vm_page.c	(revision 13489)
+++ head/sys/vm/vm_page.c	(revision 13490)
@@ -1,1135 +1,1122 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
- *	$Id: vm_page.c,v 1.44 1995/12/17 07:19:58 bde Exp $
+ *	$Id: vm_page.c,v 1.45 1996/01/04 21:13:23 wollman Exp $
  */
 
 /*
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Resident memory management module.
  */
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 
 #ifdef DDB
 extern void	DDB_print_page_info __P((void));
 #endif
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 static struct pglist *vm_page_buckets;	/* Array of buckets */
 static int vm_page_bucket_count;	/* How big is array? */
 static int vm_page_hash_mask;		/* Mask for hash function */
 
 struct pglist vm_page_queue_free;
 struct pglist vm_page_queue_zero;
 struct pglist vm_page_queue_active;
 struct pglist vm_page_queue_inactive;
 struct pglist vm_page_queue_cache;
 
+int no_queue;
+
+struct {
+	struct pglist *pl;
+	int	*cnt;
+} vm_page_queues[PQ_CACHE+1] = {
+	{NULL, &no_queue},
+	{ &vm_page_queue_free, &cnt.v_free_count},
+	{ &vm_page_queue_zero, &cnt.v_free_count},
+	{ &vm_page_queue_inactive, &cnt.v_inactive_count},
+	{ &vm_page_queue_active, &cnt.v_active_count},
+	{ &vm_page_queue_cache, &cnt.v_cache_count}
+};
+
 vm_page_t vm_page_array;
 static int vm_page_array_size;
 long first_page;
 static long last_page;
 static vm_size_t page_mask;
 static int page_shift;
 int vm_page_zero_count;
 
 /*
  * map of contiguous valid DEV_BSIZE chunks in a page
  * (this list is valid for page sizes upto 16*DEV_BSIZE)
  */
 static u_short vm_page_dev_bsize_chunks[] = {
 	0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff,
 	0x1ff, 0x3ff, 0x7ff, 0xfff, 0x1fff, 0x3fff, 0x7fff, 0xffff
 };
 
 static inline __pure int
 		vm_page_hash __P((vm_object_t object, vm_pindex_t pindex))
 		__pure2;
 static void	vm_page_unqueue __P((vm_page_t ));
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  *
  *	Sets page_shift and page_mask from cnt.v_page_size.
  */
 void
 vm_set_page_size()
 {
 
 	if (cnt.v_page_size == 0)
 		cnt.v_page_size = DEFAULT_PAGE_SIZE;
 	page_mask = cnt.v_page_size - 1;
 	if ((page_mask & cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 	for (page_shift = 0;; page_shift++)
 		if ((1 << page_shift) == cnt.v_page_size)
 			break;
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
  *
  *	Allocates memory for the page cells, and
  *	for the object/offset-to-page hash table headers.
  *	Each page cell is initialized and placed on the free list.
  */
 
 vm_offset_t
 vm_page_startup(starta, enda, vaddr)
 	register vm_offset_t starta;
 	vm_offset_t enda;
 	register vm_offset_t vaddr;
 {
 	register vm_offset_t mapped;
 	register vm_page_t m;
 	register struct pglist *bucket;
 	vm_size_t npages, page_range;
 	register vm_offset_t new_start;
 	int i;
 	vm_offset_t pa;
 	int nblocks;
 	vm_offset_t first_managed_page;
 
 	/* the biggest memory array is the second group of pages */
 	vm_offset_t start;
 	vm_offset_t biggestone, biggestsize;
 
 	vm_offset_t total;
 
 	total = 0;
 	biggestsize = 0;
 	biggestone = 0;
 	nblocks = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		int size = phys_avail[i + 1] - phys_avail[i];
 
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 		++nblocks;
 		total += size;
 	}
 
 	start = phys_avail[biggestone];
 
 	/*
 	 * Initialize the queue headers for the free queue, the active queue
 	 * and the inactive queue.
 	 */
 
 	TAILQ_INIT(&vm_page_queue_free);
 	TAILQ_INIT(&vm_page_queue_zero);
 	TAILQ_INIT(&vm_page_queue_active);
 	TAILQ_INIT(&vm_page_queue_inactive);
 	TAILQ_INIT(&vm_page_queue_cache);
 
 	/*
 	 * Allocate (and initialize) the hash table buckets.
 	 *
 	 * The number of buckets MUST BE a power of 2, and the actual value is
 	 * the next power of 2 greater than the number of physical pages in
 	 * the system.
 	 *
 	 * Note: This computation can be tweaked if desired.
 	 */
 	vm_page_buckets = (struct pglist *) vaddr;
 	bucket = vm_page_buckets;
 	if (vm_page_bucket_count == 0) {
-		vm_page_bucket_count = 1;
+		vm_page_bucket_count = 2;
 		while (vm_page_bucket_count < atop(total))
 			vm_page_bucket_count <<= 1;
 	}
 	vm_page_hash_mask = vm_page_bucket_count - 1;
 
 	/*
 	 * Validate these addresses.
 	 */
 
 	new_start = start + vm_page_bucket_count * sizeof(struct pglist);
 	new_start = round_page(new_start);
 	mapped = vaddr;
 	vaddr = pmap_map(mapped, start, new_start,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	start = new_start;
 	bzero((caddr_t) mapped, vaddr - mapped);
 	mapped = vaddr;
 
 	for (i = 0; i < vm_page_bucket_count; i++) {
 		TAILQ_INIT(bucket);
 		bucket++;
 	}
 
 	/*
 	 * round (or truncate) the addresses to our page size.
 	 */
 
 	/*
 	 * Pre-allocate maps and map entries that cannot be dynamically
 	 * allocated via malloc().  The maps include the kernel_map and
 	 * kmem_map which must be initialized before malloc() will work
 	 * (obviously).  Also could include pager maps which would be
 	 * allocated before kmeminit.
 	 *
 	 * Allow some kernel map entries... this should be plenty since people
 	 * shouldn't be cluttering up the kernel map (they should use their
 	 * own maps).
 	 */
 
 	kentry_data_size = MAX_KMAP * sizeof(struct vm_map) +
 	    MAX_KMAPENT * sizeof(struct vm_map_entry);
 	kentry_data_size = round_page(kentry_data_size);
 	kentry_data = (vm_offset_t) vaddr;
 	vaddr += kentry_data_size;
 
 	/*
 	 * Validate these zone addresses.
 	 */
 
 	new_start = start + (vaddr - mapped);
 	pmap_map(mapped, start, new_start, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((caddr_t) mapped, (vaddr - mapped));
 	start = round_page(new_start);
 
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
 
 	first_page = phys_avail[0] / PAGE_SIZE;
 	last_page = phys_avail[(nblocks - 1) * 2 + 1] / PAGE_SIZE;
 
 	page_range = last_page - (phys_avail[0] / PAGE_SIZE);
 	npages = (total - (page_range * sizeof(struct vm_page)) -
 	    (start - phys_avail[biggestone])) / PAGE_SIZE;
 
 	/*
 	 * Initialize the mem entry structures now, and put them in the free
 	 * queue.
 	 */
 
 	vm_page_array = (vm_page_t) vaddr;
 	mapped = vaddr;
 
 	/*
 	 * Validate these addresses.
 	 */
 
 	new_start = round_page(start + page_range * sizeof(struct vm_page));
 	mapped = pmap_map(mapped, start, new_start,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	start = new_start;
 
 	first_managed_page = start / PAGE_SIZE;
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	vm_page_array_size = page_range;
 
 	cnt.v_page_count = 0;
 	cnt.v_free_count = 0;
 	for (i = 0; phys_avail[i + 1] && npages > 0; i += 2) {
 		if (i == biggestone)
 			pa = ptoa(first_managed_page);
 		else
 			pa = phys_avail[i];
 		while (pa < phys_avail[i + 1] && npages-- > 0) {
 			++cnt.v_page_count;
 			++cnt.v_free_count;
 			m = PHYS_TO_VM_PAGE(pa);
-			m->flags = PG_FREE;
+			m->queue = PQ_FREE;
+			m->flags = 0;
 			m->phys_addr = pa;
 			TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq);
 			pa += PAGE_SIZE;
 		}
 	}
 
 	return (mapped);
 }
 
 /*
  *	vm_page_hash:
  *
  *	Distributes the object/offset key pair among hash buckets.
  *
  *	NOTE:  This macro depends on vm_page_bucket_count being a power of 2.
  */
 static inline __pure int
 vm_page_hash(object, pindex)
 	vm_object_t object;
 	vm_pindex_t pindex;
 {
 	return ((unsigned) object + pindex) & vm_page_hash_mask;
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object/object-page
  *	table and object list.
  *
  *	The object and page must be locked, and must be splhigh.
  */
 
 inline void
-vm_page_insert(mem, object, pindex)
-	register vm_page_t mem;
+vm_page_insert(m, object, pindex)
+	register vm_page_t m;
 	register vm_object_t object;
 	register vm_pindex_t pindex;
 {
 	register struct pglist *bucket;
 
-	if (mem->flags & PG_TABLED)
+	if (m->flags & PG_TABLED)
 		panic("vm_page_insert: already inserted");
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 
-	mem->object = object;
-	mem->pindex = pindex;
+	m->object = object;
+	m->pindex = pindex;
 
 	/*
 	 * Insert it into the object_object/offset hash table
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
-	TAILQ_INSERT_TAIL(bucket, mem, hashq);
+	TAILQ_INSERT_TAIL(bucket, m, hashq);
 
 	/*
 	 * Now link into the object's list of backed pages.
 	 */
 
-	TAILQ_INSERT_TAIL(&object->memq, mem, listq);
-	mem->flags |= PG_TABLED;
+	TAILQ_INSERT_TAIL(&object->memq, m, listq);
+	m->flags |= PG_TABLED;
 
 	/*
 	 * And show that the object has one more resident page.
 	 */
 
 	object->resident_page_count++;
 }
 
 /*
  *	vm_page_remove:		[ internal use only ]
  *				NOTE: used by device pager as well -wfj
  *
  *	Removes the given mem entry from the object/offset-page
  *	table and the object page list.
  *
  *	The object and page must be locked, and at splhigh.
  */
 
 inline void
-vm_page_remove(mem)
-	register vm_page_t mem;
+vm_page_remove(m)
+	register vm_page_t m;
 {
 	register struct pglist *bucket;
 
-	if (!(mem->flags & PG_TABLED))
+	if (!(m->flags & PG_TABLED))
 		return;
 
 	/*
 	 * Remove from the object_object/offset hash table
 	 */
 
-	bucket = &vm_page_buckets[vm_page_hash(mem->object, mem->pindex)];
-	TAILQ_REMOVE(bucket, mem, hashq);
+	bucket = &vm_page_buckets[vm_page_hash(m->object, m->pindex)];
+	TAILQ_REMOVE(bucket, m, hashq);
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 
-	TAILQ_REMOVE(&mem->object->memq, mem, listq);
+	TAILQ_REMOVE(&m->object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 
-	mem->object->resident_page_count--;
+	m->object->resident_page_count--;
 
-	mem->flags &= ~PG_TABLED;
+	m->flags &= ~PG_TABLED;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.  No side effects.
  */
 
 vm_page_t
 vm_page_lookup(object, pindex)
 	register vm_object_t object;
 	register vm_pindex_t pindex;
 {
-	register vm_page_t mem;
+	register vm_page_t m;
 	register struct pglist *bucket;
 	int s;
 
 	/*
 	 * Search the hash table for this object/offset pair
 	 */
 
 	bucket = &vm_page_buckets[vm_page_hash(object, pindex)];
 
 	s = splhigh();
-	for (mem = bucket->tqh_first; mem != NULL; mem = mem->hashq.tqe_next) {
-		if ((mem->object == object) && (mem->pindex == pindex)) {
+	for (m = bucket->tqh_first; m != NULL; m = m->hashq.tqe_next) {
+		if ((m->object == object) && (m->pindex == pindex)) {
 			splx(s);
-			return (mem);
+			return (m);
 		}
 	}
 
 	splx(s);
 	return (NULL);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	The object must be locked.
  */
 void
-vm_page_rename(mem, new_object, new_pindex)
-	register vm_page_t mem;
+vm_page_rename(m, new_object, new_pindex)
+	register vm_page_t m;
 	register vm_object_t new_object;
 	vm_pindex_t new_pindex;
 {
 	int s;
 
 	s = splhigh();
-	vm_page_remove(mem);
-	vm_page_insert(mem, new_object, new_pindex);
+	vm_page_remove(m);
+	vm_page_insert(m, new_object, new_pindex);
 	splx(s);
 }
 
 /*
  * vm_page_unqueue must be called at splhigh();
  */
 static inline void
-vm_page_unqueue(vm_page_t mem)
+vm_page_unqueue(vm_page_t m)
 {
-	int origflags;
-
-	origflags = mem->flags;
-
-	if ((origflags & (PG_ACTIVE|PG_INACTIVE|PG_CACHE)) == 0)
+	int queue = m->queue;
+	if (queue == PQ_NONE)
 		return;
-
-	if (origflags & PG_ACTIVE) {
-		TAILQ_REMOVE(&vm_page_queue_active, mem, pageq);
-		cnt.v_active_count--;
-		mem->flags &= ~PG_ACTIVE;
-	} else if (origflags & PG_INACTIVE) {
-		TAILQ_REMOVE(&vm_page_queue_inactive, mem, pageq);
-		cnt.v_inactive_count--;
-		mem->flags &= ~PG_INACTIVE;
-	} else if (origflags & PG_CACHE) {
-		TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq);
-		cnt.v_cache_count--;
-		mem->flags &= ~PG_CACHE;
-		if (cnt.v_cache_count + cnt.v_free_count < cnt.v_free_reserved)
+	m->queue = PQ_NONE;
+	TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq);
+	--(*vm_page_queues[queue].cnt);
+	if (queue == PQ_CACHE) {
+		if ((cnt.v_cache_count + cnt.v_free_count) <
+			(cnt.v_free_min + cnt.v_cache_min))
 			pagedaemon_wakeup();
 	}
 	return;
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a memory cell associated
  *	with this VM object/offset pair.
  *
  *	page_req classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
- *	or in:
  *	VM_ALLOC_ZERO		zero page
  *
  *	Object must be locked.
  */
 vm_page_t
 vm_page_alloc(object, pindex, page_req)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int page_req;
 {
-	register vm_page_t mem;
+	register vm_page_t m;
+	int queue;
 	int s;
 
 #ifdef DIAGNOSTIC
-	mem = vm_page_lookup(object, pindex);
-	if (mem)
+	m = vm_page_lookup(object, pindex);
+	if (m)
 		panic("vm_page_alloc: page already allocated");
 #endif
 
 	if ((curproc == pageproc) && (page_req != VM_ALLOC_INTERRUPT)) {
 		page_req = VM_ALLOC_SYSTEM;
 	};
 
 	s = splhigh();
 
-	switch ((page_req & ~(VM_ALLOC_ZERO))) {
+	switch (page_req) {
+
 	case VM_ALLOC_NORMAL:
 		if (cnt.v_free_count >= cnt.v_free_reserved) {
-			if (page_req & VM_ALLOC_ZERO) {
-				mem = vm_page_queue_zero.tqh_first;
-				if (mem) {
-					--vm_page_zero_count;
-					TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq);
-					mem->flags = PG_BUSY|PG_ZERO;
-				} else {
-					mem = vm_page_queue_free.tqh_first;
-					TAILQ_REMOVE(&vm_page_queue_free, mem, pageq);
-					mem->flags = PG_BUSY;
-				}
-			} else {
-				mem = vm_page_queue_free.tqh_first;
-				if (mem) {
-					TAILQ_REMOVE(&vm_page_queue_free, mem, pageq);
-					mem->flags = PG_BUSY;
-				} else {
-					--vm_page_zero_count;
-					mem = vm_page_queue_zero.tqh_first;
-					TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq);
-					mem->flags = PG_BUSY|PG_ZERO;
-				}
+			m = vm_page_queue_free.tqh_first;
+			if (m == NULL) {
+				--vm_page_zero_count;
+				m = vm_page_queue_zero.tqh_first;
 			}
-			cnt.v_free_count--;
 		} else {
-			mem = vm_page_queue_cache.tqh_first;
-			if (mem != NULL) {
-				TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq);
-				vm_page_remove(mem);
-				mem->flags = PG_BUSY;
-				cnt.v_cache_count--;
+			m = vm_page_queue_cache.tqh_first;
+			if (m == NULL) {
+				splx(s);
+				pagedaemon_wakeup();
+				return (NULL);
+			}
+		}
+		break;
+
+	case VM_ALLOC_ZERO:
+		if (cnt.v_free_count >= cnt.v_free_reserved) {
+			m = vm_page_queue_zero.tqh_first;
+			if (m) {
+				--vm_page_zero_count;
 			} else {
+				m = vm_page_queue_free.tqh_first;
+			}
+		} else {
+			m = vm_page_queue_cache.tqh_first;
+			if (m == NULL) {
 				splx(s);
 				pagedaemon_wakeup();
 				return (NULL);
 			}
 		}
 		break;
 
 	case VM_ALLOC_SYSTEM:
 		if ((cnt.v_free_count >= cnt.v_free_reserved) ||
 		    ((cnt.v_cache_count == 0) &&
 		    (cnt.v_free_count >= cnt.v_interrupt_free_min))) {
-			if (page_req & VM_ALLOC_ZERO) {
-				mem = vm_page_queue_zero.tqh_first;
-				if (mem) {
+				m = vm_page_queue_free.tqh_first;
+				if (m == NULL) {
 					--vm_page_zero_count;
-					TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq);
-					mem->flags = PG_BUSY|PG_ZERO;
-				} else {
-					mem = vm_page_queue_free.tqh_first;
-					TAILQ_REMOVE(&vm_page_queue_free, mem, pageq);
-					mem->flags = PG_BUSY;
+					m = vm_page_queue_zero.tqh_first;
 				}
-			} else {
-				mem = vm_page_queue_free.tqh_first;
-				if (mem) {
-					TAILQ_REMOVE(&vm_page_queue_free, mem, pageq);
-					mem->flags = PG_BUSY;
-				} else {
-					--vm_page_zero_count;
-					mem = vm_page_queue_zero.tqh_first;
-					TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq);
-					mem->flags = PG_BUSY|PG_ZERO;
-				}
-			}
-			cnt.v_free_count--;
 		} else {
-			mem = vm_page_queue_cache.tqh_first;
-			if (mem != NULL) {
-				TAILQ_REMOVE(&vm_page_queue_cache, mem, pageq);
-				vm_page_remove(mem);
-				mem->flags = PG_BUSY;
-				cnt.v_cache_count--;
-			} else {
+			m = vm_page_queue_cache.tqh_first;
+			if (m == NULL) {
 				splx(s);
 				pagedaemon_wakeup();
 				return (NULL);
 			}
 		}
 		break;
 
 	case VM_ALLOC_INTERRUPT:
 		if (cnt.v_free_count > 0) {
-			mem = vm_page_queue_free.tqh_first;
-			if (mem) {
-				TAILQ_REMOVE(&vm_page_queue_free, mem, pageq);
-				mem->flags = PG_BUSY;
-			} else {
+			m = vm_page_queue_free.tqh_first;
+			if (m == NULL) {
 				--vm_page_zero_count;
-				mem = vm_page_queue_zero.tqh_first;
-				TAILQ_REMOVE(&vm_page_queue_zero, mem, pageq);
-				mem->flags = PG_BUSY|PG_ZERO;
+				m = vm_page_queue_zero.tqh_first;
 			}
-			cnt.v_free_count--;
 		} else {
 			splx(s);
 			pagedaemon_wakeup();
-			return NULL;
+			return (NULL);
 		}
 		break;
 
 	default:
 		panic("vm_page_alloc: invalid allocation class");
 	}
 
-	mem->wire_count = 0;
-	mem->hold_count = 0;
-	mem->act_count = 0;
-	mem->busy = 0;
-	mem->valid = 0;
-	mem->dirty = 0;
-	mem->bmapped = 0;
+	queue = m->queue;
+	TAILQ_REMOVE(vm_page_queues[queue].pl, m, pageq);
+	--(*vm_page_queues[queue].cnt);
+	if (queue == PQ_ZERO) {
+		m->flags = PG_ZERO|PG_BUSY;
+	} else if (queue == PQ_CACHE) {
+		vm_page_remove(m);
+		m->flags = PG_BUSY;
+	} else {
+		m->flags = PG_BUSY;
+	}
+	m->wire_count = 0;
+	m->hold_count = 0;
+	m->act_count = 0;
+	m->busy = 0;
+	m->valid = 0;
+	m->dirty = 0;
+	m->queue = PQ_NONE;
 
 	/* XXX before splx until vm_page_insert is safe */
-	vm_page_insert(mem, object, pindex);
+	vm_page_insert(m, object, pindex);
 
 	splx(s);
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
-	if (((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min) ||
-	    (cnt.v_free_count < cnt.v_pageout_free_min))
+	if (((cnt.v_free_count + cnt.v_cache_count) <
+		(cnt.v_free_min + cnt.v_cache_min)) ||
+			(cnt.v_free_count < cnt.v_pageout_free_min))
 		pagedaemon_wakeup();
 
-	return (mem);
+	return (m);
 }
 
 vm_offset_t
 vm_page_alloc_contig(size, low, high, alignment)
 	vm_offset_t size;
 	vm_offset_t low;
 	vm_offset_t high;
 	vm_offset_t alignment;
 {
 	int i, s, start;
 	vm_offset_t addr, phys, tmp_addr;
 	vm_page_t pga = vm_page_array;
 
 	if ((alignment & (alignment - 1)) != 0)
 		panic("vm_page_alloc_contig: alignment must be a power of 2");
 
 	start = 0;
 	s = splhigh();
 again:
 	/*
 	 * Find first page in array that is free, within range, and aligned.
 	 */
 	for (i = start; i < cnt.v_page_count; i++) {
 		phys = VM_PAGE_TO_PHYS(&pga[i]);
-		if (((pga[i].flags & PG_FREE) == PG_FREE) &&
+		if ((pga[i].queue == PQ_FREE) &&
 		    (phys >= low) && (phys < high) &&
 		    ((phys & (alignment - 1)) == 0))
 			break;
 	}
 
 	/*
 	 * If the above failed or we will exceed the upper bound, fail.
 	 */
-	if ((i == cnt.v_page_count) || ((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
+	if ((i == cnt.v_page_count) ||
+		((VM_PAGE_TO_PHYS(&pga[i]) + size) > high)) {
 		splx(s);
 		return (NULL);
 	}
 	start = i;
 
 	/*
 	 * Check successive pages for contiguous and free.
 	 */
 	for (i = start + 1; i < (start + size / PAGE_SIZE); i++) {
 		if ((VM_PAGE_TO_PHYS(&pga[i]) !=
-			(VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
-		    ((pga[i].flags & PG_FREE) != PG_FREE)) {
+		    (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) ||
+		    (pga[i].queue != PQ_FREE)) {
 			start++;
 			goto again;
 		}
 	}
 
 	/*
 	 * We've found a contiguous chunk that meets are requirements.
 	 * Allocate kernel VM, unfree and assign the physical pages to it and
 	 * return kernel VM pointer.
 	 */
 	tmp_addr = addr = kmem_alloc_pageable(kernel_map, size);
 
 	for (i = start; i < (start + size / PAGE_SIZE); i++) {
 		vm_page_t m = &pga[i];
 
 		TAILQ_REMOVE(&vm_page_queue_free, m, pageq);
 		cnt.v_free_count--;
 		m->valid = VM_PAGE_BITS_ALL;
 		m->flags = 0;
 		m->dirty = 0;
 		m->wire_count = 0;
 		m->act_count = 0;
-		m->bmapped = 0;
 		m->busy = 0;
+		m->queue = PQ_NONE;
 		vm_page_insert(m, kernel_object,
 			OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS));
 		vm_page_wire(m);
 		pmap_kenter(tmp_addr, VM_PAGE_TO_PHYS(m));
 		tmp_addr += PAGE_SIZE;
 	}
 
 	splx(s);
 	return (addr);
 }
 
 /*
  *	vm_page_free:
  *
  *	Returns the given page to the free list,
  *	disassociating it with any VM object.
  *
  *	Object and page must be locked prior to entry.
  */
 void
-vm_page_free(mem)
-	register vm_page_t mem;
+vm_page_free(m)
+	register vm_page_t m;
 {
 	int s;
-	int flags;
+	int flags = m->flags;
 
 	s = splhigh();
-	vm_page_remove(mem);
-	vm_page_unqueue(mem);
-
-	flags = mem->flags;
-	if (mem->bmapped || mem->busy || flags & (PG_BUSY|PG_FREE)) {
-		if (flags & PG_FREE)
+	if (m->busy || (flags & PG_BUSY) || (m->queue == PQ_FREE)) {
+		printf("vm_page_free: pindex(%ld), busy(%d), PG_BUSY(%d)\n",
+		    m->pindex, m->busy, (flags & PG_BUSY) ? 1 : 0);
+		if (m->queue == PQ_FREE)
 			panic("vm_page_free: freeing free page");
-		printf("vm_page_free: pindex(%ld), bmapped(%d), busy(%d), PG_BUSY(%d)\n",
-		    mem->pindex, mem->bmapped, mem->busy, (flags & PG_BUSY) ? 1 : 0);
-		panic("vm_page_free: freeing busy page");
+		else
+			panic("vm_page_free: freeing busy page");
 	}
 
+	vm_page_remove(m);
+	vm_page_unqueue(m);
+
+/*
 	if ((flags & PG_WANTED) != 0)
-		wakeup(mem);
+		wakeup(m);
+*/
 	if ((flags & PG_FICTITIOUS) == 0) {
-		if (mem->wire_count) {
-			if (mem->wire_count > 1) {
-				printf("vm_page_free: wire count > 1 (%d)", mem->wire_count);
+		if (m->wire_count) {
+			if (m->wire_count > 1) {
+				printf("vm_page_free: wire count > 1 (%d)", m->wire_count);
 				panic("vm_page_free: invalid wire count");
 			}
 			cnt.v_wire_count--;
-			mem->wire_count = 0;
+			m->wire_count = 0;
 		}
-		mem->flags |= PG_FREE;
-		TAILQ_INSERT_TAIL(&vm_page_queue_free, mem, pageq);
+		m->queue = PQ_FREE;
+		TAILQ_INSERT_TAIL(&vm_page_queue_free, m, pageq);
 		splx(s);
 		/*
 		 * if pageout daemon needs pages, then tell it that there are
 		 * some free.
 		 */
 		if (vm_pageout_pages_needed) {
 			wakeup(&vm_pageout_pages_needed);
 			vm_pageout_pages_needed = 0;
 		}
 
 		cnt.v_free_count++;
 		/*
 		 * wakeup processes that are waiting on memory if we hit a
 		 * high water mark. And wakeup scheduler process if we have
 		 * lots of memory. this process will swapin processes.
 		 */
 		if ((cnt.v_free_count + cnt.v_cache_count) == cnt.v_free_min) {
 			wakeup(&cnt.v_free_count);
 			wakeup(&proc0);
 		}
 	} else {
 		splx(s);
 	}
 	cnt.v_tfree++;
 }
 
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	The page queues must be locked.
  */
 void
-vm_page_wire(mem)
-	register vm_page_t mem;
+vm_page_wire(m)
+	register vm_page_t m;
 {
 	int s;
 
-	if (mem->wire_count == 0) {
+	if (m->wire_count == 0) {
 		s = splhigh();
-		vm_page_unqueue(mem);
+		vm_page_unqueue(m);
 		splx(s);
 		cnt.v_wire_count++;
 	}
-	mem->flags |= PG_WRITEABLE|PG_MAPPED;
-	mem->wire_count++;
+	m->wire_count++;
+	m->flags |= PG_MAPPED;
 }
 
 /*
  *	vm_page_unwire:
  *
  *	Release one wiring of this page, potentially
  *	enabling it to be paged again.
  *
  *	The page queues must be locked.
  */
 void
-vm_page_unwire(mem)
-	register vm_page_t mem;
+vm_page_unwire(m)
+	register vm_page_t m;
 {
 	int s;
 
 	s = splhigh();
 
-	if (mem->wire_count)
-		mem->wire_count--;
-	if (mem->wire_count == 0) {
-		TAILQ_INSERT_TAIL(&vm_page_queue_active, mem, pageq);
-		cnt.v_active_count++;
-		mem->flags |= PG_ACTIVE;
+	if (m->wire_count > 0)
+		m->wire_count--;
+		
+	if (m->wire_count == 0) {
 		cnt.v_wire_count--;
+		TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
+		m->queue = PQ_ACTIVE;
+		if( m->act_count < ACT_MAX)
+			m->act_count += 1;
+		cnt.v_active_count++;
 	}
 	splx(s);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *
  *	The page queues must be locked.
  */
 void
 vm_page_activate(m)
 	register vm_page_t m;
 {
 	int s;
 
 	s = splhigh();
-	if (m->flags & PG_ACTIVE)
+	if (m->queue == PQ_ACTIVE)
 		panic("vm_page_activate: already active");
 
-	if (m->flags & PG_CACHE)
+	if (m->queue == PQ_CACHE)
 		cnt.v_reactivated++;
 
 	vm_page_unqueue(m);
 
 	if (m->wire_count == 0) {
 		TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
-		m->flags |= PG_ACTIVE;
+		m->queue = PQ_ACTIVE;
 		if (m->act_count < 5)
 			m->act_count = 5;
 		else if( m->act_count < ACT_MAX)
 			m->act_count += 1;
 		cnt.v_active_count++;
 	}
 	splx(s);
 }
 
 /*
  *	vm_page_deactivate:
  *
  *	Returns the given page to the inactive list,
  *	indicating that no physical maps have access
  *	to this page.  [Used by the physical mapping system.]
  *
  *	The page queues must be locked.
  */
 void
 vm_page_deactivate(m)
 	register vm_page_t m;
 {
 	int spl;
 
 	/*
 	 * Only move active pages -- ignore locked or already inactive ones.
 	 *
 	 * XXX: sometimes we get pages which aren't wired down or on any queue -
 	 * we need to put them on the inactive queue also, otherwise we lose
 	 * track of them. Paul Mackerras (paulus@cs.anu.edu.au) 9-Jan-93.
 	 */
+	if (m->queue == PQ_INACTIVE)
+		return;
 
 	spl = splhigh();
-	if (!(m->flags & PG_INACTIVE) && m->wire_count == 0 &&
-	    m->hold_count == 0) {
-		if (m->flags & PG_CACHE)
+	if (m->wire_count == 0 && m->hold_count == 0) {
+		if (m->queue == PQ_CACHE)
 			cnt.v_reactivated++;
 		vm_page_unqueue(m);
 		TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
-		m->flags |= PG_INACTIVE;
+		m->queue = PQ_INACTIVE;
 		cnt.v_inactive_count++;
 		m->act_count = 0;
 	}
 	splx(spl);
 }
 
 /*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
  */
 void
 vm_page_cache(m)
 	register vm_page_t m;
 {
 	int s;
 
-	if ((m->flags & (PG_CACHE | PG_BUSY)) || m->busy || m->wire_count ||
-	    m->bmapped)
+	if ((m->flags & PG_BUSY) || m->busy || m->wire_count)
 		return;
+	if (m->queue == PQ_CACHE)
+		return;
 
+	vm_page_protect(m, VM_PROT_NONE);
 	s = splhigh();
 	vm_page_unqueue(m);
-	vm_page_protect(m, VM_PROT_NONE);
-
 	TAILQ_INSERT_TAIL(&vm_page_queue_cache, m, pageq);
-	m->flags |= PG_CACHE;
+	m->queue = PQ_CACHE;
 	cnt.v_cache_count++;
 	if ((cnt.v_free_count + cnt.v_cache_count) == cnt.v_free_min) {
 		wakeup(&cnt.v_free_count);
 		wakeup(&proc0);
 	}
 	if (vm_pageout_pages_needed) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
-
 	splx(s);
 }
 
 /*
  *	vm_page_zero_fill:
  *
  *	Zero-fill the specified page.
  *	Written as a standard pagein routine, to
  *	be used by the zero-fill object.
  */
 boolean_t
 vm_page_zero_fill(m)
 	vm_page_t m;
 {
 	pmap_zero_page(VM_PAGE_TO_PHYS(m));
 	return (TRUE);
 }
 
 /*
  *	vm_page_copy:
  *
  *	Copy one page to another
  */
 void
 vm_page_copy(src_m, dest_m)
 	vm_page_t src_m;
 	vm_page_t dest_m;
 {
 	pmap_copy_page(VM_PAGE_TO_PHYS(src_m), VM_PAGE_TO_PHYS(dest_m));
 	dest_m->valid = VM_PAGE_BITS_ALL;
 }
 
 
 /*
  * mapping function for valid bits or for dirty bits in
  * a page
  */
 inline int
 vm_page_bits(int base, int size)
 {
 	u_short chunk;
 
 	if ((base == 0) && (size >= PAGE_SIZE))
 		return VM_PAGE_BITS_ALL;
 	size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 	base = (base % PAGE_SIZE) / DEV_BSIZE;
 	chunk = vm_page_dev_bsize_chunks[size / DEV_BSIZE];
 	return (chunk << base) & VM_PAGE_BITS_ALL;
 }
 
 /*
  * set a page valid and clean
  */
 void
 vm_page_set_validclean(m, base, size)
 	vm_page_t m;
 	int base;
 	int size;
 {
 	int pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 	m->dirty &= ~pagebits;
 	if( base == 0 && size == PAGE_SIZE)
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 }
 
 /*
  * set a page (partially) invalid
  */
 void
 vm_page_set_invalid(m, base, size)
 	vm_page_t m;
 	int base;
 	int size;
 {
 	int bits;
 
 	m->valid &= ~(bits = vm_page_bits(base, size));
 	if (m->valid == 0)
 		m->dirty &= ~bits;
 }
 
 /*
  * is (partial) page valid?
  */
 int
 vm_page_is_valid(m, base, size)
 	vm_page_t m;
 	int base;
 	int size;
 {
 	int bits = vm_page_bits(base, size);
 
 	if (m->valid && ((m->valid & bits) == bits))
 		return 1;
 	else
 		return 0;
 }
 
 
 
 void
 vm_page_test_dirty(m)
 	vm_page_t m;
 {
 	if ((m->dirty != VM_PAGE_BITS_ALL) &&
 	    pmap_is_modified(VM_PAGE_TO_PHYS(m))) {
 		m->dirty = VM_PAGE_BITS_ALL;
 	}
 }
 
 #ifdef DDB
 void
 DDB_print_page_info(void)
 {
 	printf("cnt.v_free_count: %d\n", cnt.v_free_count);
 	printf("cnt.v_cache_count: %d\n", cnt.v_cache_count);
 	printf("cnt.v_inactive_count: %d\n", cnt.v_inactive_count);
 	printf("cnt.v_active_count: %d\n", cnt.v_active_count);
 	printf("cnt.v_wire_count: %d\n", cnt.v_wire_count);
 	printf("cnt.v_free_reserved: %d\n", cnt.v_free_reserved);
 	printf("cnt.v_free_min: %d\n", cnt.v_free_min);
 	printf("cnt.v_free_target: %d\n", cnt.v_free_target);
 	printf("cnt.v_cache_min: %d\n", cnt.v_cache_min);
 	printf("cnt.v_inactive_target: %d\n", cnt.v_inactive_target);
 }
 #endif
Index: head/sys/vm/vm_page.h
===================================================================
--- head/sys/vm/vm_page.h	(revision 13489)
+++ head/sys/vm/vm_page.h	(revision 13490)
@@ -1,298 +1,303 @@
 /*
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.h	8.2 (Berkeley) 12/13/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_page.h,v 1.22 1995/11/20 12:19:32 phk Exp $
+ * $Id: vm_page.h,v 1.23 1995/12/11 04:58:26 dyson Exp $
  */
 
 /*
  *	Resident memory system definitions.
  */
 
 #ifndef	_VM_PAGE_
 #define	_VM_PAGE_
 
 #include <vm/pmap.h>
 /*
  *	Management of resident (logical) pages.
  *
  *	A small structure is kept for each resident
  *	page, indexed by page number.  Each structure
  *	is an element of several lists:
  *
  *		A hash table bucket used to quickly
  *		perform object/offset lookups
  *
  *		A list of all pages for a given object,
  *		so they can be quickly deactivated at
  *		time of deallocation.
  *
  *		An ordered list of pages due for pageout.
  *
  *	In addition, the structure contains the object
  *	and offset to which this page belongs (for pageout),
  *	and sundry status bits.
  *
  *	Fields in this structure are locked either by the lock on the
  *	object that the page belongs to (O) or by the lock on the page
  *	queues (P).
  */
 
 TAILQ_HEAD(pglist, vm_page);
 
 struct vm_page {
 	TAILQ_ENTRY(vm_page) pageq;	/* queue info for FIFO queue or free list (P) */
 	TAILQ_ENTRY(vm_page) hashq;	/* hash table links (O) */
 	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
 
 	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_offset_t phys_addr;		/* physical address of page */
-
+	u_short	queue:4,		/* page queue index */
+		flags:12;		/* see below */
 	u_short wire_count;		/* wired down maps refs (P) */
-	u_short flags;			/* see below */
 	short hold_count;		/* page hold count */
-	u_short act_count;		/* page usage count */
-	u_short bmapped;		/* number of buffers mapped */
-	u_short busy;			/* page busy count */
-	u_short valid;			/* map of valid DEV_BSIZE chunks */
-	u_short dirty;			/* map of dirty DEV_BSIZE chunks */
+	u_char	act_count;		/* page usage count */
+	u_char	busy;			/* page busy count */
+	/* NOTE that these must support one bit per DEV_BSIZE in a page!!! */
+	/* so, on normal X86 kernels, they must be at least 8 bits wide */
+	u_char	valid;			/* map of valid DEV_BSIZE chunks */
+	u_char	dirty;			/* map of dirty DEV_BSIZE chunks */
 };
 
+#define PQ_NONE 0
+#define PQ_FREE	1
+#define PQ_ZERO 2
+#define PQ_INACTIVE 3
+#define PQ_ACTIVE 4
+#define PQ_CACHE 5
+
 /*
  * These are the flags defined for vm_page.
  *
  * Note: PG_FILLED and PG_DIRTY are added for the filesystems.
  */
-#define	PG_INACTIVE	0x0001		/* page is in inactive list (P) */
-#define	PG_ACTIVE	0x0002		/* page is in active list (P) */
-#define	PG_BUSY		0x0010		/* page is in transit (O) */
-#define	PG_WANTED	0x0020		/* someone is waiting for page (O) */
-#define	PG_TABLED	0x0040		/* page is in VP table (O) */
-#define	PG_FICTITIOUS	0x0100		/* physical page doesn't exist (O) */
-#define	PG_WRITEABLE	0x0200		/* page is mapped writeable */
-#define PG_MAPPED	0x0400		/* page is mapped */
-#define	PG_ZERO		0x0800		/* page is zeroed */
-#define PG_REFERENCED	0x1000		/* page has been referenced */
-#define	PG_CACHE	0x4000		/* On VMIO cache */
-#define	PG_FREE		0x8000		/* page is in free list */
+#define	PG_BUSY		0x01		/* page is in transit (O) */
+#define	PG_WANTED	0x02		/* someone is waiting for page (O) */
+#define	PG_TABLED	0x04		/* page is in VP table (O) */
+#define	PG_FICTITIOUS	0x08		/* physical page doesn't exist (O) */
+#define	PG_WRITEABLE	0x10		/* page is mapped writeable */
+#define PG_MAPPED	0x20		/* page is mapped */
+#define	PG_ZERO		0x40		/* page is zeroed */
+#define PG_REFERENCED	0x80		/* page has been referenced */
+#define PG_CLEANCHK	0x100		/* page has been checked for cleaning */
 
 /*
  * Misc constants.
  */
 
 #define ACT_DECLINE		1
 #define ACT_ADVANCE		3
 #define ACT_MAX			100
 #define PFCLUSTER_BEHIND	3
 #define PFCLUSTER_AHEAD		3
 
 #ifdef KERNEL
 /*
  * Each pageable resident page falls into one of four lists:
  *
  *	free
  *		Available for allocation now.
  *
  * The following are all LRU sorted:
  *
  *	cache
  *		Almost available for allocation. Still in an
  *		object, but clean and immediately freeable at
  *		non-interrupt times.
  *
  *	inactive
  *		Low activity, candidates for reclaimation.
  *		This is the list of pages that should be
  *		paged out next.
  *
  *	active
  *		Pages that are "active" i.e. they have been
  *		recently referenced.
  *
  *	zero
  *		Pages that are really free and have been pre-zeroed
  *
  */
 
 extern struct pglist vm_page_queue_free;	/* memory free queue */
 extern struct pglist vm_page_queue_zero;	/* zeroed memory free queue */
 extern struct pglist vm_page_queue_active;	/* active memory queue */
 extern struct pglist vm_page_queue_inactive;	/* inactive memory queue */
 extern struct pglist vm_page_queue_cache;	/* cache memory queue */
 
 extern int vm_page_zero_count;
 
 extern vm_page_t vm_page_array;		/* First resident page in table */
 extern long first_page;			/* first physical page number */
 
  /* ... represented in vm_page_array */
 extern long last_page;			/* last physical page number */
 
  /* ... represented in vm_page_array */
  /* [INCLUSIVE] */
 extern vm_offset_t first_phys_addr;	/* physical address for first_page */
 extern vm_offset_t last_phys_addr;	/* physical address for last_page */
 
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
 #define IS_VM_PHYSADDR(pa) \
 		((pa) >= first_phys_addr && (pa) <= last_phys_addr)
 
 #define PHYS_TO_VM_PAGE(pa) \
 		(&vm_page_array[atop(pa) - first_page ])
 
 /*
  *	Functions implemented as macros
  */
 
 #define PAGE_ASSERT_WAIT(m, interruptible)	{ \
 				(m)->flags |= PG_WANTED; \
 				assert_wait((int) (m), (interruptible)); \
 			}
 
 #define PAGE_WAKEUP(m)	{ \
 				(m)->flags &= ~PG_BUSY; \
 				if ((m)->flags & PG_WANTED) { \
 					(m)->flags &= ~PG_WANTED; \
 					wakeup((caddr_t) (m)); \
 				} \
 			}
 
 #if PAGE_SIZE == 4096
 #define VM_PAGE_BITS_ALL 0xff
 #endif
 
 #if PAGE_SIZE == 8192
 #define VM_PAGE_BITS_ALL 0xffff
 #endif
 
 #define VM_ALLOC_NORMAL 0
 #define VM_ALLOC_INTERRUPT 1
 #define VM_ALLOC_SYSTEM 2
-#define	VM_ALLOC_ZERO	0x80
+#define	VM_ALLOC_ZERO	3
 
 void vm_page_activate __P((vm_page_t));
 vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int));
 void vm_page_cache __P((register vm_page_t));
 void vm_page_copy __P((vm_page_t, vm_page_t));
 void vm_page_deactivate __P((vm_page_t));
 void vm_page_free __P((vm_page_t));
 void vm_page_insert __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_page_t vm_page_lookup __P((vm_object_t, vm_pindex_t));
 void vm_page_remove __P((vm_page_t));
 void vm_page_rename __P((vm_page_t, vm_object_t, vm_pindex_t));
 vm_offset_t vm_page_startup __P((vm_offset_t, vm_offset_t, vm_offset_t));
 void vm_page_unwire __P((vm_page_t));
 void vm_page_wire __P((vm_page_t));
 boolean_t vm_page_zero_fill __P((vm_page_t));
 void vm_page_set_validclean __P((vm_page_t, int, int));
 void vm_page_set_invalid __P((vm_page_t, int, int));
 int vm_page_is_valid __P((vm_page_t, int, int));
 void vm_page_test_dirty __P((vm_page_t));
 int vm_page_bits __P((int, int));
 
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 static __inline void
 vm_page_hold(vm_page_t mem)
 {
 	mem->hold_count++;
 }
 
 #ifdef DIAGNOSTIC
 #include <sys/systm.h>		/* make GCC shut up */
 #endif
 
 static __inline void
 vm_page_unhold(vm_page_t mem)
 {
 #ifdef DIAGNOSTIC
 	if (--mem->hold_count < 0)
 		panic("vm_page_unhold: hold count < 0!!!");
 #else
 	--mem->hold_count;
 #endif
 }
 
 static __inline void
 vm_page_protect(vm_page_t mem, int prot)
 {
 	if (prot == VM_PROT_NONE) {
 		if (mem->flags & (PG_WRITEABLE|PG_MAPPED)) {
 			pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot);
 			mem->flags &= ~(PG_WRITEABLE|PG_MAPPED);
 		}
 	} else if ((prot == VM_PROT_READ) && (mem->flags & PG_WRITEABLE)) {
 		pmap_page_protect(VM_PAGE_TO_PHYS(mem), prot);
 		mem->flags &= ~PG_WRITEABLE;
 	}
 }
 
 
 #endif				/* KERNEL */
 #endif				/* !_VM_PAGE_ */
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c	(revision 13489)
+++ head/sys/vm/vm_pageout.c	(revision 13490)
@@ -1,989 +1,984 @@
 /*
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
- * $Id: vm_pageout.c,v 1.62 1995/12/11 04:58:28 dyson Exp $
+ * $Id: vm_pageout.c,v 1.63 1995/12/14 09:55:09 phk Exp $
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/lock.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout __P((void));
 static int vm_pageout_clean __P((vm_page_t, int));
 static int vm_pageout_scan __P((void));
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT_KT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp)
 
 /* the kernel process "vm_daemon"*/
 static void vm_daemon __P((void));
 static struct	proc *vmproc;
 
 static struct kproc_desc vm_kp = {
 	"vmdaemon",
 	vm_daemon,
 	&vmproc
 };
 SYSINIT_KT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp)
 
 
 int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 
 extern int npendingio;
 static int vm_pageout_req_swapout;	/* XXX */
 static int vm_daemon_needed;
 extern int nswiodone;
 extern int vm_swap_size;
 extern int vfs_update_wakeup;
 
 #define MAXSCAN 1024		/* maximum number of pages to scan in queues */
 
 #define MAXLAUNDER (cnt.v_page_count > 1800 ? 32 : 16)
 
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
 int vm_page_max_wired;		/* XXX max # of wired pages system-wide */
 
 typedef int freeer_fcn_t __P((vm_map_t, vm_object_t, int, int));
 static void vm_pageout_map_deactivate_pages __P((vm_map_t, vm_map_entry_t,
 						 int *, freeer_fcn_t *));
 static freeer_fcn_t vm_pageout_object_deactivate_pages;
 static void vm_req_vmdaemon __P((void));
 
 /*
  * vm_pageout_clean:
  *
  * Clean the page and remove it from the laundry.
  * 
  * We set the busy bit to cause potential page faults on this page to
  * block.
  * 
  * And we set pageout-in-progress to keep the object from disappearing
  * during pageout.  This guarantees that the page won't move from the
  * inactive queue.  (However, any other page on the inactive queue may
  * move!)
  */
 static int
 vm_pageout_clean(m, sync)
 	vm_page_t m;
 	int sync;
 {
 	register vm_object_t object;
 	vm_page_t mc[2*VM_PAGEOUT_PAGE_COUNT];
 	int pageout_count;
 	int i, forward_okay, backward_okay, page_base;
 	vm_pindex_t pindex = m->pindex;
 
 	object = m->object;
 
 	/*
 	 * If not OBJT_SWAP, additional memory may be needed to do the pageout.
 	 * Try to avoid the deadlock.
 	 */
 	if ((sync != VM_PAGEOUT_FORCE) &&
 	    (object->type != OBJT_SWAP) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_pageout_free_min))
 		return 0;
 
 	/*
 	 * Don't mess with the page if it's busy.
 	 */
 	if ((!sync && m->hold_count != 0) ||
 	    ((m->busy != 0) || (m->flags & PG_BUSY)))
 		return 0;
 
 	/*
 	 * Try collapsing before it's too late.
 	 */
 	if (!sync && object->backing_object) {
 		vm_object_collapse(object);
 	}
 	mc[VM_PAGEOUT_PAGE_COUNT] = m;
 	pageout_count = 1;
 	page_base = VM_PAGEOUT_PAGE_COUNT;
 	forward_okay = TRUE;
 	if (pindex != 0)
 		backward_okay = TRUE;
 	else
 		backward_okay = FALSE;
 	/*
 	 * Scan object for clusterable pages.
 	 *
 	 * We can cluster ONLY if: ->> the page is NOT
 	 * clean, wired, busy, held, or mapped into a
 	 * buffer, and one of the following:
 	 * 1) The page is inactive, or a seldom used
 	 *    active page.
 	 * -or-
 	 * 2) we force the issue.
 	 */
 	for (i = 1; (i < vm_pageout_page_count) && (forward_okay || backward_okay); i++) {
 		vm_page_t p;
 
 		/*
 		 * See if forward page is clusterable.
 		 */
 		if (forward_okay) {
 			/*
 			 * Stop forward scan at end of object.
 			 */
 			if ((pindex + i) > object->size) {
 				forward_okay = FALSE;
 				goto do_backward;
 			}
 			p = vm_page_lookup(object, pindex + i);
 			if (p) {
-				if ((p->flags & (PG_BUSY|PG_CACHE)) || p->busy) {
+				if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) {
 					forward_okay = FALSE;
 					goto do_backward;
 				}
 				vm_page_test_dirty(p);
 				if ((p->dirty & p->valid) != 0 &&
-				    ((p->flags & PG_INACTIVE) ||
+				    ((p->queue == PQ_INACTIVE) ||
 				     (sync == VM_PAGEOUT_FORCE)) &&
 				    (p->wire_count == 0) &&
 				    (p->hold_count == 0)) {
 					mc[VM_PAGEOUT_PAGE_COUNT + i] = p;
 					pageout_count++;
 					if (pageout_count == vm_pageout_page_count)
 						break;
 				} else {
 					forward_okay = FALSE;
 				}
 			} else {
 				forward_okay = FALSE;
 			}
 		}
 do_backward:
 		/*
 		 * See if backward page is clusterable.
 		 */
 		if (backward_okay) {
 			/*
 			 * Stop backward scan at beginning of object.
 			 */
 			if ((pindex - i) == 0) {
 				backward_okay = FALSE;
 			}
 			p = vm_page_lookup(object, pindex - i);
 			if (p) {
-				if ((p->flags & (PG_BUSY|PG_CACHE)) || p->busy) {
+				if ((p->queue == PQ_CACHE) || (p->flags & PG_BUSY) || p->busy) {
 					backward_okay = FALSE;
 					continue;
 				}
 				vm_page_test_dirty(p);
 				if ((p->dirty & p->valid) != 0 &&
-				    ((p->flags & PG_INACTIVE) ||
+				    ((p->queue == PQ_INACTIVE) ||
 				     (sync == VM_PAGEOUT_FORCE)) &&
 				    (p->wire_count == 0) &&
 				    (p->hold_count == 0)) {
 					mc[VM_PAGEOUT_PAGE_COUNT - i] = p;
 					pageout_count++;
 					page_base--;
 					if (pageout_count == vm_pageout_page_count)
 						break;
 				} else {
 					backward_okay = FALSE;
 				}
 			} else {
 				backward_okay = FALSE;
 			}
 		}
 	}
 
 	/*
 	 * we allow reads during pageouts...
 	 */
 	for (i = page_base; i < (page_base + pageout_count); i++) {
 		mc[i]->flags |= PG_BUSY;
 		vm_page_protect(mc[i], VM_PROT_READ);
 	}
 
 	return vm_pageout_flush(&mc[page_base], pageout_count, sync);
 }
 
 int
 vm_pageout_flush(mc, count, sync)
 	vm_page_t *mc;
 	int count;
 	int sync;
 {
 	register vm_object_t object;
 	int pageout_status[count];
 	int anyok = 0;
 	int i;
 
 	object = mc[0]->object;
 	object->paging_in_progress += count;
 
 	vm_pager_put_pages(object, mc, count,
 	    ((sync || (object == kernel_object)) ? TRUE : FALSE),
 	    pageout_status);
 
 
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 			++anyok;
 			break;
 		case VM_PAGER_PEND:
 			++anyok;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * Page outside of range of object. Right now we
 			 * essentially lose the changes by pretending it
 			 * worked.
 			 */
 			pmap_clear_modify(VM_PAGE_TO_PHYS(mt));
 			mt->dirty = 0;
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If page couldn't be paged out, then reactivate the
 			 * page so it doesn't clog the inactive list.  (We
 			 * will try paging out it again later).
 			 */
-			if (mt->flags & PG_INACTIVE)
+			if (mt->queue == PQ_INACTIVE)
 				vm_page_activate(mt);
 			break;
 		case VM_PAGER_AGAIN:
 			break;
 		}
 
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
-			if ((mt->flags & (PG_REFERENCED|PG_WANTED)) ||
-			    pmap_is_referenced(VM_PAGE_TO_PHYS(mt))) {
-				pmap_clear_reference(VM_PAGE_TO_PHYS(mt));
-				mt->flags &= ~PG_REFERENCED;
-				if (mt->flags & PG_INACTIVE)
-					vm_page_activate(mt);
-			}
 			PAGE_WAKEUP(mt);
 		}
 	}
 	return anyok;
 }
 
 /*
  *	vm_pageout_object_deactivate_pages
  *
  *	deactivate enough pages to satisfy the inactive target
  *	requirements or if vm_page_proc_limit is set, then
  *	deactivate all of the pages in the object and its
  *	backing_objects.
  *
  *	The object and map must be locked.
  */
 static int
 vm_pageout_object_deactivate_pages(map, object, count, map_remove_only)
 	vm_map_t map;
 	vm_object_t object;
 	int count;
 	int map_remove_only;
 {
 	register vm_page_t p, next;
 	int rcount;
 	int dcount;
 
 	dcount = 0;
 	if (count == 0)
 		count = 1;
 
 	if (object->type == OBJT_DEVICE)
 		return 0;
 
 	if (object->backing_object) {
 		if (object->backing_object->ref_count == 1)
 			dcount += vm_pageout_object_deactivate_pages(map,
 			    object->backing_object, count / 2 + 1, map_remove_only);
 		else
 			vm_pageout_object_deactivate_pages(map,
 			    object->backing_object, count, 1);
 	}
 	if (object->paging_in_progress)
 		return dcount;
 
 	/*
 	 * scan the objects entire memory queue
 	 */
 	rcount = object->resident_page_count;
 	p = object->memq.tqh_first;
 	while (p && (rcount-- > 0)) {
 		next = p->listq.tqe_next;
 		cnt.v_pdpages++;
 		if (p->wire_count != 0 ||
 		    p->hold_count != 0 ||
 		    p->busy != 0 ||
+		    (p->flags & PG_BUSY) ||
 		    !pmap_page_exists(vm_map_pmap(map), VM_PAGE_TO_PHYS(p))) {
 			p = next;
 			continue;
 		}
 		/*
 		 * if a page is active, not wired and is in the processes
 		 * pmap, then deactivate the page.
 		 */
-		if ((p->flags & (PG_ACTIVE | PG_BUSY)) == PG_ACTIVE) {
+		if (p->queue == PQ_ACTIVE) {
 			if (!pmap_is_referenced(VM_PAGE_TO_PHYS(p)) &&
-			    (p->flags & (PG_REFERENCED|PG_WANTED)) == 0) {
+			    (p->flags & PG_REFERENCED) == 0) {
 				p->act_count -= min(p->act_count, ACT_DECLINE);
 				/*
 				 * if the page act_count is zero -- then we
 				 * deactivate
 				 */
 				if (!p->act_count) {
 					if (!map_remove_only)
 						vm_page_deactivate(p);
 					vm_page_protect(p, VM_PROT_NONE);
 					/*
 					 * else if on the next go-around we
 					 * will deactivate the page we need to
 					 * place the page on the end of the
 					 * queue to age the other pages in
 					 * memory.
 					 */
 				} else {
 					TAILQ_REMOVE(&vm_page_queue_active, p, pageq);
 					TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq);
 				}
 				/*
 				 * see if we are done yet
 				 */
-				if (p->flags & PG_INACTIVE) {
+				if (p->queue == PQ_INACTIVE) {
 					--count;
 					++dcount;
 					if (count <= 0 &&
 					    cnt.v_inactive_count > cnt.v_inactive_target) {
 						return dcount;
 					}
 				}
 			} else {
 				/*
 				 * Move the page to the bottom of the queue.
 				 */
 				pmap_clear_reference(VM_PAGE_TO_PHYS(p));
 				p->flags &= ~PG_REFERENCED;
 				if (p->act_count < ACT_MAX)
 					p->act_count += ACT_ADVANCE;
 
 				TAILQ_REMOVE(&vm_page_queue_active, p, pageq);
 				TAILQ_INSERT_TAIL(&vm_page_queue_active, p, pageq);
 			}
-		} else if ((p->flags & (PG_INACTIVE | PG_BUSY)) == PG_INACTIVE) {
+		} else if (p->queue == PQ_INACTIVE) {
 			vm_page_protect(p, VM_PROT_NONE);
 		}
 		p = next;
 	}
 	return dcount;
 }
 
-
 /*
  * deactivate some number of pages in a map, try to do it fairly, but
  * that is really hard to do.
  */
 
 static void
 vm_pageout_map_deactivate_pages(map, entry, count, freeer)
 	vm_map_t map;
 	vm_map_entry_t entry;
 	int *count;
 	freeer_fcn_t *freeer;
 {
 	vm_map_t tmpm;
 	vm_map_entry_t tmpe;
 	vm_object_t obj;
 
 	if (*count <= 0)
 		return;
 	vm_map_reference(map);
 	if (!lock_try_read(&map->lock)) {
 		vm_map_deallocate(map);
 		return;
 	}
 	if (entry == 0) {
 		tmpe = map->header.next;
 		while (tmpe != &map->header && *count > 0) {
 			vm_pageout_map_deactivate_pages(map, tmpe, count, freeer);
 			tmpe = tmpe->next;
 		};
 	} else if (entry->is_sub_map || entry->is_a_map) {
 		tmpm = entry->object.share_map;
 		tmpe = tmpm->header.next;
 		while (tmpe != &tmpm->header && *count > 0) {
 			vm_pageout_map_deactivate_pages(tmpm, tmpe, count, freeer);
 			tmpe = tmpe->next;
 		};
 	} else if ((obj = entry->object.vm_object) != 0) {
 		*count -= (*freeer) (map, obj, *count, TRUE);
 	}
 	lock_read_done(&map->lock);
 	vm_map_deallocate(map);
 	return;
 }
 
 static void
 vm_req_vmdaemon()
 {
 	static int lastrun = 0;
 
 	if ((ticks > (lastrun + hz / 10)) || (ticks < lastrun)) {
 		wakeup(&vm_daemon_needed);
 		lastrun = ticks;
 	}
 }
 
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
  */
 static int
 vm_pageout_scan()
 {
 	vm_page_t m;
 	int page_shortage, maxscan, maxlaunder, pcount;
 	int pages_freed;
 	vm_page_t next;
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	vm_object_t object;
 	int force_wakeup = 0;
 	int vnodes_skipped = 0;
 
 	pages_freed = 0;
 
 	/*
 	 * Start scanning the inactive queue for pages we can free. We keep
 	 * scanning until we have enough free pages or we have scanned through
 	 * the entire queue.  If we encounter dirty pages, we start cleaning
 	 * them.
 	 */
 
 	maxlaunder = (cnt.v_inactive_target > MAXLAUNDER) ?
 	    MAXLAUNDER : cnt.v_inactive_target;
 
 rescan1:
 	maxscan = cnt.v_inactive_count;
 	m = vm_page_queue_inactive.tqh_first;
 	while ((m != NULL) && (maxscan-- > 0) &&
 	    ((cnt.v_cache_count + cnt.v_free_count) < (cnt.v_cache_min + cnt.v_free_target))) {
 		vm_page_t next;
 
 		cnt.v_pdpages++;
 		next = m->pageq.tqe_next;
 
 #if defined(VM_DIAGNOSE)
-		if ((m->flags & PG_INACTIVE) == 0) {
+		if (m->queue != PQ_INACTIVE) {
 			printf("vm_pageout_scan: page not inactive?\n");
 			break;
 		}
 #endif
 
 		/*
 		 * dont mess with busy pages
 		 */
-		if (m->hold_count || m->busy || (m->flags & PG_BUSY)) {
+		if (m->busy || (m->flags & PG_BUSY)) {
+			m = next;
+			continue;
+		}
+		if (m->hold_count) {
 			TAILQ_REMOVE(&vm_page_queue_inactive, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_inactive, m, pageq);
 			m = next;
 			continue;
 		}
+
 		if (((m->flags & PG_REFERENCED) == 0) &&
 		    pmap_is_referenced(VM_PAGE_TO_PHYS(m))) {
 			m->flags |= PG_REFERENCED;
 		}
 		if (m->object->ref_count == 0) {
 			m->flags &= ~PG_REFERENCED;
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 		}
-		if ((m->flags & (PG_REFERENCED|PG_WANTED)) != 0) {
+		if ((m->flags & PG_REFERENCED) != 0) {
 			m->flags &= ~PG_REFERENCED;
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 			vm_page_activate(m);
 			if (m->act_count < ACT_MAX)
 				m->act_count += ACT_ADVANCE;
 			m = next;
 			continue;
 		}
 
-		vm_page_test_dirty(m);
 		if (m->dirty == 0) {
-			if (m->bmapped == 0) {
-				if (m->valid == 0) {
-					pmap_page_protect(VM_PAGE_TO_PHYS(m), VM_PROT_NONE);
-					vm_page_free(m);
-					cnt.v_dfree++;
-				} else {
-					vm_page_cache(m);
-				}
-				++pages_freed;
-			} else {
-				m = next;
-				continue;
-			}
+			vm_page_test_dirty(m);
+		} else if (m->dirty != 0)
+			m->dirty = VM_PAGE_BITS_ALL;
+		if (m->valid == 0) {
+			vm_page_protect(m, VM_PROT_NONE);
+			vm_page_free(m);
+			cnt.v_dfree++;
+			++pages_freed;
+		} else if (m->dirty == 0) {
+			vm_page_cache(m);
+			++pages_freed;
 		} else if (maxlaunder > 0) {
 			int written;
 			struct vnode *vp = NULL;
 
 			object = m->object;
 			if (object->flags & OBJ_DEAD) {
 				m = next;
 				continue;
 			}
 
 			if (object->type == OBJT_VNODE) {
 				vp = object->handle;
 				if (VOP_ISLOCKED(vp) || vget(vp, 1)) {
 					if (object->flags & OBJ_MIGHTBEDIRTY)
 						++vnodes_skipped;
 					m = next;
 					continue;
 				}
 			}
 
 			/*
 			 * If a page is dirty, then it is either being washed
 			 * (but not yet cleaned) or it is still in the
 			 * laundry.  If it is still in the laundry, then we
 			 * start the cleaning operation.
 			 */
 			written = vm_pageout_clean(m, 0);
 
 			if (vp)
 				vput(vp);
 
 			if (!next) {
 				break;
 			}
 			maxlaunder -= written;
 			/*
 			 * if the next page has been re-activated, start
 			 * scanning again
 			 */
-			if ((next->flags & PG_INACTIVE) == 0) {
+			if (next->queue != PQ_INACTIVE) {
 				vm_pager_sync();
 				goto rescan1;
 			}
 		}
 		m = next;
 	}
 
 	/*
 	 * Compute the page shortage.  If we are still very low on memory be
 	 * sure that we will move a minimal amount of pages from active to
 	 * inactive.
 	 */
 
 	page_shortage = cnt.v_inactive_target -
 	    (cnt.v_free_count + cnt.v_inactive_count + cnt.v_cache_count);
 	if (page_shortage <= 0) {
 		if (pages_freed == 0) {
 			page_shortage = cnt.v_free_min - cnt.v_free_count;
 		} else {
 			page_shortage = 1;
 		}
 	}
 	maxscan = MAXSCAN;
 	pcount = cnt.v_active_count;
 	m = vm_page_queue_active.tqh_first;
-	while ((m != NULL) && (maxscan > 0) && (pcount-- > 0) && (page_shortage > 0)) {
+	while ((m != NULL) && (maxscan > 0) &&
+		(pcount-- > 0) && (page_shortage > 0)) {
 
 		cnt.v_pdpages++;
 		next = m->pageq.tqe_next;
 
 		/*
 		 * Don't deactivate pages that are busy.
 		 */
 		if ((m->busy != 0) ||
 		    (m->flags & PG_BUSY) ||
 		    (m->hold_count != 0)) {
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			m = next;
-			/* printf("busy: s: %d, f: 0x%x, h: %d\n",
-				m->busy, m->flags, m->hold_count); */
 			continue;
 		}
 		if (m->object->ref_count &&
-			((m->flags & (PG_REFERENCED|PG_WANTED)) ||
-			pmap_is_referenced(VM_PAGE_TO_PHYS(m)))) {
+			((m->flags & PG_REFERENCED) ||
+			pmap_is_referenced(VM_PAGE_TO_PHYS(m))) ) {
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 			m->flags &= ~PG_REFERENCED;
 			if (m->act_count < ACT_MAX) {
 				m->act_count += ACT_ADVANCE;
 			}
 			TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 			TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 		} else {
 			m->flags &= ~PG_REFERENCED;
 			pmap_clear_reference(VM_PAGE_TO_PHYS(m));
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 
 			/*
 			 * if the page act_count is zero -- then we deactivate
 			 */
 			if (!m->act_count && (page_shortage > 0)) {
 				if (m->object->ref_count == 0) {
 					--page_shortage;
 					vm_page_test_dirty(m);
-					if ((m->bmapped == 0) && (m->dirty == 0) ) {
+					if (m->dirty == 0) {
 						m->act_count = 0;
 						vm_page_cache(m);
 					} else {
 						vm_page_deactivate(m);
 					}
 				} else {
 					vm_page_deactivate(m);
 					--page_shortage;
 				}
 			} else if (m->act_count) {
 				TAILQ_REMOVE(&vm_page_queue_active, m, pageq);
 				TAILQ_INSERT_TAIL(&vm_page_queue_active, m, pageq);
 			}
 		}
 		maxscan--;
 		m = next;
 	}
 
 	/*
 	 * We try to maintain some *really* free pages, this allows interrupt
 	 * code to be guaranteed space.
 	 */
 	while (cnt.v_free_count < cnt.v_free_reserved) {
 		m = vm_page_queue_cache.tqh_first;
 		if (!m)
 			break;
 		vm_page_free(m);
 		cnt.v_dfree++;
 	}
 
 	/*
 	 * If we didn't get enough free pages, and we have skipped a vnode
 	 * in a writeable object, wakeup the sync daemon.  And kick swapout
 	 * if we did not get enough free pages.
 	 */
-	if ((cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_target) {
+	if ((cnt.v_cache_count + cnt.v_free_count) <
+		(cnt.v_free_target + cnt.v_cache_min) ) {
 		if (vnodes_skipped &&
 		    (cnt.v_cache_count + cnt.v_free_count) < cnt.v_free_min) {
 			if (!vfs_update_wakeup) {
 				vfs_update_wakeup = 1;
 				wakeup(&vfs_update_wakeup);
 			}
 		}
 		/*
 		 * now swap processes out if we are in low memory conditions
 		 */
 		if (!swap_pager_full && vm_swap_size &&
 			vm_pageout_req_swapout == 0) {
 			vm_pageout_req_swapout = 1;
 			vm_req_vmdaemon();
 		}
 	}
 
 	if ((cnt.v_inactive_count + cnt.v_free_count + cnt.v_cache_count) <
 	    (cnt.v_inactive_target + cnt.v_free_min)) {
 		vm_req_vmdaemon();
 	}
 
 	/*
 	 * make sure that we have swap space -- if we are low on memory and
 	 * swap -- then kill the biggest process.
 	 */
 	if ((vm_swap_size == 0 || swap_pager_full) &&
 	    ((cnt.v_free_count + cnt.v_cache_count) < cnt.v_free_min)) {
 		bigproc = NULL;
 		bigsize = 0;
 		for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 			/*
 			 * if this is a system process, skip it
 			 */
 			if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) ||
 			    ((p->p_pid < 48) && (vm_swap_size != 0))) {
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
 				continue;
 			}
 			/*
 			 * get the process size
 			 */
 			size = p->p_vmspace->vm_pmap.pm_stats.resident_count;
 			/*
 			 * if the this process is bigger than the biggest one
 			 * remember it.
 			 */
 			if (size > bigsize) {
 				bigproc = p;
 				bigsize = size;
 			}
 		}
 		if (bigproc != NULL) {
 			printf("Process %lu killed by vm_pageout -- out of swap\n", (u_long) bigproc->p_pid);
 			psignal(bigproc, SIGKILL);
 			bigproc->p_estcpu = 0;
 			bigproc->p_nice = PRIO_MIN;
 			resetpriority(bigproc);
 			wakeup(&cnt.v_free_count);
 		}
 	}
 	return force_wakeup;
 }
 
 /*
  *	vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout()
 {
 	(void) spl0();
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 
 	cnt.v_interrupt_free_min = 2;
 
 	if (cnt.v_page_count > 1024)
 		cnt.v_free_min = 4 + (cnt.v_page_count - 1024) / 200;
 	else
 		cnt.v_free_min = 4;
 	/*
 	 * free_reserved needs to include enough for the largest swap pager
 	 * structures plus enough for any pv_entry structs when paging.
 	 */
 	cnt.v_pageout_free_min = 6 + cnt.v_page_count / 1024 +
 				cnt.v_interrupt_free_min;
 	cnt.v_free_reserved = cnt.v_pageout_free_min + 6;
 	cnt.v_free_target = 3 * cnt.v_free_min + cnt.v_free_reserved;
 	cnt.v_free_min += cnt.v_free_reserved;
 
 	if (cnt.v_page_count > 1024) {
 		cnt.v_cache_max = (cnt.v_free_count - 1024) / 2;
 		cnt.v_cache_min = (cnt.v_free_count - 1024) / 8;
 		cnt.v_inactive_target = 2*cnt.v_cache_min + 192;
 	} else {
 		cnt.v_cache_min = 0;
 		cnt.v_cache_max = 0;
 		cnt.v_inactive_target = cnt.v_free_count / 4;
 	}
 
 	/* XXX does not really belong here */
 	if (vm_page_max_wired == 0)
 		vm_page_max_wired = cnt.v_free_count / 3;
 
 
 	swap_pager_swap_init();
 	/*
 	 * The pageout daemon is never done, so loop forever.
 	 */
 	while (TRUE) {
 		int s = splhigh();
 
 		if (!vm_pages_needed ||
 			((cnt.v_free_count >= cnt.v_free_reserved) &&
 			 (cnt.v_free_count + cnt.v_cache_count >= cnt.v_free_min))) {
 			vm_pages_needed = 0;
 			tsleep(&vm_pages_needed, PVM, "psleep", 0);
 		}
 		vm_pages_needed = 0;
 		splx(s);
 		cnt.v_pdwakeups++;
 		vm_pager_sync();
 		vm_pageout_scan();
 		vm_pager_sync();
 		wakeup(&cnt.v_free_count);
 		wakeup(kmem_map);
 	}
 }
 
 static void
 vm_daemon()
 {
 	vm_object_t object;
 	struct proc *p;
 
 	while (TRUE) {
 		tsleep(&vm_daemon_needed, PUSER, "psleep", 0);
 		if (vm_pageout_req_swapout) {
 			swapout_procs();
 			vm_pageout_req_swapout = 0;
 		}
 		/*
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
 
 		for (p = (struct proc *) allproc; p != NULL; p = p->p_next) {
 			int overage;
 			quad_t limit;
 			vm_offset_t size;
 
 			/*
 			 * if this is a system process or if we have already
 			 * looked at this process, skip it.
 			 */
 			if (p->p_flag & (P_SYSTEM | P_WEXIT)) {
 				continue;
 			}
 			/*
 			 * if the process is in a non-running type state,
 			 * don't touch it.
 			 */
 			if (p->p_stat != SRUN && p->p_stat != SSLEEP) {
 				continue;
 			}
 			/*
 			 * get a limit
 			 */
 			limit = qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur,
 			    p->p_rlimit[RLIMIT_RSS].rlim_max);
 
 			/*
 			 * let processes that are swapped out really be
 			 * swapped out set the limit to nothing (will force a
 			 * swap-out.)
 			 */
 			if ((p->p_flag & P_INMEM) == 0)
 				limit = 0;	/* XXX */
 
 			size = p->p_vmspace->vm_pmap.pm_stats.resident_count * PAGE_SIZE;
 			if (limit >= 0 && size >= limit) {
 				overage = (size - limit) >> PAGE_SHIFT;
 				vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map,
 				    (vm_map_entry_t) 0, &overage, vm_pageout_object_deactivate_pages);
 			}
 		}
 
 		/*
 		 * we remove cached objects that have no RSS...
 		 */
 restart:
 		object = vm_object_cached_list.tqh_first;
 		while (object) {
 			/*
 			 * if there are no resident pages -- get rid of the object
 			 */
 			if (object->resident_page_count == 0) {
 				vm_object_reference(object);
 				pager_cache(object, FALSE);
 				goto restart;
 			}
 			object = object->cached_list.tqe_next;
 		}
 	}
 }
Index: head/sys/vm/vm_unix.c
===================================================================
--- head/sys/vm/vm_unix.c	(revision 13489)
+++ head/sys/vm/vm_unix.c	(revision 13490)
@@ -1,119 +1,121 @@
 /*
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
  *
  *	@(#)vm_unix.c	8.1 (Berkeley) 6/11/93
- * $Id: vm_unix.c,v 1.8 1995/11/12 06:43:28 bde Exp $
+ * $Id: vm_unix.c,v 1.9 1995/12/07 12:48:29 davidg Exp $
  */
 
 /*
  * Traditional sbrk/grow interface to VM
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/lock.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
+#include <vm/vm_prot.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct obreak_args {
 	char *nsize;
 };
 #endif
 
 /* ARGSUSED */
 int
 obreak(p, uap, retval)
 	struct proc *p;
 	struct obreak_args *uap;
 	int *retval;
 {
 	register struct vmspace *vm = p->p_vmspace;
 	vm_offset_t new, old;
 	int rv;
 	register int diff;
 
 	old = (vm_offset_t) vm->vm_daddr;
 	new = round_page(uap->nsize);
 	if ((int) (new - old) > p->p_rlimit[RLIMIT_DATA].rlim_cur)
 		return (ENOMEM);
 	old = round_page(old + ctob(vm->vm_dsize));
 	diff = new - old;
 	if (diff > 0) {
 		if (swap_pager_full) {
 			return (ENOMEM);
 		}
-		rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE);
+		rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE,
+			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (ENOMEM);
 		}
 		vm->vm_dsize += btoc(diff);
 	} else if (diff < 0) {
 		diff = -diff;
 		rv = vm_map_remove(&vm->vm_map, new, new + diff);
 		if (rv != KERN_SUCCESS) {
 			return (ENOMEM);
 		}
 		vm->vm_dsize -= btoc(diff);
 	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ovadvise_args {
 	int anom;
 };
 #endif
 
 /* ARGSUSED */
 int
 ovadvise(p, uap, retval)
 	struct proc *p;
 	struct ovadvise_args *uap;
 	int *retval;
 {
 
 	return (EINVAL);
 }
Index: head/sys/vm/vnode_pager.c
===================================================================
--- head/sys/vm/vnode_pager.c	(revision 13489)
+++ head/sys/vm/vnode_pager.c	(revision 13490)
@@ -1,960 +1,961 @@
 /*
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
- *	$Id: vnode_pager.c,v 1.56 1995/12/14 09:55:14 phk Exp $
+ *	$Id: vnode_pager.c,v 1.57 1995/12/17 23:29:56 dyson Exp $
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/mount.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_prot.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 
 static vm_offset_t vnode_pager_addr __P((struct vnode *vp, vm_ooffset_t address,
 					 int *run));
 static void vnode_pager_iodone __P((struct buf *bp));
 static int vnode_pager_input_smlfs __P((vm_object_t object, vm_page_t m));
 static int vnode_pager_input_old __P((vm_object_t object, vm_page_t m));
 static void vnode_pager_dealloc __P((vm_object_t));
 static int vnode_pager_getpages __P((vm_object_t, vm_page_t *, int, int));
 static int vnode_pager_putpages __P((vm_object_t, vm_page_t *, int, boolean_t, int *));
 static boolean_t vnode_pager_haspage __P((vm_object_t, vm_pindex_t, int *, int *));
 
 struct pagerops vnodepagerops = {
 	NULL,
 	vnode_pager_alloc,
 	vnode_pager_dealloc,
 	vnode_pager_getpages,
 	vnode_pager_putpages,
 	vnode_pager_haspage,
 	NULL
 };
 
 static int vnode_pager_leaf_getpages __P((vm_object_t object, vm_page_t *m,
 					  int count, int reqpage));
 static int vnode_pager_leaf_putpages __P((vm_object_t object, vm_page_t *m,
 					  int count, boolean_t sync,
 					  int *rtvals));
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  */
 vm_object_t
 vnode_pager_alloc(handle, size, prot, offset)
 	void *handle;
 	vm_size_t size;
 	vm_prot_t prot;
 	vm_ooffset_t offset;
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	/*
 	 * Prevent race condition when allocating the object. This
 	 * can happen with NFS vnodes since the nfsnode isn't locked.
 	 */
 	while (vp->v_flag & VOLOCK) {
 		vp->v_flag |= VOWANT;
 		tsleep(vp, PVM, "vnpobj", 0);
 	}
 	vp->v_flag |= VOLOCK;
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
-	while (((object = vp->v_object) != NULL) && (object->flags & OBJ_DEAD)) {
+	while (((object = vp->v_object) != NULL) &&
+		(object->flags & OBJ_DEAD)) {
 		tsleep(object, PVM, "vadead", 0);
 	}
 
 	if (object == NULL) {
 		/*
 		 * And an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, size);
 		object->flags = OBJ_CANPERSIST;
 
 		/*
 		 * Hold a reference to the vnode and initialize object data.
 		 */
 		VREF(vp);
 		object->un_pager.vnp.vnp_size = (vm_ooffset_t) size * PAGE_SIZE;
 
 		object->handle = handle;
 		vp->v_object = object;
 	} else {
 		/*
 		 * vm_object_reference() will remove the object from the cache if
 		 * found and gain a reference to the object.
 		 */
 		vm_object_reference(object);
 	}
 
 	if (vp->v_type == VREG)
 		vp->v_flag |= VVMIO;
 
 	vp->v_flag &= ~VOLOCK;
 	if (vp->v_flag & VOWANT) {
 		vp->v_flag &= ~VOWANT;
 		wakeup(vp);
 	}
 	return (object);
 }
 
 static void
 vnode_pager_dealloc(object)
 	vm_object_t object;
 {
 	register struct vnode *vp = object->handle;
 
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	if (object->paging_in_progress) {
 		int s = splbio();
 		while (object->paging_in_progress) {
 			object->flags |= OBJ_PIPWNT;
 			tsleep(object, PVM, "vnpdea", 0);
 		}
 		splx(s);
 	}
 
 	object->handle = NULL;
 
 	vp->v_object = NULL;
 	vp->v_flag &= ~(VTEXT | VVMIO);
 	vp->v_flag |= VAGE;
 	vrele(vp);
 }
 
 static boolean_t
 vnode_pager_haspage(object, pindex, before, after)
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int *before;
 	int *after;
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 
 	/*
 	 * If filesystem no longer mounted or offset beyond end of file we do
 	 * not have the page.
 	 */
 	if ((vp->v_mount == NULL) ||
 		(IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size))
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	err = VOP_BMAP(vp, reqblock, (struct vnode **) 0, &bn,
 		after, before);
 	if (err)
 		return TRUE;
 	if ( bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			int numafter;
 			*after *= pagesperblock;
 			numafter = pagesperblock - (poff + 1);
 			if (IDX_TO_OFF(pindex + numafter) > object->un_pager.vnp.vnp_size) {
 				numafter = OFF_TO_IDX((object->un_pager.vnp.vnp_size - IDX_TO_OFF(pindex)));
 			}
 			*after += numafter;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(vp, nsize)
 	struct vnode *vp;
 	vm_ooffset_t nsize;
 {
 	vm_object_t object = vp->v_object;
 
 	if (object == NULL)
 		return;
 
 	/*
 	 * Hasn't changed size
 	 */
 	if (nsize == object->un_pager.vnp.vnp_size)
 		return;
 
 	/*
 	 * File has shrunk. Toss any cached pages beyond the new EOF.
 	 */
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		vm_ooffset_t nsizerounded;
 		nsizerounded = IDX_TO_OFF(OFF_TO_IDX(nsize + PAGE_SIZE - 1));
 		if (nsizerounded < object->un_pager.vnp.vnp_size) {
 			vm_object_page_remove(object,
 				OFF_TO_IDX(nsize + PAGE_SIZE - 1),
 				OFF_TO_IDX(object->un_pager.vnp.vnp_size),
 				FALSE);
 		}
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode...
 		 */
 		if (nsize & PAGE_MASK) {
 			vm_offset_t kva;
 			vm_page_t m;
 
 			m = vm_page_lookup(object, OFF_TO_IDX(nsize));
 			if (m) {
 				kva = vm_pager_map_page(m);
 				bzero((caddr_t) kva + (nsize & PAGE_MASK),
 				    (int) (round_page(nsize) - nsize));
 				vm_pager_unmap_page(kva);
 			}
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = OFF_TO_IDX(nsize + PAGE_SIZE - 1);
 }
 
 void
 vnode_pager_umount(mp)
 	register struct mount *mp;
 {
 	struct vnode *vp, *nvp;
 
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
 		/*
 		 * Vnode can be reclaimed by getnewvnode() while we
 		 * traverse the list.
 		 */
 		if (vp->v_mount != mp)
 			goto loop;
 
 		/*
 		 * Save the next pointer now since uncaching may terminate the
 		 * object and render vnode invalid
 		 */
 		nvp = vp->v_mntvnodes.le_next;
 
 		if (vp->v_object != NULL) {
 			VOP_LOCK(vp);
 			vnode_pager_uncache(vp);
 			VOP_UNLOCK(vp);
 		}
 	}
 }
 
 /*
  * Remove vnode associated object from the object cache.
  * This routine must be called with the vnode locked.
  *
  * XXX unlock the vnode.
  * We must do this since uncaching the object may result in its
  * destruction which may initiate paging activity which may necessitate
  * re-locking the vnode.
  */
 void
 vnode_pager_uncache(vp)
 	struct vnode *vp;
 {
 	vm_object_t object;
 
 	/*
 	 * Not a mapped vnode
 	 */
 	object = vp->v_object;
 	if (object == NULL)
 		return;
 
 	vm_object_reference(object);
 	VOP_UNLOCK(vp);
 	pager_cache(object, FALSE);
 	VOP_LOCK(vp);
 	return;
 }
 
 
 void
 vnode_pager_freepage(m)
 	vm_page_t m;
 {
 	PAGE_WAKEUP(m);
 	vm_page_free(m);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static vm_offset_t
 vnode_pager_addr(vp, address, run)
 	struct vnode *vp;
 	vm_ooffset_t address;
 	int *run;
 {
 	int rtaddress;
 	int bsize;
 	daddr_t block;
 	struct vnode *rtvp;
 	int err;
 	daddr_t vblock;
 	int voffset;
 
 	if ((int) address < 0)
 		return -1;
 
 	if (vp->v_mount == NULL)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, &rtvp, &block, run, NULL);
 
 	if (err || (block == -1))
 		rtaddress = -1;
 	else {
 		rtaddress = block + voffset / DEV_BSIZE;
 		if( run) {
 			*run += 1;
 			*run *= bsize/PAGE_SIZE;
 			*run -= voffset/PAGE_SIZE;
 		}
 	}
 
 	return rtaddress;
 }
 
 /*
  * interrupt routine for I/O completion
  */
 static void
 vnode_pager_iodone(bp)
 	struct buf *bp;
 {
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 }
 
 /*
  * small block file system vnode pager input
  */
 static int
 vnode_pager_input_smlfs(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	int i;
 	int s;
 	struct vnode *dp, *vp;
 	struct buf *bp;
 	vm_offset_t kva;
 	int fileaddr;
 	vm_offset_t bsize;
 	int error = 0;
 
 	vp = object->handle;
 	if (vp->v_mount == NULL)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 
 	VOP_BMAP(vp, 0, &dp, 0, NULL, NULL);
 
 	kva = vm_pager_map_page(m);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 
 		if ((vm_page_bits(IDX_TO_OFF(m->pindex) + i * bsize, bsize) & m->valid))
 			continue;
 
 		fileaddr = vnode_pager_addr(vp,
 			IDX_TO_OFF(m->pindex) + i * bsize, (int *)0);
 		if (fileaddr != -1) {
 			bp = getpbuf();
 
 			/* build a minimal buffer header */
 			bp->b_flags = B_BUSY | B_READ | B_CALL;
 			bp->b_iodone = vnode_pager_iodone;
 			bp->b_proc = curproc;
 			bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 			if (bp->b_rcred != NOCRED)
 				crhold(bp->b_rcred);
 			if (bp->b_wcred != NOCRED)
 				crhold(bp->b_wcred);
 			bp->b_un.b_addr = (caddr_t) kva + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetvp(dp, bp);
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 
 			/* do the input */
 			VOP_STRATEGY(bp);
 
 			/* we definitely need to be at splbio here */
 
 			s = splbio();
 			while ((bp->b_flags & B_DONE) == 0) {
 				tsleep(bp, PVM, "vnsrd", 0);
 			}
 			splx(s);
 			if ((bp->b_flags & B_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			relpbuf(bp);
 			if (error)
 				break;
 
 			vm_page_set_validclean(m, (i * bsize) & (PAGE_SIZE-1), bsize);
 		} else {
 			vm_page_set_validclean(m, (i * bsize) & (PAGE_SIZE-1), bsize);
 			bzero((caddr_t) kva + i * bsize, bsize);
 		}
 	}
 	vm_pager_unmap_page(kva);
 	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 	m->flags &= ~PG_ZERO;
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 
 }
 
 
 /*
  * old style vnode pager output routine
  */
 static int
 vnode_pager_input_old(object, m)
 	vm_object_t object;
 	vm_page_t m;
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	vm_offset_t kva;
 
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		kva = vm_pager_map_page(m);
 
 		aiov.iov_base = (caddr_t) kva;
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_procp = (struct proc *) 0;
 
 		error = VOP_READ(object->handle, &auio, 0, curproc->p_ucred);
 		if (!error) {
 			register int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t) kva + count, PAGE_SIZE - count);
 		}
 		vm_pager_unmap_page(kva);
 	}
 	pmap_clear_modify(VM_PAGE_TO_PHYS(m));
 	m->dirty = 0;
 	m->flags &= ~PG_ZERO;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 static int
 vnode_pager_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	int rtval;
 	struct vnode *vp;
 	vp = object->handle;
 	rtval = VOP_GETPAGES(vp, m, count*PAGE_SIZE, reqpage, 0);
 	if (rtval == EOPNOTSUPP)
 		return vnode_pager_leaf_getpages(object, m, count, reqpage);
 	else
 		return rtval;
 }
 
 static int
 vnode_pager_leaf_getpages(object, m, count, reqpage)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	int reqpage;
 {
 	vm_offset_t kva;
 	off_t foff;
 	int i, size, bsize, first, firstaddr;
 	struct vnode *dp, *vp;
 	int runpg;
 	int runend;
 	struct buf *bp;
 	int s;
 	int error = 0;
 
 	vp = object->handle;
 	if (vp->v_mount == NULL)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	/* get the UNDERLYING device for the file with VOP_BMAP() */
 
 	/*
 	 * originally, we did not check for an error return value -- assuming
 	 * an fs always has a bmap entry point -- that assumption is wrong!!!
 	 */
 	foff = IDX_TO_OFF(m[reqpage]->pindex);
 
 	/*
 	 * if we can't bmap, use old VOP code
 	 */
 	if (VOP_BMAP(vp, 0, &dp, 0, NULL, NULL)) {
 		for (i = 0; i < count; i++) {
 			if (i != reqpage) {
 				vnode_pager_freepage(m[i]);
 			}
 		}
 		cnt.v_vnodein++;
 		cnt.v_vnodepgsin++;
 		return vnode_pager_input_old(object, m[reqpage]);
 
 		/*
 		 * if the blocksize is smaller than a page size, then use
 		 * special small filesystem code.  NFS sometimes has a small
 		 * blocksize, but it can handle large reads itself.
 		 */
 	} else if ((PAGE_SIZE / bsize) > 1 &&
 	    (vp->v_mount->mnt_stat.f_type != MOUNT_NFS)) {
 
 		for (i = 0; i < count; i++) {
 			if (i != reqpage) {
 				vnode_pager_freepage(m[i]);
 			}
 		}
 		cnt.v_vnodein++;
 		cnt.v_vnodepgsin++;
 		return vnode_pager_input_smlfs(object, m[reqpage]);
 	}
 	/*
 	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block
 	 * then, the entire page is valid --
 	 */
 	if (m[reqpage]->valid) {
 		m[reqpage]->valid = VM_PAGE_BITS_ALL;
 		for (i = 0; i < count; i++) {
 			if (i != reqpage)
 				vnode_pager_freepage(m[i]);
 		}
 		return VM_PAGER_OK;
 	}
 
 	/*
 	 * here on direct device I/O
 	 */
 
 	firstaddr = -1;
 	/*
 	 * calculate the run that includes the required page
 	 */
 	for(first = 0, i = 0; i < count; i = runend) {
 		firstaddr = vnode_pager_addr(vp,
 			IDX_TO_OFF(m[i]->pindex), &runpg);
 		if (firstaddr == -1) {
 			if (i == reqpage && foff < object->un_pager.vnp.vnp_size) {
 				panic("vnode_pager_putpages: unexpected missing page: firstaddr: %d, foff: %ld, vnp_size: %d",
 			   	 firstaddr, foff, object->un_pager.vnp.vnp_size);
 			}
 			vnode_pager_freepage(m[i]);
 			runend = i + 1;
 			first = runend;
 			continue;
 		}
 		runend = i + runpg;
 		if (runend <= reqpage) {
 			int j;
 			for (j = i; j < runend; j++) {
 				vnode_pager_freepage(m[j]);
 			}
 		} else {
 			if (runpg < (count - first)) {
 				for (i = first + runpg; i < count; i++)
 					vnode_pager_freepage(m[i]);
 				count = first + runpg;
 			}
 			break;
 		}
 		first = runend;
 	}
 
 	/*
 	 * the first and last page have been calculated now, move input pages
 	 * to be zero based...
 	 */
 	if (first != 0) {
 		for (i = first; i < count; i++) {
 			m[i - first] = m[i];
 		}
 		count -= first;
 		reqpage -= first;
 	}
 
 	/*
 	 * calculate the file virtual address for the transfer
 	 */
 	foff = IDX_TO_OFF(m[0]->pindex);
 
 	/*
 	 * calculate the size of the transfer
 	 */
 	size = count * PAGE_SIZE;
 	if ((foff + size) > object->un_pager.vnp.vnp_size)
 		size = object->un_pager.vnp.vnp_size - foff;
 
 	/*
 	 * round up physical size for real devices
 	 */
 	if (dp->v_type == VBLK || dp->v_type == VCHR)
 		size = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 
 	bp = getpbuf();
 	kva = (vm_offset_t) bp->b_data;
 
 	/*
 	 * and map the pages to be read into the kva
 	 */
 	pmap_qenter(kva, m, count);
 
 	/* build a minimal buffer header */
 	bp->b_flags = B_BUSY | B_READ | B_CALL;
 	bp->b_iodone = vnode_pager_iodone;
 	/* B_PHYS is not set, but it is nice to fill this in */
 	bp->b_proc = curproc;
 	bp->b_rcred = bp->b_wcred = bp->b_proc->p_ucred;
 	if (bp->b_rcred != NOCRED)
 		crhold(bp->b_rcred);
 	if (bp->b_wcred != NOCRED)
 		crhold(bp->b_wcred);
 	bp->b_blkno = firstaddr;
 	pbgetvp(dp, bp);
 	bp->b_bcount = size;
 	bp->b_bufsize = size;
 
 	cnt.v_vnodein++;
 	cnt.v_vnodepgsin += count;
 
 	/* do the input */
 	VOP_STRATEGY(bp);
 
 	s = splbio();
 	/* we definitely need to be at splbio here */
 
 	while ((bp->b_flags & B_DONE) == 0) {
 		tsleep(bp, PVM, "vnread", 0);
 	}
 	splx(s);
 	if ((bp->b_flags & B_ERROR) != 0)
 		error = EIO;
 
 	if (!error) {
 		if (size != count * PAGE_SIZE)
 			bzero((caddr_t) kva + size, PAGE_SIZE * count - size);
 	}
 	pmap_qremove(kva, count);
 
 	/*
 	 * free the buffer header back to the swap buffer pool
 	 */
 	relpbuf(bp);
 
 	for (i = 0; i < count; i++) {
 		pmap_clear_modify(VM_PAGE_TO_PHYS(m[i]));
 		m[i]->dirty = 0;
 		m[i]->valid = VM_PAGE_BITS_ALL;
 		m[i]->flags &= ~PG_ZERO;
 		if (i != reqpage) {
 
 			/*
 			 * whether or not to leave the page activated is up in
 			 * the air, but we should put the page on a page queue
 			 * somewhere. (it already is in the object). Result:
 			 * It appears that emperical results show that
 			 * deactivating pages is best.
 			 */
 
 			/*
 			 * just in case someone was asking for this page we
 			 * now tell them that it is ok to use
 			 */
 			if (!error) {
 				vm_page_deactivate(m[i]);
 				PAGE_WAKEUP(m[i]);
 			} else {
 				vnode_pager_freepage(m[i]);
 			}
 		}
 	}
 	if (error) {
 		printf("vnode_pager_getpages: I/O read error\n");
 	}
 	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 static int
 vnode_pager_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	int rtval;
 	struct vnode *vp;
 	vp = object->handle;
 	rtval = VOP_PUTPAGES(vp, m, count*PAGE_SIZE, sync, rtvals, 0);
 	if (rtval == EOPNOTSUPP)
 		return vnode_pager_leaf_putpages(object, m, count, sync, rtvals);
 	else
 		return rtval;
 }
 
 /*
  * generic vnode pager output routine
  */
 static int
 vnode_pager_leaf_putpages(object, m, count, sync, rtvals)
 	vm_object_t object;
 	vm_page_t *m;
 	int count;
 	boolean_t sync;
 	int *rtvals;
 {
 	int i;
 
 	struct vnode *vp;
 	int maxsize, ncount;
 	vm_ooffset_t poffset;
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 
 	vp = object->handle;;
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_AGAIN;
 
 	if ((int) m[0]->pindex < 0) {
 		printf("vnode_pager_putpages: attempt to write meta-data!!! -- 0x%x(%x)\n", m[0]->pindex, m[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return VM_PAGER_BAD;
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(m[0]->pindex);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > poffset)
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 		else
 			maxsize = 0;
 		ncount = (maxsize + PAGE_SIZE - 1) / PAGE_SIZE;
 		if (ncount < count) {
 			for (i = ncount; i < count; i++) {
 				rtvals[i] = VM_PAGER_BAD;
 			}
 #ifdef BOGUS
 			if (ncount == 0) {
 				printf("vnode_pager_putpages: write past end of file: %d, %lu\n",
 					poffset,
 					(unsigned long) object->un_pager.vnp.vnp_size);
 				return rtvals[0];
 			}
 #endif
 		}
 	}
 
 	for (i = 0; i < count; i++) {
 		m[i]->busy++;
 		m[i]->flags &= ~PG_BUSY;
 	}
 
 	aiov.iov_base = (caddr_t) 0;
 	aiov.iov_len = maxsize;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = poffset;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_resid = maxsize;
 	auio.uio_procp = (struct proc *) 0;
 	error = VOP_WRITE(vp, &auio, IO_VMIO|(sync?IO_SYNC:0), curproc->p_ucred);
 	cnt.v_vnodeout++;
 	cnt.v_vnodepgsout += ncount;
 
 	if (error) {
 		printf("vnode_pager_putpages: I/O error %d\n", error);
 	}
 	if (auio.uio_resid) {
 		printf("vnode_pager_putpages: residual I/O %d at %ld\n",
 			auio.uio_resid, m[0]->pindex);
 	}
 	for (i = 0; i < count; i++) {
 		m[i]->busy--;
 		if (i < ncount) {
 			rtvals[i] = VM_PAGER_OK;
 		}
 		if ((m[i]->busy == 0) && (m[i]->flags & PG_WANTED))
 			wakeup(m[i]);
 	}
 	return rtvals[0];
 }
 
 struct vnode *
 vnode_pager_lock(object)
 	vm_object_t object;
 {
 	for (; object != NULL; object = object->backing_object) {
 		if (object->type != OBJT_VNODE)
 			continue;
 
 		VOP_LOCK(object->handle);
 		return object->handle;
 	}
 	return NULL;
 }