Index: user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/dev/agp/agp.c	(revision 267237)
@@ -1,998 +1,998 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_agp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/agpio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 
 #include <dev/agp/agppriv.h>
 #include <dev/agp/agpvar.h>
 #include <dev/agp/agpreg.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/rman.h>
 
 MODULE_VERSION(agp, 1);
 
 MALLOC_DEFINE(M_AGP, "agp", "AGP data structures");
 
 				/* agp_drv.c */
 static d_open_t agp_open;
 static d_close_t agp_close;
 static d_ioctl_t agp_ioctl;
 static d_mmap_t agp_mmap;
 
 static struct cdevsw agp_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	agp_open,
 	.d_close =	agp_close,
 	.d_ioctl =	agp_ioctl,
 	.d_mmap =	agp_mmap,
 	.d_name =	"agp",
 };
 
 static devclass_t agp_devclass;
 
 /* Helper functions for implementing chipset mini drivers. */
 
 void
 agp_flush_cache()
 {
 #if defined(__i386__) || defined(__amd64__)
 	wbinvd();
 #endif
 }
 
 u_int8_t
 agp_find_caps(device_t dev)
 {
 	int capreg;
 
 
 	if (pci_find_cap(dev, PCIY_AGP, &capreg) != 0)
 		capreg = 0;
 	return (capreg);
 }
 
 /*
  * Find an AGP display device (if any).
  */
 static device_t
 agp_find_display(void)
 {
 	devclass_t pci = devclass_find("pci");
 	device_t bus, dev = 0;
 	device_t *kids;
 	int busnum, numkids, i;
 
 	for (busnum = 0; busnum < devclass_get_maxunit(pci); busnum++) {
 		bus = devclass_get_device(pci, busnum);
 		if (!bus)
 			continue;
 		if (device_get_children(bus, &kids, &numkids) != 0)
 			continue;
 		for (i = 0; i < numkids; i++) {
 			dev = kids[i];
 			if (pci_get_class(dev) == PCIC_DISPLAY
 			    && pci_get_subclass(dev) == PCIS_DISPLAY_VGA)
 				if (agp_find_caps(dev)) {
 					free(kids, M_TEMP);
 					return dev;
 				}
 					
 		}
 		free(kids, M_TEMP);
 	}
 
 	return 0;
 }
 
 struct agp_gatt *
 agp_alloc_gatt(device_t dev)
 {
 	u_int32_t apsize = AGP_GET_APERTURE(dev);
 	u_int32_t entries = apsize >> AGP_PAGE_SHIFT;
 	struct agp_gatt *gatt;
 
 	if (bootverbose)
 		device_printf(dev,
 			      "allocating GATT for aperture of size %dM\n",
 			      apsize / (1024*1024));
 
 	if (entries == 0) {
 		device_printf(dev, "bad aperture size\n");
 		return NULL;
 	}
 
 	gatt = malloc(sizeof(struct agp_gatt), M_AGP, M_NOWAIT);
 	if (!gatt)
 		return 0;
 
 	gatt->ag_entries = entries;
 	gatt->ag_virtual = contigmalloc(entries * sizeof(u_int32_t), M_AGP, 0,
 					0, ~0, PAGE_SIZE, 0);
 	if (!gatt->ag_virtual) {
 		if (bootverbose)
 			device_printf(dev, "contiguous allocation failed\n");
 		free(gatt, M_AGP);
 		return 0;
 	}
 	bzero(gatt->ag_virtual, entries * sizeof(u_int32_t));
 	gatt->ag_physical = vtophys((vm_offset_t) gatt->ag_virtual);
 	agp_flush_cache();
 
 	return gatt;
 }
 
 void
 agp_free_gatt(struct agp_gatt *gatt)
 {
 	contigfree(gatt->ag_virtual,
 		   gatt->ag_entries * sizeof(u_int32_t), M_AGP);
 	free(gatt, M_AGP);
 }
 
 static u_int agp_max[][2] = {
 	{0,	0},
 	{32,	4},
 	{64,	28},
 	{128,	96},
 	{256,	204},
 	{512,	440},
 	{1024,	942},
 	{2048,	1920},
 	{4096,	3932}
 };
 #define agp_max_size	(sizeof(agp_max) / sizeof(agp_max[0]))
 
 /**
  * Sets the PCI resource which represents the AGP aperture.
  *
  * If not called, the default AGP aperture resource of AGP_APBASE will
  * be used.  Must be called before agp_generic_attach().
  */
 void
 agp_set_aperture_resource(device_t dev, int rid)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	sc->as_aperture_rid = rid;
 }
 
 int
 agp_generic_attach(device_t dev)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	int i;
 	u_int memsize;
 
 	/*
 	 * Find and map the aperture, RF_SHAREABLE for DRM but not RF_ACTIVE
 	 * because the kernel doesn't need to map it.
 	 */
 
 	if (sc->as_aperture_rid != -1) {
 		if (sc->as_aperture_rid == 0)
 			sc->as_aperture_rid = AGP_APBASE;
 
 		sc->as_aperture = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
 		    &sc->as_aperture_rid, RF_SHAREABLE);
 		if (!sc->as_aperture)
 			return ENOMEM;
 	}
 
 	/*
 	 * Work out an upper bound for agp memory allocation. This
 	 * uses a heurisitc table from the Linux driver.
 	 */
 	memsize = ptoa(realmem) >> 20;
 	for (i = 0; i < agp_max_size; i++) {
 		if (memsize <= agp_max[i][0])
 			break;
 	}
 	if (i == agp_max_size)
 		i = agp_max_size - 1;
 	sc->as_maxmem = agp_max[i][1] << 20U;
 
 	/*
 	 * The lock is used to prevent re-entry to
 	 * agp_generic_bind_memory() since that function can sleep.
 	 */
 	mtx_init(&sc->as_lock, "agp lock", NULL, MTX_DEF);
 
 	/*
 	 * Initialise stuff for the userland device.
 	 */
 	agp_devclass = devclass_find("agp");
 	TAILQ_INIT(&sc->as_memory);
 	sc->as_nextid = 1;
 
 	sc->as_devnode = make_dev(&agp_cdevsw,
 	    0, UID_ROOT, GID_WHEEL, 0600, "agpgart");
 	sc->as_devnode->si_drv1 = dev;
 
 	return 0;
 }
 
 void
 agp_free_cdev(device_t dev)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	destroy_dev(sc->as_devnode);
 }
 
 void
 agp_free_res(device_t dev)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (sc->as_aperture != NULL)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->as_aperture_rid,
 		    sc->as_aperture);
 	mtx_destroy(&sc->as_lock);
 	agp_flush_cache();
 }
 
 int
 agp_generic_detach(device_t dev)
 {
 
 	agp_free_cdev(dev);
 	agp_free_res(dev);
 	return 0;
 }
 
 /**
  * Default AGP aperture size detection which simply returns the size of
  * the aperture's PCI resource.
  */
 u_int32_t
 agp_generic_get_aperture(device_t dev)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	return rman_get_size(sc->as_aperture);
 }
 
 /**
  * Default AGP aperture size setting function, which simply doesn't allow
  * changes to resource size.
  */
 int
 agp_generic_set_aperture(device_t dev, u_int32_t aperture)
 {
 	u_int32_t current_aperture;
 
 	current_aperture = AGP_GET_APERTURE(dev);
 	if (current_aperture != aperture)
 		return EINVAL;
 	else
 		return 0;
 }
 
 /*
  * This does the enable logic for v3, with the same topology
  * restrictions as in place for v2 -- one bus, one device on the bus.
  */
 static int
 agp_v3_enable(device_t dev, device_t mdev, u_int32_t mode)
 {
 	u_int32_t tstatus, mstatus;
 	u_int32_t command;
 	int rq, sba, fw, rate, arqsz, cal;
 
 	tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4);
 	mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4);
 
 	/* Set RQ to the min of mode, tstatus and mstatus */
 	rq = AGP_MODE_GET_RQ(mode);
 	if (AGP_MODE_GET_RQ(tstatus) < rq)
 		rq = AGP_MODE_GET_RQ(tstatus);
 	if (AGP_MODE_GET_RQ(mstatus) < rq)
 		rq = AGP_MODE_GET_RQ(mstatus);
 
 	/*
 	 * ARQSZ - Set the value to the maximum one.
 	 * Don't allow the mode register to override values.
 	 */
 	arqsz = AGP_MODE_GET_ARQSZ(mode);
 	if (AGP_MODE_GET_ARQSZ(tstatus) > rq)
 		rq = AGP_MODE_GET_ARQSZ(tstatus);
 	if (AGP_MODE_GET_ARQSZ(mstatus) > rq)
 		rq = AGP_MODE_GET_ARQSZ(mstatus);
 
 	/* Calibration cycle - don't allow override by mode register */
 	cal = AGP_MODE_GET_CAL(tstatus);
 	if (AGP_MODE_GET_CAL(mstatus) < cal)
 		cal = AGP_MODE_GET_CAL(mstatus);
 
 	/* SBA must be supported for AGP v3. */
 	sba = 1;
 
 	/* Set FW if all three support it. */
 	fw = (AGP_MODE_GET_FW(tstatus)
 	       & AGP_MODE_GET_FW(mstatus)
 	       & AGP_MODE_GET_FW(mode));
 	
 	/* Figure out the max rate */
 	rate = (AGP_MODE_GET_RATE(tstatus)
 		& AGP_MODE_GET_RATE(mstatus)
 		& AGP_MODE_GET_RATE(mode));
 	if (rate & AGP_MODE_V3_RATE_8x)
 		rate = AGP_MODE_V3_RATE_8x;
 	else
 		rate = AGP_MODE_V3_RATE_4x;
 	if (bootverbose)
 		device_printf(dev, "Setting AGP v3 mode %d\n", rate * 4);
 
 	pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, 0, 4);
 
 	/* Construct the new mode word and tell the hardware */
 	command = 0;
 	command = AGP_MODE_SET_RQ(0, rq);
 	command = AGP_MODE_SET_ARQSZ(command, arqsz);
 	command = AGP_MODE_SET_CAL(command, cal);
 	command = AGP_MODE_SET_SBA(command, sba);
 	command = AGP_MODE_SET_FW(command, fw);
 	command = AGP_MODE_SET_RATE(command, rate);
 	command = AGP_MODE_SET_MODE_3(command, 1);
 	command = AGP_MODE_SET_AGP(command, 1);
 	pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, command, 4);
 	pci_write_config(mdev, agp_find_caps(mdev) + AGP_COMMAND, command, 4);
 
 	return 0;
 }
 
 static int
 agp_v2_enable(device_t dev, device_t mdev, u_int32_t mode)
 {
 	u_int32_t tstatus, mstatus;
 	u_int32_t command;
 	int rq, sba, fw, rate;
 
 	tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4);
 	mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4);
 
 	/* Set RQ to the min of mode, tstatus and mstatus */
 	rq = AGP_MODE_GET_RQ(mode);
 	if (AGP_MODE_GET_RQ(tstatus) < rq)
 		rq = AGP_MODE_GET_RQ(tstatus);
 	if (AGP_MODE_GET_RQ(mstatus) < rq)
 		rq = AGP_MODE_GET_RQ(mstatus);
 
 	/* Set SBA if all three can deal with SBA */
 	sba = (AGP_MODE_GET_SBA(tstatus)
 	       & AGP_MODE_GET_SBA(mstatus)
 	       & AGP_MODE_GET_SBA(mode));
 
 	/* Similar for FW */
 	fw = (AGP_MODE_GET_FW(tstatus)
 	       & AGP_MODE_GET_FW(mstatus)
 	       & AGP_MODE_GET_FW(mode));
 
 	/* Figure out the max rate */
 	rate = (AGP_MODE_GET_RATE(tstatus)
 		& AGP_MODE_GET_RATE(mstatus)
 		& AGP_MODE_GET_RATE(mode));
 	if (rate & AGP_MODE_V2_RATE_4x)
 		rate = AGP_MODE_V2_RATE_4x;
 	else if (rate & AGP_MODE_V2_RATE_2x)
 		rate = AGP_MODE_V2_RATE_2x;
 	else
 		rate = AGP_MODE_V2_RATE_1x;
 	if (bootverbose)
 		device_printf(dev, "Setting AGP v2 mode %d\n", rate);
 
 	/* Construct the new mode word and tell the hardware */
 	command = 0;
 	command = AGP_MODE_SET_RQ(0, rq);
 	command = AGP_MODE_SET_SBA(command, sba);
 	command = AGP_MODE_SET_FW(command, fw);
 	command = AGP_MODE_SET_RATE(command, rate);
 	command = AGP_MODE_SET_AGP(command, 1);
 	pci_write_config(dev, agp_find_caps(dev) + AGP_COMMAND, command, 4);
 	pci_write_config(mdev, agp_find_caps(mdev) + AGP_COMMAND, command, 4);
 
 	return 0;
 }
 
 int
 agp_generic_enable(device_t dev, u_int32_t mode)
 {
 	device_t mdev = agp_find_display();
 	u_int32_t tstatus, mstatus;
 
 	if (!mdev) {
 		AGP_DPF("can't find display\n");
 		return ENXIO;
 	}
 
 	tstatus = pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4);
 	mstatus = pci_read_config(mdev, agp_find_caps(mdev) + AGP_STATUS, 4);
 
 	/*
 	 * Check display and bridge for AGP v3 support.  AGP v3 allows
 	 * more variety in topology than v2, e.g. multiple AGP devices
 	 * attached to one bridge, or multiple AGP bridges in one
 	 * system.  This doesn't attempt to address those situations,
 	 * but should work fine for a classic single AGP slot system
 	 * with AGP v3.
 	 */
 	if (AGP_MODE_GET_MODE_3(mode) &&
 	    AGP_MODE_GET_MODE_3(tstatus) &&
 	    AGP_MODE_GET_MODE_3(mstatus))
 		return (agp_v3_enable(dev, mdev, mode));
 	else
 		return (agp_v2_enable(dev, mdev, mode));	    
 }
 
 struct agp_memory *
 agp_generic_alloc_memory(device_t dev, int type, vm_size_t size)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	struct agp_memory *mem;
 
 	if ((size & (AGP_PAGE_SIZE - 1)) != 0)
 		return 0;
 
 	if (sc->as_allocated + size > sc->as_maxmem)
 		return 0;
 
 	if (type != 0) {
 		printf("agp_generic_alloc_memory: unsupported type %d\n",
 		       type);
 		return 0;
 	}
 
 	mem = malloc(sizeof *mem, M_AGP, M_WAITOK);
 	mem->am_id = sc->as_nextid++;
 	mem->am_size = size;
 	mem->am_type = 0;
 	mem->am_obj = vm_object_allocate(OBJT_DEFAULT, atop(round_page(size)));
 	mem->am_physical = 0;
 	mem->am_offset = 0;
 	mem->am_is_bound = 0;
 	TAILQ_INSERT_TAIL(&sc->as_memory, mem, am_link);
 	sc->as_allocated += size;
 
 	return mem;
 }
 
 int
 agp_generic_free_memory(device_t dev, struct agp_memory *mem)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (mem->am_is_bound)
 		return EBUSY;
 
 	sc->as_allocated -= mem->am_size;
 	TAILQ_REMOVE(&sc->as_memory, mem, am_link);
 	vm_object_deallocate(mem->am_obj);
 	free(mem, M_AGP);
 	return 0;
 }
 
 int
 agp_generic_bind_memory(device_t dev, struct agp_memory *mem,
 			vm_offset_t offset)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	vm_offset_t i, j, k;
 	vm_page_t m;
 	int error;
 
 	/* Do some sanity checks first. */
 	if ((offset & (AGP_PAGE_SIZE - 1)) != 0 ||
 	    offset + mem->am_size > AGP_GET_APERTURE(dev)) {
 		device_printf(dev, "binding memory at bad offset %#x\n",
 		    (int)offset);
 		return EINVAL;
 	}
 
 	/*
 	 * Allocate the pages early, before acquiring the lock,
 	 * because vm_page_grab() may sleep and we can't hold a mutex
 	 * while sleeping.
 	 */
 	VM_OBJECT_WLOCK(mem->am_obj);
 	for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
 		/*
 		 * Find a page from the object and wire it
 		 * down. This page will be mapped using one or more
 		 * entries in the GATT (assuming that PAGE_SIZE >=
 		 * AGP_PAGE_SIZE. If this is the first call to bind,
 		 * the pages will be allocated and zeroed.
 		 */
 		m = vm_page_grab(mem->am_obj, OFF_TO_IDX(i),
 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 		AGP_DPF("found page pa=%#jx\n", (uintmax_t)VM_PAGE_TO_PHYS(m));
 	}
 	VM_OBJECT_WUNLOCK(mem->am_obj);
 
 	mtx_lock(&sc->as_lock);
 
 	if (mem->am_is_bound) {
 		device_printf(dev, "memory already bound\n");
 		error = EINVAL;
 		VM_OBJECT_WLOCK(mem->am_obj);
 		i = 0;
 		goto bad;
 	}
 	
 	/*
 	 * Bind the individual pages and flush the chipset's
 	 * TLB.
 	 */
 	VM_OBJECT_WLOCK(mem->am_obj);
 	for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
 		m = vm_page_lookup(mem->am_obj, OFF_TO_IDX(i));
 
 		/*
 		 * Install entries in the GATT, making sure that if
 		 * AGP_PAGE_SIZE < PAGE_SIZE and mem->am_size is not
 		 * aligned to PAGE_SIZE, we don't modify too many GATT 
 		 * entries.
 		 */
 		for (j = 0; j < PAGE_SIZE && i + j < mem->am_size;
 		     j += AGP_PAGE_SIZE) {
 			vm_offset_t pa = VM_PAGE_TO_PHYS(m) + j;
 			AGP_DPF("binding offset %#jx to pa %#jx\n",
 				(uintmax_t)offset + i + j, (uintmax_t)pa);
 			error = AGP_BIND_PAGE(dev, offset + i + j, pa);
 			if (error) {
 				/*
 				 * Bail out. Reverse all the mappings
 				 * and unwire the pages.
 				 */
 				for (k = 0; k < i + j; k += AGP_PAGE_SIZE)
 					AGP_UNBIND_PAGE(dev, offset + k);
 				goto bad;
 			}
 		}
 		vm_page_xunbusy(m);
 	}
 	VM_OBJECT_WUNLOCK(mem->am_obj);
 
 	/*
 	 * Flush the cpu cache since we are providing a new mapping
 	 * for these pages.
 	 */
 	agp_flush_cache();
 
 	/*
 	 * Make sure the chipset gets the new mappings.
 	 */
 	AGP_FLUSH_TLB(dev);
 
 	mem->am_offset = offset;
 	mem->am_is_bound = 1;
 
 	mtx_unlock(&sc->as_lock);
 
 	return 0;
 bad:
 	mtx_unlock(&sc->as_lock);
 	VM_OBJECT_ASSERT_WLOCKED(mem->am_obj);
 	for (k = 0; k < mem->am_size; k += PAGE_SIZE) {
 		m = vm_page_lookup(mem->am_obj, OFF_TO_IDX(k));
 		if (k >= i)
 			vm_page_xunbusy(m);
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(mem->am_obj);
 
 	return error;
 }
 
 int
 agp_generic_unbind_memory(device_t dev, struct agp_memory *mem)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	vm_page_t m;
 	int i;
 
 	mtx_lock(&sc->as_lock);
 
 	if (!mem->am_is_bound) {
 		device_printf(dev, "memory is not bound\n");
 		mtx_unlock(&sc->as_lock);
 		return EINVAL;
 	}
 
 
 	/*
 	 * Unbind the individual pages and flush the chipset's
 	 * TLB. Unwire the pages so they can be swapped.
 	 */
 	for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE)
 		AGP_UNBIND_PAGE(dev, mem->am_offset + i);
 	VM_OBJECT_WLOCK(mem->am_obj);
 	for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
 		m = vm_page_lookup(mem->am_obj, atop(i));
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(mem->am_obj);
 		
 	agp_flush_cache();
 	AGP_FLUSH_TLB(dev);
 
 	mem->am_offset = 0;
 	mem->am_is_bound = 0;
 
 	mtx_unlock(&sc->as_lock);
 
 	return 0;
 }
 
 /* Helper functions for implementing user/kernel api */
 
 static int
 agp_acquire_helper(device_t dev, enum agp_acquire_state state)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (sc->as_state != AGP_ACQUIRE_FREE)
 		return EBUSY;
 	sc->as_state = state;
 
 	return 0;
 }
 
 static int
 agp_release_helper(device_t dev, enum agp_acquire_state state)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (sc->as_state == AGP_ACQUIRE_FREE)
 		return 0;
 
 	if (sc->as_state != state)
 		return EBUSY;
 
 	sc->as_state = AGP_ACQUIRE_FREE;
 	return 0;
 }
 
 static struct agp_memory *
 agp_find_memory(device_t dev, int id)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	struct agp_memory *mem;
 
 	AGP_DPF("searching for memory block %d\n", id);
 	TAILQ_FOREACH(mem, &sc->as_memory, am_link) {
 		AGP_DPF("considering memory block %d\n", mem->am_id);
 		if (mem->am_id == id)
 			return mem;
 	}
 	return 0;
 }
 
 /* Implementation of the userland ioctl api */
 
 static int
 agp_info_user(device_t dev, agp_info *info)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	bzero(info, sizeof *info);
 	info->bridge_id = pci_get_devid(dev);
 	info->agp_mode = 
 	    pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4);
 	if (sc->as_aperture)
 		info->aper_base = rman_get_start(sc->as_aperture);
 	else
 		info->aper_base = 0;
 	info->aper_size = AGP_GET_APERTURE(dev) >> 20;
 	info->pg_total = info->pg_system = sc->as_maxmem >> AGP_PAGE_SHIFT;
 	info->pg_used = sc->as_allocated >> AGP_PAGE_SHIFT;
 
 	return 0;
 }
 
 static int
 agp_setup_user(device_t dev, agp_setup *setup)
 {
 	return AGP_ENABLE(dev, setup->agp_mode);
 }
 
 static int
 agp_allocate_user(device_t dev, agp_allocate *alloc)
 {
 	struct agp_memory *mem;
 
 	mem = AGP_ALLOC_MEMORY(dev,
 			       alloc->type,
 			       alloc->pg_count << AGP_PAGE_SHIFT);
 	if (mem) {
 		alloc->key = mem->am_id;
 		alloc->physical = mem->am_physical;
 		return 0;
 	} else {
 		return ENOMEM;
 	}
 }
 
 static int
 agp_deallocate_user(device_t dev, int id)
 {
 	struct agp_memory *mem = agp_find_memory(dev, id);
 
 	if (mem) {
 		AGP_FREE_MEMORY(dev, mem);
 		return 0;
 	} else {
 		return ENOENT;
 	}
 }
 
 static int
 agp_bind_user(device_t dev, agp_bind *bind)
 {
 	struct agp_memory *mem = agp_find_memory(dev, bind->key);
 
 	if (!mem)
 		return ENOENT;
 
 	return AGP_BIND_MEMORY(dev, mem, bind->pg_start << AGP_PAGE_SHIFT);
 }
 
 static int
 agp_unbind_user(device_t dev, agp_unbind *unbind)
 {
 	struct agp_memory *mem = agp_find_memory(dev, unbind->key);
 
 	if (!mem)
 		return ENOENT;
 
 	return AGP_UNBIND_MEMORY(dev, mem);
 }
 
 static int
 agp_chipset_flush(device_t dev)
 {
 
 	return (AGP_CHIPSET_FLUSH(dev));
 }
 
 static int
 agp_open(struct cdev *kdev, int oflags, int devtype, struct thread *td)
 {
 	device_t dev = kdev->si_drv1;
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (!sc->as_isopen) {
 		sc->as_isopen = 1;
 		device_busy(dev);
 	}
 
 	return 0;
 }
 
 static int
 agp_close(struct cdev *kdev, int fflag, int devtype, struct thread *td)
 {
 	device_t dev = kdev->si_drv1;
 	struct agp_softc *sc = device_get_softc(dev);
 	struct agp_memory *mem;
 
 	/*
 	 * Clear the GATT and force release on last close
 	 */
 	while ((mem = TAILQ_FIRST(&sc->as_memory)) != 0) {
 		if (mem->am_is_bound)
 			AGP_UNBIND_MEMORY(dev, mem);
 		AGP_FREE_MEMORY(dev, mem);
 	}
 	if (sc->as_state == AGP_ACQUIRE_USER)
 		agp_release_helper(dev, AGP_ACQUIRE_USER);
 	sc->as_isopen = 0;
 	device_unbusy(dev);
 
 	return 0;
 }
 
 static int
 agp_ioctl(struct cdev *kdev, u_long cmd, caddr_t data, int fflag, struct thread *td)
 {
 	device_t dev = kdev->si_drv1;
 
 	switch (cmd) {
 	case AGPIOC_INFO:
 		return agp_info_user(dev, (agp_info *) data);
 
 	case AGPIOC_ACQUIRE:
 		return agp_acquire_helper(dev, AGP_ACQUIRE_USER);
 
 	case AGPIOC_RELEASE:
 		return agp_release_helper(dev, AGP_ACQUIRE_USER);
 
 	case AGPIOC_SETUP:
 		return agp_setup_user(dev, (agp_setup *)data);
 
 	case AGPIOC_ALLOCATE:
 		return agp_allocate_user(dev, (agp_allocate *)data);
 
 	case AGPIOC_DEALLOCATE:
 		return agp_deallocate_user(dev, *(int *) data);
 
 	case AGPIOC_BIND:
 		return agp_bind_user(dev, (agp_bind *)data);
 
 	case AGPIOC_UNBIND:
 		return agp_unbind_user(dev, (agp_unbind *)data);
 
 	case AGPIOC_CHIPSET_FLUSH:
 		return agp_chipset_flush(dev);
 	}
 
 	return EINVAL;
 }
 
 static int
 agp_mmap(struct cdev *kdev, vm_ooffset_t offset, vm_paddr_t *paddr,
     int prot, vm_memattr_t *memattr)
 {
 	device_t dev = kdev->si_drv1;
 	struct agp_softc *sc = device_get_softc(dev);
 
 	if (offset > AGP_GET_APERTURE(dev))
 		return -1;
 	if (sc->as_aperture == NULL)
 		return -1;
 	*paddr = rman_get_start(sc->as_aperture) + offset;
 	return 0;
 }
 
 /* Implementation of the kernel api */
 
 device_t
 agp_find_device()
 {
 	device_t *children, child;
 	int i, count;
 
 	if (!agp_devclass)
 		return NULL;
 	if (devclass_get_devices(agp_devclass, &children, &count) != 0)
 		return NULL;
 	child = NULL;
 	for (i = 0; i < count; i++) {
 		if (device_is_attached(children[i])) {
 			child = children[i];
 			break;
 		}
 	}
 	free(children, M_TEMP);
 	return child;
 }
 
 enum agp_acquire_state
 agp_state(device_t dev)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 	return sc->as_state;
 }
 
 void
 agp_get_info(device_t dev, struct agp_info *info)
 {
 	struct agp_softc *sc = device_get_softc(dev);
 
 	info->ai_mode =
 		pci_read_config(dev, agp_find_caps(dev) + AGP_STATUS, 4);
 	if (sc->as_aperture != NULL)
 		info->ai_aperture_base = rman_get_start(sc->as_aperture);
 	else
 		info->ai_aperture_base = 0;
 	info->ai_aperture_size = AGP_GET_APERTURE(dev);
 	info->ai_memory_allowed = sc->as_maxmem;
 	info->ai_memory_used = sc->as_allocated;
 }
 
 int
 agp_acquire(device_t dev)
 {
 	return agp_acquire_helper(dev, AGP_ACQUIRE_KERNEL);
 }
 
 int
 agp_release(device_t dev)
 {
 	return agp_release_helper(dev, AGP_ACQUIRE_KERNEL);
 }
 
 int
 agp_enable(device_t dev, u_int32_t mode)
 {
 	return AGP_ENABLE(dev, mode);
 }
 
 void *agp_alloc_memory(device_t dev, int type, vm_size_t bytes)
 {
 	return  (void *) AGP_ALLOC_MEMORY(dev, type, bytes);
 }
 
 void agp_free_memory(device_t dev, void *handle)
 {
 	struct agp_memory *mem = (struct agp_memory *) handle;
 	AGP_FREE_MEMORY(dev, mem);
 }
 
 int agp_bind_memory(device_t dev, void *handle, vm_offset_t offset)
 {
 	struct agp_memory *mem = (struct agp_memory *) handle;
 	return AGP_BIND_MEMORY(dev, mem, offset);
 }
 
 int agp_unbind_memory(device_t dev, void *handle)
 {
 	struct agp_memory *mem = (struct agp_memory *) handle;
 	return AGP_UNBIND_MEMORY(dev, mem);
 }
 
 void agp_memory_info(device_t dev, void *handle, struct
 		     agp_memory_info *mi)
 {
 	struct agp_memory *mem = (struct agp_memory *) handle;
 
 	mi->ami_size = mem->am_size;
 	mi->ami_physical = mem->am_physical;
 	mi->ami_offset = mem->am_offset;
 	mi->ami_is_bound = mem->am_is_bound;
 }
Index: user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/dev/agp/agp_i810.c	(revision 267237)
@@ -1,2570 +1,2570 @@
 /*-
  * Copyright (c) 2000 Doug Rabson
  * Copyright (c) 2000 Ruslan Ermilov
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Fixes for 830/845G support: David Dawes <dawes@xfree86.org>
  * 852GM/855GM/865G support added by David Dawes <dawes@xfree86.org>
  *
  * This is generic Intel GTT handling code, morphed from the AGP
  * bridge code.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #if 0
 #define	KTR_AGP_I810	KTR_DEV
 #else
 #define	KTR_AGP_I810	0
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 
 #include <dev/agp/agppriv.h>
 #include <dev/agp/agpreg.h>
 #include <dev/agp/agp_i810.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pci_private.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/md_var.h>
 #include <sys/rman.h>
 
 MALLOC_DECLARE(M_AGP);
 
 struct agp_i810_match;
 
 static int agp_i810_check_active(device_t bridge_dev);
 static int agp_i830_check_active(device_t bridge_dev);
 static int agp_i915_check_active(device_t bridge_dev);
 static int agp_sb_check_active(device_t bridge_dev);
 
 static void agp_82852_set_desc(device_t dev,
     const struct agp_i810_match *match);
 static void agp_i810_set_desc(device_t dev, const struct agp_i810_match *match);
 
 static void agp_i810_dump_regs(device_t dev);
 static void agp_i830_dump_regs(device_t dev);
 static void agp_i855_dump_regs(device_t dev);
 static void agp_i915_dump_regs(device_t dev);
 static void agp_i965_dump_regs(device_t dev);
 static void agp_sb_dump_regs(device_t dev);
 
 static int agp_i810_get_stolen_size(device_t dev);
 static int agp_i830_get_stolen_size(device_t dev);
 static int agp_i915_get_stolen_size(device_t dev);
 static int agp_sb_get_stolen_size(device_t dev);
 
 static int agp_i810_get_gtt_mappable_entries(device_t dev);
 static int agp_i830_get_gtt_mappable_entries(device_t dev);
 static int agp_i915_get_gtt_mappable_entries(device_t dev);
 
 static int agp_i810_get_gtt_total_entries(device_t dev);
 static int agp_i965_get_gtt_total_entries(device_t dev);
 static int agp_gen5_get_gtt_total_entries(device_t dev);
 static int agp_sb_get_gtt_total_entries(device_t dev);
 
 static int agp_i810_install_gatt(device_t dev);
 static int agp_i830_install_gatt(device_t dev);
 
 static void agp_i810_deinstall_gatt(device_t dev);
 static void agp_i830_deinstall_gatt(device_t dev);
 
 static void agp_i810_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 static void agp_i830_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 static void agp_i915_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 static void agp_i965_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 static void agp_g4x_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 static void agp_sb_install_gtt_pte(device_t dev, u_int index,
     vm_offset_t physical, int flags);
 
 static void agp_i810_write_gtt(device_t dev, u_int index, uint32_t pte);
 static void agp_i915_write_gtt(device_t dev, u_int index, uint32_t pte);
 static void agp_i965_write_gtt(device_t dev, u_int index, uint32_t pte);
 static void agp_g4x_write_gtt(device_t dev, u_int index, uint32_t pte);
 static void agp_sb_write_gtt(device_t dev, u_int index, uint32_t pte);
 
 static u_int32_t agp_i810_read_gtt_pte(device_t dev, u_int index);
 static u_int32_t agp_i915_read_gtt_pte(device_t dev, u_int index);
 static u_int32_t agp_i965_read_gtt_pte(device_t dev, u_int index);
 static u_int32_t agp_g4x_read_gtt_pte(device_t dev, u_int index);
 
 static vm_paddr_t agp_i810_read_gtt_pte_paddr(device_t dev, u_int index);
 static vm_paddr_t agp_i915_read_gtt_pte_paddr(device_t dev, u_int index);
 static vm_paddr_t agp_sb_read_gtt_pte_paddr(device_t dev, u_int index);
 
 static int agp_i810_set_aperture(device_t dev, u_int32_t aperture);
 static int agp_i830_set_aperture(device_t dev, u_int32_t aperture);
 static int agp_i915_set_aperture(device_t dev, u_int32_t aperture);
 
 static int agp_i810_chipset_flush_setup(device_t dev);
 static int agp_i915_chipset_flush_setup(device_t dev);
 static int agp_i965_chipset_flush_setup(device_t dev);
 
 static void agp_i810_chipset_flush_teardown(device_t dev);
 static void agp_i915_chipset_flush_teardown(device_t dev);
 static void agp_i965_chipset_flush_teardown(device_t dev);
 
 static void agp_i810_chipset_flush(device_t dev);
 static void agp_i830_chipset_flush(device_t dev);
 static void agp_i915_chipset_flush(device_t dev);
 
 enum {
 	CHIP_I810,	/* i810/i815 */
 	CHIP_I830,	/* 830M/845G */
 	CHIP_I855,	/* 852GM/855GM/865G */
 	CHIP_I915,	/* 915G/915GM */
 	CHIP_I965,	/* G965 */
 	CHIP_G33,	/* G33/Q33/Q35 */
 	CHIP_IGD,	/* Pineview */
 	CHIP_G4X,	/* G45/Q45 */
 	CHIP_SB,	/* SandyBridge */
 };
 
 /* The i810 through i855 have the registers at BAR 1, and the GATT gets
  * allocated by us.  The i915 has registers in BAR 0 and the GATT is at the
  * start of the stolen memory, and should only be accessed by the OS through
  * BAR 3.  The G965 has registers and GATT in the same BAR (0) -- first 512KB
  * is registers, second 512KB is GATT.
  */
 static struct resource_spec agp_i810_res_spec[] = {
 	{ SYS_RES_MEMORY, AGP_I810_MMADR, RF_ACTIVE | RF_SHAREABLE },
 	{ -1, 0 }
 };
 
 static struct resource_spec agp_i915_res_spec[] = {
 	{ SYS_RES_MEMORY, AGP_I915_MMADR, RF_ACTIVE | RF_SHAREABLE },
 	{ SYS_RES_MEMORY, AGP_I915_GTTADR, RF_ACTIVE | RF_SHAREABLE },
 	{ -1, 0 }
 };
 
 static struct resource_spec agp_i965_res_spec[] = {
 	{ SYS_RES_MEMORY, AGP_I965_GTTMMADR, RF_ACTIVE | RF_SHAREABLE },
 	{ -1, 0 }
 };
 
 static struct resource_spec agp_g4x_res_spec[] = {
 	{ SYS_RES_MEMORY, AGP_G4X_MMADR, RF_ACTIVE | RF_SHAREABLE },
 	{ SYS_RES_MEMORY, AGP_G4X_GTTADR, RF_ACTIVE | RF_SHAREABLE },
 	{ -1, 0 }
 };
 
 struct agp_i810_softc {
 	struct agp_softc agp;
 	u_int32_t initial_aperture;	/* aperture size at startup */
 	struct agp_gatt *gatt;
 	u_int32_t dcache_size;		/* i810 only */
 	u_int32_t stolen;		/* number of i830/845 gtt
 					   entries for stolen memory */
 	u_int stolen_size;		/* BIOS-reserved graphics memory */
 	u_int gtt_total_entries;	/* Total number of gtt ptes */
 	u_int gtt_mappable_entries;	/* Number of gtt ptes mappable by CPU */
 	device_t bdev;			/* bridge device */
 	void *argb_cursor;		/* contigmalloc area for ARGB cursor */
 	struct resource *sc_res[2];
 	const struct agp_i810_match *match;
 	int sc_flush_page_rid;
 	struct resource *sc_flush_page_res;
 	void *sc_flush_page_vaddr;
 	int sc_bios_allocated_flush_page;
 };
 
 static device_t intel_agp;
 
 struct agp_i810_driver {
 	int chiptype;
 	int gen;
 	int busdma_addr_mask_sz;
 	struct resource_spec *res_spec;
 	int (*check_active)(device_t);
 	void (*set_desc)(device_t, const struct agp_i810_match *);
 	void (*dump_regs)(device_t);
 	int (*get_stolen_size)(device_t);
 	int (*get_gtt_total_entries)(device_t);
 	int (*get_gtt_mappable_entries)(device_t);
 	int (*install_gatt)(device_t);
 	void (*deinstall_gatt)(device_t);
 	void (*write_gtt)(device_t, u_int, uint32_t);
 	void (*install_gtt_pte)(device_t, u_int, vm_offset_t, int);
 	u_int32_t (*read_gtt_pte)(device_t, u_int);
 	vm_paddr_t (*read_gtt_pte_paddr)(device_t , u_int);
 	int (*set_aperture)(device_t, u_int32_t);
 	int (*chipset_flush_setup)(device_t);
 	void (*chipset_flush_teardown)(device_t);
 	void (*chipset_flush)(device_t);
 };
 
 static const struct agp_i810_driver agp_i810_i810_driver = {
 	.chiptype = CHIP_I810,
 	.gen = 1,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i810_res_spec,
 	.check_active = agp_i810_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i810_dump_regs,
 	.get_stolen_size = agp_i810_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i810_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i810_install_gatt,
 	.deinstall_gatt = agp_i810_deinstall_gatt,
 	.write_gtt = agp_i810_write_gtt,
 	.install_gtt_pte = agp_i810_install_gtt_pte,
 	.read_gtt_pte = agp_i810_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr,
 	.set_aperture = agp_i810_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i810_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_i815_driver = {
 	.chiptype = CHIP_I810,
 	.gen = 2,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i810_res_spec,
 	.check_active = agp_i810_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i810_dump_regs,
 	.get_stolen_size = agp_i810_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i830_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i810_install_gatt,
 	.deinstall_gatt = agp_i810_deinstall_gatt,
 	.write_gtt = agp_i810_write_gtt,
 	.install_gtt_pte = agp_i810_install_gtt_pte,
 	.read_gtt_pte = agp_i810_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr,
 	.set_aperture = agp_i810_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i830_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_i830_driver = {
 	.chiptype = CHIP_I830,
 	.gen = 2,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i810_res_spec,
 	.check_active = agp_i830_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i830_dump_regs,
 	.get_stolen_size = agp_i830_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i830_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i810_write_gtt,
 	.install_gtt_pte = agp_i830_install_gtt_pte,
 	.read_gtt_pte = agp_i810_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr,
 	.set_aperture = agp_i830_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i830_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_i855_driver = {
 	.chiptype = CHIP_I855,
 	.gen = 2,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i810_res_spec,
 	.check_active = agp_i830_check_active,
 	.set_desc = agp_82852_set_desc,
 	.dump_regs = agp_i855_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i810_write_gtt,
 	.install_gtt_pte = agp_i830_install_gtt_pte,
 	.read_gtt_pte = agp_i810_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr,
 	.set_aperture = agp_i830_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i830_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_i865_driver = {
 	.chiptype = CHIP_I855,
 	.gen = 2,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i810_res_spec,
 	.check_active = agp_i830_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i855_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i810_write_gtt,
 	.install_gtt_pte = agp_i830_install_gtt_pte,
 	.read_gtt_pte = agp_i810_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i810_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i830_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_i915_driver = {
 	.chiptype = CHIP_I915,
 	.gen = 3,
 	.busdma_addr_mask_sz = 32,
 	.res_spec = agp_i915_res_spec,
 	.check_active = agp_i915_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i915_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i810_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i915_write_gtt,
 	.install_gtt_pte = agp_i915_install_gtt_pte,
 	.read_gtt_pte = agp_i915_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i915_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i915_chipset_flush_teardown,
 	.chipset_flush = agp_i915_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_g965_driver = {
 	.chiptype = CHIP_I965,
 	.gen = 4,
 	.busdma_addr_mask_sz = 36,
 	.res_spec = agp_i965_res_spec,
 	.check_active = agp_i915_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i965_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i965_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i965_write_gtt,
 	.install_gtt_pte = agp_i965_install_gtt_pte,
 	.read_gtt_pte = agp_i965_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i965_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i965_chipset_flush_teardown,
 	.chipset_flush = agp_i915_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_g33_driver = {
 	.chiptype = CHIP_G33,
 	.gen = 3,
 	.busdma_addr_mask_sz = 36,
 	.res_spec = agp_i915_res_spec,
 	.check_active = agp_i915_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i965_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i965_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i915_write_gtt,
 	.install_gtt_pte = agp_i915_install_gtt_pte,
 	.read_gtt_pte = agp_i915_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i965_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i965_chipset_flush_teardown,
 	.chipset_flush = agp_i915_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_igd_driver = {
 	.chiptype = CHIP_IGD,
 	.gen = 3,
 	.busdma_addr_mask_sz = 36,
 	.res_spec = agp_i915_res_spec,
 	.check_active = agp_i915_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i915_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_i965_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_i915_write_gtt,
 	.install_gtt_pte = agp_i915_install_gtt_pte,
 	.read_gtt_pte = agp_i915_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i965_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i965_chipset_flush_teardown,
 	.chipset_flush = agp_i915_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_g4x_driver = {
 	.chiptype = CHIP_G4X,
 	.gen = 5,
 	.busdma_addr_mask_sz = 36,
 	.res_spec = agp_i965_res_spec,
 	.check_active = agp_i915_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_i965_dump_regs,
 	.get_stolen_size = agp_i915_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_gen5_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_g4x_write_gtt,
 	.install_gtt_pte = agp_g4x_install_gtt_pte,
 	.read_gtt_pte = agp_g4x_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_i915_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i965_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i965_chipset_flush_teardown,
 	.chipset_flush = agp_i915_chipset_flush,
 };
 
 static const struct agp_i810_driver agp_i810_sb_driver = {
 	.chiptype = CHIP_SB,
 	.gen = 6,
 	.busdma_addr_mask_sz = 40,
 	.res_spec = agp_g4x_res_spec,
 	.check_active = agp_sb_check_active,
 	.set_desc = agp_i810_set_desc,
 	.dump_regs = agp_sb_dump_regs,
 	.get_stolen_size = agp_sb_get_stolen_size,
 	.get_gtt_mappable_entries = agp_i915_get_gtt_mappable_entries,
 	.get_gtt_total_entries = agp_sb_get_gtt_total_entries,
 	.install_gatt = agp_i830_install_gatt,
 	.deinstall_gatt = agp_i830_deinstall_gatt,
 	.write_gtt = agp_sb_write_gtt,
 	.install_gtt_pte = agp_sb_install_gtt_pte,
 	.read_gtt_pte = agp_g4x_read_gtt_pte,
 	.read_gtt_pte_paddr = agp_sb_read_gtt_pte_paddr,
 	.set_aperture = agp_i915_set_aperture,
 	.chipset_flush_setup = agp_i810_chipset_flush_setup,
 	.chipset_flush_teardown = agp_i810_chipset_flush_teardown,
 	.chipset_flush = agp_i810_chipset_flush,
 };
 
 /* For adding new devices, devid is the id of the graphics controller
  * (pci:0:2:0, for example).  The placeholder (usually at pci:0:2:1) for the
  * second head should never be added.  The bridge_offset is the offset to
  * subtract from devid to get the id of the hostb that the device is on.
  */
 static const struct agp_i810_match {
 	int devid;
 	char *name;
 	const struct agp_i810_driver *driver;
 } agp_i810_matches[] = {
 	{
 		.devid = 0x71218086,
 		.name = "Intel 82810 (i810 GMCH) SVGA controller",
 		.driver = &agp_i810_i810_driver
 	},
 	{
 		.devid = 0x71238086,
 		.name = "Intel 82810-DC100 (i810-DC100 GMCH) SVGA controller",
 		.driver = &agp_i810_i810_driver
 	},
 	{
 		.devid = 0x71258086,
 		.name = "Intel 82810E (i810E GMCH) SVGA controller",
 		.driver = &agp_i810_i810_driver
 	},
 	{
 		.devid = 0x11328086,
 		.name = "Intel 82815 (i815 GMCH) SVGA controller",
 		.driver = &agp_i810_i815_driver
 	},
 	{
 		.devid = 0x35778086,
 		.name = "Intel 82830M (830M GMCH) SVGA controller",
 		.driver = &agp_i810_i830_driver
 	},
 	{
 		.devid = 0x25628086,
 		.name = "Intel 82845M (845M GMCH) SVGA controller",
 		.driver = &agp_i810_i830_driver
 	},
 	{
 		.devid = 0x35828086,
 		.name = "Intel 82852/855GM SVGA controller",
 		.driver = &agp_i810_i855_driver
 	},
 	{
 		.devid = 0x25728086,
 		.name = "Intel 82865G (865G GMCH) SVGA controller",
 		.driver = &agp_i810_i865_driver
 	},
 	{
 		.devid = 0x25828086,
 		.name = "Intel 82915G (915G GMCH) SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x258A8086,
 		.name = "Intel E7221 SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x25928086,
 		.name = "Intel 82915GM (915GM GMCH) SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x27728086,
 		.name = "Intel 82945G (945G GMCH) SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x27A28086,
 		.name = "Intel 82945GM (945GM GMCH) SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x27AE8086,
 		.name = "Intel 945GME SVGA controller",
 		.driver = &agp_i810_i915_driver
 	},
 	{
 		.devid = 0x29728086,
 		.name = "Intel 946GZ SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x29828086,
 		.name = "Intel G965 SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x29928086,
 		.name = "Intel Q965 SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x29A28086,
 		.name = "Intel G965 SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x29B28086,
 		.name = "Intel Q35 SVGA controller",
 		.driver = &agp_i810_g33_driver
 	},
 	{
 		.devid = 0x29C28086,
 		.name = "Intel G33 SVGA controller",
 		.driver = &agp_i810_g33_driver
 	},
 	{
 		.devid = 0x29D28086,
 		.name = "Intel Q33 SVGA controller",
 		.driver = &agp_i810_g33_driver
 	},
 	{
 		.devid = 0xA0018086,
 		.name = "Intel Pineview SVGA controller",
 		.driver = &agp_i810_igd_driver
 	},
 	{
 		.devid = 0xA0118086,
 		.name = "Intel Pineview (M) SVGA controller",
 		.driver = &agp_i810_igd_driver
 	},
 	{
 		.devid = 0x2A028086,
 		.name = "Intel GM965 SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x2A128086,
 		.name = "Intel GME965 SVGA controller",
 		.driver = &agp_i810_g965_driver
 	},
 	{
 		.devid = 0x2A428086,
 		.name = "Intel GM45 SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x2E028086,
 		.name = "Intel Eaglelake SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x2E128086,
 		.name = "Intel Q45 SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x2E228086,
 		.name = "Intel G45 SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x2E328086,
 		.name = "Intel G41 SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x00428086,
 		.name = "Intel Ironlake (D) SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x00468086,
 		.name = "Intel Ironlake (M) SVGA controller",
 		.driver = &agp_i810_g4x_driver
 	},
 	{
 		.devid = 0x01028086,
 		.name = "SandyBridge desktop GT1 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01128086,
 		.name = "SandyBridge desktop GT2 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01228086,
 		.name = "SandyBridge desktop GT2+ IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01068086,
 		.name = "SandyBridge mobile GT1 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01168086,
 		.name = "SandyBridge mobile GT2 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01268086,
 		.name = "SandyBridge mobile GT2+ IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x010a8086,
 		.name = "SandyBridge server IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01528086,
 		.name = "IvyBridge desktop GT1 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01628086,
 		.name = "IvyBridge desktop GT2 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01568086,
 		.name = "IvyBridge mobile GT1 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x01668086,
 		.name = "IvyBridge mobile GT2 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x015a8086,
 		.name = "IvyBridge server GT1 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0x016a8086,
 		.name = "IvyBridge server GT2 IG",
 		.driver = &agp_i810_sb_driver
 	},
 	{
 		.devid = 0,
 	}
 };
 
 static const struct agp_i810_match*
 agp_i810_match(device_t dev)
 {
 	int i, devid;
 
 	if (pci_get_class(dev) != PCIC_DISPLAY
 	    || pci_get_subclass(dev) != PCIS_DISPLAY_VGA)
 		return (NULL);
 
 	devid = pci_get_devid(dev);
 	for (i = 0; agp_i810_matches[i].devid != 0; i++) {
 		if (agp_i810_matches[i].devid == devid)
 			break;
 	}
 	if (agp_i810_matches[i].devid == 0)
 		return (NULL);
 	else
 		return (&agp_i810_matches[i]);
 }
 
 /*
  * Find bridge device.
  */
 static device_t
 agp_i810_find_bridge(device_t dev)
 {
 
 	return (pci_find_dbsf(0, 0, 0, 0));
 }
 
 static void
 agp_i810_identify(driver_t *driver, device_t parent)
 {
 
 	if (device_find_child(parent, "agp", -1) == NULL &&
 	    agp_i810_match(parent))
 		device_add_child(parent, "agp", -1);
 }
 
 static int
 agp_i810_check_active(device_t bridge_dev)
 {
 	u_int8_t smram;
 
 	smram = pci_read_config(bridge_dev, AGP_I810_SMRAM, 1);
 	if ((smram & AGP_I810_SMRAM_GMS) == AGP_I810_SMRAM_GMS_DISABLED)
 		return (ENXIO);
 	return (0);
 }
 
 static int
 agp_i830_check_active(device_t bridge_dev)
 {
 	int gcc1;
 
 	gcc1 = pci_read_config(bridge_dev, AGP_I830_GCC1, 1);
 	if ((gcc1 & AGP_I830_GCC1_DEV2) == AGP_I830_GCC1_DEV2_DISABLED)
 		return (ENXIO);
 	return (0);
 }
 
 static int
 agp_i915_check_active(device_t bridge_dev)
 {
 	int deven;
 
 	deven = pci_read_config(bridge_dev, AGP_I915_DEVEN, 4);
 	if ((deven & AGP_I915_DEVEN_D2F0) == AGP_I915_DEVEN_D2F0_DISABLED)
 		return (ENXIO);
 	return (0);
 }
 
 static int
 agp_sb_check_active(device_t bridge_dev)
 {
 	int deven;
 
 	deven = pci_read_config(bridge_dev, AGP_I915_DEVEN, 4);
 	if ((deven & AGP_SB_DEVEN_D2EN) == AGP_SB_DEVEN_D2EN_DISABLED)
 		return (ENXIO);
 	return (0);
 }
 
 static void
 agp_82852_set_desc(device_t dev, const struct agp_i810_match *match)
 {
 
 	switch (pci_read_config(dev, AGP_I85X_CAPID, 1)) {
 	case AGP_I855_GME:
 		device_set_desc(dev,
 		    "Intel 82855GME (855GME GMCH) SVGA controller");
 		break;
 	case AGP_I855_GM:
 		device_set_desc(dev,
 		    "Intel 82855GM (855GM GMCH) SVGA controller");
 		break;
 	case AGP_I852_GME:
 		device_set_desc(dev,
 		    "Intel 82852GME (852GME GMCH) SVGA controller");
 		break;
 	case AGP_I852_GM:
 		device_set_desc(dev,
 		    "Intel 82852GM (852GM GMCH) SVGA controller");
 		break;
 	default:
 		device_set_desc(dev,
 		    "Intel 8285xM (85xGM GMCH) SVGA controller");
 		break;
 	}
 }
 
 static void
 agp_i810_set_desc(device_t dev, const struct agp_i810_match *match)
 {
 
 	device_set_desc(dev, match->name);
 }
 
 static int
 agp_i810_probe(device_t dev)
 {
 	device_t bdev;
 	const struct agp_i810_match *match;
 	int err;
 
 	if (resource_disabled("agp", device_get_unit(dev)))
 		return (ENXIO);
 	match = agp_i810_match(dev);
 	if (match == NULL)
 		return (ENXIO);
 
 	bdev = agp_i810_find_bridge(dev);
 	if (bdev == NULL) {
 		if (bootverbose)
 			printf("I810: can't find bridge device\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * checking whether internal graphics device has been activated.
 	 */
 	err = match->driver->check_active(bdev);
 	if (err != 0) {
 		if (bootverbose)
 			printf("i810: disabled, not probing\n");
 		return (err);
 	}
 
 	match->driver->set_desc(dev, match);
 	return (BUS_PROBE_DEFAULT);
 }
 
 static void
 agp_i810_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL));
 	device_printf(dev, "AGP_I810_MISCC: 0x%04x\n",
 	    pci_read_config(sc->bdev, AGP_I810_MISCC, 2));
 }
 
 static void
 agp_i830_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL));
 	device_printf(dev, "AGP_I830_GCC1: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I830_GCC1, 1));
 }
 
 static void
 agp_i855_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL));
 	device_printf(dev, "AGP_I855_GCC1: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I855_GCC1, 1));
 }
 
 static void
 agp_i915_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_I810_PGTBL_CTL: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL));
 	device_printf(dev, "AGP_I855_GCC1: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I855_GCC1, 1));
 	device_printf(dev, "AGP_I915_MSAC: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I915_MSAC, 1));
 }
 
 static void
 agp_i965_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_I965_PGTBL_CTL2: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2));
 	device_printf(dev, "AGP_I855_GCC1: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I855_GCC1, 1));
 	device_printf(dev, "AGP_I965_MSAC: 0x%02x\n",
 	    pci_read_config(sc->bdev, AGP_I965_MSAC, 1));
 }
 
 static void
 agp_sb_dump_regs(device_t dev)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 
 	device_printf(dev, "AGP_SNB_GFX_MODE: %08x\n",
 	    bus_read_4(sc->sc_res[0], AGP_SNB_GFX_MODE));
 	device_printf(dev, "AGP_SNB_GCC1: 0x%04x\n",
 	    pci_read_config(sc->bdev, AGP_SNB_GCC1, 2));
 }
 
 static int
 agp_i810_get_stolen_size(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	sc->stolen = 0;
 	sc->stolen_size = 0;
 	return (0);
 }
 
 static int
 agp_i830_get_stolen_size(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	unsigned int gcc1;
 
 	sc = device_get_softc(dev);
 
 	gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 1);
 	switch (gcc1 & AGP_I830_GCC1_GMS) {
 	case AGP_I830_GCC1_GMS_STOLEN_512:
 		sc->stolen = (512 - 132) * 1024 / 4096;
 		sc->stolen_size = 512 * 1024;
 		break;
 	case AGP_I830_GCC1_GMS_STOLEN_1024: 
 		sc->stolen = (1024 - 132) * 1024 / 4096;
 		sc->stolen_size = 1024 * 1024;
 		break;
 	case AGP_I830_GCC1_GMS_STOLEN_8192: 
 		sc->stolen = (8192 - 132) * 1024 / 4096;
 		sc->stolen_size = 8192 * 1024;
 		break;
 	default:
 		sc->stolen = 0;
 		device_printf(dev,
 		    "unknown memory configuration, disabling (GCC1 %x)\n",
 		    gcc1);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 agp_i915_get_stolen_size(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	unsigned int gcc1, stolen, gtt_size;
 
 	sc = device_get_softc(dev);
 
 	/*
 	 * Stolen memory is set up at the beginning of the aperture by
 	 * the BIOS, consisting of the GATT followed by 4kb for the
 	 * BIOS display.
 	 */
 	switch (sc->match->driver->chiptype) {
 	case CHIP_I855:
 		gtt_size = 128;
 		break;
 	case CHIP_I915:
 		gtt_size = 256;
 		break;
 	case CHIP_I965:
 		switch (bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL) &
 			AGP_I810_PGTBL_SIZE_MASK) {
 		case AGP_I810_PGTBL_SIZE_128KB:
 			gtt_size = 128;
 			break;
 		case AGP_I810_PGTBL_SIZE_256KB:
 			gtt_size = 256;
 			break;
 		case AGP_I810_PGTBL_SIZE_512KB:
 			gtt_size = 512;
 			break;
 		case AGP_I965_PGTBL_SIZE_1MB:
 			gtt_size = 1024;
 			break;
 		case AGP_I965_PGTBL_SIZE_2MB:
 			gtt_size = 2048;
 			break;
 		case AGP_I965_PGTBL_SIZE_1_5MB:
 			gtt_size = 1024 + 512;
 			break;
 		default:
 			device_printf(dev, "Bad PGTBL size\n");
 			return (EINVAL);
 		}
 		break;
 	case CHIP_G33:
 		gcc1 = pci_read_config(sc->bdev, AGP_I855_GCC1, 2);
 		switch (gcc1 & AGP_G33_MGGC_GGMS_MASK) {
 		case AGP_G33_MGGC_GGMS_SIZE_1M:
 			gtt_size = 1024;
 			break;
 		case AGP_G33_MGGC_GGMS_SIZE_2M:
 			gtt_size = 2048;
 			break;
 		default:
 			device_printf(dev, "Bad PGTBL size\n");
 			return (EINVAL);
 		}
 		break;
 	case CHIP_IGD:
 	case CHIP_G4X:
 		gtt_size = 0;
 		break;
 	default:
 		device_printf(dev, "Bad chiptype\n");
 		return (EINVAL);
 	}
 
 	/* GCC1 is called MGGC on i915+ */
 	gcc1 = pci_read_config(sc->bdev, AGP_I855_GCC1, 1);
 	switch (gcc1 & AGP_I855_GCC1_GMS) {
 	case AGP_I855_GCC1_GMS_STOLEN_1M:
 		stolen = 1024;
 		break;
 	case AGP_I855_GCC1_GMS_STOLEN_4M:
 		stolen = 4 * 1024;
 		break;
 	case AGP_I855_GCC1_GMS_STOLEN_8M:
 		stolen = 8 * 1024;
 		break;
 	case AGP_I855_GCC1_GMS_STOLEN_16M:
 		stolen = 16 * 1024;
 		break;
 	case AGP_I855_GCC1_GMS_STOLEN_32M:
 		stolen = 32 * 1024;
 		break;
 	case AGP_I915_GCC1_GMS_STOLEN_48M:
 		stolen = sc->match->driver->gen > 2 ? 48 * 1024 : 0;
 		break;
 	case AGP_I915_GCC1_GMS_STOLEN_64M:
 		stolen = sc->match->driver->gen > 2 ? 64 * 1024 : 0;
 		break;
 	case AGP_G33_GCC1_GMS_STOLEN_128M:
 		stolen = sc->match->driver->gen > 2 ? 128 * 1024 : 0;
 		break;
 	case AGP_G33_GCC1_GMS_STOLEN_256M:
 		stolen = sc->match->driver->gen > 2 ? 256 * 1024 : 0;
 		break;
 	case AGP_G4X_GCC1_GMS_STOLEN_96M:
 		if (sc->match->driver->chiptype == CHIP_I965 ||
 		    sc->match->driver->chiptype == CHIP_G4X)
 			stolen = 96 * 1024;
 		else
 			stolen = 0;
 		break;
 	case AGP_G4X_GCC1_GMS_STOLEN_160M:
 		if (sc->match->driver->chiptype == CHIP_I965 ||
 		    sc->match->driver->chiptype == CHIP_G4X)
 			stolen = 160 * 1024;
 		else
 			stolen = 0;
 		break;
 	case AGP_G4X_GCC1_GMS_STOLEN_224M:
 		if (sc->match->driver->chiptype == CHIP_I965 ||
 		    sc->match->driver->chiptype == CHIP_G4X)
 			stolen = 224 * 1024;
 		else
 			stolen = 0;
 		break;
 	case AGP_G4X_GCC1_GMS_STOLEN_352M:
 		if (sc->match->driver->chiptype == CHIP_I965 ||
 		    sc->match->driver->chiptype == CHIP_G4X)
 			stolen = 352 * 1024;
 		else
 			stolen = 0;
 		break;
 	default:
 		device_printf(dev,
 		    "unknown memory configuration, disabling (GCC1 %x)\n",
 		    gcc1);
 		return (EINVAL);
 	}
 
 	gtt_size += 4;
 	sc->stolen_size = stolen * 1024;
 	sc->stolen = (stolen - gtt_size) * 1024 / 4096;
 
 	return (0);
 }
 
 static int
 agp_sb_get_stolen_size(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint16_t gmch_ctl;
 
 	sc = device_get_softc(dev);
 	gmch_ctl = pci_read_config(sc->bdev, AGP_SNB_GCC1, 2);
 	switch (gmch_ctl & AGP_SNB_GMCH_GMS_STOLEN_MASK) {
 	case AGP_SNB_GMCH_GMS_STOLEN_32M:
 		sc->stolen_size = 32 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_64M:
 		sc->stolen_size = 64 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_96M:
 		sc->stolen_size = 96 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_128M:
 		sc->stolen_size = 128 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_160M:
 		sc->stolen_size = 160 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_192M:
 		sc->stolen_size = 192 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_224M:
 		sc->stolen_size = 224 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_256M:
 		sc->stolen_size = 256 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_288M:
 		sc->stolen_size = 288 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_320M:
 		sc->stolen_size = 320 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_352M:
 		sc->stolen_size = 352 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_384M:
 		sc->stolen_size = 384 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_416M:
 		sc->stolen_size = 416 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_448M:
 		sc->stolen_size = 448 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_480M:
 		sc->stolen_size = 480 * 1024 * 1024;
 		break;
 	case AGP_SNB_GMCH_GMS_STOLEN_512M:
 		sc->stolen_size = 512 * 1024 * 1024;
 		break;
 	}
 	sc->stolen = (sc->stolen_size - 4) / 4096;
 	return (0);
 }
 
 static int
 agp_i810_get_gtt_mappable_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t ap;
 	uint16_t miscc;
 
 	sc = device_get_softc(dev);
 	miscc = pci_read_config(sc->bdev, AGP_I810_MISCC, 2);
 	if ((miscc & AGP_I810_MISCC_WINSIZE) == AGP_I810_MISCC_WINSIZE_32)
 		ap = 32;
 	else
 		ap = 64;
 	sc->gtt_mappable_entries = (ap * 1024 * 1024) >> AGP_PAGE_SHIFT;
 	return (0);
 }
 
 static int
 agp_i830_get_gtt_mappable_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t ap;
 	uint16_t gmch_ctl;
 
 	sc = device_get_softc(dev);
 	gmch_ctl = pci_read_config(sc->bdev, AGP_I830_GCC1, 2);
 	if ((gmch_ctl & AGP_I830_GCC1_GMASIZE) == AGP_I830_GCC1_GMASIZE_64)
 		ap = 64;
 	else
 		ap = 128;
 	sc->gtt_mappable_entries = (ap * 1024 * 1024) >> AGP_PAGE_SHIFT;
 	return (0);
 }
 
 static int
 agp_i915_get_gtt_mappable_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t ap;
 
 	sc = device_get_softc(dev);
 	ap = AGP_GET_APERTURE(dev);
 	sc->gtt_mappable_entries = ap >> AGP_PAGE_SHIFT;
 	return (0);
 }
 
 static int
 agp_i810_get_gtt_total_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	sc->gtt_total_entries = sc->gtt_mappable_entries;
 	return (0);
 }
 
 static int
 agp_i965_get_gtt_total_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t pgetbl_ctl;
 	int error;
 
 	sc = device_get_softc(dev);
 	error = 0;
 	pgetbl_ctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL);
 	switch (pgetbl_ctl & AGP_I810_PGTBL_SIZE_MASK) {
 	case AGP_I810_PGTBL_SIZE_128KB:
 		sc->gtt_total_entries = 128 * 1024 / 4;
 		break;
 	case AGP_I810_PGTBL_SIZE_256KB:
 		sc->gtt_total_entries = 256 * 1024 / 4;
 		break;
 	case AGP_I810_PGTBL_SIZE_512KB:
 		sc->gtt_total_entries = 512 * 1024 / 4;
 		break;
 	/* GTT pagetable sizes bigger than 512KB are not possible on G33! */
 	case AGP_I810_PGTBL_SIZE_1MB:
 		sc->gtt_total_entries = 1024 * 1024 / 4;
 		break;
 	case AGP_I810_PGTBL_SIZE_2MB:
 		sc->gtt_total_entries = 2 * 1024 * 1024 / 4;
 		break;
 	case AGP_I810_PGTBL_SIZE_1_5MB:
 		sc->gtt_total_entries = (1024 + 512) * 1024 / 4;
 		break;
 	default:
 		device_printf(dev, "Unknown page table size\n");
 		error = ENXIO;
 	}
 	return (error);
 }
 
 static void
 agp_gen5_adjust_pgtbl_size(device_t dev, uint32_t sz)
 {
 	struct agp_i810_softc *sc;
 	uint32_t pgetbl_ctl, pgetbl_ctl2;
 
 	sc = device_get_softc(dev);
 
 	/* Disable per-process page table. */
 	pgetbl_ctl2 = bus_read_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2);
 	pgetbl_ctl2 &= ~AGP_I810_PGTBL_ENABLED;
 	bus_write_4(sc->sc_res[0], AGP_I965_PGTBL_CTL2, pgetbl_ctl2);
 
 	/* Write the new ggtt size. */
 	pgetbl_ctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL);
 	pgetbl_ctl &= ~AGP_I810_PGTBL_SIZE_MASK;
 	pgetbl_ctl |= sz;
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgetbl_ctl);
 }
 
 static int
 agp_gen5_get_gtt_total_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint16_t gcc1;
 
 	sc = device_get_softc(dev);
 
 	gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 2);
 	switch (gcc1 & AGP_G4x_GCC1_SIZE_MASK) {
 	case AGP_G4x_GCC1_SIZE_1M:
 	case AGP_G4x_GCC1_SIZE_VT_1M:
 		agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_1MB);
 		break;
 	case AGP_G4x_GCC1_SIZE_VT_1_5M:
 		agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_1_5MB);
 		break;
 	case AGP_G4x_GCC1_SIZE_2M:
 	case AGP_G4x_GCC1_SIZE_VT_2M:
 		agp_gen5_adjust_pgtbl_size(dev, AGP_I810_PGTBL_SIZE_2MB);
 		break;
 	default:
 		device_printf(dev, "Unknown page table size\n");
 		return (ENXIO);
 	}
 
 	return (agp_i965_get_gtt_total_entries(dev));
 }
 
 static int
 agp_sb_get_gtt_total_entries(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint16_t gcc1;
 
 	sc = device_get_softc(dev);
 
 	gcc1 = pci_read_config(sc->bdev, AGP_SNB_GCC1, 2);
 	switch (gcc1 & AGP_SNB_GTT_SIZE_MASK) {
 	default:
 	case AGP_SNB_GTT_SIZE_0M:
 		printf("Bad GTT size mask: 0x%04x\n", gcc1);
 		return (ENXIO);
 	case AGP_SNB_GTT_SIZE_1M:
 		sc->gtt_total_entries = 1024 * 1024 / 4;
 		break;
 	case AGP_SNB_GTT_SIZE_2M:
 		sc->gtt_total_entries = 2 * 1024 * 1024 / 4;
 		break;
 	}
 	return (0);
 }
 
 static int
 agp_i810_install_gatt(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	/* Some i810s have on-chip memory called dcache. */
 	if ((bus_read_1(sc->sc_res[0], AGP_I810_DRT) & AGP_I810_DRT_POPULATED)
 	    != 0)
 		sc->dcache_size = 4 * 1024 * 1024;
 	else
 		sc->dcache_size = 0;
 
 	/* According to the specs the gatt on the i810 must be 64k. */
 	sc->gatt->ag_virtual = contigmalloc(64 * 1024, M_AGP, 0, 0, ~0,
 	    PAGE_SIZE, 0);
 	if (sc->gatt->ag_virtual == NULL) {
 		if (bootverbose)
 			device_printf(dev, "contiguous allocation failed\n");
 		return (ENOMEM);
 	}
 
 	bzero(sc->gatt->ag_virtual, sc->gatt->ag_entries * sizeof(u_int32_t));
 	sc->gatt->ag_physical = vtophys((vm_offset_t)sc->gatt->ag_virtual);
 	agp_flush_cache();
 	/* Install the GATT. */
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL,
 	    sc->gatt->ag_physical | 1);
 	return (0);
 }
 
 static int
 agp_i830_install_gatt(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t pgtblctl;
 
 	sc = device_get_softc(dev);
 
 	/*
 	 * The i830 automatically initializes the 128k gatt on boot.
 	 * GATT address is already in there, make sure it's enabled.
 	 */
 	pgtblctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL);
 	pgtblctl |= 1;
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgtblctl);
 	
 	sc->gatt->ag_physical = pgtblctl & ~1;
 	return (0);
 }
 
 static int
 agp_i810_attach(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	int error;
 
 	sc = device_get_softc(dev);
 	sc->bdev = agp_i810_find_bridge(dev);
 	if (sc->bdev == NULL)
 		return (ENOENT);
 
 	sc->match = agp_i810_match(dev);
 
 	agp_set_aperture_resource(dev, sc->match->driver->gen <= 2 ?
 	    AGP_APBASE : AGP_I915_GMADR);
 	error = agp_generic_attach(dev);
 	if (error)
 		return (error);
 
 	if (ptoa((vm_paddr_t)Maxmem) >
 	    (1ULL << sc->match->driver->busdma_addr_mask_sz) - 1) {
 		device_printf(dev, "agp_i810 does not support physical "
 		    "memory above %ju.\n", (uintmax_t)(1ULL <<
 		    sc->match->driver->busdma_addr_mask_sz) - 1);
 		return (ENOENT);
 	}
 
 	if (bus_alloc_resources(dev, sc->match->driver->res_spec, sc->sc_res)) {
 		agp_generic_detach(dev);
 		return (ENODEV);
 	}
 
 	sc->initial_aperture = AGP_GET_APERTURE(dev);
 	sc->gatt = malloc(sizeof(struct agp_gatt), M_AGP, M_WAITOK);
 	sc->gatt->ag_entries = AGP_GET_APERTURE(dev) >> AGP_PAGE_SHIFT;
 
 	if ((error = sc->match->driver->get_stolen_size(dev)) != 0 ||
 	    (error = sc->match->driver->install_gatt(dev)) != 0 ||
 	    (error = sc->match->driver->get_gtt_mappable_entries(dev)) != 0 ||
 	    (error = sc->match->driver->get_gtt_total_entries(dev)) != 0 ||
 	    (error = sc->match->driver->chipset_flush_setup(dev)) != 0) {
 		bus_release_resources(dev, sc->match->driver->res_spec,
 		    sc->sc_res);
 		free(sc->gatt, M_AGP);
 		agp_generic_detach(dev);
 		return (error);
 	}
 
 	intel_agp = dev;
 	device_printf(dev, "aperture size is %dM",
 	    sc->initial_aperture / 1024 / 1024);
 	if (sc->stolen > 0)
 		printf(", detected %dk stolen memory\n", sc->stolen * 4);
 	else
 		printf("\n");
 	if (bootverbose) {
 		sc->match->driver->dump_regs(dev);
 		device_printf(dev, "Mappable GTT entries: %d\n",
 		    sc->gtt_mappable_entries);
 		device_printf(dev, "Total GTT entries: %d\n",
 		    sc->gtt_total_entries);
 	}
 	return (0);
 }
 
 static void
 agp_i810_deinstall_gatt(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, 0);
 	contigfree(sc->gatt->ag_virtual, 64 * 1024, M_AGP);
 }
 
 static void
 agp_i830_deinstall_gatt(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	unsigned int pgtblctl;
 
 	sc = device_get_softc(dev);
 	pgtblctl = bus_read_4(sc->sc_res[0], AGP_I810_PGTBL_CTL);
 	pgtblctl &= ~1;
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL, pgtblctl);
 }
 
 static int
 agp_i810_detach(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	agp_free_cdev(dev);
 
 	/* Clear the GATT base. */
 	sc->match->driver->deinstall_gatt(dev);
 
 	sc->match->driver->chipset_flush_teardown(dev);
 
 	/* Put the aperture back the way it started. */
 	AGP_SET_APERTURE(dev, sc->initial_aperture);
 
 	free(sc->gatt, M_AGP);
 	bus_release_resources(dev, sc->match->driver->res_spec, sc->sc_res);
 	agp_free_res(dev);
 
 	return (0);
 }
 
 static int
 agp_i810_resume(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	sc = device_get_softc(dev);
 
 	AGP_SET_APERTURE(dev, sc->initial_aperture);
 
 	/* Install the GATT. */
 	bus_write_4(sc->sc_res[0], AGP_I810_PGTBL_CTL,
 	sc->gatt->ag_physical | 1);
 
 	return (bus_generic_resume(dev));
 }
 
 /**
  * Sets the PCI resource size of the aperture on i830-class and below chipsets,
  * while returning failure on later chipsets when an actual change is
  * requested.
  *
  * This whole function is likely bogus, as the kernel would probably need to
  * reconfigure the placement of the AGP aperture if a larger size is requested,
  * which doesn't happen currently.
  */
 static int
 agp_i810_set_aperture(device_t dev, u_int32_t aperture)
 {
 	struct agp_i810_softc *sc;
 	u_int16_t miscc;
 
 	sc = device_get_softc(dev);
 	/*
 	 * Double check for sanity.
 	 */
 	if (aperture != 32 * 1024 * 1024 && aperture != 64 * 1024 * 1024) {
 		device_printf(dev, "bad aperture size %d\n", aperture);
 		return (EINVAL);
 	}
 
 	miscc = pci_read_config(sc->bdev, AGP_I810_MISCC, 2);
 	miscc &= ~AGP_I810_MISCC_WINSIZE;
 	if (aperture == 32 * 1024 * 1024)
 		miscc |= AGP_I810_MISCC_WINSIZE_32;
 	else
 		miscc |= AGP_I810_MISCC_WINSIZE_64;
 	
 	pci_write_config(sc->bdev, AGP_I810_MISCC, miscc, 2);
 	return (0);
 }
 
 static int
 agp_i830_set_aperture(device_t dev, u_int32_t aperture)
 {
 	struct agp_i810_softc *sc;
 	u_int16_t gcc1;
 
 	sc = device_get_softc(dev);
 
 	if (aperture != 64 * 1024 * 1024 &&
 	    aperture != 128 * 1024 * 1024) {
 		device_printf(dev, "bad aperture size %d\n", aperture);
 		return (EINVAL);
 	}
 	gcc1 = pci_read_config(sc->bdev, AGP_I830_GCC1, 2);
 	gcc1 &= ~AGP_I830_GCC1_GMASIZE;
 	if (aperture == 64 * 1024 * 1024)
 		gcc1 |= AGP_I830_GCC1_GMASIZE_64;
 	else
 		gcc1 |= AGP_I830_GCC1_GMASIZE_128;
 
 	pci_write_config(sc->bdev, AGP_I830_GCC1, gcc1, 2);
 	return (0);
 }
 
 static int
 agp_i915_set_aperture(device_t dev, u_int32_t aperture)
 {
 
 	return (agp_generic_set_aperture(dev, aperture));
 }
 
 static int
 agp_i810_method_set_aperture(device_t dev, u_int32_t aperture)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	return (sc->match->driver->set_aperture(dev, aperture));
 }
 
 /**
  * Writes a GTT entry mapping the page at the given offset from the
  * beginning of the aperture to the given physical address.  Setup the
  * caching mode according to flags.
  *
  * For gen 1, 2 and 3, GTT start is located at AGP_I810_GTT offset
  * from corresponding BAR start. For gen 4, offset is 512KB +
  * AGP_I810_GTT, for gen 5 and 6 it is 2MB + AGP_I810_GTT.
  *
  * Also, the bits of the physical page address above 4GB needs to be
  * placed into bits 40-32 of PTE.
  */
 static void
 agp_i810_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	if (flags == AGP_DCACHE_MEMORY)
 		pte |= I810_PTE_LOCAL;
 	else if (flags == AGP_USER_CACHED_MEMORY)
 		pte |= I830_PTE_SYSTEM_CACHED;
 	agp_i810_write_gtt(dev, index, pte);
 }
 
 static void
 agp_i810_write_gtt(device_t dev, u_int index, uint32_t pte)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[0], AGP_I810_GTT + index * 4, pte);
 	CTR2(KTR_AGP_I810, "810_pte %x %x", index, pte);
 }
 
 static void
 agp_i830_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	if (flags == AGP_USER_CACHED_MEMORY)
 		pte |= I830_PTE_SYSTEM_CACHED;
 	agp_i810_write_gtt(dev, index, pte);
 }
 
 static void
 agp_i915_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	if (flags == AGP_USER_CACHED_MEMORY)
 		pte |= I830_PTE_SYSTEM_CACHED;
 	pte |= (physical & 0x0000000f00000000ull) >> 28;
 	agp_i915_write_gtt(dev, index, pte);
 }
 
 static void
 agp_i915_write_gtt(device_t dev, u_int index, uint32_t pte)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[1], index * 4, pte);
 	CTR2(KTR_AGP_I810, "915_pte %x %x", index, pte);
 }
 
 static void
 agp_i965_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	if (flags == AGP_USER_CACHED_MEMORY)
 		pte |= I830_PTE_SYSTEM_CACHED;
 	pte |= (physical & 0x0000000f00000000ull) >> 28;
 	agp_i965_write_gtt(dev, index, pte);
 }
 
 static void
 agp_i965_write_gtt(device_t dev, u_int index, uint32_t pte)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[0], index * 4 + (512 * 1024), pte);
 	CTR2(KTR_AGP_I810, "965_pte %x %x", index, pte);
 }
 
 static void
 agp_g4x_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	if (flags == AGP_USER_CACHED_MEMORY)
 		pte |= I830_PTE_SYSTEM_CACHED;
 	pte |= (physical & 0x0000000f00000000ull) >> 28;
 	agp_g4x_write_gtt(dev, index, pte);
 }
 
 static void
 agp_g4x_write_gtt(device_t dev, u_int index, uint32_t pte)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024), pte);
 	CTR2(KTR_AGP_I810, "g4x_pte %x %x", index, pte);
 }
 
 static void
 agp_sb_install_gtt_pte(device_t dev, u_int index, vm_offset_t physical,
     int flags)
 {
 	int type_mask, gfdt;
 	uint32_t pte;
 
 	pte = (u_int32_t)physical | I810_PTE_VALID;
 	type_mask = flags & ~AGP_USER_CACHED_MEMORY_GFDT;
 	gfdt = (flags & AGP_USER_CACHED_MEMORY_GFDT) != 0 ? GEN6_PTE_GFDT : 0;
 
 	if (type_mask == AGP_USER_MEMORY)
 		pte |= GEN6_PTE_UNCACHED;
 	else if (type_mask == AGP_USER_CACHED_MEMORY_LLC_MLC)
 		pte |= GEN6_PTE_LLC_MLC | gfdt;
 	else
 		pte |= GEN6_PTE_LLC | gfdt;
 
 	pte |= (physical & 0x000000ff00000000ull) >> 28;
 	agp_sb_write_gtt(dev, index, pte);
 }
 
 static void
 agp_sb_write_gtt(device_t dev, u_int index, uint32_t pte)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	bus_write_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024), pte);
 	CTR2(KTR_AGP_I810, "sb_pte %x %x", index, pte);
 }
 
 static int
 agp_i810_bind_page(device_t dev, vm_offset_t offset, vm_offset_t physical)
 {
 	struct agp_i810_softc *sc = device_get_softc(dev);
 	u_int index;
 
 	if (offset >= (sc->gatt->ag_entries << AGP_PAGE_SHIFT)) {
 		device_printf(dev, "failed: offset is 0x%08jx, "
 		    "shift is %d, entries is %d\n", (intmax_t)offset,
 		    AGP_PAGE_SHIFT, sc->gatt->ag_entries);
 		return (EINVAL);
 	}
 	index = offset >> AGP_PAGE_SHIFT;
 	if (sc->stolen != 0 && index < sc->stolen) {
 		device_printf(dev, "trying to bind into stolen memory\n");
 		return (EINVAL);
 	}
 	sc->match->driver->install_gtt_pte(dev, index, physical, 0);
 	return (0);
 }
 
 static int
 agp_i810_unbind_page(device_t dev, vm_offset_t offset)
 {
 	struct agp_i810_softc *sc;
 	u_int index;
 
 	sc = device_get_softc(dev);
 	if (offset >= (sc->gatt->ag_entries << AGP_PAGE_SHIFT))
 		return (EINVAL);
 	index = offset >> AGP_PAGE_SHIFT;
 	if (sc->stolen != 0 && index < sc->stolen) {
 		device_printf(dev, "trying to unbind from stolen memory\n");
 		return (EINVAL);
 	}
 	sc->match->driver->install_gtt_pte(dev, index, 0, 0);
 	return (0);
 }
 
 static u_int32_t
 agp_i810_read_gtt_pte(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 
 	sc = device_get_softc(dev);
 	pte = bus_read_4(sc->sc_res[0], AGP_I810_GTT + index * 4);
 	return (pte);
 }
 
 static u_int32_t
 agp_i915_read_gtt_pte(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 
 	sc = device_get_softc(dev);
 	pte = bus_read_4(sc->sc_res[1], index * 4);
 	return (pte);
 }
 
 static u_int32_t
 agp_i965_read_gtt_pte(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 
 	sc = device_get_softc(dev);
 	pte = bus_read_4(sc->sc_res[0], index * 4 + (512 * 1024));
 	return (pte);
 }
 
 static u_int32_t
 agp_g4x_read_gtt_pte(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 
 	sc = device_get_softc(dev);
 	pte = bus_read_4(sc->sc_res[0], index * 4 + (2 * 1024 * 1024));
 	return (pte);
 }
 
 static vm_paddr_t
 agp_i810_read_gtt_pte_paddr(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 	vm_paddr_t res;
 
 	sc = device_get_softc(dev);
 	pte = sc->match->driver->read_gtt_pte(dev, index);
 	res = pte & ~PAGE_MASK;
 	return (res);
 }
 
 static vm_paddr_t
 agp_i915_read_gtt_pte_paddr(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 	vm_paddr_t res;
 
 	sc = device_get_softc(dev);
 	pte = sc->match->driver->read_gtt_pte(dev, index);
 	res = (pte & ~PAGE_MASK) | ((pte & 0xf0) << 28);
 	return (res);
 }
 
 static vm_paddr_t
 agp_sb_read_gtt_pte_paddr(device_t dev, u_int index)
 {
 	struct agp_i810_softc *sc;
 	u_int32_t pte;
 	vm_paddr_t res;
 
 	sc = device_get_softc(dev);
 	pte = sc->match->driver->read_gtt_pte(dev, index);
 	res = (pte & ~PAGE_MASK) | ((pte & 0xff0) << 28);
 	return (res);
 }
 
 /*
  * Writing via memory mapped registers already flushes all TLBs.
  */
 static void
 agp_i810_flush_tlb(device_t dev)
 {
 }
 
 static int
 agp_i810_enable(device_t dev, u_int32_t mode)
 {
 
 	return (0);
 }
 
 static struct agp_memory *
 agp_i810_alloc_memory(device_t dev, int type, vm_size_t size)
 {
 	struct agp_i810_softc *sc;
 	struct agp_memory *mem;
 	vm_page_t m;
 
 	sc = device_get_softc(dev);
 
 	if ((size & (AGP_PAGE_SIZE - 1)) != 0 ||
 	    sc->agp.as_allocated + size > sc->agp.as_maxmem)
 		return (0);
 
 	if (type == 1) {
 		/*
 		 * Mapping local DRAM into GATT.
 		 */
 		if (sc->match->driver->chiptype != CHIP_I810)
 			return (0);
 		if (size != sc->dcache_size)
 			return (0);
 	} else if (type == 2) {
 		/*
 		 * Type 2 is the contiguous physical memory type, that hands
 		 * back a physical address.  This is used for cursors on i810.
 		 * Hand back as many single pages with physical as the user
 		 * wants, but only allow one larger allocation (ARGB cursor)
 		 * for simplicity.
 		 */
 		if (size != AGP_PAGE_SIZE) {
 			if (sc->argb_cursor != NULL)
 				return (0);
 
 			/* Allocate memory for ARGB cursor, if we can. */
 			sc->argb_cursor = contigmalloc(size, M_AGP,
 			   0, 0, ~0, PAGE_SIZE, 0);
 			if (sc->argb_cursor == NULL)
 				return (0);
 		}
 	}
 
 	mem = malloc(sizeof *mem, M_AGP, M_WAITOK);
 	mem->am_id = sc->agp.as_nextid++;
 	mem->am_size = size;
 	mem->am_type = type;
 	if (type != 1 && (type != 2 || size == AGP_PAGE_SIZE))
 		mem->am_obj = vm_object_allocate(OBJT_DEFAULT,
 		    atop(round_page(size)));
 	else
 		mem->am_obj = 0;
 
 	if (type == 2) {
 		if (size == AGP_PAGE_SIZE) {
 			/*
 			 * Allocate and wire down the page now so that we can
 			 * get its physical address.
 			 */
 			VM_OBJECT_WLOCK(mem->am_obj);
 			m = vm_page_grab(mem->am_obj, 0, VM_ALLOC_NOBUSY |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			VM_OBJECT_WUNLOCK(mem->am_obj);
 			mem->am_physical = VM_PAGE_TO_PHYS(m);
 		} else {
 			/* Our allocation is already nicely wired down for us.
 			 * Just grab the physical address.
 			 */
 			mem->am_physical = vtophys(sc->argb_cursor);
 		}
 	} else
 		mem->am_physical = 0;
 
 	mem->am_offset = 0;
 	mem->am_is_bound = 0;
 	TAILQ_INSERT_TAIL(&sc->agp.as_memory, mem, am_link);
 	sc->agp.as_allocated += size;
 
 	return (mem);
 }
 
 static int
 agp_i810_free_memory(device_t dev, struct agp_memory *mem)
 {
 	struct agp_i810_softc *sc;
 	vm_page_t m;
 
 	if (mem->am_is_bound)
 		return (EBUSY);
 
 	sc = device_get_softc(dev);
 
 	if (mem->am_type == 2) {
 		if (mem->am_size == AGP_PAGE_SIZE) {
 			/*
 			 * Unwire the page which we wired in alloc_memory.
 			 */
 			VM_OBJECT_WLOCK(mem->am_obj);
 			m = vm_page_lookup(mem->am_obj, 0);
 			vm_page_lock(m);
-			vm_page_unwire(m, 0);
+			vm_page_unwire(m, PQ_INACTIVE);
 			vm_page_unlock(m);
 			VM_OBJECT_WUNLOCK(mem->am_obj);
 		} else {
 			contigfree(sc->argb_cursor, mem->am_size, M_AGP);
 			sc->argb_cursor = NULL;
 		}
 	}
 
 	sc->agp.as_allocated -= mem->am_size;
 	TAILQ_REMOVE(&sc->agp.as_memory, mem, am_link);
 	if (mem->am_obj)
 		vm_object_deallocate(mem->am_obj);
 	free(mem, M_AGP);
 	return (0);
 }
 
 static int
 agp_i810_bind_memory(device_t dev, struct agp_memory *mem, vm_offset_t offset)
 {
 	struct agp_i810_softc *sc;
 	vm_offset_t i;
 
 	/* Do some sanity checks first. */
 	if ((offset & (AGP_PAGE_SIZE - 1)) != 0 ||
 	    offset + mem->am_size > AGP_GET_APERTURE(dev)) {
 		device_printf(dev, "binding memory at bad offset %#x\n",
 		    (int)offset);
 		return (EINVAL);
 	}
 
 	sc = device_get_softc(dev);
 	if (mem->am_type == 2 && mem->am_size != AGP_PAGE_SIZE) {
 		mtx_lock(&sc->agp.as_lock);
 		if (mem->am_is_bound) {
 			mtx_unlock(&sc->agp.as_lock);
 			return (EINVAL);
 		}
 		/* The memory's already wired down, just stick it in the GTT. */
 		for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) {
 			sc->match->driver->install_gtt_pte(dev, (offset + i) >>
 			    AGP_PAGE_SHIFT, mem->am_physical + i, 0);
 		}
 		agp_flush_cache();
 		mem->am_offset = offset;
 		mem->am_is_bound = 1;
 		mtx_unlock(&sc->agp.as_lock);
 		return (0);
 	}
 
 	if (mem->am_type != 1)
 		return (agp_generic_bind_memory(dev, mem, offset));
 
 	/*
 	 * Mapping local DRAM into GATT.
 	 */
 	if (sc->match->driver->chiptype != CHIP_I810)
 		return (EINVAL);
 	for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE)
 		bus_write_4(sc->sc_res[0],
 		    AGP_I810_GTT + (i >> AGP_PAGE_SHIFT) * 4, i | 3);
 
 	return (0);
 }
 
 static int
 agp_i810_unbind_memory(device_t dev, struct agp_memory *mem)
 {
 	struct agp_i810_softc *sc;
 	vm_offset_t i;
 
 	sc = device_get_softc(dev);
 
 	if (mem->am_type == 2 && mem->am_size != AGP_PAGE_SIZE) {
 		mtx_lock(&sc->agp.as_lock);
 		if (!mem->am_is_bound) {
 			mtx_unlock(&sc->agp.as_lock);
 			return (EINVAL);
 		}
 
 		for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) {
 			sc->match->driver->install_gtt_pte(dev,
 			    (mem->am_offset + i) >> AGP_PAGE_SHIFT, 0, 0);
 		}
 		agp_flush_cache();
 		mem->am_is_bound = 0;
 		mtx_unlock(&sc->agp.as_lock);
 		return (0);
 	}
 
 	if (mem->am_type != 1)
 		return (agp_generic_unbind_memory(dev, mem));
 
 	if (sc->match->driver->chiptype != CHIP_I810)
 		return (EINVAL);
 	for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE) {
 		sc->match->driver->install_gtt_pte(dev, i >> AGP_PAGE_SHIFT,
 		    0, 0);
 	}
 	return (0);
 }
 
 static device_method_t agp_i810_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	agp_i810_identify),
 	DEVMETHOD(device_probe,		agp_i810_probe),
 	DEVMETHOD(device_attach,	agp_i810_attach),
 	DEVMETHOD(device_detach,	agp_i810_detach),
 	DEVMETHOD(device_suspend,	bus_generic_suspend),
 	DEVMETHOD(device_resume,	agp_i810_resume),
 
 	/* AGP interface */
 	DEVMETHOD(agp_get_aperture,	agp_generic_get_aperture),
 	DEVMETHOD(agp_set_aperture,	agp_i810_method_set_aperture),
 	DEVMETHOD(agp_bind_page,	agp_i810_bind_page),
 	DEVMETHOD(agp_unbind_page,	agp_i810_unbind_page),
 	DEVMETHOD(agp_flush_tlb,	agp_i810_flush_tlb),
 	DEVMETHOD(agp_enable,		agp_i810_enable),
 	DEVMETHOD(agp_alloc_memory,	agp_i810_alloc_memory),
 	DEVMETHOD(agp_free_memory,	agp_i810_free_memory),
 	DEVMETHOD(agp_bind_memory,	agp_i810_bind_memory),
 	DEVMETHOD(agp_unbind_memory,	agp_i810_unbind_memory),
 	DEVMETHOD(agp_chipset_flush,	agp_intel_gtt_chipset_flush),
 
 	{ 0, 0 }
 };
 
 static driver_t agp_i810_driver = {
 	"agp",
 	agp_i810_methods,
 	sizeof(struct agp_i810_softc),
 };
 
 static devclass_t agp_devclass;
 
 DRIVER_MODULE(agp_i810, vgapci, agp_i810_driver, agp_devclass, 0, 0);
 MODULE_DEPEND(agp_i810, agp, 1, 1, 1);
 MODULE_DEPEND(agp_i810, pci, 1, 1, 1);
 
 extern vm_page_t bogus_page;
 
 void
 agp_intel_gtt_clear_range(device_t dev, u_int first_entry, u_int num_entries)
 {
 	struct agp_i810_softc *sc;
 	u_int i;
 
 	sc = device_get_softc(dev);
 	for (i = 0; i < num_entries; i++)
 		sc->match->driver->install_gtt_pte(dev, first_entry + i,
 		    VM_PAGE_TO_PHYS(bogus_page), 0);
 	sc->match->driver->read_gtt_pte(dev, first_entry + num_entries - 1);
 }
 
 void
 agp_intel_gtt_insert_pages(device_t dev, u_int first_entry, u_int num_entries,
     vm_page_t *pages, u_int flags)
 {
 	struct agp_i810_softc *sc;
 	u_int i;
 
 	sc = device_get_softc(dev);
 	for (i = 0; i < num_entries; i++) {
 		MPASS(pages[i]->valid == VM_PAGE_BITS_ALL);
 		MPASS(pages[i]->wire_count > 0);
 		sc->match->driver->install_gtt_pte(dev, first_entry + i,
 		    VM_PAGE_TO_PHYS(pages[i]), flags);
 	}
 	sc->match->driver->read_gtt_pte(dev, first_entry + num_entries - 1);
 }
 
 struct intel_gtt
 agp_intel_gtt_get(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	struct intel_gtt res;
 
 	sc = device_get_softc(dev);
 	res.stolen_size = sc->stolen_size;
 	res.gtt_total_entries = sc->gtt_total_entries;
 	res.gtt_mappable_entries = sc->gtt_mappable_entries;
 	res.do_idle_maps = 0;
 	res.scratch_page_dma = VM_PAGE_TO_PHYS(bogus_page);
 	return (res);
 }
 
 static int
 agp_i810_chipset_flush_setup(device_t dev)
 {
 
 	return (0);
 }
 
 static void
 agp_i810_chipset_flush_teardown(device_t dev)
 {
 
 	/* Nothing to do. */
 }
 
 static void
 agp_i810_chipset_flush(device_t dev)
 {
 
 	/* Nothing to do. */
 }
 
 static void
 agp_i830_chipset_flush(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t hic;
 	int i;
 
 	sc = device_get_softc(dev);
 	pmap_invalidate_cache();
 	hic = bus_read_4(sc->sc_res[0], AGP_I830_HIC);
 	bus_write_4(sc->sc_res[0], AGP_I830_HIC, hic | (1U << 31));
 	for (i = 0; i < 20000 /* 1 sec */; i++) {
 		hic = bus_read_4(sc->sc_res[0], AGP_I830_HIC);
 		if ((hic & (1U << 31)) == 0)
 			break;
 		DELAY(50);
 	}
 }
 
 static int
 agp_i915_chipset_flush_alloc_page(device_t dev, uint64_t start, uint64_t end)
 {
 	struct agp_i810_softc *sc;
 	device_t vga;
 
 	sc = device_get_softc(dev);
 	vga = device_get_parent(dev);
 	sc->sc_flush_page_rid = 100;
 	sc->sc_flush_page_res = BUS_ALLOC_RESOURCE(device_get_parent(vga), dev,
 	    SYS_RES_MEMORY, &sc->sc_flush_page_rid, start, end, PAGE_SIZE,
 	    RF_ACTIVE);
 	if (sc->sc_flush_page_res == NULL) {
 		device_printf(dev, "Failed to allocate flush page at 0x%jx\n",
 		    (uintmax_t)start);
 		return (EINVAL);
 	}
 	sc->sc_flush_page_vaddr = rman_get_virtual(sc->sc_flush_page_res);
 	if (bootverbose) {
 		device_printf(dev, "Allocated flush page phys 0x%jx virt %p\n",
 		    (uintmax_t)rman_get_start(sc->sc_flush_page_res),
 		    sc->sc_flush_page_vaddr);
 	}
 	return (0);
 }
 
 static void
 agp_i915_chipset_flush_free_page(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	device_t vga;
 
 	sc = device_get_softc(dev);
 	vga = device_get_parent(dev);
 	if (sc->sc_flush_page_res == NULL)
 		return;
 	BUS_DEACTIVATE_RESOURCE(device_get_parent(vga), dev, SYS_RES_MEMORY,
 	    sc->sc_flush_page_rid, sc->sc_flush_page_res);
 	BUS_RELEASE_RESOURCE(device_get_parent(vga), dev, SYS_RES_MEMORY,
 	    sc->sc_flush_page_rid, sc->sc_flush_page_res);
 }
 
 static int
 agp_i915_chipset_flush_setup(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t temp;
 	int error;
 
 	sc = device_get_softc(dev);
 	temp = pci_read_config(sc->bdev, AGP_I915_IFPADDR, 4);
 	if ((temp & 1) != 0) {
 		temp &= ~1;
 		if (bootverbose)
 			device_printf(dev,
 			    "Found already configured flush page at 0x%jx\n",
 			    (uintmax_t)temp);
 		sc->sc_bios_allocated_flush_page = 1;
 		/*
 		 * In the case BIOS initialized the flush pointer (?)
 		 * register, expect that BIOS also set up the resource
 		 * for the page.
 		 */
 		error = agp_i915_chipset_flush_alloc_page(dev, temp,
 		    temp + PAGE_SIZE - 1);
 		if (error != 0)
 			return (error);
 	} else {
 		sc->sc_bios_allocated_flush_page = 0;
 		error = agp_i915_chipset_flush_alloc_page(dev, 0, 0xffffffff);
 		if (error != 0)
 			return (error);
 		temp = rman_get_start(sc->sc_flush_page_res);
 		pci_write_config(sc->bdev, AGP_I915_IFPADDR, temp | 1, 4);
 	}
 	return (0);
 }
 
 static void
 agp_i915_chipset_flush_teardown(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t temp;
 
 	sc = device_get_softc(dev);
 	if (sc->sc_flush_page_res == NULL)
 		return;
 	if (!sc->sc_bios_allocated_flush_page) {
 		temp = pci_read_config(sc->bdev, AGP_I915_IFPADDR, 4);
 		temp &= ~1;
 		pci_write_config(sc->bdev, AGP_I915_IFPADDR, temp, 4);
 	}		
 	agp_i915_chipset_flush_free_page(dev);
 }
 
 static int
 agp_i965_chipset_flush_setup(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint64_t temp;
 	uint32_t temp_hi, temp_lo;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	temp_hi = pci_read_config(sc->bdev, AGP_I965_IFPADDR + 4, 4);
 	temp_lo = pci_read_config(sc->bdev, AGP_I965_IFPADDR, 4);
 
 	if ((temp_lo & 1) != 0) {
 		temp = ((uint64_t)temp_hi << 32) | (temp_lo & ~1);
 		if (bootverbose)
 			device_printf(dev,
 			    "Found already configured flush page at 0x%jx\n",
 			    (uintmax_t)temp);
 		sc->sc_bios_allocated_flush_page = 1;
 		/*
 		 * In the case BIOS initialized the flush pointer (?)
 		 * register, expect that BIOS also set up the resource
 		 * for the page.
 		 */
 		error = agp_i915_chipset_flush_alloc_page(dev, temp,
 		    temp + PAGE_SIZE - 1);
 		if (error != 0)
 			return (error);
 	} else {
 		sc->sc_bios_allocated_flush_page = 0;
 		error = agp_i915_chipset_flush_alloc_page(dev, 0, ~0);
 		if (error != 0)
 			return (error);
 		temp = rman_get_start(sc->sc_flush_page_res);
 		pci_write_config(sc->bdev, AGP_I965_IFPADDR + 4,
 		    (temp >> 32) & UINT32_MAX, 4);
 		pci_write_config(sc->bdev, AGP_I965_IFPADDR,
 		    (temp & UINT32_MAX) | 1, 4);
 	}
 	return (0);
 }
 
 static void
 agp_i965_chipset_flush_teardown(device_t dev)
 {
 	struct agp_i810_softc *sc;
 	uint32_t temp_lo;
 
 	sc = device_get_softc(dev);
 	if (sc->sc_flush_page_res == NULL)
 		return;
 	if (!sc->sc_bios_allocated_flush_page) {
 		temp_lo = pci_read_config(sc->bdev, AGP_I965_IFPADDR, 4);
 		temp_lo &= ~1;
 		pci_write_config(sc->bdev, AGP_I965_IFPADDR, temp_lo, 4);
 	}
 	agp_i915_chipset_flush_free_page(dev);
 }
 
 static void
 agp_i915_chipset_flush(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	*(uint32_t *)sc->sc_flush_page_vaddr = 1;
 }
 
 int
 agp_intel_gtt_chipset_flush(device_t dev)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(dev);
 	sc->match->driver->chipset_flush(dev);
 	return (0);
 }
 
 void
 agp_intel_gtt_unmap_memory(device_t dev, struct sglist *sg_list)
 {
 }
 
 int
 agp_intel_gtt_map_memory(device_t dev, vm_page_t *pages, u_int num_entries,
     struct sglist **sg_list)
 {
 	struct agp_i810_softc *sc;
 	struct sglist *sg;
 	int i;
 #if 0
 	int error;
 	bus_dma_tag_t dmat;
 #endif
 
 	if (*sg_list != NULL)
 		return (0);
 	sc = device_get_softc(dev);
 	sg = sglist_alloc(num_entries, M_WAITOK /* XXXKIB */);
 	for (i = 0; i < num_entries; i++) {
 		sg->sg_segs[i].ss_paddr = VM_PAGE_TO_PHYS(pages[i]);
 		sg->sg_segs[i].ss_len = PAGE_SIZE;
 	}
 
 #if 0
 	error = bus_dma_tag_create(bus_get_dma_tag(dev),
 	    1 /* alignment */, 0 /* boundary */,
 	    1ULL << sc->match->busdma_addr_mask_sz /* lowaddr */,
 	    BUS_SPACE_MAXADDR /* highaddr */,
             NULL /* filtfunc */, NULL /* filtfuncarg */,
 	    BUS_SPACE_MAXADDR /* maxsize */,
 	    BUS_SPACE_UNRESTRICTED /* nsegments */,
 	    BUS_SPACE_MAXADDR /* maxsegsz */,
 	    0 /* flags */, NULL /* lockfunc */, NULL /* lockfuncarg */,
 	    &dmat);
 	if (error != 0) {
 		sglist_free(sg);
 		return (error);
 	}
 	/* XXXKIB */
 #endif
 	*sg_list = sg;
 	return (0);
 }
 
 void
 agp_intel_gtt_insert_sg_entries(device_t dev, struct sglist *sg_list,
     u_int first_entry, u_int flags)
 {
 	struct agp_i810_softc *sc;
 	vm_paddr_t spaddr;
 	size_t slen;
 	u_int i, j;
 
 	sc = device_get_softc(dev);
 	for (i = j = 0; j < sg_list->sg_nseg; j++) {
 		spaddr = sg_list->sg_segs[i].ss_paddr;
 		slen = sg_list->sg_segs[i].ss_len;
 		for (; slen > 0; i++) {
 			sc->match->driver->install_gtt_pte(dev, first_entry + i,
 			    spaddr, flags);
 			spaddr += AGP_PAGE_SIZE;
 			slen -= AGP_PAGE_SIZE;
 		}
 	}
 	sc->match->driver->read_gtt_pte(dev, first_entry + i - 1);
 }
 
 void
 intel_gtt_clear_range(u_int first_entry, u_int num_entries)
 {
 
 	agp_intel_gtt_clear_range(intel_agp, first_entry, num_entries);
 }
 
 void
 intel_gtt_insert_pages(u_int first_entry, u_int num_entries, vm_page_t *pages,
     u_int flags)
 {
 
 	agp_intel_gtt_insert_pages(intel_agp, first_entry, num_entries,
 	    pages, flags);
 }
 
 struct intel_gtt
 intel_gtt_get(void)
 {
 
 	return (agp_intel_gtt_get(intel_agp));
 }
 
 int
 intel_gtt_chipset_flush(void)
 {
 
 	return (agp_intel_gtt_chipset_flush(intel_agp));
 }
 
 void
 intel_gtt_unmap_memory(struct sglist *sg_list)
 {
 
 	agp_intel_gtt_unmap_memory(intel_agp, sg_list);
 }
 
 int
 intel_gtt_map_memory(vm_page_t *pages, u_int num_entries,
     struct sglist **sg_list)
 {
 
 	return (agp_intel_gtt_map_memory(intel_agp, pages, num_entries,
 	    sg_list));
 }
 
 void
 intel_gtt_insert_sg_entries(struct sglist *sg_list, u_int first_entry,
     u_int flags)
 {
 
 	agp_intel_gtt_insert_sg_entries(intel_agp, sg_list, first_entry, flags);
 }
 
 device_t
 intel_gtt_get_bridge_device(void)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(intel_agp);
 	return (sc->bdev);
 }
 
 vm_paddr_t
 intel_gtt_read_pte_paddr(u_int entry)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(intel_agp);
 	return (sc->match->driver->read_gtt_pte_paddr(intel_agp, entry));
 }
 
 u_int32_t
 intel_gtt_read_pte(u_int entry)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(intel_agp);
 	return (sc->match->driver->read_gtt_pte(intel_agp, entry));
 }
 
 void
 intel_gtt_write(u_int entry, uint32_t val)
 {
 	struct agp_i810_softc *sc;
 
 	sc = device_get_softc(intel_agp);
 	return (sc->match->driver->write_gtt(intel_agp, entry, val));
 }
Index: user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/dev/cxgbe/tom/t4_ddp.c	(revision 267237)
@@ -1,1285 +1,1285 @@
 /*-
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/tcp_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/toecore.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_tcb.h"
 #include "tom/t4_tom.h"
 
 #define PPOD_SZ(n)	((n) * sizeof(struct pagepod))
 #define PPOD_SIZE	(PPOD_SZ(1))
 
 /* XXX: must match A_ULP_RX_TDDP_PSZ */ 
 static int t4_ddp_pgsz[] = {4096, 4096 << 2, 4096 << 4, 4096 << 6};
 
 #if 0
 static void
 t4_dump_tcb(struct adapter *sc, int tid)
 {
 	uint32_t tcb_base, off, i, j;
 
 	/* Dump TCB for the tid */
 	tcb_base = t4_read_reg(sc, A_TP_CMM_TCB_BASE);
 	t4_write_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2),
 	    tcb_base + tid * TCB_SIZE);
 	t4_read_reg(sc, PCIE_MEM_ACCESS_REG(A_PCIE_MEM_ACCESS_OFFSET, 2));
 	off = 0;
 	printf("\n");
 	for (i = 0; i < 4; i++) {
 		uint32_t buf[8];
 		for (j = 0; j < 8; j++, off += 4)
 			buf[j] = htonl(t4_read_reg(sc, MEMWIN2_BASE + off));
 
 		printf("%08x %08x %08x %08x %08x %08x %08x %08x\n",
 		    buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6],
 		    buf[7]);
 	}
 }
 #endif
 
 #define MAX_DDP_BUFFER_SIZE		(M_TCB_RX_DDP_BUF0_LEN)
 static int
 alloc_ppods(struct tom_data *td, int n, struct ppod_region *pr)
 {
 	int ppod;
 
 	KASSERT(n > 0, ("%s: nonsense allocation (%d)", __func__, n));
 
 	mtx_lock(&td->ppod_lock);
 	if (n > td->nppods_free) {
 		mtx_unlock(&td->ppod_lock);
 		return (-1);
 	}
 
 	if (td->nppods_free_head >= n) {
 		td->nppods_free_head -= n;
 		ppod = td->nppods_free_head;
 		TAILQ_INSERT_HEAD(&td->ppods, pr, link);
 	} else {
 		struct ppod_region *p;
 
 		ppod = td->nppods_free_head;
 		TAILQ_FOREACH(p, &td->ppods, link) {
 			ppod += p->used + p->free;
 			if (n <= p->free) {
 				ppod -= n;
 				p->free -= n;
 				TAILQ_INSERT_AFTER(&td->ppods, p, pr, link);
 				goto allocated;
 			}
 		}
 
 		if (__predict_false(ppod != td->nppods)) {
 			panic("%s: ppods TAILQ (%p) corrupt."
 			    "  At %d instead of %d at the end of the queue.",
 			    __func__, &td->ppods, ppod, td->nppods);
 		}
 
 		mtx_unlock(&td->ppod_lock);
 		return (-1);
 	}
 
 allocated:
 	pr->used = n;
 	pr->free = 0;
 	td->nppods_free -= n;
 	mtx_unlock(&td->ppod_lock);
 
 	return (ppod);
 }
 
 static void
 free_ppods(struct tom_data *td, struct ppod_region *pr)
 {
 	struct ppod_region *p;
 
 	KASSERT(pr->used > 0, ("%s: nonsense free (%d)", __func__, pr->used));
 
 	mtx_lock(&td->ppod_lock);
 	p = TAILQ_PREV(pr, ppod_head, link);
 	if (p != NULL)
 		p->free += pr->used + pr->free;
 	else
 		td->nppods_free_head += pr->used + pr->free;
 	td->nppods_free += pr->used;
 	KASSERT(td->nppods_free <= td->nppods,
 	    ("%s: nppods_free (%d) > nppods (%d).  %d freed this time.",
 	    __func__, td->nppods_free, td->nppods, pr->used));
 	TAILQ_REMOVE(&td->ppods, pr, link);
 	mtx_unlock(&td->ppod_lock);
 }
 
 static inline int
 pages_to_nppods(int npages, int ddp_pgsz)
 {
 	int nsegs = npages * PAGE_SIZE / ddp_pgsz;
 
 	return (howmany(nsegs, PPOD_PAGES));
 }
 
 static void
 free_ddp_buffer(struct tom_data *td, struct ddp_buffer *db)
 {
 
 	if (db == NULL)
 		return;
 
 	if (db->pages)
 		free(db->pages, M_CXGBE);
 
 	if (db->nppods > 0)
 		free_ppods(td, &db->ppod_region);
 
 	free(db, M_CXGBE);
 }
 
 void
 release_ddp_resources(struct toepcb *toep)
 {
 	int i;
 
 	for (i = 0; i < nitems(toep->db); i++) {
 		if (toep->db[i] != NULL) {
 			free_ddp_buffer(toep->td, toep->db[i]);
 			toep->db[i] = NULL;
 		}
 	}
 }
 
 /* XXX: handle_ddp_data code duplication */
 void
 insert_ddp_data(struct toepcb *toep, uint32_t n)
 {
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb = &inp->inp_socket->so_rcv;
 	struct mbuf *m;
 
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK_ASSERT(sb);
 
 	m = get_ddp_mbuf(n);
 	tp->rcv_nxt += n;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= n, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= n;
 #endif
 
 	KASSERT(toep->sb_cc >= sb->sb_cc,
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sb->sb_cc, toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits -= n;	/* adjust for F_RX_FC_DDP */
 #endif
 	sbappendstream_locked(sb, m);
 	toep->sb_cc = sb->sb_cc;
 }
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 /* RX_DATA_ACK sent as a ULP command looks like this */
 #define LEN__RX_DATA_ACK_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_rx_data_ack_core))
 
 static inline void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep,
     uint64_t word, uint64_t mask, uint64_t val)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1) |
 	    V_QUEUENO(toep->ofld_rxq->iq.abs_id));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
         req->mask = htobe64(mask);
         req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline void *
 mk_rx_data_ack_ulp(struct ulp_txpkt *ulpmc, struct toepcb *toep)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_rx_data_ack_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__RX_DATA_ACK_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_rx_data_ack_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tid));
 	req->credit_dack = htobe32(F_RX_MODULATE_RX);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__RX_DATA_ACK_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static inline uint64_t
 select_ddp_flags(struct socket *so, int flags, int db_idx)
 {
 	uint64_t ddp_flags = V_TF_DDP_INDICATE_OUT(0);
 	int waitall = flags & MSG_WAITALL;
 	int nb = so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO);
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	if (db_idx == 0) {
 		ddp_flags |= V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0);
 		if (waitall)
 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_0(1);
 		else if (nb)
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(1);
 		else
 			ddp_flags |= V_TF_DDP_BUF0_FLUSH(0);
 	} else {
 		ddp_flags |= V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1);
 		if (waitall)
 			ddp_flags |= V_TF_DDP_PUSH_DISABLE_1(1);
 		else if (nb)
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(1);
 		else
 			ddp_flags |= V_TF_DDP_BUF1_FLUSH(0);
 	}
 
 	return (ddp_flags);
 }
 
 static struct wrqe *
 mk_update_tcb_for_ddp(struct adapter *sc, struct toepcb *toep, int db_idx,
     int offset, uint64_t ddp_flags)
 {
 	struct ddp_buffer *db = toep->db[db_idx];
 	struct wrqe *wr;
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int len;
 
 	KASSERT(db_idx == 0 || db_idx == 1,
 	    ("%s: bad DDP buffer index %d", __func__, db_idx));
 
 	/*
 	 * We'll send a compound work request that has 3 SET_TCB_FIELDs and an
 	 * RX_DATA_ACK (with RX_MODULATE to speed up delivery).
 	 *
 	 * The work request header is 16B and always ends at a 16B boundary.
 	 * The ULPTX master commands that follow must all end at 16B boundaries
 	 * too so we round up the size to 16.
 	 */
 	len = sizeof(*wrh) + 3 * roundup2(LEN__SET_TCB_FIELD_ULP, 16) +
 	    roundup2(LEN__RX_DATA_ACK_ULP, 16);
 
 	wr = alloc_wrqe(len, toep->ctrlq);
 	if (wr == NULL)
 		return (NULL);
 	wrh = wrtod(wr);
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 
 	/* Write the buffer's tag */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 	    W_TCB_RX_DDP_BUF0_TAG + db_idx,
 	    V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
 	    V_TCB_RX_DDP_BUF0_TAG(db->tag));
 
 	/* Update the current offset in the DDP buffer and its total length */
 	if (db_idx == 0)
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF0_OFFSET,
 		    V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
 		    V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
 		    V_TCB_RX_DDP_BUF0_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF0_LEN(db->len));
 	else
 		ulpmc = mk_set_tcb_field_ulp(ulpmc, toep,
 		    W_TCB_RX_DDP_BUF1_OFFSET,
 		    V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)M_TCB_RX_DDP_BUF1_LEN << 32),
 		    V_TCB_RX_DDP_BUF1_OFFSET(offset) |
 		    V_TCB_RX_DDP_BUF1_LEN((u64)db->len << 32));
 
 	/* Update DDP flags */
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, toep, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_BUF0_FLUSH(1) | V_TF_DDP_BUF1_FLUSH(1) |
 	    V_TF_DDP_PUSH_DISABLE_0(1) | V_TF_DDP_PUSH_DISABLE_1(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1) |
 	    V_TF_DDP_ACTIVE_BUF(1) | V_TF_DDP_INDICATE_OUT(1), ddp_flags);
 
 	/* Gratuitous RX_DATA_ACK with RX_MODULATE set to speed up delivery. */
 	ulpmc = mk_rx_data_ack_ulp(ulpmc, toep);
 
 	return (wr);
 }
 
 static void
 discourage_ddp(struct toepcb *toep)
 {
 
 	if (toep->ddp_score && --toep->ddp_score == 0) {
 		toep->ddp_flags &= ~DDP_OK;
 		toep->ddp_disabled = time_uptime;
 		CTR3(KTR_CXGBE, "%s: tid %u !DDP_OK @ %u",
 		    __func__, toep->tid, time_uptime);
 	}
 }
 
 static int
 handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len)
 {
 	uint32_t report = be32toh(ddp_report);
 	unsigned int db_flag;
 	struct inpcb *inp = toep->inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	struct sockbuf *sb;
 	struct mbuf *m;
 
 	db_flag = report & F_DDP_BUF_IDX ? DDP_BUF1_ACTIVE : DDP_BUF0_ACTIVE;
 
 	if (__predict_false(!(report & F_DDP_INV)))
 		CXGBE_UNIMPLEMENTED("DDP buffer still valid");
 
 	INP_WLOCK(inp);
 	so = inp_inpcbtosocket(inp);
 	sb = &so->so_rcv;
 	if (__predict_false(inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) {
 
 		/*
 		 * XXX: think a bit more.
 		 * tcpcb probably gone, but socket should still be around
 		 * because we always wait for DDP completion in soreceive no
 		 * matter what.  Just wake it up and let it clean up.
 		 */
 
 		CTR5(KTR_CXGBE, "%s: tid %u, seq 0x%x, len %d, inp_flags 0x%x",
 		    __func__, toep->tid, be32toh(rcv_nxt), len, inp->inp_flags);
 		SOCKBUF_LOCK(sb);
 		goto wakeup;
 	}
 
 	tp = intotcpcb(inp);
 	len += be32toh(rcv_nxt) - tp->rcv_nxt;
 	tp->rcv_nxt += len;
 	tp->t_rcvtime = ticks;
 #ifndef USE_DDP_RX_FLOW_CONTROL
 	KASSERT(tp->rcv_wnd >= len, ("%s: negative window size", __func__));
 	tp->rcv_wnd -= len;
 #endif
 	m = get_ddp_mbuf(len);
 
 	SOCKBUF_LOCK(sb);
 	if (report & F_DDP_BUF_COMPLETE)
 		toep->ddp_score = DDP_HIGH_SCORE;
 	else
 		discourage_ddp(toep);
 
 	KASSERT(toep->sb_cc >= sb->sb_cc,
 	    ("%s: sb %p has more data (%d) than last time (%d).",
 	    __func__, sb, sb->sb_cc, toep->sb_cc));
 	toep->rx_credits += toep->sb_cc - sb->sb_cc;
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	toep->rx_credits -= len;	/* adjust for F_RX_FC_DDP */
 #endif
 	sbappendstream_locked(sb, m);
 	toep->sb_cc = sb->sb_cc;
 wakeup:
 	KASSERT(toep->ddp_flags & db_flag,
 	    ("%s: DDP buffer not active. toep %p, ddp_flags 0x%x, report 0x%x",
 	    __func__, toep, toep->ddp_flags, report));
 	toep->ddp_flags &= ~db_flag;
 	sorwakeup_locked(so);
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #define DDP_ERR (F_DDP_PPOD_MISMATCH | F_DDP_LLIMIT_ERR | F_DDP_ULIMIT_ERR |\
 	 F_DDP_PPOD_PARITY_ERR | F_DDP_PADDING_ERR | F_DDP_OFFSET_ERR |\
 	 F_DDP_INVALID_TAG | F_DDP_COLOR_ERR | F_DDP_TID_MISMATCH |\
 	 F_DDP_INVALID_PPOD | F_DDP_HDRCRC_ERR | F_DDP_DATACRC_ERR)
 
 static int
 do_rx_data_ddp(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_data_ddp *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	uint32_t vld;
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	vld = be32toh(cpl->ddpvld);
 	if (__predict_false(vld & DDP_ERR)) {
 		panic("%s: DDP error 0x%x (tid %d, toep %p)",
 		    __func__, vld, tid, toep);
 	}
 
 	handle_ddp_data(toep, cpl->u.ddp_report, cpl->seq, be16toh(cpl->len));
 
 	return (0);
 }
 
 static int
 do_rx_ddp_complete(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_ddp_complete *cpl = (const void *)(rss + 1);
 	unsigned int tid = GET_TID(cpl);
 	struct toepcb *toep = lookup_tid(sc, tid);
 
 	KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__));
 	KASSERT(toep->tid == tid, ("%s: toep tid/atid mismatch", __func__));
 	KASSERT(!(toep->flags & TPF_SYNQE),
 	    ("%s: toep %p claims to be a synq entry", __func__, toep));
 
 	handle_ddp_data(toep, cpl->ddp_report, cpl->rcv_nxt, 0);
 
 	return (0);
 }
 
 void
 enable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_OK | DDP_SC_REQ)) == DDP_OK,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp_flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	toep->ddp_flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS,
 	    V_TF_DDP_OFF(1) | V_TF_DDP_INDICATE_OUT(1) |
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1) |
 	    V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_BUF1_VALID(1),
 	    V_TF_DDP_BUF0_INDICATE(1) | V_TF_DDP_BUF1_INDICATE(1));
 	t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), 0);
 }
 
 static inline void
 disable_ddp(struct adapter *sc, struct toepcb *toep)
 {
 
 	KASSERT((toep->ddp_flags & (DDP_ON | DDP_SC_REQ)) == DDP_ON,
 	    ("%s: toep %p has bad ddp_flags 0x%x",
 	    __func__, toep, toep->ddp_flags));
 
 	CTR3(KTR_CXGBE, "%s: tid %u (time %u)",
 	    __func__, toep->tid, time_uptime);
 
 	toep->ddp_flags |= DDP_SC_REQ;
 	t4_set_tcb_field(sc, toep, 1, W_TCB_T_FLAGS,
 	    V_TF_RCV_COALESCE_ENABLE(1), V_TF_RCV_COALESCE_ENABLE(1));
 	t4_set_tcb_field(sc, toep, 1, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
 	    V_TF_DDP_OFF(1));
 }
 
 static int
 hold_uio(struct uio *uio, vm_page_t **ppages, int *pnpages)
 {
 	struct vm_map *map;
 	struct iovec *iov;
 	vm_offset_t start, end;
 	vm_page_t *pp;
 	int n;
 
 	KASSERT(uio->uio_iovcnt == 1,
 	    ("%s: uio_iovcnt %d", __func__, uio->uio_iovcnt));
 	KASSERT(uio->uio_td->td_proc == curproc,
 	    ("%s: uio proc (%p) is not curproc (%p)",
 	    __func__, uio->uio_td->td_proc, curproc));
 
 	map = &curproc->p_vmspace->vm_map;
 	iov = &uio->uio_iov[0];
 	start = trunc_page((uintptr_t)iov->iov_base);
 	end = round_page((vm_offset_t)iov->iov_base + iov->iov_len);
 	n = howmany(end - start, PAGE_SIZE);
 
 	if (end - start > MAX_DDP_BUFFER_SIZE)
 		return (E2BIG);
 
 	pp = malloc(n * sizeof(vm_page_t), M_CXGBE, M_NOWAIT);
 	if (pp == NULL)
 		return (ENOMEM);
 
 	if (vm_fault_quick_hold_pages(map, (vm_offset_t)iov->iov_base,
 	    iov->iov_len, VM_PROT_WRITE, pp, n) < 0) {
 		free(pp, M_CXGBE);
 		return (EFAULT);
 	}
 
 	*ppages = pp;
 	*pnpages = n;
 
 	return (0);
 }
 
 static int
 bufcmp(struct ddp_buffer *db, vm_page_t *pages, int npages, int offset, int len)
 {
 	int i;
 
 	if (db == NULL || db->npages != npages || db->offset != offset ||
 	    db->len != len)
 		return (1);
 
 	for (i = 0; i < npages; i++) {
 		if (pages[i]->phys_addr != db->pages[i]->phys_addr)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 calculate_hcf(int n1, int n2)
 {
 	int a, b, t;
 
 	if (n1 <= n2) {
 		a = n1;
 		b = n2;
 	} else {
 		a = n2;
 		b = n1;
 	}
 
 	while (a != 0) {
 		t = a;
 		a = b % a;
 		b = t;
 	}
 
 	return (b);
 }
 
 static struct ddp_buffer *
 alloc_ddp_buffer(struct tom_data *td, vm_page_t *pages, int npages, int offset,
     int len)
 {
 	int i, hcf, seglen, idx, ppod, nppods;
 	struct ddp_buffer *db;
 
 	/*
 	 * The DDP page size is unrelated to the VM page size.  We combine
 	 * contiguous physical pages into larger segments to get the best DDP
 	 * page size possible.  This is the largest of the four sizes in
 	 * A_ULP_RX_TDDP_PSZ that evenly divides the HCF of the segment sizes in
 	 * the page list.
 	 */
 	hcf = 0;
 	for (i = 0; i < npages; i++) {
 		seglen = PAGE_SIZE;
 		while (i < npages - 1 &&
 		    pages[i]->phys_addr + PAGE_SIZE == pages[i + 1]->phys_addr) {
 			seglen += PAGE_SIZE;
 			i++;
 		}
 
 		hcf = calculate_hcf(hcf, seglen);
 		if (hcf < t4_ddp_pgsz[1]) {
 			idx = 0;
 			goto have_pgsz;	/* give up, short circuit */
 		}
 	}
 
 	if (hcf % t4_ddp_pgsz[0] != 0) {
 		/* hmmm.  This could only happen when PAGE_SIZE < 4K */
 		KASSERT(PAGE_SIZE < 4096,
 		    ("%s: PAGE_SIZE %d, hcf %d", __func__, PAGE_SIZE, hcf));
 		CTR3(KTR_CXGBE, "%s: PAGE_SIZE %d, hcf %d",
 		    __func__, PAGE_SIZE, hcf);
 		return (NULL);
 	}
 
 	for (idx = nitems(t4_ddp_pgsz) - 1; idx > 0; idx--) {
 		if (hcf % t4_ddp_pgsz[idx] == 0)
 			break;
 	}
 have_pgsz:
 
 	db = malloc(sizeof(*db), M_CXGBE, M_NOWAIT);
 	if (db == NULL) {
 		CTR1(KTR_CXGBE, "%s: malloc failed.", __func__);
 		return (NULL);
 	}
 
 	nppods = pages_to_nppods(npages, t4_ddp_pgsz[idx]);
 	ppod = alloc_ppods(td, nppods, &db->ppod_region);
 	if (ppod < 0) {
 		free(db, M_CXGBE);
 		CTR4(KTR_CXGBE, "%s: no pods, nppods %d, resid %d, pgsz %d",
 		    __func__, nppods, len, t4_ddp_pgsz[idx]);
 		return (NULL);
 	}
 
 	KASSERT(idx <= M_PPOD_PGSZ && ppod <= M_PPOD_TAG,
 	    ("%s: DDP pgsz_idx = %d, ppod = %d", __func__, idx, ppod));
 
 	db->tag = V_PPOD_PGSZ(idx) | V_PPOD_TAG(ppod);
 	db->nppods = nppods;
 	db->npages = npages;
 	db->pages = pages;
 	db->offset = offset;
 	db->len = len;
 
 	CTR6(KTR_CXGBE, "New DDP buffer.  "
 	    "ddp_pgsz %d, ppod 0x%x, npages %d, nppods %d, offset %d, len %d",
 	    t4_ddp_pgsz[idx], ppod, db->npages, db->nppods, db->offset,
 	    db->len);
 
 	return (db);
 }
 
 #define NUM_ULP_TX_SC_IMM_PPODS (256 / PPOD_SIZE)
 
 static int
 write_page_pods(struct adapter *sc, struct toepcb *toep, struct ddp_buffer *db)
 {
 	struct wrqe *wr;
 	struct ulp_mem_io *ulpmc;
 	struct ulptx_idata *ulpsc;
 	struct pagepod *ppod;
 	int i, j, k, n, chunk, len, ddp_pgsz, idx, ppod_addr;
 	uint32_t cmd;
 
 	cmd = htobe32(V_ULPTX_CMD(ULP_TX_MEM_WRITE));
 	if (is_t4(sc))
 		cmd |= htobe32(F_ULP_MEMIO_ORDER);
 	else
 		cmd |= htobe32(F_T5_ULP_MEMIO_IMM);
 	ddp_pgsz = t4_ddp_pgsz[G_PPOD_PGSZ(db->tag)];
 	ppod_addr = sc->vres.ddp.start + G_PPOD_TAG(db->tag) * PPOD_SIZE;
 	for (i = 0; i < db->nppods; ppod_addr += chunk) {
 
 		/* How many page pods are we writing in this cycle */
 		n = min(db->nppods - i, NUM_ULP_TX_SC_IMM_PPODS);
 		chunk = PPOD_SZ(n);
 		len = roundup2(sizeof(*ulpmc) + sizeof(*ulpsc) + chunk, 16);
 
 		wr = alloc_wrqe(len, toep->ctrlq);
 		if (wr == NULL)
 			return (ENOMEM);	/* ok to just bail out */
 		ulpmc = wrtod(wr);
 
 		INIT_ULPTX_WR(ulpmc, len, 0, 0);
 		ulpmc->cmd = cmd;
 		ulpmc->dlen = htobe32(V_ULP_MEMIO_DATA_LEN(chunk / 32));
 		ulpmc->len16 = htobe32(howmany(len - sizeof(ulpmc->wr), 16));
 		ulpmc->lock_addr = htobe32(V_ULP_MEMIO_ADDR(ppod_addr >> 5));
 
 		ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 		ulpsc->len = htobe32(chunk);
 
 		ppod = (struct pagepod *)(ulpsc + 1);
 		for (j = 0; j < n; i++, j++, ppod++) {
 			ppod->vld_tid_pgsz_tag_color = htobe64(F_PPOD_VALID |
 			    V_PPOD_TID(toep->tid) | db->tag);
 			ppod->len_offset = htobe64(V_PPOD_LEN(db->len) |
 			    V_PPOD_OFST(db->offset));
 			ppod->rsvd = 0;
 			idx = i * PPOD_PAGES * (ddp_pgsz / PAGE_SIZE);
 			for (k = 0; k < nitems(ppod->addr); k++) {
 				if (idx < db->npages) {
 					ppod->addr[k] =
 					    htobe64(db->pages[idx]->phys_addr);
 					idx += ddp_pgsz / PAGE_SIZE;
 				} else
 					ppod->addr[k] = 0;
 #if 0
 				CTR5(KTR_CXGBE,
 				    "%s: tid %d ppod[%d]->addr[%d] = %p",
 				    __func__, toep->tid, i, k,
 				    htobe64(ppod->addr[k]));
 #endif
 			}
 
 		}
 
 		t4_wrq_tx(sc, wr);
 	}
 
 	return (0);
 }
 
 /*
  * Reuse, or allocate (and program the page pods for) a new DDP buffer.  The
  * "pages" array is handed over to this function and should not be used in any
  * way by the caller after that.
  */
 static int
 select_ddp_buffer(struct adapter *sc, struct toepcb *toep, vm_page_t *pages,
     int npages, int db_off, int db_len)
 {
 	struct ddp_buffer *db;
 	struct tom_data *td = sc->tom_softc;
 	int i, empty_slot = -1;
 
 	/* Try to reuse */
 	for (i = 0; i < nitems(toep->db); i++) {
 		if (bufcmp(toep->db[i], pages, npages, db_off, db_len) == 0) {
 			free(pages, M_CXGBE);
 			return (i);	/* pages still held */
 		} else if (toep->db[i] == NULL && empty_slot < 0)
 			empty_slot = i;
 	}
 
 	/* Allocate new buffer, write its page pods. */
 	db = alloc_ddp_buffer(td, pages, npages, db_off, db_len);
 	if (db == NULL) {
 		vm_page_unhold_pages(pages, npages);
 		free(pages, M_CXGBE);
 		return (-1);
 	}
 	if (write_page_pods(sc, toep, db) != 0) {
 		vm_page_unhold_pages(pages, npages);
 		free_ddp_buffer(td, db);
 		return (-1);
 	}
 
 	i = empty_slot;
 	if (i < 0) {
 		i = arc4random() % nitems(toep->db);
 		free_ddp_buffer(td, toep->db[i]);
 	}
 	toep->db[i] = db;
 
 	CTR5(KTR_CXGBE, "%s: tid %d, DDP buffer[%d] = %p (tag 0x%x)",
 	    __func__, toep->tid, i, db, db->tag);
 
 	return (i);
 }
 
 static void
 wire_ddp_buffer(struct ddp_buffer *db)
 {
 	int i;
 	vm_page_t p;
 
 	for (i = 0; i < db->npages; i++) {
 		p = db->pages[i];
 		vm_page_lock(p);
 		vm_page_wire(p);
 		vm_page_unhold(p);
 		vm_page_unlock(p);
 	}
 }
 
 static void
 unwire_ddp_buffer(struct ddp_buffer *db)
 {
 	int i;
 	vm_page_t p;
 
 	for (i = 0; i < db->npages; i++) {
 		p = db->pages[i];
 		vm_page_lock(p);
-		vm_page_unwire(p, 0);
+		vm_page_unwire(p, PQ_INACTIVE);
 		vm_page_unlock(p);
 	}
 }
 
 static int
 handle_ddp(struct socket *so, struct uio *uio, int flags, int error)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	vm_page_t *pages;
 	int npages, db_idx, rc, buf_flag;
 	struct ddp_buffer *db;
 	struct wrqe *wr;
 	uint64_t ddp_flags;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 
 #if 0
 	if (sb->sb_cc + sc->tt.ddp_thres > uio->uio_resid) {
 		CTR4(KTR_CXGBE, "%s: sb_cc %d, threshold %d, resid %d",
 		    __func__, sb->sb_cc, sc->tt.ddp_thres, uio->uio_resid);
 	}
 #endif
 
 	/* XXX: too eager to disable DDP, could handle NBIO better than this. */
 	if (sb->sb_cc >= uio->uio_resid || uio->uio_resid < sc->tt.ddp_thres ||
 	    uio->uio_resid > MAX_DDP_BUFFER_SIZE || uio->uio_iovcnt > 1 ||
 	    so->so_state & SS_NBIO || flags & (MSG_DONTWAIT | MSG_NBIO) ||
 	    error || so->so_error || sb->sb_state & SBS_CANTRCVMORE)
 		goto no_ddp;
 
 	/*
 	 * Fault in and then hold the pages of the uio buffers.  We'll wire them
 	 * a bit later if everything else works out.
 	 */
 	SOCKBUF_UNLOCK(sb);
 	if (hold_uio(uio, &pages, &npages) != 0) {
 		SOCKBUF_LOCK(sb);
 		goto no_ddp;
 	}
 	SOCKBUF_LOCK(sb);
 	if (__predict_false(so->so_error || sb->sb_state & SBS_CANTRCVMORE)) {
 		vm_page_unhold_pages(pages, npages);
 		free(pages, M_CXGBE);
 		goto no_ddp;
 	}
 
 	/*
 	 * Figure out which one of the two DDP buffers to use this time.
 	 */
 	db_idx = select_ddp_buffer(sc, toep, pages, npages,
 	    (uintptr_t)uio->uio_iov->iov_base & PAGE_MASK, uio->uio_resid);
 	pages = NULL;	/* handed off to select_ddp_buffer */
 	if (db_idx < 0)
 		goto no_ddp;
 	db = toep->db[db_idx];
 	buf_flag = db_idx == 0 ? DDP_BUF0_ACTIVE : DDP_BUF1_ACTIVE;
 
 	/*
 	 * Build the compound work request that tells the chip where to DMA the
 	 * payload.
 	 */
 	ddp_flags = select_ddp_flags(so, flags, db_idx);
 	wr = mk_update_tcb_for_ddp(sc, toep, db_idx, sb->sb_cc, ddp_flags);
 	if (wr == NULL) {
 		/*
 		 * Just unhold the pages.  The DDP buffer's software state is
 		 * left as-is in the toep.  The page pods were written
 		 * successfully and we may have an opportunity to use it in the
 		 * future.
 		 */
 		vm_page_unhold_pages(db->pages, db->npages);
 		goto no_ddp;
 	}
 
 	/* Wire (and then unhold) the pages, and give the chip the go-ahead. */
 	wire_ddp_buffer(db);
 	t4_wrq_tx(sc, wr);
 	sb->sb_flags &= ~SB_DDP_INDICATE;
 	toep->ddp_flags |= buf_flag;
 
 	/*
 	 * Wait for the DDP operation to complete and then unwire the pages.
 	 * The return code from the sbwait will be the final return code of this
 	 * function.  But we do need to wait for DDP no matter what.
 	 */
 	rc = sbwait(sb);
 	while (toep->ddp_flags & buf_flag) {
 		sb->sb_flags |= SB_WAIT;
 		msleep(&sb->sb_cc, &sb->sb_mtx, PSOCK , "sbwait", 0);
 	}
 	unwire_ddp_buffer(db);
 	return (rc);
 no_ddp:
 	disable_ddp(sc, toep);
 	discourage_ddp(toep);
 	sb->sb_flags &= ~SB_DDP_INDICATE;
 	return (0);
 }
 
 void
 t4_init_ddp(struct adapter *sc, struct tom_data *td)
 {
 	int nppods = sc->vres.ddp.size / PPOD_SIZE;
 
 	td->nppods = nppods;
 	td->nppods_free = nppods;
 	td->nppods_free_head = nppods;
 	TAILQ_INIT(&td->ppods);
 	mtx_init(&td->ppod_lock, "page pods", NULL, MTX_DEF);
 
 	t4_register_cpl_handler(sc, CPL_RX_DATA_DDP, do_rx_data_ddp);
 	t4_register_cpl_handler(sc, CPL_RX_DDP_COMPLETE, do_rx_ddp_complete);
 }
 
 void
 t4_uninit_ddp(struct adapter *sc __unused, struct tom_data *td)
 {
 
 	KASSERT(td->nppods == td->nppods_free,
 	    ("%s: page pods still in use, nppods = %d, free = %d",
 	    __func__, td->nppods, td->nppods_free));
 
 	if (mtx_initialized(&td->ppod_lock))
 		mtx_destroy(&td->ppod_lock);
 }
 
 #define	VNET_SO_ASSERT(so)						\
 	VNET_ASSERT(curvnet != NULL,					\
 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 #define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 
 	CXGBE_UNIMPLEMENTED(__func__);
 }
 
 static char ddp_magic_str[] = "nothing to see here";
 
 struct mbuf *
 get_ddp_mbuf(int len)
 {
 	struct mbuf *m;
 
 	m = m_get(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		CXGBE_UNIMPLEMENTED("mbuf alloc failure");
 	m->m_len = len;
 	m->m_data = &ddp_magic_str[0];
 
 	return (m);
 }
 
 static inline int
 is_ddp_mbuf(struct mbuf *m)
 {
 
 	return (m->m_data == &ddp_magic_str[0]);
 }
 
 /*
  * Copy an mbuf chain into a uio limited by len if set.
  */
 static int
 m_mbuftouio_ddp(struct uio *uio, struct mbuf *m, int len)
 {
 	int error, length, total;
 	int progress = 0;
 
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
 
 	/* Fill the uio with data from the mbufs. */
 	for (; m != NULL; m = m->m_next) {
 		length = min(m->m_len, total - progress);
 
 		if (is_ddp_mbuf(m)) {
 			enum uio_seg segflag = uio->uio_segflg;
 
 			uio->uio_segflg	= UIO_NOCOPY;
 			error = uiomove(mtod(m, void *), length, uio);
 			uio->uio_segflg	= segflag;
 		} else
 			error = uiomove(mtod(m, void *), length, uio);
 		if (error)
 			return (error);
 
 		progress += length;
 	}
 
 	return (0);
 }
 
 /*
  * Based on soreceive_stream() in uipc_socket.c
  */
 int
 t4_soreceive_ddp(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int len = 0, error = 0, flags, oresid, ddp_handled = 0;
 	struct sockbuf *sb;
 	struct mbuf *m, *n = NULL;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		return (EINVAL);
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 	sb = &so->so_rcv;
 
 	/* Prevent other readers from entering the socket. */
 	error = sblock(sb, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 	SOCKBUF_LOCK(sb);
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	oresid = uio->uio_resid;
 
 	/* We will never ever get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
 
 		/* uio should be just as it was at entry */
 		KASSERT(oresid == uio->uio_resid,
 		    ("%s: oresid = %d, uio_resid = %zd, sb_cc = %d",
 		    __func__, oresid, uio->uio_resid, sb->sb_cc));
 
 		error = handle_ddp(so, uio, flags, 0);
 		ddp_handled = 1;
 		if (error)
 			goto out;
 	}
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sb->sb_cc > 0)
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sb->sb_cc > 0)
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sb->sb_cc == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sb->sb_cc > 0 && !(flags & MSG_WAITALL) &&
 	    ((sb->sb_flags & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sb->sb_cc >= sb->sb_lowat ||
 	     sb->sb_cc >= uio->uio_resid ||
 	     sb->sb_cc >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sb->sb_cc >= uio->uio_resid || sb->sb_cc >= sb->sb_lowat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(sb);
 	if (error) {
 		if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled) {
 			(void) handle_ddp(so, uio, flags, 1);
 			ddp_handled = 1;
 		}
 		goto out;
 	}
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sb->sb_cc > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	if (sb->sb_flags & SB_DDP_INDICATE && !ddp_handled)
 		goto restart;
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sb->sb_cc);
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			for (*mp0 = m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			sb->sb_mb = m;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 			n->m_next = NULL;
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= m->m_len;
 			if (*mp0 != NULL)
 				n->m_next = m;
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio_ddp(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
 		     !(flags & MSG_SOCALLBCK))) {
 			SOCKBUF_UNLOCK(sb);
 			VNET_SO_ASSERT(so);
 			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SOCKBUF_LOCK_ASSERT(sb);
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	sbunlock(sb);
 	return (error);
 }
 
 #endif
Index: user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/dev/drm/via_dmablit.c	(revision 267237)
@@ -1,790 +1,790 @@
 /* via_dmablit.c -- PCI DMA BitBlt support for the VIA Unichrome/Pro
  *
  * Copyright (C) 2005 Thomas Hellstrom, All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sub license,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *    Thomas Hellstrom.
  *    Partially based on code obtained from Digeo Inc.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Unmaps the DMA mappings.
  * FIXME: Is this a NoOp on x86? Also
  * FIXME: What happens if this one is called and a pending blit has previously done
  * the same DMA mappings?
  */
 
 #include "dev/drm/drmP.h"
 #include "dev/drm/via_drm.h"
 #include "dev/drm/via_drv.h"
 #include "dev/drm/via_dmablit.h"
 
 #define VIA_PGDN(x)	(((unsigned long)(x)) & ~PAGE_MASK)
 #define VIA_PGOFF(x)	(((unsigned long)(x)) & PAGE_MASK)
 #define VIA_PFN(x)	((unsigned long)(x) >> PAGE_SHIFT)
 
 typedef struct _drm_via_descriptor {
 	uint32_t mem_addr;
 	uint32_t dev_addr;
 	uint32_t size;
 	uint32_t next;
 } drm_via_descriptor_t;
 
 static void via_dmablit_timer(void *arg);
 
 /*
  * Unmap a DMA mapping.
  */
 static void
 via_unmap_blit_from_device(drm_via_sg_info_t *vsg)
 {
 	int num_desc = vsg->num_desc;
 	unsigned cur_descriptor_page = num_desc / vsg->descriptors_per_page;
 	unsigned descriptor_this_page = num_desc % vsg->descriptors_per_page;
 	drm_via_descriptor_t *desc_ptr = vsg->desc_pages[cur_descriptor_page] +
 		descriptor_this_page;
 	dma_addr_t next = vsg->chain_start;
 
 	while(num_desc--) {
 		if (descriptor_this_page-- == 0) {
 			cur_descriptor_page--;
 			descriptor_this_page = vsg->descriptors_per_page - 1;
 			desc_ptr = vsg->desc_pages[cur_descriptor_page] +
 				descriptor_this_page;
 		}
 		next = (dma_addr_t) desc_ptr->next;
 		desc_ptr--;
 	}
 }
 
 
 /*
  * If mode = 0, count how many descriptors are needed.
  * If mode = 1, Map the DMA pages for the device, put together and map also the descriptors.
  * Descriptors are run in reverse order by the hardware because we are not allowed to update the
  * 'next' field without syncing calls when the descriptor is already mapped.
  */
 static void
 via_map_blit_for_device(const drm_via_dmablit_t *xfer,
 		   drm_via_sg_info_t *vsg, int mode)
 {
 	unsigned cur_descriptor_page = 0;
 	unsigned num_descriptors_this_page = 0;
 	unsigned char *mem_addr = xfer->mem_addr;
 	unsigned char *cur_mem;
 	unsigned char *first_addr = (unsigned char *)VIA_PGDN(mem_addr);
 	uint32_t fb_addr = xfer->fb_addr;
 	uint32_t cur_fb;
 	unsigned long line_len;
 	unsigned remaining_len;
 	int num_desc = 0;
 	int cur_line;
 	dma_addr_t next = 0 | VIA_DMA_DPR_EC;
 	drm_via_descriptor_t *desc_ptr = NULL;
 
 	if (mode == 1)
 		desc_ptr = vsg->desc_pages[cur_descriptor_page];
 
 	for (cur_line = 0; cur_line < xfer->num_lines; ++cur_line) {
 
 		line_len = xfer->line_length;
 		cur_fb = fb_addr;
 		cur_mem = mem_addr;
 
 		while (line_len > 0) {
 
 			remaining_len = min(PAGE_SIZE - VIA_PGOFF(cur_mem),
 			    line_len);
 			line_len -= remaining_len;
 
 			if (mode == 1) {
 				desc_ptr->mem_addr =
 				    VM_PAGE_TO_PHYS(
 				    vsg->pages[VIA_PFN(cur_mem) -
 				    VIA_PFN(first_addr)]) + VIA_PGOFF(cur_mem);
 				desc_ptr->dev_addr = cur_fb;
 
 				desc_ptr->size = remaining_len;
 				desc_ptr->next = (uint32_t) next;
 
 				next = vtophys(desc_ptr);
 
 				desc_ptr++;
 				if (++num_descriptors_this_page >= vsg->descriptors_per_page) {
 					num_descriptors_this_page = 0;
 					desc_ptr = vsg->desc_pages[++cur_descriptor_page];
 				}
 			}
 
 			num_desc++;
 			cur_mem += remaining_len;
 			cur_fb += remaining_len;
 		}
 
 		mem_addr += xfer->mem_stride;
 		fb_addr += xfer->fb_stride;
 	}
 
 	if (mode == 1) {
 		vsg->chain_start = next;
 		vsg->state = dr_via_device_mapped;
 	}
 	vsg->num_desc = num_desc;
 }
 
 
 /*
  * Function that frees up all resources for a blit. It is usable even if the
  * blit info has only been partially built as long as the status enum is consistent
  * with the actual status of the used resources.
  */
 static void
 via_free_sg_info(drm_via_sg_info_t *vsg)
 {
 	vm_page_t page;
 	int i;
 
 	switch(vsg->state) {
 	case dr_via_device_mapped:
 		via_unmap_blit_from_device(vsg);
 	case dr_via_desc_pages_alloc:
 		for (i=0; i<vsg->num_desc_pages; ++i) {
 			if (vsg->desc_pages[i] != NULL)
 			    free(vsg->desc_pages[i], DRM_MEM_PAGES);
 		}
 		free(vsg->desc_pages, DRM_MEM_DRIVER);
 	case dr_via_pages_locked:
 		for (i=0; i < vsg->num_pages; ++i) {
 			page = vsg->pages[i];
 			vm_page_lock(page);
-			vm_page_unwire(page, 0);
+			vm_page_unwire(page, PQ_INACTIVE);
 			vm_page_unlock(page);
 		}
 	case dr_via_pages_alloc:
 		free(vsg->pages, DRM_MEM_DRIVER);
 	default:
 		vsg->state = dr_via_sg_init;
 	}
 	free(vsg->bounce_buffer, DRM_MEM_DRIVER);
 	vsg->bounce_buffer = NULL;
 	vsg->free_on_sequence = 0;
 }
 
 
 /*
  * Fire a blit engine.
  */
 static void
 via_fire_dmablit(struct drm_device *dev, drm_via_sg_info_t *vsg, int engine)
 {
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 
 	VIA_WRITE(VIA_PCI_DMA_MAR0 + engine*0x10, 0);
 	VIA_WRITE(VIA_PCI_DMA_DAR0 + engine*0x10, 0);
 	VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_DD | VIA_DMA_CSR_TD |
 		  VIA_DMA_CSR_DE);
 	VIA_WRITE(VIA_PCI_DMA_MR0  + engine*0x04, VIA_DMA_MR_CM | VIA_DMA_MR_TDIE);
 	VIA_WRITE(VIA_PCI_DMA_BCR0 + engine*0x10, 0);
 	VIA_WRITE(VIA_PCI_DMA_DPR0 + engine*0x10, vsg->chain_start);
 	DRM_WRITEMEMORYBARRIER();
 	VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_DE | VIA_DMA_CSR_TS);
 	(void)VIA_READ(VIA_PCI_DMA_CSR0 + engine*0x04);
 }
 
 
 /*
  * Obtain a page pointer array and lock all pages into system memory. A segmentation violation will
  * occur here if the calling user does not have access to the submitted address.
  */
 static int
 via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  drm_via_dmablit_t *xfer)
 {
 	unsigned long first_pfn = VIA_PFN(xfer->mem_addr);
 	vm_page_t m;
 	int i;
 
 	vsg->num_pages = VIA_PFN(xfer->mem_addr +
 	    (xfer->num_lines * xfer->mem_stride -1)) - first_pfn + 1;
 
 	if (NULL == (vsg->pages = malloc(sizeof(vm_page_t) * vsg->num_pages,
 	    DRM_MEM_DRIVER, M_NOWAIT)))
 		return -ENOMEM;
 
 	vsg->state = dr_via_pages_alloc;
 
 	if (vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)xfer->mem_addr, vsg->num_pages * PAGE_SIZE,
 	    VM_PROT_READ | VM_PROT_WRITE, vsg->pages, vsg->num_pages) < 0)
 		return -EACCES;
 
 	for (i = 0; i < vsg->num_pages; i++) {
 		m = vsg->pages[i];
 		vm_page_lock(m);
 		vm_page_wire(m);
 		vm_page_unhold(m);
 		vm_page_unlock(m);
 	}
 	vsg->state = dr_via_pages_locked;
 
 	DRM_DEBUG("DMA pages locked\n");
 
 	return 0;
 }
 
 
 /*
  * Allocate DMA capable memory for the blit descriptor chain, and an array that keeps track of the
  * pages we allocate. We don't want to use kmalloc for the descriptor chain because it may be
  * quite large for some blits, and pages don't need to be contingous.
  */
 static int
 via_alloc_desc_pages(drm_via_sg_info_t *vsg)
 {
 	int i;
 
 	vsg->descriptors_per_page = PAGE_SIZE / sizeof(drm_via_descriptor_t);
 	vsg->num_desc_pages = (vsg->num_desc + vsg->descriptors_per_page - 1) /
 	    vsg->descriptors_per_page;
 
 	if (NULL ==  (vsg->desc_pages = malloc(vsg->num_desc_pages *
 	    sizeof(void *), DRM_MEM_DRIVER, M_NOWAIT | M_ZERO)))
 		return -ENOMEM;
 
 	vsg->state = dr_via_desc_pages_alloc;
 	for (i = 0; i < vsg->num_desc_pages; ++i) {
 		if (NULL == (vsg->desc_pages[i] =
 		    (drm_via_descriptor_t *)malloc(PAGE_SIZE, DRM_MEM_PAGES,
 		    M_NOWAIT | M_ZERO)))
 			return -ENOMEM;
 	}
 	DRM_DEBUG("Allocated %d pages for %d descriptors.\n",
 	    vsg->num_desc_pages, vsg->num_desc);
 
 	return 0;
 }
 
 
 static void
 via_abort_dmablit(struct drm_device *dev, int engine)
 {
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 
 	VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_TA);
 }
 
 
 static void
 via_dmablit_engine_off(struct drm_device *dev, int engine)
 {
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 
 	VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04, VIA_DMA_CSR_TD | VIA_DMA_CSR_DD);
 }
 
 
 /*
  * The dmablit part of the IRQ handler. Trying to do only reasonably fast things here.
  * The rest, like unmapping and freeing memory for done blits is done in a separate workqueue
  * task. Basically the task of the interrupt handler is to submit a new blit to the engine, while
  * the workqueue task takes care of processing associated with the old blit.
  */
 void
 via_dmablit_handler(struct drm_device *dev, int engine, int from_irq)
 {
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 	drm_via_blitq_t *blitq = dev_priv->blit_queues + engine;
 	int cur;
 	int done_transfer;
 	uint32_t status = 0;
 
 	DRM_DEBUG("DMA blit handler called. engine = %d, from_irq = %d, blitq = 0x%lx\n",
 		  engine, from_irq, (unsigned long) blitq);
 
 	mtx_lock(&blitq->blit_lock);
 
 	done_transfer = blitq->is_active &&
 	  (( status = VIA_READ(VIA_PCI_DMA_CSR0 + engine*0x04)) & VIA_DMA_CSR_TD);
 	done_transfer = done_transfer || ( blitq->aborting && !(status & VIA_DMA_CSR_DE));
 
 	cur = blitq->cur;
 	if (done_transfer) {
 
 		blitq->blits[cur]->aborted = blitq->aborting;
 		blitq->done_blit_handle++;
 		DRM_WAKEUP(&blitq->blit_queue[cur]);
 
 		cur++;
 		if (cur >= VIA_NUM_BLIT_SLOTS)
 			cur = 0;
 		blitq->cur = cur;
 
 		/*
 		 * Clear transfer done flag.
 		 */
 
 		VIA_WRITE(VIA_PCI_DMA_CSR0 + engine*0x04,  VIA_DMA_CSR_TD);
 
 		blitq->is_active = 0;
 		blitq->aborting = 0;
 
 		taskqueue_enqueue(taskqueue_swi, &blitq->wq);
 
 	} else if (blitq->is_active && (ticks >= blitq->end)) {
 
 		/*
 		 * Abort transfer after one second.
 		 */
 
 		via_abort_dmablit(dev, engine);
 		blitq->aborting = 1;
 		blitq->end = ticks + DRM_HZ;
 	}
 
 	if (!blitq->is_active) {
 		if (blitq->num_outstanding) {
 			via_fire_dmablit(dev, blitq->blits[cur], engine);
 			blitq->is_active = 1;
 			blitq->cur = cur;
 			blitq->num_outstanding--;
 			blitq->end = ticks + DRM_HZ;
 
 			if (!callout_pending(&blitq->poll_timer))
 				callout_reset(&blitq->poll_timer,
 				    1, (timeout_t *)via_dmablit_timer,
 				    (void *)blitq);
 		} else {
 			if (callout_pending(&blitq->poll_timer)) {
 				callout_stop(&blitq->poll_timer);
 			}
 			via_dmablit_engine_off(dev, engine);
 		}
 	}
 
 	mtx_unlock(&blitq->blit_lock);
 }
 
 
 /*
  * Check whether this blit is still active, performing necessary locking.
  */
 static int
 via_dmablit_active(drm_via_blitq_t *blitq, int engine, uint32_t handle, wait_queue_head_t **queue)
 {
 	uint32_t slot;
 	int active;
 
 	mtx_lock(&blitq->blit_lock);
 
 	/*
 	 * Allow for handle wraparounds.
 	 */
 	active = ((blitq->done_blit_handle - handle) > (1 << 23)) &&
 		((blitq->cur_blit_handle - handle) <= (1 << 23));
 
 	if (queue && active) {
 		slot = handle - blitq->done_blit_handle + blitq->cur -1;
 		if (slot >= VIA_NUM_BLIT_SLOTS) {
 			slot -= VIA_NUM_BLIT_SLOTS;
 		}
 		*queue = blitq->blit_queue + slot;
 	}
 
 	mtx_unlock(&blitq->blit_lock);
 
 	return active;
 }
 
 
 /*
  * Sync. Wait for at least three seconds for the blit to be performed.
  */
 static int
 via_dmablit_sync(struct drm_device *dev, uint32_t handle, int engine)
 {
 
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 	drm_via_blitq_t *blitq = dev_priv->blit_queues + engine;
 	wait_queue_head_t *queue;
 	int ret = 0;
 
 	if (via_dmablit_active(blitq, engine, handle, &queue)) {
 		DRM_WAIT_ON(ret, *queue, 3 * DRM_HZ,
 			    !via_dmablit_active(blitq, engine, handle, NULL));
 	}
 	DRM_DEBUG("DMA blit sync handle 0x%x engine %d returned %d\n",
 		  handle, engine, ret);
 
 	return ret;
 }
 
 
 /*
  * A timer that regularly polls the blit engine in cases where we don't have interrupts:
  * a) Broken hardware (typically those that don't have any video capture facility).
  * b) Blit abort. The hardware doesn't send an interrupt when a blit is aborted.
  * The timer and hardware IRQ's can and do work in parallel. If the hardware has
  * irqs, it will shorten the latency somewhat.
  */
 static void
 via_dmablit_timer(void *arg)
 {
 	drm_via_blitq_t *blitq = (drm_via_blitq_t *)arg;
 	struct drm_device *dev = blitq->dev;
 	int engine = (int)
 		(blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues);
 
 	DRM_DEBUG("Polling timer called for engine %d, jiffies %lu\n", engine,
 		  (unsigned long) jiffies);
 
 	via_dmablit_handler(dev, engine, 0);
 
 	if (!callout_pending(&blitq->poll_timer)) {
 		callout_schedule(&blitq->poll_timer, 1);
 
 	       /*
 		* Rerun handler to delete timer if engines are off, and
 		* to shorten abort latency. This is a little nasty.
 		*/
 
 	       via_dmablit_handler(dev, engine, 0);
 
 	}
 }
 
 
 /*
  * Workqueue task that frees data and mappings associated with a blit.
  * Also wakes up waiting processes. Each of these tasks handles one
  * blit engine only and may not be called on each interrupt.
  */
 static void
 via_dmablit_workqueue(void *arg, int pending)
 {
 	drm_via_blitq_t *blitq = (drm_via_blitq_t *)arg;
 	struct drm_device *dev = blitq->dev;
 	drm_via_sg_info_t *cur_sg;
 	int cur_released;
 
 
 	DRM_DEBUG("task called for blit engine %ld\n",(unsigned long)
 		  (blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues));
 
 	mtx_lock(&blitq->blit_lock);
 
 	while(blitq->serviced != blitq->cur) {
 
 		cur_released = blitq->serviced++;
 
 		DRM_DEBUG("Releasing blit slot %d\n", cur_released);
 
 		if (blitq->serviced >= VIA_NUM_BLIT_SLOTS)
 			blitq->serviced = 0;
 
 		cur_sg = blitq->blits[cur_released];
 		blitq->num_free++;
 
 		mtx_unlock(&blitq->blit_lock);
 
 		DRM_WAKEUP(&blitq->busy_queue);
 
 		via_free_sg_info(cur_sg);
 		free(cur_sg, DRM_MEM_DRIVER);
 
 		mtx_lock(&blitq->blit_lock);
 	}
 
 	mtx_unlock(&blitq->blit_lock);
 }
 
 
 /*
  * Init all blit engines. Currently we use two, but some hardware have 4.
  */
 void
 via_init_dmablit(struct drm_device *dev)
 {
 	int i,j;
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 	drm_via_blitq_t *blitq;
 
 	for (i=0; i< VIA_NUM_BLIT_ENGINES; ++i) {
 		blitq = dev_priv->blit_queues + i;
 		blitq->dev = dev;
 		blitq->cur_blit_handle = 0;
 		blitq->done_blit_handle = 0;
 		blitq->head = 0;
 		blitq->cur = 0;
 		blitq->serviced = 0;
 		blitq->num_free = VIA_NUM_BLIT_SLOTS - 1;
 		blitq->num_outstanding = 0;
 		blitq->is_active = 0;
 		blitq->aborting = 0;
 		mtx_init(&blitq->blit_lock, "via_blit_lk", NULL, MTX_DEF);
 		for (j=0; j<VIA_NUM_BLIT_SLOTS; ++j) {
 			DRM_INIT_WAITQUEUE(blitq->blit_queue + j);
 		}
 		DRM_INIT_WAITQUEUE(&blitq->busy_queue);
 		TASK_INIT(&blitq->wq, 0, via_dmablit_workqueue, blitq);
 		callout_init(&blitq->poll_timer, 0);
 	}
 }
 
 
 /*
  * Build all info and do all mappings required for a blit.
  */
 static int
 via_build_sg_info(struct drm_device *dev, drm_via_sg_info_t *vsg,
     drm_via_dmablit_t *xfer)
 {
 	int ret = 0;
 
 	vsg->bounce_buffer = NULL;
 
 	vsg->state = dr_via_sg_init;
 
 	if (xfer->num_lines <= 0 || xfer->line_length <= 0) {
 		DRM_ERROR("Zero size bitblt.\n");
 		return -EINVAL;
 	}
 
 	/*
 	 * Below check is a driver limitation, not a hardware one. We
 	 * don't want to lock unused pages, and don't want to incoporate the
 	 * extra logic of avoiding them. Make sure there are no.
 	 * (Not a big limitation anyway.)
 	 */
 	if ((xfer->mem_stride - xfer->line_length) > 2 * PAGE_SIZE) {
 		DRM_ERROR("Too large system memory stride. Stride: %d, "
 			  "Length: %d\n", xfer->mem_stride, xfer->line_length);
 		return -EINVAL;
 	}
 
 	if ((xfer->mem_stride == xfer->line_length) &&
 	    (xfer->fb_stride == xfer->line_length)) {
 		xfer->mem_stride *= xfer->num_lines;
 		xfer->line_length = xfer->mem_stride;
 		xfer->fb_stride = xfer->mem_stride;
 		xfer->num_lines = 1;
 	}
 
 	/*
 	 * Don't lock an arbitrary large number of pages, since that causes a
 	 * DOS security hole.
 	 */
 	if (xfer->num_lines > 2048 ||
 	    (xfer->num_lines*xfer->mem_stride > (2048*2048*4))) {
 		DRM_ERROR("Too large PCI DMA bitblt.\n");
 		return -EINVAL;
 	}
 
 	/*
 	 * we allow a negative fb stride to allow flipping of images in
 	 * transfer.
 	 */
 	if (xfer->mem_stride < xfer->line_length ||
 		abs(xfer->fb_stride) < xfer->line_length) {
 		DRM_ERROR("Invalid frame-buffer / memory stride.\n");
 		return -EINVAL;
 	}
 
 	/*
 	 * A hardware bug seems to be worked around if system memory addresses
 	 * start on 16 byte boundaries. This seems a bit restrictive however.
 	 * VIA is contacted about this. Meanwhile, impose the following
 	 * restrictions:
 	 */
 #ifdef VIA_BUGFREE
 	if ((((unsigned long)xfer->mem_addr & 3) !=
 	    ((unsigned long)xfer->fb_addr & 3)) ||
 	    ((xfer->num_lines > 1) && ((xfer->mem_stride & 3) !=
 	    (xfer->fb_stride & 3)))) {
 		DRM_ERROR("Invalid DRM bitblt alignment.\n");
 		return -EINVAL;
 	}
 #else
 	if ((((unsigned long)xfer->mem_addr & 15) ||
 	    ((unsigned long)xfer->fb_addr & 3)) ||
 	    ((xfer->num_lines > 1) &&
 	    ((xfer->mem_stride & 15) || (xfer->fb_stride & 3)))) {
 		DRM_ERROR("Invalid DRM bitblt alignment.\n");
 		return -EINVAL;
 	}
 #endif
 
 	if (0 != (ret = via_lock_all_dma_pages(vsg, xfer))) {
 		DRM_ERROR("Could not lock DMA pages.\n");
 		via_free_sg_info(vsg);
 		return ret;
 	}
 
 	via_map_blit_for_device(xfer, vsg, 0);
 	if (0 != (ret = via_alloc_desc_pages(vsg))) {
 		DRM_ERROR("Could not allocate DMA descriptor pages.\n");
 		via_free_sg_info(vsg);
 		return ret;
 	}
 	via_map_blit_for_device(xfer, vsg, 1);
 
 	return 0;
 }
 
 
 /*
  * Reserve one free slot in the blit queue. Will wait for one second for one
  * to become available. Otherwise -EBUSY is returned.
  */
 static int
 via_dmablit_grab_slot(drm_via_blitq_t *blitq, int engine)
 {
 	struct drm_device *dev = blitq->dev;
 	int ret=0;
 
 	DRM_DEBUG("Num free is %d\n", blitq->num_free);
 	mtx_lock(&blitq->blit_lock);
 	while(blitq->num_free == 0) {
 		mtx_unlock(&blitq->blit_lock);
 
 		DRM_WAIT_ON(ret, blitq->busy_queue, DRM_HZ,
 		    blitq->num_free > 0);
 		if (ret) {
 			return (-EINTR == ret) ? -EAGAIN : ret;
 		}
 
 		mtx_lock(&blitq->blit_lock);
 	}
 
 	blitq->num_free--;
 	mtx_unlock(&blitq->blit_lock);
 
 	return 0;
 }
 
 
 /*
  * Hand back a free slot if we changed our mind.
  */
 static void
 via_dmablit_release_slot(drm_via_blitq_t *blitq)
 {
 
 	mtx_lock(&blitq->blit_lock);
 	blitq->num_free++;
 	mtx_unlock(&blitq->blit_lock);
 	DRM_WAKEUP( &blitq->busy_queue );
 }
 
 
 /*
  * Grab a free slot. Build blit info and queue a blit.
  */
 static int
 via_dmablit(struct drm_device *dev, drm_via_dmablit_t *xfer)
 {
 	drm_via_private_t *dev_priv = (drm_via_private_t *)dev->dev_private;
 	drm_via_sg_info_t *vsg;
 	drm_via_blitq_t *blitq;
 	int ret;
 	int engine;
 
 	if (dev_priv == NULL) {
 		DRM_ERROR("Called without initialization.\n");
 		return -EINVAL;
 	}
 
 	engine = (xfer->to_fb) ? 0 : 1;
 	blitq = dev_priv->blit_queues + engine;
 	if (0 != (ret = via_dmablit_grab_slot(blitq, engine))) {
 		return ret;
 	}
 	if (NULL == (vsg = malloc(sizeof(*vsg), DRM_MEM_DRIVER,
 	    M_NOWAIT | M_ZERO))) {
 		via_dmablit_release_slot(blitq);
 		return -ENOMEM;
 	}
 	if (0 != (ret = via_build_sg_info(dev, vsg, xfer))) {
 		via_dmablit_release_slot(blitq);
 		free(vsg, DRM_MEM_DRIVER);
 		return ret;
 	}
 	mtx_lock(&blitq->blit_lock);
 
 	blitq->blits[blitq->head++] = vsg;
 	if (blitq->head >= VIA_NUM_BLIT_SLOTS)
 		blitq->head = 0;
 	blitq->num_outstanding++;
 	xfer->sync.sync_handle = ++blitq->cur_blit_handle;
 
 	mtx_unlock(&blitq->blit_lock);
 	xfer->sync.engine = engine;
 
 	via_dmablit_handler(dev, engine, 0);
 
 	return 0;
 }
 
 
 /*
  * Sync on a previously submitted blit. Note that the X server use signals
  * extensively, and that there is a very big probability that this IOCTL will
  * be interrupted by a signal. In that case it returns with -EAGAIN for the
  * signal to be delivered. The caller should then reissue the IOCTL. This is
  * similar to what is being done for drmGetLock().
  */
 int
 via_dma_blit_sync( struct drm_device *dev, void *data,
     struct drm_file *file_priv )
 {
 	drm_via_blitsync_t *sync = data;
 	int err;
 
 	if (sync->engine >= VIA_NUM_BLIT_ENGINES)
 		return -EINVAL;
 
 	err = via_dmablit_sync(dev, sync->sync_handle, sync->engine);
 
 	if (-EINTR == err)
 		err = -EAGAIN;
 
 	return err;
 }
 
 
 /*
  * Queue a blit and hand back a handle to be used for sync. This IOCTL may be
  * interrupted by a signal while waiting for a free slot in the blit queue.
  * In that case it returns with -EAGAIN and should be reissued. See the above
  * IOCTL code.
  */
 int
 via_dma_blit( struct drm_device *dev, void *data, struct drm_file *file_priv )
 {
 	drm_via_dmablit_t *xfer = data;
 	int err;
 
 	err = via_dmablit(dev, xfer);
 
 	return err;
 }
Index: user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/dev/drm2/i915/i915_gem.c	(revision 267237)
@@ -1,3785 +1,3785 @@
 /*-
  * Copyright © 2008 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Eric Anholt <eric@anholt.net>
  *
  * Copyright (c) 2011 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Konstantin Belousov under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <dev/drm2/drmP.h>
 #include <dev/drm2/drm.h>
 #include <dev/drm2/i915/i915_drm.h>
 #include <dev/drm2/i915/i915_drv.h>
 #include <dev/drm2/i915/intel_drv.h>
 #include <dev/drm2/i915/intel_ringbuffer.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 
 static void i915_gem_object_flush_cpu_write_domain(
     struct drm_i915_gem_object *obj);
 static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size,
     int tiling_mode);
 static uint32_t i915_gem_get_gtt_alignment(struct drm_device *dev,
     uint32_t size, int tiling_mode);
 static int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
     unsigned alignment, bool map_and_fenceable);
 static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj,
     int flags);
 static void i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj);
 static int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj,
     bool write);
 static void i915_gem_object_set_to_full_cpu_read_domain(
     struct drm_i915_gem_object *obj);
 static int i915_gem_object_set_cpu_read_domain_range(
     struct drm_i915_gem_object *obj, uint64_t offset, uint64_t size);
 static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj);
 static void i915_gem_object_truncate(struct drm_i915_gem_object *obj);
 static int i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj);
 static bool i915_gem_object_is_inactive(struct drm_i915_gem_object *obj);
 static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj);
 static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex);
 static void i915_gem_process_flushing_list(struct intel_ring_buffer *ring,
     uint32_t flush_domains);
 static void i915_gem_clear_fence_reg(struct drm_device *dev,
     struct drm_i915_fence_reg *reg);
 static void i915_gem_reset_fences(struct drm_device *dev);
 static void i915_gem_retire_task_handler(void *arg, int pending);
 static int i915_gem_phys_pwrite(struct drm_device *dev,
     struct drm_i915_gem_object *obj, uint64_t data_ptr, uint64_t offset,
     uint64_t size, struct drm_file *file_priv);
 static void i915_gem_lowmem(void *arg);
 
 MALLOC_DEFINE(DRM_I915_GEM, "i915gem", "Allocations from i915 gem");
 long i915_gem_wired_pages_cnt;
 
 static void
 i915_gem_info_add_obj(struct drm_i915_private *dev_priv, size_t size)
 {
 
 	dev_priv->mm.object_count++;
 	dev_priv->mm.object_memory += size;
 }
 
 static void
 i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, size_t size)
 {
 
 	dev_priv->mm.object_count--;
 	dev_priv->mm.object_memory -= size;
 }
 
 static int
 i915_gem_wait_for_error(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv;
 	int ret;
 
 	dev_priv = dev->dev_private;
 	if (!atomic_load_acq_int(&dev_priv->mm.wedged))
 		return (0);
 
 	mtx_lock(&dev_priv->error_completion_lock);
 	while (dev_priv->error_completion == 0) {
 		ret = -msleep(&dev_priv->error_completion,
 		    &dev_priv->error_completion_lock, PCATCH, "915wco", 0);
 		if (ret != 0) {
 			mtx_unlock(&dev_priv->error_completion_lock);
 			return (ret);
 		}
 	}
 	mtx_unlock(&dev_priv->error_completion_lock);
 
 	if (atomic_load_acq_int(&dev_priv->mm.wedged)) {
 		mtx_lock(&dev_priv->error_completion_lock);
 		dev_priv->error_completion++;
 		mtx_unlock(&dev_priv->error_completion_lock);
 	}
 	return (0);
 }
 
 int
 i915_mutex_lock_interruptible(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv;
 	int ret;
 
 	dev_priv = dev->dev_private;
 	ret = i915_gem_wait_for_error(dev);
 	if (ret != 0)
 		return (ret);
 
 	/*
 	 * interruptible shall it be. might indeed be if dev_lock is
 	 * changed to sx
 	 */
 	ret = sx_xlock_sig(&dev->dev_struct_lock);
 	if (ret != 0)
 		return (-ret);
 
 	return (0);
 }
 
 
 static void
 i915_gem_free_object_tail(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev;
 	drm_i915_private_t *dev_priv;
 	int ret;
 
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 
 	ret = i915_gem_object_unbind(obj);
 	if (ret == -ERESTART) {
 		list_move(&obj->mm_list, &dev_priv->mm.deferred_free_list);
 		return;
 	}
 
 	CTR1(KTR_DRM, "object_destroy_tail %p", obj);
 	drm_gem_free_mmap_offset(&obj->base);
 	drm_gem_object_release(&obj->base);
 	i915_gem_info_remove_obj(dev_priv, obj->base.size);
 
 	free(obj->page_cpu_valid, DRM_I915_GEM);
 	free(obj->bit_17, DRM_I915_GEM);
 	free(obj, DRM_I915_GEM);
 }
 
 void
 i915_gem_free_object(struct drm_gem_object *gem_obj)
 {
 	struct drm_i915_gem_object *obj;
 	struct drm_device *dev;
 
 	obj = to_intel_bo(gem_obj);
 	dev = obj->base.dev;
 
 	while (obj->pin_count > 0)
 		i915_gem_object_unpin(obj);
 
 	if (obj->phys_obj != NULL)
 		i915_gem_detach_phys_object(dev, obj);
 
 	i915_gem_free_object_tail(obj);
 }
 
 static void
 init_ring_lists(struct intel_ring_buffer *ring)
 {
 
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	INIT_LIST_HEAD(&ring->gpu_write_list);
 }
 
 void
 i915_gem_load(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 	int i;
 
 	dev_priv = dev->dev_private;
 
 	INIT_LIST_HEAD(&dev_priv->mm.active_list);
 	INIT_LIST_HEAD(&dev_priv->mm.flushing_list);
 	INIT_LIST_HEAD(&dev_priv->mm.inactive_list);
 	INIT_LIST_HEAD(&dev_priv->mm.pinned_list);
 	INIT_LIST_HEAD(&dev_priv->mm.fence_list);
 	INIT_LIST_HEAD(&dev_priv->mm.deferred_free_list);
 	INIT_LIST_HEAD(&dev_priv->mm.gtt_list);
 	for (i = 0; i < I915_NUM_RINGS; i++)
 		init_ring_lists(&dev_priv->rings[i]);
 	for (i = 0; i < I915_MAX_NUM_FENCES; i++)
 		INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list);
 	TIMEOUT_TASK_INIT(dev_priv->tq, &dev_priv->mm.retire_task, 0,
 	    i915_gem_retire_task_handler, dev_priv);
 	dev_priv->error_completion = 0;
 
 	/* On GEN3 we really need to make sure the ARB C3 LP bit is set */
 	if (IS_GEN3(dev)) {
 		u32 tmp = I915_READ(MI_ARB_STATE);
 		if (!(tmp & MI_ARB_C3_LP_WRITE_ENABLE)) {
 			/*
 			 * arb state is a masked write, so set bit +
 			 * bit in mask.
 			 */
 			tmp = MI_ARB_C3_LP_WRITE_ENABLE |
 			    (MI_ARB_C3_LP_WRITE_ENABLE << MI_ARB_MASK_SHIFT);
 			I915_WRITE(MI_ARB_STATE, tmp);
 		}
 	}
 
 	dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL;
 
 	/* Old X drivers will take 0-2 for front, back, depth buffers */
 	if (!drm_core_check_feature(dev, DRIVER_MODESET))
 		dev_priv->fence_reg_start = 3;
 
 	if (INTEL_INFO(dev)->gen >= 4 || IS_I945G(dev) || IS_I945GM(dev) ||
 	    IS_G33(dev))
 		dev_priv->num_fence_regs = 16;
 	else
 		dev_priv->num_fence_regs = 8;
 
 	/* Initialize fence registers to zero */
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		i915_gem_clear_fence_reg(dev, &dev_priv->fence_regs[i]);
 	}
 	i915_gem_detect_bit_6_swizzle(dev);
 	dev_priv->mm.interruptible = true;
 
 	dev_priv->mm.i915_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 	    i915_gem_lowmem, dev, EVENTHANDLER_PRI_ANY);
 }
 
 int
 i915_gem_do_init(struct drm_device *dev, unsigned long start,
     unsigned long mappable_end, unsigned long end)
 {
 	drm_i915_private_t *dev_priv;
 	unsigned long mappable;
 	int error;
 
 	dev_priv = dev->dev_private;
 	mappable = min(end, mappable_end) - start;
 
 	drm_mm_init(&dev_priv->mm.gtt_space, start, end - start);
 
 	dev_priv->mm.gtt_start = start;
 	dev_priv->mm.gtt_mappable_end = mappable_end;
 	dev_priv->mm.gtt_end = end;
 	dev_priv->mm.gtt_total = end - start;
 	dev_priv->mm.mappable_gtt_total = mappable;
 
 	/* Take over this portion of the GTT */
 	intel_gtt_clear_range(start / PAGE_SIZE, (end-start) / PAGE_SIZE);
 	device_printf(dev->device,
 	    "taking over the fictitious range 0x%lx-0x%lx\n",
 	    dev->agp->base + start, dev->agp->base + start + mappable);
 	error = -vm_phys_fictitious_reg_range(dev->agp->base + start,
 	    dev->agp->base + start + mappable, VM_MEMATTR_WRITE_COMBINING);
 	return (error);
 }
 
 int
 i915_gem_init_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_init *args;
 	drm_i915_private_t *dev_priv;
 
 	dev_priv = dev->dev_private;
 	args = data;
 
 	if (args->gtt_start >= args->gtt_end ||
 	    (args->gtt_end | args->gtt_start) & (PAGE_SIZE - 1))
 		return (-EINVAL);
 
 	if (mtx_initialized(&dev_priv->mm.gtt_space.unused_lock))
 		return (-EBUSY);
 	/*
 	 * XXXKIB. The second-time initialization should be guarded
 	 * against.
 	 */
 	return (i915_gem_do_init(dev, args->gtt_start, args->gtt_end,
 	    args->gtt_end));
 }
 
 int
 i915_gem_idle(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 	int ret;
 
 	dev_priv = dev->dev_private;
 	if (dev_priv->mm.suspended)
 		return (0);
 
 	ret = i915_gpu_idle(dev, true);
 	if (ret != 0)
 		return (ret);
 
 	/* Under UMS, be paranoid and evict. */
 	if (!drm_core_check_feature(dev, DRIVER_MODESET)) {
 		ret = i915_gem_evict_inactive(dev, false);
 		if (ret != 0)
 			return ret;
 	}
 
 	i915_gem_reset_fences(dev);
 
 	/* Hack!  Don't let anybody do execbuf while we don't control the chip.
 	 * We need to replace this with a semaphore, or something.
 	 * And not confound mm.suspended!
 	 */
 	dev_priv->mm.suspended = 1;
 	callout_stop(&dev_priv->hangcheck_timer);
 
 	i915_kernel_lost_context(dev);
 	i915_gem_cleanup_ringbuffer(dev);
 
 	/* Cancel the retire work handler, which should be idle now. */
 	taskqueue_cancel_timeout(dev_priv->tq, &dev_priv->mm.retire_task, NULL);
 	return (ret);
 }
 
 void
 i915_gem_init_swizzling(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 
 	dev_priv = dev->dev_private;
 
 	if (INTEL_INFO(dev)->gen < 5 ||
 	    dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
 		return;
 
 	I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) |
 				 DISP_TILE_SURFACE_SWIZZLING);
 
 	if (IS_GEN5(dev))
 		return;
 
 	I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL);
 	if (IS_GEN6(dev))
 		I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_SNB));
 	else
 		I915_WRITE(ARB_MODE, ARB_MODE_ENABLE(ARB_MODE_SWIZZLE_IVB));
 }
 
 void
 i915_gem_init_ppgtt(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 	struct i915_hw_ppgtt *ppgtt;
 	uint32_t pd_offset, pd_entry;
 	vm_paddr_t pt_addr;
 	struct intel_ring_buffer *ring;
 	u_int first_pd_entry_in_global_pt, i;
 
 	dev_priv = dev->dev_private;
 	ppgtt = dev_priv->mm.aliasing_ppgtt;
 	if (ppgtt == NULL)
 		return;
 
 	first_pd_entry_in_global_pt = 512 * 1024 - I915_PPGTT_PD_ENTRIES;
 	for (i = 0; i < ppgtt->num_pd_entries; i++) {
 		pt_addr = VM_PAGE_TO_PHYS(ppgtt->pt_pages[i]);
 		pd_entry = GEN6_PDE_ADDR_ENCODE(pt_addr);
 		pd_entry |= GEN6_PDE_VALID;
 		intel_gtt_write(first_pd_entry_in_global_pt + i, pd_entry);
 	}
 	intel_gtt_read_pte(first_pd_entry_in_global_pt);
 
 	pd_offset = ppgtt->pd_offset;
 	pd_offset /= 64; /* in cachelines, */
 	pd_offset <<= 16;
 
 	if (INTEL_INFO(dev)->gen == 6) {
 		uint32_t ecochk = I915_READ(GAM_ECOCHK);
 		I915_WRITE(GAM_ECOCHK, ecochk | ECOCHK_SNB_BIT |
 				       ECOCHK_PPGTT_CACHE64B);
 		I915_WRITE(GFX_MODE, GFX_MODE_ENABLE(GFX_PPGTT_ENABLE));
 	} else if (INTEL_INFO(dev)->gen >= 7) {
 		I915_WRITE(GAM_ECOCHK, ECOCHK_PPGTT_CACHE64B);
 		/* GFX_MODE is per-ring on gen7+ */
 	}
 
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		ring = &dev_priv->rings[i];
 
 		if (INTEL_INFO(dev)->gen >= 7)
 			I915_WRITE(RING_MODE_GEN7(ring),
 				   GFX_MODE_ENABLE(GFX_PPGTT_ENABLE));
 
 		I915_WRITE(RING_PP_DIR_DCLV(ring), PP_DIR_DCLV_2G);
 		I915_WRITE(RING_PP_DIR_BASE(ring), pd_offset);
 	}
 }
 
 int
 i915_gem_init_hw(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 	int ret;
 
 	dev_priv = dev->dev_private;
 
 	i915_gem_init_swizzling(dev);
 
 	ret = intel_init_render_ring_buffer(dev);
 	if (ret != 0)
 		return (ret);
 
 	if (HAS_BSD(dev)) {
 		ret = intel_init_bsd_ring_buffer(dev);
 		if (ret != 0)
 			goto cleanup_render_ring;
 	}
 
 	if (HAS_BLT(dev)) {
 		ret = intel_init_blt_ring_buffer(dev);
 		if (ret != 0)
 			goto cleanup_bsd_ring;
 	}
 
 	dev_priv->next_seqno = 1;
 	i915_gem_init_ppgtt(dev);
 	return (0);
 
 cleanup_bsd_ring:
 	intel_cleanup_ring_buffer(&dev_priv->rings[VCS]);
 cleanup_render_ring:
 	intel_cleanup_ring_buffer(&dev_priv->rings[RCS]);
 	return (ret);
 }
 
 int
 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_gem_get_aperture *args;
 	struct drm_i915_gem_object *obj;
 	size_t pinned;
 
 	dev_priv = dev->dev_private;
 	args = data;
 
 	if (!(dev->driver->driver_features & DRIVER_GEM))
 		return (-ENODEV);
 
 	pinned = 0;
 	DRM_LOCK(dev);
 	list_for_each_entry(obj, &dev_priv->mm.pinned_list, mm_list)
 		pinned += obj->gtt_space->size;
 	DRM_UNLOCK(dev);
 
 	args->aper_size = dev_priv->mm.gtt_total;
 	args->aper_available_size = args->aper_size - pinned;
 
 	return (0);
 }
 
 int
 i915_gem_object_pin(struct drm_i915_gem_object *obj, uint32_t alignment,
      bool map_and_fenceable)
 {
 	struct drm_device *dev;
 	struct drm_i915_private *dev_priv;
 	int ret;
 
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 
 	KASSERT(obj->pin_count != DRM_I915_GEM_OBJECT_MAX_PIN_COUNT,
 	    ("Max pin count"));
 
 	if (obj->gtt_space != NULL) {
 		if ((alignment && obj->gtt_offset & (alignment - 1)) ||
 		    (map_and_fenceable && !obj->map_and_fenceable)) {
 			DRM_DEBUG("bo is already pinned with incorrect alignment:"
 			     " offset=%x, req.alignment=%x, req.map_and_fenceable=%d,"
 			     " obj->map_and_fenceable=%d\n",
 			     obj->gtt_offset, alignment,
 			     map_and_fenceable,
 			     obj->map_and_fenceable);
 			ret = i915_gem_object_unbind(obj);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 
 	if (obj->gtt_space == NULL) {
 		ret = i915_gem_object_bind_to_gtt(obj, alignment,
 		    map_and_fenceable);
 		if (ret)
 			return (ret);
 	}
 
 	if (obj->pin_count++ == 0 && !obj->active)
 		list_move_tail(&obj->mm_list, &dev_priv->mm.pinned_list);
 	obj->pin_mappable |= map_and_fenceable;
 
 #if 1
 	KIB_NOTYET();
 #else
 	WARN_ON(i915_verify_lists(dev));
 #endif
 	return (0);
 }
 
 void
 i915_gem_object_unpin(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev;
 	drm_i915_private_t *dev_priv;
 
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 
 #if 1
 	KIB_NOTYET();
 #else
 	WARN_ON(i915_verify_lists(dev));
 #endif
 	
 	KASSERT(obj->pin_count != 0, ("zero pin count"));
 	KASSERT(obj->gtt_space != NULL, ("No gtt mapping"));
 
 	if (--obj->pin_count == 0) {
 		if (!obj->active)
 			list_move_tail(&obj->mm_list,
 			    &dev_priv->mm.inactive_list);
 		obj->pin_mappable = false;
 	}
 #if 1
 	KIB_NOTYET();
 #else
 	WARN_ON(i915_verify_lists(dev));
 #endif
 }
 
 int
 i915_gem_pin_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_pin *args;
 	struct drm_i915_gem_object *obj;
 	struct drm_gem_object *gobj;
 	int ret;
 
 	args = data;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return ret;
 
 	gobj = drm_gem_object_lookup(dev, file, args->handle);
 	if (gobj == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 	obj = to_intel_bo(gobj);
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to pin a purgeable buffer\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (obj->pin_filp != NULL && obj->pin_filp != file) {
 		DRM_ERROR("Already pinned in i915_gem_pin_ioctl(): %d\n",
 		    args->handle);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	obj->user_pin_count++;
 	obj->pin_filp = file;
 	if (obj->user_pin_count == 1) {
 		ret = i915_gem_object_pin(obj, args->alignment, true);
 		if (ret != 0)
 			goto out;
 	}
 
 	/* XXX - flush the CPU caches for pinned objects
 	 * as the X server doesn't manage domains yet
 	 */
 	i915_gem_object_flush_cpu_write_domain(obj);
 	args->offset = obj->gtt_offset;
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 int
 i915_gem_unpin_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_pin *args;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	args = data;
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return (ret);
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->pin_filp != file) {
 		DRM_ERROR("Not pinned by caller in i915_gem_pin_ioctl(): %d\n",
 		    args->handle);
 		ret = -EINVAL;
 		goto out;
 	}
 	obj->user_pin_count--;
 	if (obj->user_pin_count == 0) {
 		obj->pin_filp = NULL;
 		i915_gem_object_unpin(obj);
 	}
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 int
 i915_gem_busy_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_busy *args;
 	struct drm_i915_gem_object *obj;
 	struct drm_i915_gem_request *request;
 	int ret;
 
 	args = data;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return ret;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	args->busy = obj->active;
 	if (args->busy) {
 		if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) {
 			ret = i915_gem_flush_ring(obj->ring,
 			    0, obj->base.write_domain);
 		} else if (obj->ring->outstanding_lazy_request ==
 		    obj->last_rendering_seqno) {
 			request = malloc(sizeof(*request), DRM_I915_GEM,
 			    M_WAITOK | M_ZERO);
 			ret = i915_add_request(obj->ring, NULL, request);
 			if (ret != 0)
 				free(request, DRM_I915_GEM);
 		}
 
 		i915_gem_retire_requests_ring(obj->ring);
 		args->busy = obj->active;
 	}
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 static int
 i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_file_private *file_priv;
 	unsigned long recent_enough;
 	struct drm_i915_gem_request *request;
 	struct intel_ring_buffer *ring;
 	u32 seqno;
 	int ret;
 
 	dev_priv = dev->dev_private;
 	if (atomic_load_acq_int(&dev_priv->mm.wedged))
 		return (-EIO);
 
 	file_priv = file->driver_priv;
 	recent_enough = ticks - (20 * hz / 1000);
 	ring = NULL;
 	seqno = 0;
 
 	mtx_lock(&file_priv->mm.lck);
 	list_for_each_entry(request, &file_priv->mm.request_list, client_list) {
 		if (time_after_eq(request->emitted_jiffies, recent_enough))
 			break;
 		ring = request->ring;
 		seqno = request->seqno;
 	}
 	mtx_unlock(&file_priv->mm.lck);
 	if (seqno == 0)
 		return (0);
 
 	ret = 0;
 	mtx_lock(&ring->irq_lock);
 	if (!i915_seqno_passed(ring->get_seqno(ring), seqno)) {
 		if (ring->irq_get(ring)) {
 			while (ret == 0 &&
 			    !(i915_seqno_passed(ring->get_seqno(ring), seqno) ||
 			    atomic_load_acq_int(&dev_priv->mm.wedged)))
 				ret = -msleep(ring, &ring->irq_lock, PCATCH,
 				    "915thr", 0);
 			ring->irq_put(ring);
 			if (ret == 0 && atomic_load_acq_int(&dev_priv->mm.wedged))
 				ret = -EIO;
 		} else if (_intel_wait_for(dev,
 		    i915_seqno_passed(ring->get_seqno(ring), seqno) ||
 		    atomic_load_acq_int(&dev_priv->mm.wedged), 3000, 0, "915rtr")) {
 			ret = -EBUSY;
 		}
 	}
 	mtx_unlock(&ring->irq_lock);
 
 	if (ret == 0)
 		taskqueue_enqueue_timeout(dev_priv->tq,
 		    &dev_priv->mm.retire_task, 0);
 
 	return (ret);
 }
 
 int
 i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file_priv)
 {
 
 	return (i915_gem_ring_throttle(dev, file_priv));
 }
 
 int
 i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file_priv)
 {
 	struct drm_i915_gem_madvise *args;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	args = data;
 	switch (args->madv) {
 	case I915_MADV_DONTNEED:
 	case I915_MADV_WILLNEED:
 		break;
 	default:
 		return (-EINVAL);
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return (ret);
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->pin_count != 0) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (obj->madv != I915_MADV_PURGED_INTERNAL)
 		obj->madv = args->madv;
 	if (i915_gem_object_is_purgeable(obj) && obj->gtt_space == NULL)
 		i915_gem_object_truncate(obj);
 	args->retained = obj->madv != I915_MADV_PURGED_INTERNAL;
 
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 void
 i915_gem_cleanup_ringbuffer(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 	int i;
 
 	dev_priv = dev->dev_private;
 	for (i = 0; i < I915_NUM_RINGS; i++)
 		intel_cleanup_ring_buffer(&dev_priv->rings[i]);
 }
 
 int
 i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file_priv)
 {
 	drm_i915_private_t *dev_priv;
 	int ret, i;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return (0);
 	dev_priv = dev->dev_private;
 	if (atomic_load_acq_int(&dev_priv->mm.wedged) != 0) {
 		DRM_ERROR("Reenabling wedged hardware, good luck\n");
 		atomic_store_rel_int(&dev_priv->mm.wedged, 0);
 	}
 
 	dev_priv->mm.suspended = 0;
 
 	ret = i915_gem_init_hw(dev);
 	if (ret != 0) {
 		return (ret);
 	}
 
 	KASSERT(list_empty(&dev_priv->mm.active_list), ("active list"));
 	KASSERT(list_empty(&dev_priv->mm.flushing_list), ("flushing list"));
 	KASSERT(list_empty(&dev_priv->mm.inactive_list), ("inactive list"));
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		KASSERT(list_empty(&dev_priv->rings[i].active_list),
 		    ("ring %d active list", i));
 		KASSERT(list_empty(&dev_priv->rings[i].request_list),
 		    ("ring %d request list", i));
 	}
 
 	DRM_UNLOCK(dev);
 	ret = drm_irq_install(dev);
 	DRM_LOCK(dev);
 	if (ret)
 		goto cleanup_ringbuffer;
 
 	return (0);
 
 cleanup_ringbuffer:
 	i915_gem_cleanup_ringbuffer(dev);
 	dev_priv->mm.suspended = 1;
 
 	return (ret);
 }
 
 int
 i915_gem_leavevt_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file_priv)
 {
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return 0;
 
 	drm_irq_uninstall(dev);
 	return (i915_gem_idle(dev));
 }
 
 int
 i915_gem_create(struct drm_file *file, struct drm_device *dev, uint64_t size,
     uint32_t *handle_p)
 {
 	struct drm_i915_gem_object *obj;
 	uint32_t handle;
 	int ret;
 
 	size = roundup(size, PAGE_SIZE);
 	if (size == 0)
 		return (-EINVAL);
 
 	obj = i915_gem_alloc_object(dev, size);
 	if (obj == NULL)
 		return (-ENOMEM);
 
 	handle = 0;
 	ret = drm_gem_handle_create(file, &obj->base, &handle);
 	if (ret != 0) {
 		drm_gem_object_release(&obj->base);
 		i915_gem_info_remove_obj(dev->dev_private, obj->base.size);
 		free(obj, DRM_I915_GEM);
 		return (-ret);
 	}
 
 	/* drop reference from allocate - handle holds it now */
 	drm_gem_object_unreference(&obj->base);
 	CTR2(KTR_DRM, "object_create %p %x", obj, size);
 	*handle_p = handle;
 	return (0);
 }
 
 int
 i915_gem_dumb_create(struct drm_file *file, struct drm_device *dev,
     struct drm_mode_create_dumb *args)
 {
 
 	/* have to work out size/pitch and return them */
 	args->pitch = roundup2(args->width * ((args->bpp + 7) / 8), 64);
 	args->size = args->pitch * args->height;
 	return (i915_gem_create(file, dev, args->size, &args->handle));
 }
 
 int
 i915_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev,
     uint32_t handle)
 {
 
 	return (drm_gem_handle_delete(file, handle));
 }
 
 int
 i915_gem_create_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_create *args = data;
 
 	return (i915_gem_create(file, dev, args->size, &args->handle));
 }
 
 static int
 i915_gem_swap_io(struct drm_device *dev, struct drm_i915_gem_object *obj,
     uint64_t data_ptr, uint64_t size, uint64_t offset, enum uio_rw rw,
     struct drm_file *file)
 {
 	vm_object_t vm_obj;
 	vm_page_t m;
 	struct sf_buf *sf;
 	vm_offset_t mkva;
 	vm_pindex_t obj_pi;
 	int cnt, do_bit17_swizzling, length, obj_po, ret, swizzled_po;
 
 	if (obj->gtt_offset != 0 && rw == UIO_READ)
 		do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 	else
 		do_bit17_swizzling = 0;
 
 	obj->dirty = 1;
 	vm_obj = obj->base.vm_obj;
 	ret = 0;
 
 	VM_OBJECT_WLOCK(vm_obj);
 	vm_object_pip_add(vm_obj, 1);
 	while (size > 0) {
 		obj_pi = OFF_TO_IDX(offset);
 		obj_po = offset & PAGE_MASK;
 
 		m = i915_gem_wire_page(vm_obj, obj_pi);
 		VM_OBJECT_WUNLOCK(vm_obj);
 
 		sched_pin();
 		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 		mkva = sf_buf_kva(sf);
 		length = min(size, PAGE_SIZE - obj_po);
 		while (length > 0) {
 			if (do_bit17_swizzling &&
 			    (VM_PAGE_TO_PHYS(m) & (1 << 17)) != 0) {
 				cnt = roundup2(obj_po + 1, 64);
 				cnt = min(cnt - obj_po, length);
 				swizzled_po = obj_po ^ 64;
 			} else {
 				cnt = length;
 				swizzled_po = obj_po;
 			}
 			if (rw == UIO_READ)
 				ret = -copyout_nofault(
 				    (char *)mkva + swizzled_po,
 				    (void *)(uintptr_t)data_ptr, cnt);
 			else
 				ret = -copyin_nofault(
 				    (void *)(uintptr_t)data_ptr,
 				    (char *)mkva + swizzled_po, cnt);
 			if (ret != 0)
 				break;
 			data_ptr += cnt;
 			size -= cnt;
 			length -= cnt;
 			offset += cnt;
 			obj_po += cnt;
 		}
 		sf_buf_free(sf);
 		sched_unpin();
 		VM_OBJECT_WLOCK(vm_obj);
 		if (rw == UIO_WRITE)
 			vm_page_dirty(m);
 		vm_page_reference(m);
 		vm_page_lock(m);
-		vm_page_unwire(m, 1);
+		vm_page_unwire(m, PQ_ACTIVE);
 		vm_page_unlock(m);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 
 		if (ret != 0)
 			break;
 	}
 	vm_object_pip_wakeup(vm_obj);
 	VM_OBJECT_WUNLOCK(vm_obj);
 
 	return (ret);
 }
 
 static int
 i915_gem_gtt_write(struct drm_device *dev, struct drm_i915_gem_object *obj,
     uint64_t data_ptr, uint64_t size, uint64_t offset, struct drm_file *file)
 {
 	vm_offset_t mkva;
 	vm_pindex_t obj_pi;
 	int obj_po, ret;
 
 	obj_pi = OFF_TO_IDX(offset);
 	obj_po = offset & PAGE_MASK;
 
 	mkva = (vm_offset_t)pmap_mapdev_attr(dev->agp->base + obj->gtt_offset +
 	    IDX_TO_OFF(obj_pi), size, PAT_WRITE_COMBINING);
 	ret = -copyin_nofault((void *)(uintptr_t)data_ptr, (char *)mkva +
 	    obj_po, size);
 	pmap_unmapdev(mkva, size);
 	return (ret);
 }
 
 static int
 i915_gem_obj_io(struct drm_device *dev, uint32_t handle, uint64_t data_ptr,
     uint64_t size, uint64_t offset, enum uio_rw rw, struct drm_file *file)
 {
 	struct drm_i915_gem_object *obj;
 	vm_page_t *ma;
 	vm_offset_t start, end;
 	int npages, ret;
 
 	if (size == 0)
 		return (0);
 	start = trunc_page(data_ptr);
 	end = round_page(data_ptr + size);
 	npages = howmany(end - start, PAGE_SIZE);
 	ma = malloc(npages * sizeof(vm_page_t), DRM_I915_GEM, M_WAITOK |
 	    M_ZERO);
 	npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)data_ptr, size,
 	    (rw == UIO_READ ? VM_PROT_WRITE : 0 ) | VM_PROT_READ, ma, npages);
 	if (npages == -1) {
 		ret = -EFAULT;
 		goto free_ma;
 	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		goto unlocked;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 	if (offset > obj->base.size || size > obj->base.size - offset) {
 		ret = -EINVAL;
 		goto out;
 	}
 
 	if (rw == UIO_READ) {
 		CTR3(KTR_DRM, "object_pread %p %jx %jx", obj, offset, size);
 		ret = i915_gem_object_set_cpu_read_domain_range(obj,
 		    offset, size);
 		if (ret != 0)
 			goto out;
 		ret = i915_gem_swap_io(dev, obj, data_ptr, size, offset,
 		    UIO_READ, file);
 	} else {
 		if (obj->phys_obj) {
 			CTR3(KTR_DRM, "object_phys_write %p %jx %jx", obj,
 			    offset, size);
 			ret = i915_gem_phys_pwrite(dev, obj, data_ptr, offset,
 			    size, file);
 		} else if (obj->gtt_space &&
 		    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 			CTR3(KTR_DRM, "object_gtt_write %p %jx %jx", obj,
 			    offset, size);
 			ret = i915_gem_object_pin(obj, 0, true);
 			if (ret != 0)
 				goto out;
 			ret = i915_gem_object_set_to_gtt_domain(obj, true);
 			if (ret != 0)
 				goto out_unpin;
 			ret = i915_gem_object_put_fence(obj);
 			if (ret != 0)
 				goto out_unpin;
 			ret = i915_gem_gtt_write(dev, obj, data_ptr, size,
 			    offset, file);
 out_unpin:
 			i915_gem_object_unpin(obj);
 		} else {
 			CTR3(KTR_DRM, "object_pwrite %p %jx %jx", obj,
 			    offset, size);
 			ret = i915_gem_object_set_to_cpu_domain(obj, true);
 			if (ret != 0)
 				goto out;
 			ret = i915_gem_swap_io(dev, obj, data_ptr, size, offset,
 			    UIO_WRITE, file);
 		}
 	}
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 unlocked:
 	vm_page_unhold_pages(ma, npages);
 free_ma:
 	free(ma, DRM_I915_GEM);
 	return (ret);
 }
 
 int
 i915_gem_pread_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct drm_i915_gem_pread *args;
 
 	args = data;
 	return (i915_gem_obj_io(dev, args->handle, args->data_ptr, args->size,
 	    args->offset, UIO_READ, file));
 }
 
 int
 i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 {
 	struct drm_i915_gem_pwrite *args;
 
 	args = data;
 	return (i915_gem_obj_io(dev, args->handle, args->data_ptr, args->size,
 	    args->offset, UIO_WRITE, file));
 }
 
 int
 i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_set_domain *args;
 	struct drm_i915_gem_object *obj;
 	uint32_t read_domains;
 	uint32_t write_domain;
 	int ret;
 
 	if ((dev->driver->driver_features & DRIVER_GEM) == 0)
 		return (-ENODEV);
 
 	args = data;
 	read_domains = args->read_domains;
 	write_domain = args->write_domain;
 
 	if ((write_domain & I915_GEM_GPU_DOMAINS) != 0 ||
 	    (read_domains & I915_GEM_GPU_DOMAINS) != 0 ||
 	    (write_domain != 0 && read_domains != write_domain))
 		return (-EINVAL);
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return (ret);
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if ((read_domains & I915_GEM_DOMAIN_GTT) != 0) {
 		ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0);
 		if (ret == -EINVAL)
 			ret = 0;
 	} else
 		ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
 
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 int
 i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_sw_finish *args;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	args = data;
 	ret = 0;
 	if ((dev->driver->driver_features & DRIVER_GEM) == 0)
 		return (ENODEV);
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return (ret);
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 	if (obj->pin_count != 0)
 		i915_gem_object_flush_cpu_write_domain(obj);
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 int
 i915_gem_mmap_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_gem_mmap *args;
 	struct drm_gem_object *obj;
 	struct proc *p;
 	vm_map_t map;
 	vm_offset_t addr;
 	vm_size_t size;
 	int error, rv;
 
 	args = data;
 
 	if ((dev->driver->driver_features & DRIVER_GEM) == 0)
 		return (-ENODEV);
 
 	obj = drm_gem_object_lookup(dev, file, args->handle);
 	if (obj == NULL)
 		return (-ENOENT);
 	error = 0;
 	if (args->size == 0)
 		goto out;
 	p = curproc;
 	map = &p->p_vmspace->vm_map;
 	size = round_page(args->size);
 	PROC_LOCK(p);
 	if (map->size + size > lim_cur(p, RLIMIT_VMEM)) {
 		PROC_UNLOCK(p);
 		error = ENOMEM;
 		goto out;
 	}
 	PROC_UNLOCK(p);
 
 	addr = 0;
 	vm_object_reference(obj->vm_obj);
 	DRM_UNLOCK(dev);
 	rv = vm_map_find(map, obj->vm_obj, args->offset, &addr, args->size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, MAP_INHERIT_SHARE);
 	if (rv != KERN_SUCCESS) {
 		vm_object_deallocate(obj->vm_obj);
 		error = -vm_mmap_to_errno(rv);
 	} else {
 		args->addr_ptr = (uint64_t)addr;
 	}
 	DRM_LOCK(dev);
 out:
 	drm_gem_object_unreference(obj);
 	return (error);
 }
 
 static int
 i915_gem_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
 {
 
 	*color = 0; /* XXXKIB */
 	return (0);
 }
 
 int i915_intr_pf;
 
 static int
 i915_gem_pager_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot,
     vm_page_t *mres)
 {
 	struct drm_gem_object *gem_obj;
 	struct drm_i915_gem_object *obj;
 	struct drm_device *dev;
 	drm_i915_private_t *dev_priv;
 	vm_page_t m, oldm;
 	int cause, ret;
 	bool write;
 
 	gem_obj = vm_obj->handle;
 	obj = to_intel_bo(gem_obj);
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 #if 0
 	write = (prot & VM_PROT_WRITE) != 0;
 #else
 	write = true;
 #endif
 	vm_object_pip_add(vm_obj, 1);
 
 	/*
 	 * Remove the placeholder page inserted by vm_fault() from the
 	 * object before dropping the object lock. If
 	 * i915_gem_release_mmap() is active in parallel on this gem
 	 * object, then it owns the drm device sx and might find the
 	 * placeholder already. Then, since the page is busy,
 	 * i915_gem_release_mmap() sleeps waiting for the busy state
 	 * of the page cleared. We will be not able to acquire drm
 	 * device lock until i915_gem_release_mmap() is able to make a
 	 * progress.
 	 */
 	if (*mres != NULL) {
 		oldm = *mres;
 		vm_page_lock(oldm);
 		vm_page_remove(oldm);
 		vm_page_unlock(oldm);
 		*mres = NULL;
 	} else
 		oldm = NULL;
 	VM_OBJECT_WUNLOCK(vm_obj);
 retry:
 	cause = ret = 0;
 	m = NULL;
 
 	if (i915_intr_pf) {
 		ret = i915_mutex_lock_interruptible(dev);
 		if (ret != 0) {
 			cause = 10;
 			goto out;
 		}
 	} else
 		DRM_LOCK(dev);
 
 	/*
 	 * Since the object lock was dropped, other thread might have
 	 * faulted on the same GTT address and instantiated the
 	 * mapping for the page.  Recheck.
 	 */
 	VM_OBJECT_WLOCK(vm_obj);
 	m = vm_page_lookup(vm_obj, OFF_TO_IDX(offset));
 	if (m != NULL) {
 		if (vm_page_busied(m)) {
 			DRM_UNLOCK(dev);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(vm_obj);
 			vm_page_busy_sleep(m, "915pee");
 			goto retry;
 		}
 		goto have_page;
 	} else
 		VM_OBJECT_WUNLOCK(vm_obj);
 
 	/* Now bind it into the GTT if needed */
 	if (!obj->map_and_fenceable) {
 		ret = i915_gem_object_unbind(obj);
 		if (ret != 0) {
 			cause = 20;
 			goto unlock;
 		}
 	}
 	if (!obj->gtt_space) {
 		ret = i915_gem_object_bind_to_gtt(obj, 0, true);
 		if (ret != 0) {
 			cause = 30;
 			goto unlock;
 		}
 
 		ret = i915_gem_object_set_to_gtt_domain(obj, write);
 		if (ret != 0) {
 			cause = 40;
 			goto unlock;
 		}
 	}
 
 	if (obj->tiling_mode == I915_TILING_NONE)
 		ret = i915_gem_object_put_fence(obj);
 	else
 		ret = i915_gem_object_get_fence(obj, NULL);
 	if (ret != 0) {
 		cause = 50;
 		goto unlock;
 	}
 
 	if (i915_gem_object_is_inactive(obj))
 		list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	obj->fault_mappable = true;
 	VM_OBJECT_WLOCK(vm_obj);
 	m = vm_phys_fictitious_to_vm_page(dev->agp->base + obj->gtt_offset +
 	    offset);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(vm_obj);
 		cause = 60;
 		ret = -EFAULT;
 		goto unlock;
 	}
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("not fictitious %p", m));
 	KASSERT(m->wire_count == 1, ("wire_count not 1 %p", m));
 
 	if (vm_page_busied(m)) {
 		DRM_UNLOCK(dev);
 		vm_page_lock(m);
 		VM_OBJECT_WUNLOCK(vm_obj);
 		vm_page_busy_sleep(m, "915pbs");
 		goto retry;
 	}
 	if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
 		DRM_UNLOCK(dev);
 		VM_OBJECT_WUNLOCK(vm_obj);
 		VM_WAIT;
 		goto retry;
 	}
 	m->valid = VM_PAGE_BITS_ALL;
 have_page:
 	*mres = m;
 	vm_page_xbusy(m);
 
 	CTR4(KTR_DRM, "fault %p %jx %x phys %x", gem_obj, offset, prot,
 	    m->phys_addr);
 	DRM_UNLOCK(dev);
 	if (oldm != NULL) {
 		vm_page_lock(oldm);
 		vm_page_free(oldm);
 		vm_page_unlock(oldm);
 	}
 	vm_object_pip_wakeup(vm_obj);
 	return (VM_PAGER_OK);
 
 unlock:
 	DRM_UNLOCK(dev);
 out:
 	KASSERT(ret != 0, ("i915_gem_pager_fault: wrong return"));
 	CTR5(KTR_DRM, "fault_fail %p %jx %x err %d %d", gem_obj, offset, prot,
 	    -ret, cause);
 	if (ret == -EAGAIN || ret == -EIO || ret == -EINTR) {
 		kern_yield(PRI_USER);
 		goto retry;
 	}
 	VM_OBJECT_WLOCK(vm_obj);
 	vm_object_pip_wakeup(vm_obj);
 	return (VM_PAGER_ERROR);
 }
 
 static void
 i915_gem_pager_dtor(void *handle)
 {
 	struct drm_gem_object *obj;
 	struct drm_device *dev;
 
 	obj = handle;
 	dev = obj->dev;
 
 	DRM_LOCK(dev);
 	drm_gem_free_mmap_offset(obj);
 	i915_gem_release_mmap(to_intel_bo(obj));
 	drm_gem_object_unreference(obj);
 	DRM_UNLOCK(dev);
 }
 
 struct cdev_pager_ops i915_gem_pager_ops = {
 	.cdev_pg_fault	= i915_gem_pager_fault,
 	.cdev_pg_ctor	= i915_gem_pager_ctor,
 	.cdev_pg_dtor	= i915_gem_pager_dtor
 };
 
 int
 i915_gem_mmap_gtt(struct drm_file *file, struct drm_device *dev,
     uint32_t handle, uint64_t *offset)
 {
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
 	if (!(dev->driver->driver_features & DRIVER_GEM))
 		return (-ENODEV);
 
 	dev_priv = dev->dev_private;
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret != 0)
 		return (ret);
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle));
 	if (&obj->base == NULL) {
 		ret = -ENOENT;
 		goto unlock;
 	}
 
 	if (obj->base.size > dev_priv->mm.gtt_mappable_end) {
 		ret = -E2BIG;
 		goto out;
 	}
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to mmap a purgeable buffer\n");
 		ret = -EINVAL;
 		goto out;
 	}
 
 	ret = drm_gem_create_mmap_offset(&obj->base);
 	if (ret != 0)
 		goto out;
 
 	*offset = DRM_GEM_MAPPING_OFF(obj->base.map_list.key) |
 	    DRM_GEM_MAPPING_KEY;
 out:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	DRM_UNLOCK(dev);
 	return (ret);
 }
 
 int
 i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data,
     struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_gem_mmap_gtt *args;
 
 	dev_priv = dev->dev_private;
 	args = data;
 
 	return (i915_gem_mmap_gtt(file, dev, args->handle, &args->offset));
 }
 
 struct drm_i915_gem_object *
 i915_gem_alloc_object(struct drm_device *dev, size_t size)
 {
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_gem_object *obj;
 
 	dev_priv = dev->dev_private;
 
 	obj = malloc(sizeof(*obj), DRM_I915_GEM, M_WAITOK | M_ZERO);
 
 	if (drm_gem_object_init(dev, &obj->base, size) != 0) {
 		free(obj, DRM_I915_GEM);
 		return (NULL);
 	}
 
 	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 
 	if (HAS_LLC(dev))
 		obj->cache_level = I915_CACHE_LLC;
 	else
 		obj->cache_level = I915_CACHE_NONE;
 	obj->base.driver_private = NULL;
 	obj->fence_reg = I915_FENCE_REG_NONE;
 	INIT_LIST_HEAD(&obj->mm_list);
 	INIT_LIST_HEAD(&obj->gtt_list);
 	INIT_LIST_HEAD(&obj->ring_list);
 	INIT_LIST_HEAD(&obj->exec_list);
 	INIT_LIST_HEAD(&obj->gpu_write_list);
 	obj->madv = I915_MADV_WILLNEED;
 	/* Avoid an unnecessary call to unbind on the first bind. */
 	obj->map_and_fenceable = true;
 
 	i915_gem_info_add_obj(dev_priv, size);
 
 	return (obj);
 }
 
 void
 i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 {
 
 	/* If we don't have a page list set up, then we're not pinned
 	 * to GPU, and we can ignore the cache flush because it'll happen
 	 * again at bind time.
 	 */
 	if (obj->pages == NULL)
 		return;
 
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
 	 * the caches are only snooped when the render cache is
 	 * flushed/invalidated.  As we always have to emit invalidations
 	 * and flushes when moving into and out of the RENDER domain, correct
 	 * snooping behaviour occurs naturally as the result of our domain
 	 * tracking.
 	 */
 	if (obj->cache_level != I915_CACHE_NONE)
 		return;
 
 	CTR1(KTR_DRM, "object_clflush %p", obj);
 	drm_clflush_pages(obj->pages, obj->base.size / PAGE_SIZE);
 }
 
 static void
 i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj)
 {
 	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
 		return;
 
 	i915_gem_clflush_object(obj);
 	intel_gtt_chipset_flush();
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
 	CTR3(KTR_DRM, "object_change_domain flush_cpu_write %p %x %x", obj,
 	    obj->base.read_domains, old_write_domain);
 }
 
 static int
 i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj)
 {
 
 	if ((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0)
 		return (0);
 	return (i915_gem_flush_ring(obj->ring, 0, obj->base.write_domain));
 }
 
 static void
 i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj)
 {
 	uint32_t old_write_domain;
 
 	if (obj->base.write_domain != I915_GEM_DOMAIN_GTT)
 		return;
 
 	wmb();
 
 	old_write_domain = obj->base.write_domain;
 	obj->base.write_domain = 0;
 
 	CTR3(KTR_DRM, "object_change_domain flush gtt_write %p %x %x", obj,
 	    obj->base.read_domains, old_write_domain);
 }
 
 int
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	if (obj->gtt_space == NULL)
 		return (-EINVAL);
 
 	if (obj->base.write_domain == I915_GEM_DOMAIN_GTT)
 		return 0;
 
 	ret = i915_gem_object_flush_gpu_write_domain(obj);
 	if (ret != 0)
 		return (ret);
 
 	if (obj->pending_gpu_write || write) {
 		ret = i915_gem_object_wait_rendering(obj);
 		if (ret != 0)
 			return (ret);
 	}
 
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) == 0,
 	    ("In GTT write domain"));
 	obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
 	if (write) {
 		obj->base.read_domains = I915_GEM_DOMAIN_GTT;
 		obj->base.write_domain = I915_GEM_DOMAIN_GTT;
 		obj->dirty = 1;
 	}
 
 	CTR3(KTR_DRM, "object_change_domain set_to_gtt %p %x %x", obj,
 	    old_read_domains, old_write_domain);
 	return (0);
 }
 
 int
 i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
     enum i915_cache_level cache_level)
 {
 	struct drm_device *dev;
 	drm_i915_private_t *dev_priv;
 	int ret;
 
 	if (obj->cache_level == cache_level)
 		return 0;
 
 	if (obj->pin_count) {
 		DRM_DEBUG("can not change the cache level of pinned objects\n");
 		return (-EBUSY);
 	}
 
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 	if (obj->gtt_space) {
 		ret = i915_gem_object_finish_gpu(obj);
 		if (ret != 0)
 			return (ret);
 
 		i915_gem_object_finish_gtt(obj);
 
 		/* Before SandyBridge, you could not use tiling or fence
 		 * registers with snooped memory, so relinquish any fences
 		 * currently pointing to our region in the aperture.
 		 */
 		if (INTEL_INFO(obj->base.dev)->gen < 6) {
 			ret = i915_gem_object_put_fence(obj);
 			if (ret != 0)
 				return (ret);
 		}
 
 		i915_gem_gtt_rebind_object(obj, cache_level);
 		if (obj->has_aliasing_ppgtt_mapping)
 			i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt,
 			    obj, cache_level);
 	}
 
 	if (cache_level == I915_CACHE_NONE) {
 		u32 old_read_domains, old_write_domain;
 
 		/* If we're coming from LLC cached, then we haven't
 		 * actually been tracking whether the data is in the
 		 * CPU cache or not, since we only allow one bit set
 		 * in obj->write_domain and have been skipping the clflushes.
 		 * Just set it to the CPU cache for now.
 		 */
 		KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0,
 		    ("obj %p in CPU write domain", obj));
 		KASSERT((obj->base.read_domains & ~I915_GEM_DOMAIN_CPU) == 0,
 		    ("obj %p in CPU read domain", obj));
 
 		old_read_domains = obj->base.read_domains;
 		old_write_domain = obj->base.write_domain;
 
 		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 
 		CTR3(KTR_DRM, "object_change_domain set_cache_level %p %x %x",
 		    obj, old_read_domains, old_write_domain);
 	}
 
 	obj->cache_level = cache_level;
 	return (0);
 }
 
 int
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
     u32 alignment, struct intel_ring_buffer *pipelined)
 {
 	u32 old_read_domains, old_write_domain;
 	int ret;
 
 	ret = i915_gem_object_flush_gpu_write_domain(obj);
 	if (ret != 0)
 		return (ret);
 
 	if (pipelined != obj->ring) {
 		ret = i915_gem_object_wait_rendering(obj);
 		if (ret == -ERESTART || ret == -EINTR)
 			return (ret);
 	}
 
 	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
 	if (ret != 0)
 		return (ret);
 
 	ret = i915_gem_object_pin(obj, alignment, true);
 	if (ret != 0)
 		return (ret);
 
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) == 0,
 	    ("obj %p in GTT write domain", obj));
 	obj->base.read_domains |= I915_GEM_DOMAIN_GTT;
 
 	CTR3(KTR_DRM, "object_change_domain pin_to_display_plan %p %x %x",
 	    obj, old_read_domains, obj->base.write_domain);
 	return (0);
 }
 
 int
 i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
 	if ((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0)
 		return (0);
 
 	if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) {
 		ret = i915_gem_flush_ring(obj->ring, 0, obj->base.write_domain);
 		if (ret != 0)
 			return (ret);
 	}
 
 	ret = i915_gem_object_wait_rendering(obj);
 	if (ret != 0)
 		return (ret);
 
 	obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS;
 
 	return (0);
 }
 
 static int
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
 	int ret;
 
 	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
 		return 0;
 
 	ret = i915_gem_object_flush_gpu_write_domain(obj);
 	if (ret != 0)
 		return (ret);
 
 	ret = i915_gem_object_wait_rendering(obj);
 	if (ret != 0)
 		return (ret);
 
 	i915_gem_object_flush_gtt_write_domain(obj);
 	i915_gem_object_set_to_full_cpu_read_domain(obj);
 
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) {
 		i915_gem_clflush_object(obj);
 		obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
 	}
 
 	KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0,
 	    ("In cpu write domain"));
 
 	if (write) {
 		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	}
 
 	CTR3(KTR_DRM, "object_change_domain set_to_cpu %p %x %x", obj,
 	    old_read_domains, old_write_domain);
 	return (0);
 }
 
 static void
 i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj)
 {
 	int i;
 
 	if (obj->page_cpu_valid == NULL)
 		return;
 
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0) {
 		for (i = 0; i <= (obj->base.size - 1) / PAGE_SIZE; i++) {
 			if (obj->page_cpu_valid[i] != 0)
 				continue;
 			drm_clflush_pages(obj->pages + i, 1);
 		}
 	}
 
 	free(obj->page_cpu_valid, DRM_I915_GEM);
 	obj->page_cpu_valid = NULL;
 }
 
 static int
 i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
     uint64_t offset, uint64_t size)
 {
 	uint32_t old_read_domains;
 	int i, ret;
 
 	if (offset == 0 && size == obj->base.size)
 		return (i915_gem_object_set_to_cpu_domain(obj, 0));
 
 	ret = i915_gem_object_flush_gpu_write_domain(obj);
 	if (ret != 0)
 		return (ret);
 	ret = i915_gem_object_wait_rendering(obj);
 	if (ret != 0)
 		return (ret);
 
 	i915_gem_object_flush_gtt_write_domain(obj);
 
 	if (obj->page_cpu_valid == NULL &&
 	    (obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0)
 		return (0);
 
 	if (obj->page_cpu_valid == NULL) {
 		obj->page_cpu_valid = malloc(obj->base.size / PAGE_SIZE,
 		    DRM_I915_GEM, M_WAITOK | M_ZERO);
 	} else if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
 		memset(obj->page_cpu_valid, 0, obj->base.size / PAGE_SIZE);
 
 	for (i = offset / PAGE_SIZE; i <= (offset + size - 1) / PAGE_SIZE;
 	     i++) {
 		if (obj->page_cpu_valid[i])
 			continue;
 		drm_clflush_pages(obj->pages + i, 1);
 		obj->page_cpu_valid[i] = 1;
 	}
 
 	KASSERT((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) == 0,
 	    ("In gpu write domain"));
 
 	old_read_domains = obj->base.read_domains;
 	obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
 
 	CTR3(KTR_DRM, "object_change_domain set_cpu_read %p %x %x", obj,
 	    old_read_domains, obj->base.write_domain);
 	return (0);
 }
 
 static uint32_t
 i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode)
 {
 	uint32_t gtt_size;
 
 	if (INTEL_INFO(dev)->gen >= 4 ||
 	    tiling_mode == I915_TILING_NONE)
 		return (size);
 
 	/* Previous chips need a power-of-two fence region when tiling */
 	if (INTEL_INFO(dev)->gen == 3)
 		gtt_size = 1024*1024;
 	else
 		gtt_size = 512*1024;
 
 	while (gtt_size < size)
 		gtt_size <<= 1;
 
 	return (gtt_size);
 }
 
 /**
  * i915_gem_get_gtt_alignment - return required GTT alignment for an object
  * @obj: object to check
  *
  * Return the required GTT alignment for an object, taking into account
  * potential fence register mapping.
  */
 static uint32_t
 i915_gem_get_gtt_alignment(struct drm_device *dev, uint32_t size,
      int tiling_mode)
 {
 
 	/*
 	 * Minimum alignment is 4k (GTT page size), but might be greater
 	 * if a fence register is needed for the object.
 	 */
 	if (INTEL_INFO(dev)->gen >= 4 ||
 	    tiling_mode == I915_TILING_NONE)
 		return (4096);
 
 	/*
 	 * Previous chips need to be aligned to the size of the smallest
 	 * fence register that can contain the object.
 	 */
 	return (i915_gem_get_gtt_size(dev, size, tiling_mode));
 }
 
 uint32_t
 i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev, uint32_t size,
     int tiling_mode)
 {
 
 	if (tiling_mode == I915_TILING_NONE)
 		return (4096);
 
 	/*
 	 * Minimum alignment is 4k (GTT page size) for sane hw.
 	 */
 	if (INTEL_INFO(dev)->gen >= 4 || IS_G33(dev))
 		return (4096);
 
 	/*
 	 * Previous hardware however needs to be aligned to a power-of-two
 	 * tile height. The simplest method for determining this is to reuse
 	 * the power-of-tile object size.
          */
 	return (i915_gem_get_gtt_size(dev, size, tiling_mode));
 }
 
 static int
 i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
     unsigned alignment, bool map_and_fenceable)
 {
 	struct drm_device *dev;
 	struct drm_i915_private *dev_priv;
 	struct drm_mm_node *free_space;
 	uint32_t size, fence_size, fence_alignment, unfenced_alignment;
 	bool mappable, fenceable;
 	int ret;
 
 	dev = obj->base.dev;
 	dev_priv = dev->dev_private;
 
 	if (obj->madv != I915_MADV_WILLNEED) {
 		DRM_ERROR("Attempting to bind a purgeable object\n");
 		return (-EINVAL);
 	}
 
 	fence_size = i915_gem_get_gtt_size(dev, obj->base.size,
 	    obj->tiling_mode);
 	fence_alignment = i915_gem_get_gtt_alignment(dev, obj->base.size,
 	    obj->tiling_mode);
 	unfenced_alignment = i915_gem_get_unfenced_gtt_alignment(dev,
 	    obj->base.size, obj->tiling_mode);
 	if (alignment == 0)
 		alignment = map_and_fenceable ? fence_alignment :
 		    unfenced_alignment;
 	if (map_and_fenceable && (alignment & (fence_alignment - 1)) != 0) {
 		DRM_ERROR("Invalid object alignment requested %u\n", alignment);
 		return (-EINVAL);
 	}
 
 	size = map_and_fenceable ? fence_size : obj->base.size;
 
 	/* If the object is bigger than the entire aperture, reject it early
 	 * before evicting everything in a vain attempt to find space.
 	 */
 	if (obj->base.size > (map_and_fenceable ?
 	    dev_priv->mm.gtt_mappable_end : dev_priv->mm.gtt_total)) {
 		DRM_ERROR(
 "Attempting to bind an object larger than the aperture\n");
 		return (-E2BIG);
 	}
 
  search_free:
 	if (map_and_fenceable)
 		free_space = drm_mm_search_free_in_range(
 		    &dev_priv->mm.gtt_space, size, alignment, 0,
 		    dev_priv->mm.gtt_mappable_end, 0);
 	else
 		free_space = drm_mm_search_free(&dev_priv->mm.gtt_space,
 		    size, alignment, 0);
 	if (free_space != NULL) {
 		if (map_and_fenceable)
 			obj->gtt_space = drm_mm_get_block_range_generic(
 			    free_space, size, alignment, 0,
 			    dev_priv->mm.gtt_mappable_end, 1);
 		else
 			obj->gtt_space = drm_mm_get_block_generic(free_space,
 			    size, alignment, 1);
 	}
 	if (obj->gtt_space == NULL) {
 		ret = i915_gem_evict_something(dev, size, alignment,
 		    map_and_fenceable);
 		if (ret != 0)
 			return (ret);
 		goto search_free;
 	}
 	ret = i915_gem_object_get_pages_gtt(obj, 0);
 	if (ret != 0) {
 		drm_mm_put_block(obj->gtt_space);
 		obj->gtt_space = NULL;
 		/*
 		 * i915_gem_object_get_pages_gtt() cannot return
 		 * ENOMEM, since we use vm_page_grab().
 		 */
 		return (ret);
 	}
 
 	ret = i915_gem_gtt_bind_object(obj);
 	if (ret != 0) {
 		i915_gem_object_put_pages_gtt(obj);
 		drm_mm_put_block(obj->gtt_space);
 		obj->gtt_space = NULL;
 		if (i915_gem_evict_everything(dev, false))
 			return (ret);
 		goto search_free;
 	}
 
 	list_add_tail(&obj->gtt_list, &dev_priv->mm.gtt_list);
 	list_add_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	KASSERT((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0,
 	    ("Object in gpu read domain"));
 	KASSERT((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0,
 	    ("Object in gpu write domain"));
 
 	obj->gtt_offset = obj->gtt_space->start;
 
 	fenceable =
 		obj->gtt_space->size == fence_size &&
 		(obj->gtt_space->start & (fence_alignment - 1)) == 0;
 
 	mappable =
 		obj->gtt_offset + obj->base.size <= dev_priv->mm.gtt_mappable_end;
 	obj->map_and_fenceable = mappable && fenceable;
 
 	CTR4(KTR_DRM, "object_bind %p %x %x %d", obj, obj->gtt_offset,
 	    obj->base.size, map_and_fenceable);
 	return (0);
 }
 
 static void
 i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj)
 {
 	u32 old_write_domain, old_read_domains;
 
 	/* Act a barrier for all accesses through the GTT */
 	mb();
 
 	/* Force a pagefault for domain tracking on next user access */
 	i915_gem_release_mmap(obj);
 
 	if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0)
 		return;
 
 	old_read_domains = obj->base.read_domains;
 	old_write_domain = obj->base.write_domain;
 
 	obj->base.read_domains &= ~I915_GEM_DOMAIN_GTT;
 	obj->base.write_domain &= ~I915_GEM_DOMAIN_GTT;
 
 	CTR3(KTR_DRM, "object_change_domain finish gtt %p %x %x",
 	    obj, old_read_domains, old_write_domain);
 }
 
 int
 i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv;
 	int ret;
 
 	dev_priv = obj->base.dev->dev_private;
 	ret = 0;
 	if (obj->gtt_space == NULL)
 		return (0);
 	if (obj->pin_count != 0) {
 		DRM_ERROR("Attempting to unbind pinned buffer\n");
 		return (-EINVAL);
 	}
 
 	ret = i915_gem_object_finish_gpu(obj);
 	if (ret == -ERESTART || ret == -EINTR)
 		return (ret);
 
 	i915_gem_object_finish_gtt(obj);
 
 	if (ret == 0)
 		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
 	if (ret == -ERESTART || ret == -EINTR)
 		return (ret);
 	if (ret != 0) {
 		i915_gem_clflush_object(obj);
 		obj->base.read_domains = obj->base.write_domain =
 		    I915_GEM_DOMAIN_CPU;
 	}
 
 	ret = i915_gem_object_put_fence(obj);
 	if (ret == -ERESTART)
 		return (ret);
 
 	i915_gem_gtt_unbind_object(obj);
 	if (obj->has_aliasing_ppgtt_mapping) {
 		i915_ppgtt_unbind_object(dev_priv->mm.aliasing_ppgtt, obj);
 		obj->has_aliasing_ppgtt_mapping = 0;
 	}
 	i915_gem_object_put_pages_gtt(obj);
 
 	list_del_init(&obj->gtt_list);
 	list_del_init(&obj->mm_list);
 	obj->map_and_fenceable = true;
 
 	drm_mm_put_block(obj->gtt_space);
 	obj->gtt_space = NULL;
 	obj->gtt_offset = 0;
 
 	if (i915_gem_object_is_purgeable(obj))
 		i915_gem_object_truncate(obj);
 	CTR1(KTR_DRM, "object_unbind %p", obj);
 
 	return (ret);
 }
 
 static int
 i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj,
     int flags)
 {
 	struct drm_device *dev;
 	vm_object_t vm_obj;
 	vm_page_t m;
 	int page_count, i, j;
 
 	dev = obj->base.dev;
 	KASSERT(obj->pages == NULL, ("Obj already has pages"));
 	page_count = obj->base.size / PAGE_SIZE;
 	obj->pages = malloc(page_count * sizeof(vm_page_t), DRM_I915_GEM,
 	    M_WAITOK);
 	vm_obj = obj->base.vm_obj;
 	VM_OBJECT_WLOCK(vm_obj);
 	for (i = 0; i < page_count; i++) {
 		if ((obj->pages[i] = i915_gem_wire_page(vm_obj, i)) == NULL)
 			goto failed;
 	}
 	VM_OBJECT_WUNLOCK(vm_obj);
 	if (i915_gem_object_needs_bit17_swizzle(obj))
 		i915_gem_object_do_bit_17_swizzle(obj);
 	return (0);
 
 failed:
 	for (j = 0; j < i; j++) {
 		m = obj->pages[j];
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(vm_obj);
 	free(obj->pages, DRM_I915_GEM);
 	obj->pages = NULL;
 	return (-EIO);
 }
 
 #define	GEM_PARANOID_CHECK_GTT 0
 #if GEM_PARANOID_CHECK_GTT
 static void
 i915_gem_assert_pages_not_mapped(struct drm_device *dev, vm_page_t *ma,
     int page_count)
 {
 	struct drm_i915_private *dev_priv;
 	vm_paddr_t pa;
 	unsigned long start, end;
 	u_int i;
 	int j;
 
 	dev_priv = dev->dev_private;
 	start = OFF_TO_IDX(dev_priv->mm.gtt_start);
 	end = OFF_TO_IDX(dev_priv->mm.gtt_end);
 	for (i = start; i < end; i++) {
 		pa = intel_gtt_read_pte_paddr(i);
 		for (j = 0; j < page_count; j++) {
 			if (pa == VM_PAGE_TO_PHYS(ma[j])) {
 				panic("Page %p in GTT pte index %d pte %x",
 				    ma[i], i, intel_gtt_read_pte(i));
 			}
 		}
 	}
 }
 #endif
 
 static void
 i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 {
 	vm_page_t m;
 	int page_count, i;
 
 	KASSERT(obj->madv != I915_MADV_PURGED_INTERNAL, ("Purged object"));
 
 	if (obj->tiling_mode != I915_TILING_NONE)
 		i915_gem_object_save_bit_17_swizzle(obj);
 	if (obj->madv == I915_MADV_DONTNEED)
 		obj->dirty = 0;
 	page_count = obj->base.size / PAGE_SIZE;
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 #if GEM_PARANOID_CHECK_GTT
 	i915_gem_assert_pages_not_mapped(obj->base.dev, obj->pages, page_count);
 #endif
 	for (i = 0; i < page_count; i++) {
 		m = obj->pages[i];
 		if (obj->dirty)
 			vm_page_dirty(m);
 		if (obj->madv == I915_MADV_WILLNEED)
 			vm_page_reference(m);
 		vm_page_lock(m);
-		vm_page_unwire(obj->pages[i], 1);
+		vm_page_unwire(obj->pages[i], PQ_ACTIVE);
 		vm_page_unlock(m);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 	obj->dirty = 0;
 	free(obj->pages, DRM_I915_GEM);
 	obj->pages = NULL;
 }
 
 void
 i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 {
 	vm_object_t devobj;
 	vm_page_t m;
 	int i, page_count;
 
 	if (!obj->fault_mappable)
 		return;
 
 	CTR3(KTR_DRM, "release_mmap %p %x %x", obj, obj->gtt_offset,
 	    OFF_TO_IDX(obj->base.size));
 	devobj = cdev_pager_lookup(obj);
 	if (devobj != NULL) {
 		page_count = OFF_TO_IDX(obj->base.size);
 
 		VM_OBJECT_WLOCK(devobj);
 retry:
 		for (i = 0; i < page_count; i++) {
 			m = vm_page_lookup(devobj, i);
 			if (m == NULL)
 				continue;
 			if (vm_page_sleep_if_busy(m, "915unm"))
 				goto retry;
 			cdev_pager_free_page(devobj, m);
 		}
 		VM_OBJECT_WUNLOCK(devobj);
 		vm_object_deallocate(devobj);
 	}
 
 	obj->fault_mappable = false;
 }
 
 int
 i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
 	KASSERT((obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0,
 	    ("In GPU write domain"));
 
 	CTR5(KTR_DRM, "object_wait_rendering %p %s %x %d %d", obj,
 	    obj->ring != NULL ? obj->ring->name : "none", obj->gtt_offset,
 	    obj->active, obj->last_rendering_seqno);
 	if (obj->active) {
 		ret = i915_wait_request(obj->ring, obj->last_rendering_seqno,
 		    true);
 		if (ret != 0)
 			return (ret);
 	}
 	return (0);
 }
 
 void
 i915_gem_object_move_to_active(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *ring, uint32_t seqno)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_fence_reg *reg;
 
 	obj->ring = ring;
 	KASSERT(ring != NULL, ("NULL ring"));
 
 	/* Add a reference if we're newly entering the active list. */
 	if (!obj->active) {
 		drm_gem_object_reference(&obj->base);
 		obj->active = 1;
 	}
 
 	/* Move from whatever list we were on to the tail of execution. */
 	list_move_tail(&obj->mm_list, &dev_priv->mm.active_list);
 	list_move_tail(&obj->ring_list, &ring->active_list);
 
 	obj->last_rendering_seqno = seqno;
 	if (obj->fenced_gpu_access) {
 		obj->last_fenced_seqno = seqno;
 		obj->last_fenced_ring = ring;
 
 		/* Bump MRU to take account of the delayed flush */
 		if (obj->fence_reg != I915_FENCE_REG_NONE) {
 			reg = &dev_priv->fence_regs[obj->fence_reg];
 			list_move_tail(&reg->lru_list,
 				       &dev_priv->mm.fence_list);
 		}
 	}
 }
 
 static void
 i915_gem_object_move_off_active(struct drm_i915_gem_object *obj)
 {
 	list_del_init(&obj->ring_list);
 	obj->last_rendering_seqno = 0;
 	obj->last_fenced_seqno = 0;
 }
 
 static void
 i915_gem_object_move_to_flushing(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 
 	KASSERT(obj->active, ("Object not active"));
 	list_move_tail(&obj->mm_list, &dev_priv->mm.flushing_list);
 
 	i915_gem_object_move_off_active(obj);
 }
 
 static void
 i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 
 	if (obj->pin_count != 0)
 		list_move_tail(&obj->mm_list, &dev_priv->mm.pinned_list);
 	else
 		list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list);
 
 	KASSERT(list_empty(&obj->gpu_write_list), ("On gpu_write_list"));
 	KASSERT(obj->active, ("Object not active"));
 	obj->ring = NULL;
 	obj->last_fenced_ring = NULL;
 
 	i915_gem_object_move_off_active(obj);
 	obj->fenced_gpu_access = false;
 
 	obj->active = 0;
 	obj->pending_gpu_write = false;
 	drm_gem_object_unreference(&obj->base);
 
 #if 1
 	KIB_NOTYET();
 #else
 	WARN_ON(i915_verify_lists(dev));
 #endif
 }
 
 static void
 i915_gem_object_truncate(struct drm_i915_gem_object *obj)
 {
 	vm_object_t vm_obj;
 
 	vm_obj = obj->base.vm_obj;
 	VM_OBJECT_WLOCK(vm_obj);
 	vm_object_page_remove(vm_obj, 0, 0, false);
 	VM_OBJECT_WUNLOCK(vm_obj);
 	obj->madv = I915_MADV_PURGED_INTERNAL;
 }
 
 static inline int
 i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj)
 {
 
 	return (obj->madv == I915_MADV_DONTNEED);
 }
 
 static void
 i915_gem_process_flushing_list(struct intel_ring_buffer *ring,
     uint32_t flush_domains)
 {
 	struct drm_i915_gem_object *obj, *next;
 	uint32_t old_write_domain;
 
 	list_for_each_entry_safe(obj, next, &ring->gpu_write_list,
 	    gpu_write_list) {
 		if (obj->base.write_domain & flush_domains) {
 			old_write_domain = obj->base.write_domain;
 			obj->base.write_domain = 0;
 			list_del_init(&obj->gpu_write_list);
 			i915_gem_object_move_to_active(obj, ring,
 			    i915_gem_next_request_seqno(ring));
 
 	CTR3(KTR_DRM, "object_change_domain process_flush %p %x %x",
 			    obj, obj->base.read_domains, old_write_domain);
 		}
 	}
 }
 
 static int
 i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
 {
 	drm_i915_private_t *dev_priv;
 
 	dev_priv = obj->base.dev->dev_private;
 	return (dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_9_10_17 &&
 	    obj->tiling_mode != I915_TILING_NONE);
 }
 
 static vm_page_t
 i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 	int rv;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		if (vm_pager_has_page(object, pindex, NULL, NULL)) {
 			rv = vm_pager_get_pages(object, &m, 1, 0);
 			m = vm_page_lookup(object, pindex);
 			if (m == NULL)
 				return (NULL);
 			if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				return (NULL);
 			}
 		} else {
 			pmap_zero_page(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			m->dirty = 0;
 		}
 	}
 	vm_page_lock(m);
 	vm_page_wire(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	atomic_add_long(&i915_gem_wired_pages_cnt, 1);
 	return (m);
 }
 
 int
 i915_gem_flush_ring(struct intel_ring_buffer *ring, uint32_t invalidate_domains,
     uint32_t flush_domains)
 {
 	int ret;
 
 	if (((invalidate_domains | flush_domains) & I915_GEM_GPU_DOMAINS) == 0)
 		return 0;
 
 	CTR3(KTR_DRM, "ring_flush %s %x %x", ring->name, invalidate_domains,
 	    flush_domains);
 	ret = ring->flush(ring, invalidate_domains, flush_domains);
 	if (ret)
 		return ret;
 
 	if (flush_domains & I915_GEM_GPU_DOMAINS)
 		i915_gem_process_flushing_list(ring, flush_domains);
 	return 0;
 }
 
 static int
 i915_ring_idle(struct intel_ring_buffer *ring, bool do_retire)
 {
 	int ret;
 
 	if (list_empty(&ring->gpu_write_list) && list_empty(&ring->active_list))
 		return 0;
 
 	if (!list_empty(&ring->gpu_write_list)) {
 		ret = i915_gem_flush_ring(ring, I915_GEM_GPU_DOMAINS,
 		    I915_GEM_GPU_DOMAINS);
 		if (ret != 0)
 			return ret;
 	}
 
 	return (i915_wait_request(ring, i915_gem_next_request_seqno(ring),
 	    do_retire));
 }
 
 int
 i915_gpu_idle(struct drm_device *dev, bool do_retire)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	int ret, i;
 
 	/* Flush everything onto the inactive list. */
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		ret = i915_ring_idle(&dev_priv->rings[i], do_retire);
 		if (ret)
 			return ret;
 	}
 
 	return 0;
 }
 
 int
 i915_wait_request(struct intel_ring_buffer *ring, uint32_t seqno, bool do_retire)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_i915_gem_request *request;
 	uint32_t ier;
 	int flags, ret;
 	bool recovery_complete;
 
 	KASSERT(seqno != 0, ("Zero seqno"));
 
 	dev_priv = ring->dev->dev_private;
 	ret = 0;
 
 	if (atomic_load_acq_int(&dev_priv->mm.wedged) != 0) {
 		/* Give the error handler a chance to run. */
 		mtx_lock(&dev_priv->error_completion_lock);
 		recovery_complete = (&dev_priv->error_completion) > 0;
 		mtx_unlock(&dev_priv->error_completion_lock);
 		return (recovery_complete ? -EIO : -EAGAIN);
 	}
 
 	if (seqno == ring->outstanding_lazy_request) {
 		request = malloc(sizeof(*request), DRM_I915_GEM,
 		    M_WAITOK | M_ZERO);
 		if (request == NULL)
 			return (-ENOMEM);
 
 		ret = i915_add_request(ring, NULL, request);
 		if (ret != 0) {
 			free(request, DRM_I915_GEM);
 			return (ret);
 		}
 
 		seqno = request->seqno;
 	}
 
 	if (!i915_seqno_passed(ring->get_seqno(ring), seqno)) {
 		if (HAS_PCH_SPLIT(ring->dev))
 			ier = I915_READ(DEIER) | I915_READ(GTIER);
 		else
 			ier = I915_READ(IER);
 		if (!ier) {
 			DRM_ERROR("something (likely vbetool) disabled "
 				  "interrupts, re-enabling\n");
 			ring->dev->driver->irq_preinstall(ring->dev);
 			ring->dev->driver->irq_postinstall(ring->dev);
 		}
 
 		CTR2(KTR_DRM, "request_wait_begin %s %d", ring->name, seqno);
 
 		ring->waiting_seqno = seqno;
 		mtx_lock(&ring->irq_lock);
 		if (ring->irq_get(ring)) {
 			flags = dev_priv->mm.interruptible ? PCATCH : 0;
 			while (!i915_seqno_passed(ring->get_seqno(ring), seqno)
 			    && !atomic_load_acq_int(&dev_priv->mm.wedged) &&
 			    ret == 0) {
 				ret = -msleep(ring, &ring->irq_lock, flags,
 				    "915gwr", 0);
 			}
 			ring->irq_put(ring);
 			mtx_unlock(&ring->irq_lock);
 		} else {
 			mtx_unlock(&ring->irq_lock);
 			if (_intel_wait_for(ring->dev,
 			    i915_seqno_passed(ring->get_seqno(ring), seqno) ||
 			    atomic_load_acq_int(&dev_priv->mm.wedged), 3000,
 			    0, "i915wrq") != 0)
 				ret = -EBUSY;
 		}
 		ring->waiting_seqno = 0;
 
 		CTR3(KTR_DRM, "request_wait_end %s %d %d", ring->name, seqno,
 		    ret);
 	}
 	if (atomic_load_acq_int(&dev_priv->mm.wedged))
 		ret = -EAGAIN;
 
 	/* Directly dispatch request retiring.  While we have the work queue
 	 * to handle this, the waiter on a request often wants an associated
 	 * buffer to have made it to the inactive list, and we would need
 	 * a separate wait queue to handle that.
 	 */
 	if (ret == 0 && do_retire)
 		i915_gem_retire_requests_ring(ring);
 
 	return (ret);
 }
 
 static u32
 i915_gem_get_seqno(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 seqno = dev_priv->next_seqno;
 
 	/* reserve 0 for non-seqno */
 	if (++dev_priv->next_seqno == 0)
 		dev_priv->next_seqno = 1;
 
 	return seqno;
 }
 
 u32
 i915_gem_next_request_seqno(struct intel_ring_buffer *ring)
 {
 	if (ring->outstanding_lazy_request == 0)
 		ring->outstanding_lazy_request = i915_gem_get_seqno(ring->dev);
 
 	return ring->outstanding_lazy_request;
 }
 
 int
 i915_add_request(struct intel_ring_buffer *ring, struct drm_file *file,
      struct drm_i915_gem_request *request)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_i915_file_private *file_priv;
 	uint32_t seqno;
 	u32 request_ring_position;
 	int was_empty;
 	int ret;
 
 	KASSERT(request != NULL, ("NULL request in add"));
 	DRM_LOCK_ASSERT(ring->dev);
 	dev_priv = ring->dev->dev_private;
 
 	seqno = i915_gem_next_request_seqno(ring);
 	request_ring_position = intel_ring_get_tail(ring);
 
 	ret = ring->add_request(ring, &seqno);
 	if (ret != 0)
 	    return ret;
 
 	CTR2(KTR_DRM, "request_add %s %d", ring->name, seqno);
 
 	request->seqno = seqno;
 	request->ring = ring;
 	request->tail = request_ring_position;
 	request->emitted_jiffies = ticks;
 	was_empty = list_empty(&ring->request_list);
 	list_add_tail(&request->list, &ring->request_list);
 
 	if (file != NULL) {
 		file_priv = file->driver_priv;
 
 		mtx_lock(&file_priv->mm.lck);
 		request->file_priv = file_priv;
 		list_add_tail(&request->client_list,
 		    &file_priv->mm.request_list);
 		mtx_unlock(&file_priv->mm.lck);
 	}
 
 	ring->outstanding_lazy_request = 0;
 
 	if (!dev_priv->mm.suspended) {
 		if (i915_enable_hangcheck) {
 			callout_schedule(&dev_priv->hangcheck_timer,
 			    DRM_I915_HANGCHECK_PERIOD);
 		}
 		if (was_empty)
 			taskqueue_enqueue_timeout(dev_priv->tq,
 			    &dev_priv->mm.retire_task, hz);
 	}
 	return (0);
 }
 
 static inline void
 i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_file_private *file_priv = request->file_priv;
 
 	if (!file_priv)
 		return;
 
 	DRM_LOCK_ASSERT(request->ring->dev);
 
 	mtx_lock(&file_priv->mm.lck);
 	if (request->file_priv != NULL) {
 		list_del(&request->client_list);
 		request->file_priv = NULL;
 	}
 	mtx_unlock(&file_priv->mm.lck);
 }
 
 void
 i915_gem_release(struct drm_device *dev, struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv;
 	struct drm_i915_gem_request *request;
 
 	file_priv = file->driver_priv;
 
 	/* Clean up our request list when the client is going away, so that
 	 * later retire_requests won't dereference our soon-to-be-gone
 	 * file_priv.
 	 */
 	mtx_lock(&file_priv->mm.lck);
 	while (!list_empty(&file_priv->mm.request_list)) {
 		request = list_first_entry(&file_priv->mm.request_list,
 					   struct drm_i915_gem_request,
 					   client_list);
 		list_del(&request->client_list);
 		request->file_priv = NULL;
 	}
 	mtx_unlock(&file_priv->mm.lck);
 }
 
 static void
 i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv,
     struct intel_ring_buffer *ring)
 {
 
 	if (ring->dev != NULL)
 		DRM_LOCK_ASSERT(ring->dev);
 
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
 		request = list_first_entry(&ring->request_list,
 		    struct drm_i915_gem_request, list);
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
 		free(request, DRM_I915_GEM);
 	}
 
 	while (!list_empty(&ring->active_list)) {
 		struct drm_i915_gem_object *obj;
 
 		obj = list_first_entry(&ring->active_list,
 		    struct drm_i915_gem_object, ring_list);
 
 		obj->base.write_domain = 0;
 		list_del_init(&obj->gpu_write_list);
 		i915_gem_object_move_to_inactive(obj);
 	}
 }
 
 static void
 i915_gem_reset_fences(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	int i;
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
 		struct drm_i915_gem_object *obj = reg->obj;
 
 		if (!obj)
 			continue;
 
 		if (obj->tiling_mode)
 			i915_gem_release_mmap(obj);
 
 		reg->obj->fence_reg = I915_FENCE_REG_NONE;
 		reg->obj->fenced_gpu_access = false;
 		reg->obj->last_fenced_seqno = 0;
 		reg->obj->last_fenced_ring = NULL;
 		i915_gem_clear_fence_reg(dev, reg);
 	}
 }
 
 void
 i915_gem_reset(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj;
 	int i;
 
 	for (i = 0; i < I915_NUM_RINGS; i++)
 		i915_gem_reset_ring_lists(dev_priv, &dev_priv->rings[i]);
 
 	/* Remove anything from the flushing lists. The GPU cache is likely
 	 * to be lost on reset along with the data, so simply move the
 	 * lost bo to the inactive list.
 	 */
 	while (!list_empty(&dev_priv->mm.flushing_list)) {
 		obj = list_first_entry(&dev_priv->mm.flushing_list,
 				      struct drm_i915_gem_object,
 				      mm_list);
 
 		obj->base.write_domain = 0;
 		list_del_init(&obj->gpu_write_list);
 		i915_gem_object_move_to_inactive(obj);
 	}
 
 	/* Move everything out of the GPU domains to ensure we do any
 	 * necessary invalidation upon reuse.
 	 */
 	list_for_each_entry(obj, &dev_priv->mm.inactive_list, mm_list) {
 		obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS;
 	}
 
 	/* The fence registers are invalidated so clear them out */
 	i915_gem_reset_fences(dev);
 }
 
 /**
  * This function clears the request list as sequence numbers are passed.
  */
 void
 i915_gem_retire_requests_ring(struct intel_ring_buffer *ring)
 {
 	uint32_t seqno;
 	int i;
 
 	if (list_empty(&ring->request_list))
 		return;
 
 	seqno = ring->get_seqno(ring);
 	CTR2(KTR_DRM, "retire_request_ring %s %d", ring->name, seqno);
 
 	for (i = 0; i < DRM_ARRAY_SIZE(ring->sync_seqno); i++)
 		if (seqno >= ring->sync_seqno[i])
 			ring->sync_seqno[i] = 0;
 
 	while (!list_empty(&ring->request_list)) {
 		struct drm_i915_gem_request *request;
 
 		request = list_first_entry(&ring->request_list,
 					   struct drm_i915_gem_request,
 					   list);
 
 		if (!i915_seqno_passed(seqno, request->seqno))
 			break;
 
 		CTR2(KTR_DRM, "retire_request_seqno_passed %s %d",
 		    ring->name, seqno);
 		ring->last_retired_head = request->tail;
 
 		list_del(&request->list);
 		i915_gem_request_remove_from_client(request);
 		free(request, DRM_I915_GEM);
 	}
 
 	/* Move any buffers on the active list that are no longer referenced
 	 * by the ringbuffer to the flushing/inactive lists as appropriate.
 	 */
 	while (!list_empty(&ring->active_list)) {
 		struct drm_i915_gem_object *obj;
 
 		obj = list_first_entry(&ring->active_list,
 				      struct drm_i915_gem_object,
 				      ring_list);
 
 		if (!i915_seqno_passed(seqno, obj->last_rendering_seqno))
 			break;
 
 		if (obj->base.write_domain != 0)
 			i915_gem_object_move_to_flushing(obj);
 		else
 			i915_gem_object_move_to_inactive(obj);
 	}
 
 	if (ring->trace_irq_seqno &&
 	    i915_seqno_passed(seqno, ring->trace_irq_seqno)) {
 		mtx_lock(&ring->irq_lock);
 		ring->irq_put(ring);
 		mtx_unlock(&ring->irq_lock);
 		ring->trace_irq_seqno = 0;
 	}
 }
 
 void
 i915_gem_retire_requests(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj, *next;
 	int i;
 
 	if (!list_empty(&dev_priv->mm.deferred_free_list)) {
 		list_for_each_entry_safe(obj, next,
 		    &dev_priv->mm.deferred_free_list, mm_list)
 			i915_gem_free_object_tail(obj);
 	}
 
 	for (i = 0; i < I915_NUM_RINGS; i++)
 		i915_gem_retire_requests_ring(&dev_priv->rings[i]);
 }
 
 static int
 sandybridge_write_fence_reg(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 size = obj->gtt_space->size;
 	int regnum = obj->fence_reg;
 	uint64_t val;
 
 	val = (uint64_t)((obj->gtt_offset + size - 4096) &
 			 0xfffff000) << 32;
 	val |= obj->gtt_offset & 0xfffff000;
 	val |= (uint64_t)((obj->stride / 128) - 1) <<
 		SANDYBRIDGE_FENCE_PITCH_SHIFT;
 
 	if (obj->tiling_mode == I915_TILING_Y)
 		val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 	val |= I965_FENCE_REG_VALID;
 
 	if (pipelined) {
 		int ret = intel_ring_begin(pipelined, 6);
 		if (ret)
 			return ret;
 
 		intel_ring_emit(pipelined, MI_NOOP);
 		intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(2));
 		intel_ring_emit(pipelined, FENCE_REG_SANDYBRIDGE_0 + regnum*8);
 		intel_ring_emit(pipelined, (u32)val);
 		intel_ring_emit(pipelined, FENCE_REG_SANDYBRIDGE_0 + regnum*8 + 4);
 		intel_ring_emit(pipelined, (u32)(val >> 32));
 		intel_ring_advance(pipelined);
 	} else
 		I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + regnum * 8, val);
 
 	return 0;
 }
 
 static int
 i965_write_fence_reg(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 size = obj->gtt_space->size;
 	int regnum = obj->fence_reg;
 	uint64_t val;
 
 	val = (uint64_t)((obj->gtt_offset + size - 4096) &
 		    0xfffff000) << 32;
 	val |= obj->gtt_offset & 0xfffff000;
 	val |= ((obj->stride / 128) - 1) << I965_FENCE_PITCH_SHIFT;
 	if (obj->tiling_mode == I915_TILING_Y)
 		val |= 1 << I965_FENCE_TILING_Y_SHIFT;
 	val |= I965_FENCE_REG_VALID;
 
 	if (pipelined) {
 		int ret = intel_ring_begin(pipelined, 6);
 		if (ret)
 			return ret;
 
 		intel_ring_emit(pipelined, MI_NOOP);
 		intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(2));
 		intel_ring_emit(pipelined, FENCE_REG_965_0 + regnum*8);
 		intel_ring_emit(pipelined, (u32)val);
 		intel_ring_emit(pipelined, FENCE_REG_965_0 + regnum*8 + 4);
 		intel_ring_emit(pipelined, (u32)(val >> 32));
 		intel_ring_advance(pipelined);
 	} else
 		I915_WRITE64(FENCE_REG_965_0 + regnum * 8, val);
 
 	return 0;
 }
 
 static int
 i915_write_fence_reg(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 size = obj->gtt_space->size;
 	u32 fence_reg, val, pitch_val;
 	int tile_width;
 
 	if ((obj->gtt_offset & ~I915_FENCE_START_MASK) ||
 	    (size & -size) != size || (obj->gtt_offset & (size - 1))) {
 		printf(
 "object 0x%08x [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n",
 		 obj->gtt_offset, obj->map_and_fenceable, size);
 		return -EINVAL;
 	}
 
 	if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
 		tile_width = 128;
 	else
 		tile_width = 512;
 
 	/* Note: pitch better be a power of two tile widths */
 	pitch_val = obj->stride / tile_width;
 	pitch_val = ffs(pitch_val) - 1;
 
 	val = obj->gtt_offset;
 	if (obj->tiling_mode == I915_TILING_Y)
 		val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 	val |= I915_FENCE_SIZE_BITS(size);
 	val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 	val |= I830_FENCE_REG_VALID;
 
 	fence_reg = obj->fence_reg;
 	if (fence_reg < 8)
 		fence_reg = FENCE_REG_830_0 + fence_reg * 4;
 	else
 		fence_reg = FENCE_REG_945_8 + (fence_reg - 8) * 4;
 
 	if (pipelined) {
 		int ret = intel_ring_begin(pipelined, 4);
 		if (ret)
 			return ret;
 
 		intel_ring_emit(pipelined, MI_NOOP);
 		intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(1));
 		intel_ring_emit(pipelined, fence_reg);
 		intel_ring_emit(pipelined, val);
 		intel_ring_advance(pipelined);
 	} else
 		I915_WRITE(fence_reg, val);
 
 	return 0;
 }
 
 static int
 i830_write_fence_reg(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	struct drm_device *dev = obj->base.dev;
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	u32 size = obj->gtt_space->size;
 	int regnum = obj->fence_reg;
 	uint32_t val;
 	uint32_t pitch_val;
 
 	if ((obj->gtt_offset & ~I830_FENCE_START_MASK) ||
 	    (size & -size) != size || (obj->gtt_offset & (size - 1))) {
 		printf(
 "object 0x%08x not 512K or pot-size 0x%08x aligned\n",
 		    obj->gtt_offset, size);
 		return -EINVAL;
 	}
 
 	pitch_val = obj->stride / 128;
 	pitch_val = ffs(pitch_val) - 1;
 
 	val = obj->gtt_offset;
 	if (obj->tiling_mode == I915_TILING_Y)
 		val |= 1 << I830_FENCE_TILING_Y_SHIFT;
 	val |= I830_FENCE_SIZE_BITS(size);
 	val |= pitch_val << I830_FENCE_PITCH_SHIFT;
 	val |= I830_FENCE_REG_VALID;
 
 	if (pipelined) {
 		int ret = intel_ring_begin(pipelined, 4);
 		if (ret)
 			return ret;
 
 		intel_ring_emit(pipelined, MI_NOOP);
 		intel_ring_emit(pipelined, MI_LOAD_REGISTER_IMM(1));
 		intel_ring_emit(pipelined, FENCE_REG_830_0 + regnum*4);
 		intel_ring_emit(pipelined, val);
 		intel_ring_advance(pipelined);
 	} else
 		I915_WRITE(FENCE_REG_830_0 + regnum * 4, val);
 
 	return 0;
 }
 
 static bool ring_passed_seqno(struct intel_ring_buffer *ring, u32 seqno)
 {
 	return i915_seqno_passed(ring->get_seqno(ring), seqno);
 }
 
 static int
 i915_gem_object_flush_fence(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	int ret;
 
 	if (obj->fenced_gpu_access) {
 		if (obj->base.write_domain & I915_GEM_GPU_DOMAINS) {
 			ret = i915_gem_flush_ring(obj->last_fenced_ring, 0,
 			    obj->base.write_domain);
 			if (ret)
 				return ret;
 		}
 
 		obj->fenced_gpu_access = false;
 	}
 
 	if (obj->last_fenced_seqno && pipelined != obj->last_fenced_ring) {
 		if (!ring_passed_seqno(obj->last_fenced_ring,
 				       obj->last_fenced_seqno)) {
 			ret = i915_wait_request(obj->last_fenced_ring,
 						obj->last_fenced_seqno,
 						true);
 			if (ret)
 				return ret;
 		}
 
 		obj->last_fenced_seqno = 0;
 		obj->last_fenced_ring = NULL;
 	}
 
 	/* Ensure that all CPU reads are completed before installing a fence
 	 * and all writes before removing the fence.
 	 */
 	if (obj->base.read_domains & I915_GEM_DOMAIN_GTT)
 		mb();
 
 	return 0;
 }
 
 int
 i915_gem_object_put_fence(struct drm_i915_gem_object *obj)
 {
 	int ret;
 
 	if (obj->tiling_mode)
 		i915_gem_release_mmap(obj);
 
 	ret = i915_gem_object_flush_fence(obj, NULL);
 	if (ret)
 		return ret;
 
 	if (obj->fence_reg != I915_FENCE_REG_NONE) {
 		struct drm_i915_private *dev_priv = obj->base.dev->dev_private;
 
 		if (dev_priv->fence_regs[obj->fence_reg].pin_count != 0)
 			printf("%s: pin_count %d\n", __func__,
 			    dev_priv->fence_regs[obj->fence_reg].pin_count);
 		i915_gem_clear_fence_reg(obj->base.dev,
 					 &dev_priv->fence_regs[obj->fence_reg]);
 
 		obj->fence_reg = I915_FENCE_REG_NONE;
 	}
 
 	return 0;
 }
 
 static struct drm_i915_fence_reg *
 i915_find_fence_reg(struct drm_device *dev, struct intel_ring_buffer *pipelined)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_fence_reg *reg, *first, *avail;
 	int i;
 
 	/* First try to find a free reg */
 	avail = NULL;
 	for (i = dev_priv->fence_reg_start; i < dev_priv->num_fence_regs; i++) {
 		reg = &dev_priv->fence_regs[i];
 		if (!reg->obj)
 			return reg;
 
 		if (!reg->pin_count)
 			avail = reg;
 	}
 
 	if (avail == NULL)
 		return NULL;
 
 	/* None available, try to steal one or wait for a user to finish */
 	avail = first = NULL;
 	list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) {
 		if (reg->pin_count)
 			continue;
 
 		if (first == NULL)
 			first = reg;
 
 		if (!pipelined ||
 		    !reg->obj->last_fenced_ring ||
 		    reg->obj->last_fenced_ring == pipelined) {
 			avail = reg;
 			break;
 		}
 	}
 
 	if (avail == NULL)
 		avail = first;
 
 	return avail;
 }
 
 int
 i915_gem_object_get_fence(struct drm_i915_gem_object *obj,
     struct intel_ring_buffer *pipelined)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct drm_i915_fence_reg *reg;
 	int ret;
 
 	pipelined = NULL;
 	ret = 0;
 
 	if (obj->fence_reg != I915_FENCE_REG_NONE) {
 		reg = &dev_priv->fence_regs[obj->fence_reg];
 		list_move_tail(&reg->lru_list, &dev_priv->mm.fence_list);
 
 		if (obj->tiling_changed) {
 			ret = i915_gem_object_flush_fence(obj, pipelined);
 			if (ret)
 				return ret;
 
 			if (!obj->fenced_gpu_access && !obj->last_fenced_seqno)
 				pipelined = NULL;
 
 			if (pipelined) {
 				reg->setup_seqno =
 					i915_gem_next_request_seqno(pipelined);
 				obj->last_fenced_seqno = reg->setup_seqno;
 				obj->last_fenced_ring = pipelined;
 			}
 
 			goto update;
 		}
 
 		if (!pipelined) {
 			if (reg->setup_seqno) {
 				if (!ring_passed_seqno(obj->last_fenced_ring,
 				    reg->setup_seqno)) {
 					ret = i915_wait_request(
 					    obj->last_fenced_ring,
 					    reg->setup_seqno,
 					    true);
 					if (ret)
 						return ret;
 				}
 
 				reg->setup_seqno = 0;
 			}
 		} else if (obj->last_fenced_ring &&
 			   obj->last_fenced_ring != pipelined) {
 			ret = i915_gem_object_flush_fence(obj, pipelined);
 			if (ret)
 				return ret;
 		}
 
 		if (!obj->fenced_gpu_access && !obj->last_fenced_seqno)
 			pipelined = NULL;
 		KASSERT(pipelined || reg->setup_seqno == 0, ("!pipelined"));
 
 		if (obj->tiling_changed) {
 			if (pipelined) {
 				reg->setup_seqno =
 					i915_gem_next_request_seqno(pipelined);
 				obj->last_fenced_seqno = reg->setup_seqno;
 				obj->last_fenced_ring = pipelined;
 			}
 			goto update;
 		}
 
 		return 0;
 	}
 
 	reg = i915_find_fence_reg(dev, pipelined);
 	if (reg == NULL)
 		return -EDEADLK;
 
 	ret = i915_gem_object_flush_fence(obj, pipelined);
 	if (ret)
 		return ret;
 
 	if (reg->obj) {
 		struct drm_i915_gem_object *old = reg->obj;
 
 		drm_gem_object_reference(&old->base);
 
 		if (old->tiling_mode)
 			i915_gem_release_mmap(old);
 
 		ret = i915_gem_object_flush_fence(old, pipelined);
 		if (ret) {
 			drm_gem_object_unreference(&old->base);
 			return ret;
 		}
 
 		if (old->last_fenced_seqno == 0 && obj->last_fenced_seqno == 0)
 			pipelined = NULL;
 
 		old->fence_reg = I915_FENCE_REG_NONE;
 		old->last_fenced_ring = pipelined;
 		old->last_fenced_seqno =
 			pipelined ? i915_gem_next_request_seqno(pipelined) : 0;
 
 		drm_gem_object_unreference(&old->base);
 	} else if (obj->last_fenced_seqno == 0)
 		pipelined = NULL;
 
 	reg->obj = obj;
 	list_move_tail(&reg->lru_list, &dev_priv->mm.fence_list);
 	obj->fence_reg = reg - dev_priv->fence_regs;
 	obj->last_fenced_ring = pipelined;
 
 	reg->setup_seqno =
 		pipelined ? i915_gem_next_request_seqno(pipelined) : 0;
 	obj->last_fenced_seqno = reg->setup_seqno;
 
 update:
 	obj->tiling_changed = false;
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
 	case 6:
 		ret = sandybridge_write_fence_reg(obj, pipelined);
 		break;
 	case 5:
 	case 4:
 		ret = i965_write_fence_reg(obj, pipelined);
 		break;
 	case 3:
 		ret = i915_write_fence_reg(obj, pipelined);
 		break;
 	case 2:
 		ret = i830_write_fence_reg(obj, pipelined);
 		break;
 	}
 
 	return ret;
 }
 
 static void
 i915_gem_clear_fence_reg(struct drm_device *dev, struct drm_i915_fence_reg *reg)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	uint32_t fence_reg = reg - dev_priv->fence_regs;
 
 	switch (INTEL_INFO(dev)->gen) {
 	case 7:
 	case 6:
 		I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + fence_reg*8, 0);
 		break;
 	case 5:
 	case 4:
 		I915_WRITE64(FENCE_REG_965_0 + fence_reg*8, 0);
 		break;
 	case 3:
 		if (fence_reg >= 8)
 			fence_reg = FENCE_REG_945_8 + (fence_reg - 8) * 4;
 		else
 	case 2:
 			fence_reg = FENCE_REG_830_0 + fence_reg * 4;
 
 		I915_WRITE(fence_reg, 0);
 		break;
 	}
 
 	list_del_init(&reg->lru_list);
 	reg->obj = NULL;
 	reg->setup_seqno = 0;
 	reg->pin_count = 0;
 }
 
 int
 i915_gem_init_object(struct drm_gem_object *obj)
 {
 
 	printf("i915_gem_init_object called\n");
 	return (0);
 }
 
 static bool
 i915_gem_object_is_inactive(struct drm_i915_gem_object *obj)
 {
 
 	return (obj->gtt_space && !obj->active && obj->pin_count == 0);
 }
 
 static void
 i915_gem_retire_task_handler(void *arg, int pending)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_device *dev;
 	bool idle;
 	int i;
 
 	dev_priv = arg;
 	dev = dev_priv->dev;
 
 	/* Come back later if the device is busy... */
 	if (!sx_try_xlock(&dev->dev_struct_lock)) {
 		taskqueue_enqueue_timeout(dev_priv->tq,
 		    &dev_priv->mm.retire_task, hz);
 		return;
 	}
 
 	CTR0(KTR_DRM, "retire_task");
 
 	i915_gem_retire_requests(dev);
 
 	/* Send a periodic flush down the ring so we don't hold onto GEM
 	 * objects indefinitely.
 	 */
 	idle = true;
 	for (i = 0; i < I915_NUM_RINGS; i++) {
 		struct intel_ring_buffer *ring = &dev_priv->rings[i];
 
 		if (!list_empty(&ring->gpu_write_list)) {
 			struct drm_i915_gem_request *request;
 			int ret;
 
 			ret = i915_gem_flush_ring(ring,
 						  0, I915_GEM_GPU_DOMAINS);
 			request = malloc(sizeof(*request), DRM_I915_GEM,
 			    M_WAITOK | M_ZERO);
 			if (ret || request == NULL ||
 			    i915_add_request(ring, NULL, request))
 				free(request, DRM_I915_GEM);
 		}
 
 		idle &= list_empty(&ring->request_list);
 	}
 
 	if (!dev_priv->mm.suspended && !idle)
 		taskqueue_enqueue_timeout(dev_priv->tq,
 		    &dev_priv->mm.retire_task, hz);
 
 	DRM_UNLOCK(dev);
 }
 
 void
 i915_gem_lastclose(struct drm_device *dev)
 {
 	int ret;
 
 	if (drm_core_check_feature(dev, DRIVER_MODESET))
 		return;
 
 	ret = i915_gem_idle(dev);
 	if (ret != 0)
 		DRM_ERROR("failed to idle hardware: %d\n", ret);
 }
 
 static int
 i915_gem_init_phys_object(struct drm_device *dev, int id, int size, int align)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_i915_gem_phys_object *phys_obj;
 	int ret;
 
 	dev_priv = dev->dev_private;
 	if (dev_priv->mm.phys_objs[id - 1] != NULL || size == 0)
 		return (0);
 
 	phys_obj = malloc(sizeof(struct drm_i915_gem_phys_object), DRM_I915_GEM,
 	    M_WAITOK | M_ZERO);
 
 	phys_obj->id = id;
 
 	phys_obj->handle = drm_pci_alloc(dev, size, align, ~0);
 	if (phys_obj->handle == NULL) {
 		ret = -ENOMEM;
 		goto free_obj;
 	}
 	pmap_change_attr((vm_offset_t)phys_obj->handle->vaddr,
 	    size / PAGE_SIZE, PAT_WRITE_COMBINING);
 
 	dev_priv->mm.phys_objs[id - 1] = phys_obj;
 
 	return (0);
 
 free_obj:
 	free(phys_obj, DRM_I915_GEM);
 	return (ret);
 }
 
 static void
 i915_gem_free_phys_object(struct drm_device *dev, int id)
 {
 	drm_i915_private_t *dev_priv;
 	struct drm_i915_gem_phys_object *phys_obj;
 
 	dev_priv = dev->dev_private;
 	if (dev_priv->mm.phys_objs[id - 1] == NULL)
 		return;
 
 	phys_obj = dev_priv->mm.phys_objs[id - 1];
 	if (phys_obj->cur_obj != NULL)
 		i915_gem_detach_phys_object(dev, phys_obj->cur_obj);
 
 	drm_pci_free(dev, phys_obj->handle);
 	free(phys_obj, DRM_I915_GEM);
 	dev_priv->mm.phys_objs[id - 1] = NULL;
 }
 
 void
 i915_gem_free_all_phys_object(struct drm_device *dev)
 {
 	int i;
 
 	for (i = I915_GEM_PHYS_CURSOR_0; i <= I915_MAX_PHYS_OBJECT; i++)
 		i915_gem_free_phys_object(dev, i);
 }
 
 void
 i915_gem_detach_phys_object(struct drm_device *dev,
     struct drm_i915_gem_object *obj)
 {
 	vm_page_t m;
 	struct sf_buf *sf;
 	char *vaddr, *dst;
 	int i, page_count;
 
 	if (obj->phys_obj == NULL)
 		return;
 	vaddr = obj->phys_obj->handle->vaddr;
 
 	page_count = obj->base.size / PAGE_SIZE;
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	for (i = 0; i < page_count; i++) {
 		m = i915_gem_wire_page(obj->base.vm_obj, i);
 		if (m == NULL)
 			continue; /* XXX */
 
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 		sf = sf_buf_alloc(m, 0);
 		if (sf != NULL) {
 			dst = (char *)sf_buf_kva(sf);
 			memcpy(dst, vaddr + IDX_TO_OFF(i), PAGE_SIZE);
 			sf_buf_free(sf);
 		}
 		drm_clflush_pages(&m, 1);
 
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 		vm_page_reference(m);
 		vm_page_lock(m);
 		vm_page_dirty(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 	intel_gtt_chipset_flush();
 
 	obj->phys_obj->cur_obj = NULL;
 	obj->phys_obj = NULL;
 }
 
 int
 i915_gem_attach_phys_object(struct drm_device *dev,
     struct drm_i915_gem_object *obj, int id, int align)
 {
 	drm_i915_private_t *dev_priv;
 	vm_page_t m;
 	struct sf_buf *sf;
 	char *dst, *src;
 	int i, page_count, ret;
 
 	if (id > I915_MAX_PHYS_OBJECT)
 		return (-EINVAL);
 
 	if (obj->phys_obj != NULL) {
 		if (obj->phys_obj->id == id)
 			return (0);
 		i915_gem_detach_phys_object(dev, obj);
 	}
 
 	dev_priv = dev->dev_private;
 	if (dev_priv->mm.phys_objs[id - 1] == NULL) {
 		ret = i915_gem_init_phys_object(dev, id, obj->base.size, align);
 		if (ret != 0) {
 			DRM_ERROR("failed to init phys object %d size: %zu\n",
 				  id, obj->base.size);
 			return (ret);
 		}
 	}
 
 	/* bind to the object */
 	obj->phys_obj = dev_priv->mm.phys_objs[id - 1];
 	obj->phys_obj->cur_obj = obj;
 
 	page_count = obj->base.size / PAGE_SIZE;
 
 	VM_OBJECT_WLOCK(obj->base.vm_obj);
 	ret = 0;
 	for (i = 0; i < page_count; i++) {
 		m = i915_gem_wire_page(obj->base.vm_obj, i);
 		if (m == NULL) {
 			ret = -EIO;
 			break;
 		}
 		VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 		sf = sf_buf_alloc(m, 0);
 		src = (char *)sf_buf_kva(sf);
 		dst = (char *)obj->phys_obj->handle->vaddr + IDX_TO_OFF(i);
 		memcpy(dst, src, PAGE_SIZE);
 		sf_buf_free(sf);
 
 		VM_OBJECT_WLOCK(obj->base.vm_obj);
 
 		vm_page_reference(m);
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 		atomic_add_long(&i915_gem_wired_pages_cnt, -1);
 	}
 	VM_OBJECT_WUNLOCK(obj->base.vm_obj);
 
 	return (0);
 }
 
 static int
 i915_gem_phys_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj,
     uint64_t data_ptr, uint64_t offset, uint64_t size,
     struct drm_file *file_priv)
 {
 	char *user_data, *vaddr;
 	int ret;
 
 	vaddr = (char *)obj->phys_obj->handle->vaddr + offset;
 	user_data = (char *)(uintptr_t)data_ptr;
 
 	if (copyin_nofault(user_data, vaddr, size) != 0) {
 		/* The physical object once assigned is fixed for the lifetime
 		 * of the obj, so we can safely drop the lock and continue
 		 * to access vaddr.
 		 */
 		DRM_UNLOCK(dev);
 		ret = -copyin(user_data, vaddr, size);
 		DRM_LOCK(dev);
 		if (ret != 0)
 			return (ret);
 	}
 
 	intel_gtt_chipset_flush();
 	return (0);
 }
 
 static int
 i915_gpu_is_active(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv;
 
 	dev_priv = dev->dev_private;
 	return (!list_empty(&dev_priv->mm.flushing_list) ||
 	    !list_empty(&dev_priv->mm.active_list));
 }
 
 static void
 i915_gem_lowmem(void *arg)
 {
 	struct drm_device *dev;
 	struct drm_i915_private *dev_priv;
 	struct drm_i915_gem_object *obj, *next;
 	int cnt, cnt_fail, cnt_total;
 
 	dev = arg;
 	dev_priv = dev->dev_private;
 
 	if (!sx_try_xlock(&dev->dev_struct_lock))
 		return;
 
 	CTR0(KTR_DRM, "gem_lowmem");
 
 rescan:
 	/* first scan for clean buffers */
 	i915_gem_retire_requests(dev);
 
 	cnt_total = cnt_fail = cnt = 0;
 
 	list_for_each_entry_safe(obj, next, &dev_priv->mm.inactive_list,
 	    mm_list) {
 		if (i915_gem_object_is_purgeable(obj)) {
 			if (i915_gem_object_unbind(obj) != 0)
 				cnt_total++;
 		} else
 			cnt_total++;
 	}
 
 	/* second pass, evict/count anything still on the inactive list */
 	list_for_each_entry_safe(obj, next, &dev_priv->mm.inactive_list,
 	    mm_list) {
 		if (i915_gem_object_unbind(obj) == 0)
 			cnt++;
 		else
 			cnt_fail++;
 	}
 
 	if (cnt_fail > cnt_total / 100 && i915_gpu_is_active(dev)) {
 		/*
 		 * We are desperate for pages, so as a last resort, wait
 		 * for the GPU to finish and discard whatever we can.
 		 * This has a dramatic impact to reduce the number of
 		 * OOM-killer events whilst running the GPU aggressively.
 		 */
 		if (i915_gpu_idle(dev, true) == 0)
 			goto rescan;
 	}
 	DRM_UNLOCK(dev);
 }
 
 void
 i915_gem_unload(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv;
 
 	dev_priv = dev->dev_private;
 	EVENTHANDLER_DEREGISTER(vm_lowmem, dev_priv->mm.i915_lowmem);
 }
Index: user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/kern/uipc_syscalls.c	(revision 267237)
@@ -1,3668 +1,3668 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * sendfile(2) and related extensions:
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/condvar.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sf_sync.h>
 #include <sys/sf_base.h>
 #include <sys/sysent.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #if defined(INET) || defined(INET6)
 #ifdef SCTP
 #include <netinet/sctp.h>
 #include <netinet/sctp_peeloff.h>
 #endif /* SCTP */
 #endif /* INET || INET6 */
 
 /*
  * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC
  * and SOCK_NONBLOCK.
  */
 #define	ACCEPT4_INHERIT	0x1
 #define	ACCEPT4_COMPAT	0x2
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, int s, struct sockaddr *uname,
 		   socklen_t *anamelen, int flags);
 static int do_sendfile(struct thread *td, struct sendfile_args *uap,
 		   int compat);
 static int getsockname1(struct thread *td, struct getsockname_args *uap,
 			int compat);
 static int getpeername1(struct thread *td, struct getpeername_args *uap,
 			int compat);
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static int	filt_sfsync_attach(struct knote *kn);
 static void	filt_sfsync_detach(struct knote *kn);
 static int	filt_sfsync(struct knote *kn, long hint);
 
 /*
  * sendfile(2)-related variables and associated sysctls
  */
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, sendfile, CTLFLAG_RW, 0,
     "sendfile(2) tunables");
 static int sfreadahead = 1;
 SYSCTL_INT(_kern_ipc_sendfile, OID_AUTO, readahead, CTLFLAG_RW,
     &sfreadahead, 0, "Number of sendfile(2) read-ahead MAXBSIZE blocks");
 
 #ifdef	SFSYNC_DEBUG
 static int sf_sync_debug = 0;
 SYSCTL_INT(_debug, OID_AUTO, sf_sync_debug, CTLFLAG_RW,
     &sf_sync_debug, 0, "Output debugging during sf_sync lifecycle");
 #define	SFSYNC_DPRINTF(s, ...)				\
 		do {					\
 			if (sf_sync_debug)		\
 				printf((s), ##__VA_ARGS__); \
 		} while (0)
 #else
 #define	SFSYNC_DPRINTF(c, ...)
 #endif
 
 static uma_zone_t	zone_sfsync;
 
 static struct filterops sendfile_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sfsync_attach,
 	.f_detach = filt_sfsync_detach,
 	.f_event = filt_sfsync,
 };
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static void
 sf_sync_init(const void *unused)
 {
 
 	zone_sfsync = uma_zcreate("sendfile_sync", sizeof(struct sendfile_sync),
 	    NULL, NULL,
 	    NULL, NULL,
 	    UMA_ALIGN_CACHE,
 	    0);
 	kqueue_add_filteropts(EVFILT_SENDFILE, &sendfile_filtops);
 }
 SYSINIT(sf_sync, SI_SUB_MBUF, SI_ORDER_FIRST, sf_sync_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
 
 /*
  * Convert a user file descriptor to a kernel file entry and check if required
  * capability rights are present.
  * A reference on the file entry is held upon returning.
  */
 static int
 getsock_cap(struct filedesc *fdp, int fd, cap_rights_t *rightsp,
     struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, curthread);
 		return (ENOTSOCK);
 	}
 	if (fflagp != NULL)
 		*fflagp = fp->f_flag;
 	*fpp = fp;
 	return (0);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
 int
 sys_socket(td, uap)
 	struct thread *td;
 	struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	int fd, error, type, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
 
 	type = uap->type;
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 
 #ifdef MAC
 	error = mac_socket_check_create(td->td_ucred, uap->domain, type,
 	    uap->protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = falloc(td, &fp, &fd, oflag);
 	if (error != 0)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	error = socreate(uap->domain, &so, type, uap->protocol,
 	    td->td_ucred, td);
 	if (error != 0) {
 		fdclose(td->td_proc->p_fd, fp, fd, td);
 	} else {
 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
 		if ((fflag & FNONBLOCK) != 0)
 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 sys_bind(td, uap)
 	struct thread *td;
 	struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bind(td, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 static int
 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock_cap(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_BIND), &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_bind(td->td_ucred, so, sa);
 	if (error == 0) {
 #endif
 		if (dirfd == AT_FDCWD)
 			error = sobind(so, sa, td);
 		else
 			error = sobindat(dirfd, so, sa, td);
 #ifdef MAC
 	}
 #endif
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kern_bind(struct thread *td, int fd, struct sockaddr *sa)
 {
 
 	return (kern_bindat(td, AT_FDCWD, fd, sa));
 }
 
 /* ARGSUSED */
 int
 sys_bindat(td, uap)
 	struct thread *td;
 	struct bindat_args /* {
 		int	fd;
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bindat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 sys_listen(td, uap)
 	struct thread *td;
 	struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->s);
 	error = getsock_cap(td->td_proc->p_fd, uap->s,
 	    cap_rights_init(&rights, CAP_LISTEN), &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		error = mac_socket_check_listen(td->td_ucred, so);
 		if (error == 0)
 #endif
 			error = solisten(so, uap->backlog, td);
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 /*
  * accept1()
  */
 static int
 accept1(td, s, uname, anamelen, flags)
 	struct thread *td;
 	int s;
 	struct sockaddr *uname;
 	socklen_t *anamelen;
 	int flags;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
 	struct file *fp;
 	int error;
 
 	if (uname == NULL)
 		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
 
 	error = copyin(anamelen, &namelen, sizeof (namelen));
 	if (error != 0)
 		return (error);
 
 	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
 
 	if (error != 0)
 		return (error);
 
 	if (error == 0 && uname != NULL) {
 #ifdef COMPAT_OLDSOCK
 		if (flags & ACCEPT4_COMPAT)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
 		error = copyout(name, uname, namelen);
 	}
 	if (error == 0)
 		error = copyout(&namelen, anamelen,
 		    sizeof(namelen));
 	if (error != 0)
 		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
 	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
 }
 
 int
 kern_accept4(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, int flags, struct file **fp)
 {
 	struct filedesc *fdp;
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
 	struct socket *head, *so;
 	cap_rights_t rights;
 	u_int fflag;
 	pid_t pgid;
 	int error, fd, tmp;
 
 	if (name != NULL)
 		*name = NULL;
 
 	AUDIT_ARG_FD(s);
 	fdp = td->td_proc->p_fd;
 	error = getsock_cap(fdp, s, cap_rights_init(&rights, CAP_ACCEPT),
 	    &headfp, &fflag);
 	if (error != 0)
 		return (error);
 	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	error = mac_socket_check_accept(td->td_ucred, head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc(td, &nfp, &fd, (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0);
 	if (error != 0)
 		goto done;
 	ACCEPT_LOCK();
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 		ACCEPT_UNLOCK();
 		error = EWOULDBLOCK;
 		goto noconnection;
 	}
 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 		if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			head->so_error = ECONNABORTED;
 			break;
 		}
 		error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
 		    "accept", 0);
 		if (error != 0) {
 			ACCEPT_UNLOCK();
 			goto noconnection;
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 		ACCEPT_UNLOCK();
 		goto noconnection;
 	}
 	so = TAILQ_FIRST(&head->so_comp);
 	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
 	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
 	SOCK_LOCK(so);			/* soref() and so_state update */
 	soref(so);			/* file descriptor reference */
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	if (flags & ACCEPT4_INHERIT)
 		so->so_state |= (head->so_state & SS_NBIO);
 	else
 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);
 
 	if (flags & ACCEPT4_INHERIT) {
 		pgid = fgetown(&head->so_sigio);
 		if (pgid != 0)
 			fsetown(pgid, &so->so_sigio);
 	} else {
 		fflag &= ~(FNONBLOCK | FASYNC);
 		if (flags & SOCK_NONBLOCK)
 			fflag |= FNONBLOCK;
 	}
 
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	sa = 0;
 	error = soaccept(so, &sa);
 	if (error != 0)
 		goto noconnection;
 	if (sa == NULL) {
 		if (name)
 			*namelen = 0;
 		goto done;
 	}
 	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
 			*namelen = sa->sa_len;
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_STRUCT))
 			ktrsockaddr(sa);
 #endif
 		*name = sa;
 		sa = NULL;
 	}
 noconnection:
 	free(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(fdp, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 	return (error);
 }
 
 int
 sys_accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 }
 
 int
 sys_accept4(td, uap)
 	struct thread *td;
 	struct accept4_args *uap;
 {
 
 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return (EINVAL);
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen,
 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /* ARGSUSED */
 int
 sys_connect(td, uap)
 	struct thread *td;
 	struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connect(td, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 static int
 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	int error, interrupted = 0;
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock_cap(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_CONNECT), &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_connect(td->td_ucred, so, sa);
 	if (error != 0)
 		goto bad;
 #endif
 	if (dirfd == AT_FDCWD)
 		error = soconnect(so, sa, td);
 	else
 		error = soconnectat(dirfd, so, sa, td);
 	if (error != 0)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH,
 		    "connec", 0);
 		if (error != 0) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
 			break;
 		}
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (!interrupted)
 		so->so_state &= ~SS_ISCONNECTING;
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 kern_connect(struct thread *td, int fd, struct sockaddr *sa)
 {
 
 	return (kern_connectat(td, AT_FDCWD, fd, sa));
 }
 
 /* ARGSUSED */
 int
 sys_connectat(td, uap)
 	struct thread *td;
 	struct connectat_args /* {
 		int	fd;
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connectat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_socketpair(struct thread *td, int domain, int type, int protocol,
     int *rsv)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
 
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_socket_check_create(td->td_ucred, domain, type,
 	    protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		return (error);
 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd, oflag);
 	if (error != 0)
 		goto free2;
 	rsv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd, oflag);
 	if (error != 0)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	rsv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error != 0)
 		goto free4;
 	if (type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error != 0)
 			goto free4;
 	}
 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
 	    &socketops);
 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
 	    &socketops);
 	if ((fflag & FNONBLOCK) != 0) {
 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
 	}
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	return (0);
 free4:
 	fdclose(fdp, fp2, rsv[1], td);
 	fdrop(fp2, td);
 free3:
 	fdclose(fdp, fp1, rsv[0], td);
 	fdrop(fp1, td);
 free2:
 	if (so2 != NULL)
 		(void)soclose(so2);
 free1:
 	if (so1 != NULL)
 		(void)soclose(so1);
 	return (error);
 }
 
 int
 sys_socketpair(struct thread *td, struct socketpair_args *uap)
 {
 	int error, sv[2];
 
 	error = kern_socketpair(td, uap->domain, uap->type,
 	    uap->protocol, sv);
 	if (error != 0)
 		return (error);
 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
 	if (error != 0) {
 		(void)kern_close(td, sv[0]);
 		(void)kern_close(td, sv[1]);
 	}
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
 		return (ECAPMODE);
 #endif
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error != 0) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && mp->msg_flags != MSG_COMPAT
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error != 0)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
 			cm = mtod(control, struct cmsghdr *);
 			cm->cmsg_len = control->m_len;
 			cm->cmsg_level = SOL_SOCKET;
 			cm->cmsg_type = SCM_RIGHTS;
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(td, s, mp, flags, control, segflg)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	int flags;
 	struct mbuf *control;
 	enum uio_seg segflg;
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int i, error;
 
 	AUDIT_ARG_FD(s);
 	cap_rights_init(&rights, CAP_SEND);
 	if (mp->msg_name != NULL) {
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
 		cap_rights_set(&rights, CAP_CONNECT);
 	}
 	error = getsock_cap(td->td_proc->p_fd, s, &rights, &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = (struct socket *)fp->f_data;
 
 #ifdef KTRACE
 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(mp->msg_name);
 #endif
 #ifdef MAC
 	if (mp->msg_name != NULL) {
 		error = mac_socket_check_connect(td->td_ucred, so,
 		    mp->msg_name);
 		if (error != 0)
 			goto bad;
 	}
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto bad;
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_sendto(td, uap)
 	struct thread *td;
 	struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	to;
 		int	tolen;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 osend(td, uap)
 	struct thread *td;
 	struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 int
 osendmsg(td, uap)
 	struct thread *td;
 	struct osendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_sendmsg(td, uap)
 	struct thread *td;
 	struct sendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(td, s, mp, fromseg, controlp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	enum uio_seg fromseg;
 	struct mbuf **controlp;
 {
 	struct uio auio;
 	struct iovec *iov;
 	struct mbuf *m, *control = NULL;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = NULL;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, i;
 
 	if (controlp != NULL)
 		*controlp = NULL;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td->td_proc->p_fd, s,
 	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, NULL,
 	    (mp->msg_control || controlp) ? &control : NULL,
 	    &mp->msg_flags);
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	if (fromsa != NULL)
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error != 0)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && mp->msg_flags & MSG_COMPAT) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len)
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 
 			if ((error = copyout(mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	free(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)
 		*controlp = control;
 	else  if (control)
 		m_freem(control);
 
 	return (error);
 }
 
 static int
 recvit(td, s, mp, namelenp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
 	void *namelenp;
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error != 0)
 		return (error);
 	if (namelenp != NULL) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags & MSG_COMPAT)
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
 int
 sys_recvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		struct sockaddr * __restrict	from;
 		socklen_t * __restrict fromlenaddr;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (uap->fromlenaddr) {
 		error = copyin(uap->fromlenaddr,
 		    &msg.msg_namelen, sizeof (msg.msg_namelen));
 		if (error != 0)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = uap->from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
 done2:
 	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 orecvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args *uap;
 {
 
 	uap->flags |= MSG_COMPAT;
 	return (sys_recvfrom(td, uap));
 }
 #endif
 
 #ifdef COMPAT_OLDSOCK
 int
 orecv(td, uap)
 	struct thread *td;
 	struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	return (recvit(td, uap->s, &msg, NULL));
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  */
 int
 orecvmsg(td, uap)
 	struct thread *td;
 	struct orecvmsg_args /* {
 		int	s;
 		struct	omsghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_recvmsg(td, uap)
 	struct thread *td;
 	struct recvmsg_args /* {
 		int	s;
 		struct	msghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
 /* ARGSUSED */
 int
 sys_shutdown(td, uap)
 	struct thread *td;
 	struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(uap->s);
 	error = getsock_cap(td->td_proc->p_fd, uap->s,
 	    cap_rights_init(&rights, CAP_SHUTDOWN), &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 int
 sys_setsockopt(td, uap)
 	struct thread *td;
 	struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	valsize;
 	} */ *uap;
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t valsize;
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	cap_rights_t rights;
 	int error;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td->td_proc->p_fd, s,
 	    cap_rights_init(&rights, CAP_SETSOCKOPT), &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 /* ARGSUSED */
 int
 sys_getsockopt(td, uap)
 	struct thread *td;
 	struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		void * __restrict	val;
 		socklen_t * __restrict avalsize;
 	} */ *uap;
 {
 	socklen_t valsize;
 	int error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error != 0)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(td, s, level, name, val, valseg, valsize)
 	struct thread *td;
 	int s;
 	int level;
 	int name;
 	void *val;
 	enum uio_seg valseg;
 	socklen_t *valsize;
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	cap_rights_t rights;
 	int error;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td->td_proc->p_fd, s,
 	    cap_rights_init(&rights, CAP_GETSOCKOPT), &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name.
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
 	struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof(len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getsockname(td, uap->fdes, &sa, &len);
 	if (error != 0)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	socklen_t len;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock_cap(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_GETSOCKNAME), &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	*sa = NULL;
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
 	CURVNET_RESTORE();
 	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(*sa);
 #endif
 bad:
 	fdrop(fp, td);
 	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 	return (error);
 }
 
 int
 sys_getsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * getpeername1() - Get name of peer for connected socket.
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
 	struct getpeername_args /* {
 		int	fdes;
 		struct sockaddr * __restrict	asa;
 		socklen_t * __restrict	alen;
 	} */ *uap;
 	int compat;
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(uap->alen, &len, sizeof (len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getpeername(td, uap->fdes, &sa, &len);
 	if (error != 0)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, uap->asa, (u_int)len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, uap->alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	cap_rights_t rights;
 	socklen_t len;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock_cap(td->td_proc->p_fd, fd,
 	    cap_rights_init(&rights, CAP_GETPEERNAME), &fp, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done;
 	}
 	*sa = NULL;
 	CURVNET_SET(so->so_vnet);
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	CURVNET_RESTORE();
 	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(*sa);
 #endif
 bad:
 	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_getpeername(td, uap)
 	struct thread *td;
 	struct getpeername_args *uap;
 {
 
 	return (getpeername1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetpeername(td, uap)
 	struct thread *td;
 	struct ogetpeername_args *uap;
 {
 
 	/* XXX uap should have type `getpeername_args *' to begin with. */
 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
 	int buflen, type;
 {
 	struct sockaddr *sa;
 	struct mbuf *m;
 	int error;
 
 	if (buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if (buflen > MCLBYTES)
 				return (EINVAL);
 	}
 	m = m_get2(buflen, M_WAITOK, type, 0);
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 	if (error != 0)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(namp, uaddr, len)
 	struct sockaddr **namp;
 	caddr_t uaddr;
 	size_t len;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	sa = malloc(len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error != 0) {
 		free(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 static int
 filt_sfsync_attach(struct knote *kn)
 {
 	struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_sdata;
 	struct knlist *knl = &sfs->klist;
 
 	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
 
 	/*
 	 * Validate that we actually received this via the kernel API.
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
 
 	kn->kn_ptr.p_v = sfs;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knl->kl_lock(knl->kl_lockarg);
 	/*
 	 * If we're in the "freeing" state,
 	 * don't allow the add.  That way we don't
 	 * end up racing with some other thread that
 	 * is trying to finish some setup.
 	 */
 	if (sfs->state == SF_STATE_FREEING) {
 		knl->kl_unlock(knl->kl_lockarg);
 		return (EINVAL);
 	}
 	knlist_add(&sfs->klist, kn, 1);
 	knl->kl_unlock(knl->kl_lockarg);
 
 	return (0);
 }
 
 /*
  * Called when a knote is being detached.
  */
 static void
 filt_sfsync_detach(struct knote *kn)
 {
 	struct knlist *knl;
 	struct sendfile_sync *sfs;
 	int do_free = 0;
 
 	sfs = kn->kn_ptr.p_v;
 	knl = &sfs->klist;
 
 	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
 
 	knl->kl_lock(knl->kl_lockarg);
 	if (!knlist_empty(knl))
 		knlist_remove(knl, kn, 1);
 
 	/*
 	 * If the list is empty _AND_ the refcount is 0
 	 * _AND_ we've finished the setup phase and now
 	 * we're in the running phase, we can free the
 	 * underlying sendfile_sync.
 	 *
 	 * But we shouldn't do it before finishing the
 	 * underlying divorce from the knote.
 	 *
 	 * So, we have the sfsync lock held; transition
 	 * it to "freeing", then unlock, then free
 	 * normally.
 	 */
 	if (knlist_empty(knl)) {
 		if (sfs->state == SF_STATE_COMPLETED && sfs->count == 0) {
 			SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
 			    "count==0, empty list: time to free!\n",
 			    __func__,
 			    (unsigned long long) curthread->td_tid,
 			    sfs);
 			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
 			do_free = 1;
 		}
 	}
 	knl->kl_unlock(knl->kl_lockarg);
 
 	/*
 	 * Only call free if we're the one who has transitioned things
 	 * to free.  Otherwise we could race with another thread that
 	 * is currently tearing things down.
 	 */
 	if (do_free == 1) {
 		SFSYNC_DPRINTF("%s: (%llu) sfs=%p, %s:%d\n",
 		    __func__,
 		    (unsigned long long) curthread->td_tid,
 		    sfs,
 		    __FILE__,
 		    __LINE__);
 		sf_sync_free(sfs);
 	}
 }
 
 static int
 filt_sfsync(struct knote *kn, long hint)
 {
 	struct sendfile_sync *sfs = (struct sendfile_sync *) kn->kn_ptr.p_v;
 	int ret;
 
 	SFSYNC_DPRINTF("%s: kn=%p, sfs=%p\n", __func__, kn, sfs);
 
 	/*
 	 * XXX add a lock assertion here!
 	 */
 	ret = (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED);
 
 	return (ret);
 }
 
 
 /*
  * Detach mapped page and release resources back to the system.
  */
 int
 sf_buf_mext(struct mbuf *mb, void *addr, void *args)
 {
 	vm_page_t m;
 	struct sendfile_sync *sfs;
 
 	m = sf_buf_page(args);
 	sf_buf_free(args);
 	vm_page_lock(m);
-	vm_page_unwire(m, 0);
+	vm_page_unwire(m, PQ_INACTIVE);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
 	if (m->wire_count == 0 && m->object == NULL)
 		vm_page_free(m);
 	vm_page_unlock(m);
 	if (addr != NULL) {
 		sfs = addr;
 		sf_sync_deref(sfs);
 	}
 	/*
 	 * sfs may be invalid at this point, don't use it!
 	 */
 	return (EXT_FREE_OK);
 }
 
 /*
  * Called to remove a reference to a sf_sync object.
  *
  * This is generally done during the mbuf free path to signify
  * that one of the mbufs in the transaction has been completed.
  *
  * If we're doing SF_SYNC and the refcount is zero then we'll wake
  * up any waiters.
  *
  * IF we're doing SF_KQUEUE and the refcount is zero then we'll
  * fire off the knote.
  */
 void
 sf_sync_deref(struct sendfile_sync *sfs)
 {
 	int do_free = 0;
 
 	if (sfs == NULL)
 		return;
 
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
 	sfs->count --;
 
 	/*
 	 * Only fire off the wakeup / kqueue notification if
 	 * we are in the running state.
 	 */
 	if (sfs->count == 0 && sfs->state == SF_STATE_COMPLETED) {
 		if (sfs->flags & SF_SYNC)
 			cv_signal(&sfs->cv);
 
 		if (sfs->flags & SF_KQUEUE) {
 			SFSYNC_DPRINTF("%s: (%llu) sfs=%p: knote!\n",
 			    __func__,
 			    (unsigned long long) curthread->td_tid,
 			    sfs);
 			KNOTE_LOCKED(&sfs->klist, 1);
 		}
 
 		/*
 		 * If we're not waiting around for a sync,
 		 * check if the knote list is empty.
 		 * If it is, we transition to free.
 		 *
 		 * XXX I think it's about time I added some state
 		 * or flag that says whether we're supposed to be
 		 * waiting around until we've done a signal.
 		 *
 		 * XXX Ie, the reason that I don't free it here
 		 * is because the caller will free the last reference,
 		 * not us.  That should be codified in some flag
 		 * that indicates "self-free" rather than checking
 		 * for SF_SYNC all the time.
 		 */
 		if ((sfs->flags & SF_SYNC) == 0 && knlist_empty(&sfs->klist)) {
 			SFSYNC_DPRINTF("%s: (%llu) sfs=%p; completed, "
 			    "count==0, empty list: time to free!\n",
 			    __func__,
 			    (unsigned long long) curthread->td_tid,
 			    sfs);
 			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
 			do_free = 1;
 		}
 
 	}
 	mtx_unlock(&sfs->mtx);
 
 	/*
 	 * Attempt to do a free here.
 	 *
 	 * We do this outside of the lock because it may destroy the
 	 * lock in question as it frees things.  We can optimise this
 	 * later.
 	 *
 	 * XXX yes, we should make it a requirement to hold the
 	 * lock across sf_sync_free().
 	 */
 	if (do_free == 1) {
 		SFSYNC_DPRINTF("%s: (%llu) sfs=%p\n",
 		    __func__,
 		    (unsigned long long) curthread->td_tid,
 		    sfs);
 		sf_sync_free(sfs);
 	}
 }
 
 /*
  * Allocate a sendfile_sync state structure.
  *
  * For now this only knows about the "sleep" sync, but later it will
  * grow various other personalities.
  */
 struct sendfile_sync *
 sf_sync_alloc(uint32_t flags)
 {
 	struct sendfile_sync *sfs;
 
 	sfs = uma_zalloc(zone_sfsync, M_WAITOK | M_ZERO);
 	mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 	cv_init(&sfs->cv, "sendfile");
 	sfs->flags = flags;
 	sfs->state = SF_STATE_SETUP;
 	knlist_init_mtx(&sfs->klist, &sfs->mtx);
 
 	SFSYNC_DPRINTF("%s: sfs=%p, flags=0x%08x\n", __func__, sfs, sfs->flags);
 
 	return (sfs);
 }
 
 /*
  * Take a reference to a sfsync instance.
  *
  * This has to map 1:1 to free calls coming in via sf_buf_mext(),
  * so typically this will be referenced once for each mbuf allocated.
  */
 void
 sf_sync_ref(struct sendfile_sync *sfs)
 {
 
 	if (sfs == NULL)
 		return;
 
 	mtx_lock(&sfs->mtx);
 	sfs->count++;
 	mtx_unlock(&sfs->mtx);
 }
 
 void
 sf_sync_syscall_wait(struct sendfile_sync *sfs)
 {
 
 	if (sfs == NULL)
 		return;
 
 	KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
 	    __func__,
 	    sfs));
 
 	/*
 	 * If we're not requested to wait during the syscall,
 	 * don't bother waiting.
 	 */
 	if ((sfs->flags & SF_SYNC) == 0)
 		goto out;
 
 	/*
 	 * This is a bit suboptimal and confusing, so bear with me.
 	 *
 	 * Ideally sf_sync_syscall_wait() will wait until
 	 * all pending mbuf transmit operations are done.
 	 * This means that when sendfile becomes async, it'll
 	 * run in the background and will transition from
 	 * RUNNING to COMPLETED when it's finished acquiring
 	 * new things to send.  Then, when the mbufs finish
 	 * sending, COMPLETED + sfs->count == 0 is enough to
 	 * know that no further work is being done.
 	 *
 	 * So, we will sleep on both RUNNING and COMPLETED.
 	 * It's up to the (in progress) async sendfile loop
 	 * to transition the sf_sync from RUNNING to
 	 * COMPLETED so the wakeup above will actually
 	 * do the cv_signal() call.
 	 */
 	if (sfs->state != SF_STATE_COMPLETED && sfs->state != SF_STATE_RUNNING)
 		goto out;
 
 	if (sfs->count != 0)
 		cv_wait(&sfs->cv, &sfs->mtx);
 	KASSERT(sfs->count == 0, ("sendfile sync still busy"));
 
 out:
 	return;
 }
 
 /*
  * Free an sf_sync if it's appropriate to.
  */
 void
 sf_sync_free(struct sendfile_sync *sfs)
 {
 
 	if (sfs == NULL)
 		return;
 
 	SFSYNC_DPRINTF("%s: (%lld) sfs=%p; called; state=%d, flags=0x%08x "
 	    "count=%d\n",
 	    __func__,
 	    (long long) curthread->td_tid,
 	    sfs,
 	    sfs->state,
 	    sfs->flags,
 	    sfs->count);
 
 	mtx_lock(&sfs->mtx);
 
 	/*
 	 * We keep the sf_sync around if the state is active,
 	 * we are doing kqueue notification and we have active
 	 * knotes.
 	 *
 	 * If the caller wants to free us right this second it
 	 * should transition this to the freeing state.
 	 *
 	 * So, complain loudly if they break this rule.
 	 */
 	if (sfs->state != SF_STATE_FREEING) {
 		printf("%s: (%llu) sfs=%p; not freeing; let's wait!\n",
 		    __func__,
 		    (unsigned long long) curthread->td_tid,
 		    sfs);
 		mtx_unlock(&sfs->mtx);
 		return;
 	}
 
 	KASSERT(sfs->count == 0, ("sendfile sync still busy"));
 	cv_destroy(&sfs->cv);
 	/*
 	 * This doesn't call knlist_detach() on each knote; it just frees
 	 * the entire list.
 	 */
 	knlist_delete(&sfs->klist, curthread, 1);
 	mtx_destroy(&sfs->mtx);
 	SFSYNC_DPRINTF("%s: (%llu) sfs=%p; freeing\n",
 	    __func__,
 	    (unsigned long long) curthread->td_tid,
 	    sfs);
 	uma_zfree(zone_sfsync, sfs);
 }
 
 /*
  * Setup a sf_sync to post a kqueue notification when things are complete.
  */
 int
 sf_sync_kqueue_setup(struct sendfile_sync *sfs, struct sf_hdtr_kq *sfkq)
 {
 	struct kevent kev;
 	int error;
 
 	sfs->flags |= SF_KQUEUE;
 
 	/* Check the flags are valid */
 	if ((sfkq->kq_flags & ~(EV_CLEAR | EV_DISPATCH | EV_ONESHOT)) != 0)
 		return (EINVAL);
 
 	SFSYNC_DPRINTF("%s: sfs=%p: kqfd=%d, flags=0x%08x, ident=%p, udata=%p\n",
 	    __func__,
 	    sfs,
 	    sfkq->kq_fd,
 	    sfkq->kq_flags,
 	    (void *) sfkq->kq_ident,
 	    (void *) sfkq->kq_udata);
 
 	/* Setup and register a knote on the given kqfd. */
 	kev.ident = (uintptr_t) sfkq->kq_ident;
 	kev.filter = EVFILT_SENDFILE;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1 | sfkq->kq_flags;
 	kev.data = (intptr_t) sfs;
 	kev.udata = sfkq->kq_udata;
 
 	error = kqfd_register(sfkq->kq_fd, &kev, curthread, 1);
 	if (error != 0) {
 		SFSYNC_DPRINTF("%s: returned %d\n", __func__, error);
 	}
 	return (error);
 }
 
 void
 sf_sync_set_state(struct sendfile_sync *sfs, sendfile_sync_state_t state,
     int islocked)
 {
 	sendfile_sync_state_t old_state;
 
 	if (! islocked)
 		mtx_lock(&sfs->mtx);
 
 	/*
 	 * Update our current state.
 	 */
 	old_state = sfs->state;
 	sfs->state = state;
 	SFSYNC_DPRINTF("%s: (%llu) sfs=%p; going from %d to %d\n",
 	    __func__,
 	    (unsigned long long) curthread->td_tid,
 	    sfs,
 	    old_state,
 	    state);
 
 	/*
 	 * If we're transitioning from RUNNING to COMPLETED and the count is
 	 * zero, then post the knote.  The caller may have completed the
 	 * send before we updated the state to COMPLETED and we need to make
 	 * sure this is communicated.
 	 */
 	if (old_state == SF_STATE_RUNNING
 	    && state == SF_STATE_COMPLETED
 	    && sfs->count == 0
 	    && sfs->flags & SF_KQUEUE) {
 		SFSYNC_DPRINTF("%s: (%llu) sfs=%p: triggering knote!\n",
 		    __func__,
 		    (unsigned long long) curthread->td_tid,
 		    sfs);
 		KNOTE_LOCKED(&sfs->klist, 1);
 	}
 
 	if (! islocked)
 		mtx_unlock(&sfs->mtx);
 }
 
 /*
  * Set the retval/errno for the given transaction.
  *
  * This will eventually/ideally be used when the KNOTE is fired off
  * to signify the completion of this transaction.
  *
  * The sfsync lock should be held before entering this function.
  */
 void
 sf_sync_set_retval(struct sendfile_sync *sfs, off_t retval, int xerrno)
 {
 
 	KASSERT(mtx_owned(&sfs->mtx), ("%s: sfs=%p: not locked but should be!",
 	    __func__,
 	    sfs));
 
 	SFSYNC_DPRINTF("%s: (%llu) sfs=%p: errno=%d, retval=%jd\n",
 	    __func__,
 	    (unsigned long long) curthread->td_tid,
 	    sfs,
 	    xerrno,
 	    (intmax_t) retval);
 
 	sfs->retval = retval;
 	sfs->xerrno = xerrno;
 }
 
 /*
  * sendfile(2)
  *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (do_sendfile(td, uap, 0));
 }
 
 int
 _do_sendfile(struct thread *td, int src_fd, int sock_fd, int flags,
     int compat, off_t offset, size_t nbytes, off_t *sbytes,
     struct uio *hdr_uio,
     struct uio *trl_uio, struct sf_hdtr_kq *hdtr_kq)
 {
 	cap_rights_t rights;
 	struct sendfile_sync *sfs = NULL;
 	struct file *fp;
 	int error;
 	int do_kqueue = 0;
 	int do_free = 0;
 
 	AUDIT_ARG_FD(src_fd);
 
 	if (hdtr_kq != NULL)
 		do_kqueue = 1;
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, src_fd,
 	    cap_rights_init(&rights, CAP_PREAD), &fp)) != 0) {
 		goto out;
 	}
 
 	/*
 	 * IF SF_KQUEUE is set but we haven't copied in anything for
 	 * kqueue data, error out.
 	 */
 	if (flags & SF_KQUEUE && do_kqueue == 0) {
 		SFSYNC_DPRINTF("%s: SF_KQUEUE but no KQUEUE data!\n", __func__);
 		goto out;
 	}
 
 	/*
 	 * If we need to wait for completion, initialise the sfsync
 	 * state here.
 	 */
 	if (flags & (SF_SYNC | SF_KQUEUE))
 		sfs = sf_sync_alloc(flags & (SF_SYNC | SF_KQUEUE));
 
 	if (flags & SF_KQUEUE) {
 		error = sf_sync_kqueue_setup(sfs, hdtr_kq);
 		if (error) {
 			SFSYNC_DPRINTF("%s: (%llu) error; sfs=%p\n",
 			    __func__,
 			    (unsigned long long) curthread->td_tid,
 			    sfs);
 			sf_sync_set_state(sfs, SF_STATE_FREEING, 0);
 			sf_sync_free(sfs);
 			goto out;
 		}
 	}
 
 	/*
 	 * Do the sendfile call.
 	 *
 	 * If this fails, it'll free the mbuf chain which will free up the
 	 * sendfile_sync references.
 	 */
 	error = fo_sendfile(fp, sock_fd, hdr_uio, trl_uio, offset,
 	    nbytes, sbytes, flags, compat ? SFK_COMPAT : 0, sfs, td);
 
 	/*
 	 * If the sendfile call succeeded, transition the sf_sync state
 	 * to RUNNING, then COMPLETED.
 	 *
 	 * If the sendfile call failed, then the sendfile call may have
 	 * actually sent some data first - so we check to see whether
 	 * any data was sent.  If some data was queued (ie, count > 0)
 	 * then we can't call free; we have to wait until the partial
 	 * transaction completes before we continue along.
 	 *
 	 * This has the side effect of firing off the knote
 	 * if the refcount has hit zero by the time we get here.
 	 */
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (error == 0 || sfs->count > 0) {
 			/*
 			 * When it's time to do async sendfile, the transition
 			 * to RUNNING signifies that we're actually actively
 			 * adding and completing mbufs.  When the last disk
 			 * buffer is read (ie, when we're not doing any
 			 * further read IO and all subsequent stuff is mbuf
 			 * transmissions) we'll transition to COMPLETED
 			 * and when the final mbuf is freed, the completion
 			 * will be signaled.
 			 */
 			sf_sync_set_state(sfs, SF_STATE_RUNNING, 1);
 
 			/*
 			 * Set the retval before we signal completed.
 			 * If we do it the other way around then transitioning to
 			 * COMPLETED may post the knote before you set the return
 			 * status!
 			 *
 			 * XXX for now, errno is always 0, as we don't post
 			 * knotes if sendfile failed.  Maybe that'll change later.
 			 */
 			sf_sync_set_retval(sfs, *sbytes, error);
 
 			/*
 			 * And now transition to completed, which will kick off
 			 * the knote if required.
 			 */
 			sf_sync_set_state(sfs, SF_STATE_COMPLETED, 1);
 		} else {
 			/*
 			 * Error isn't zero, sfs_count is zero, so we
 			 * won't have some other thing to wake things up.
 			 * Thus free.
 			 */
 			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
 			do_free = 1;
 		}
 
 		/*
 		 * Next - wait if appropriate.
 		 */
 		sf_sync_syscall_wait(sfs);
 
 		/*
 		 * If we're not doing kqueue notifications, we can
 		 * transition this immediately to the freeing state.
 		 */
 		if ((sfs->flags & SF_KQUEUE) == 0) {
 			sf_sync_set_state(sfs, SF_STATE_FREEING, 1);
 			do_free = 1;
 		}
 
 		mtx_unlock(&sfs->mtx);
 	}
 
 	/*
 	 * If do_free is set, free here.
 	 *
 	 * If we're doing no-kqueue notification and it's just sleep notification,
 	 * we also do free; it's the only chance we have.
 	 */
 	if (sfs != NULL && do_free == 1) {
 		sf_sync_free(sfs);
 	}
 
 	/*
 	 * XXX Should we wait until the send has completed before freeing the source
 	 * file handle? It's the previous behaviour, sure, but is it required?
 	 * We've wired down the page references after all.
 	 */
 	fdrop(fp, td);
 
 out:
 	/* Return error */
 	return (error);
 }
 
 
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct sf_hdtr_kq hdtr_kq;
 	struct uio *hdr_uio, *trl_uio;
 	int error;
 	off_t sbytes;
 	int do_kqueue = 0;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
 			if (error != 0)
 				goto out;
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 
 		/*
 		 * If SF_KQUEUE is set, then we need to also copy in
 		 * the kqueue data after the normal hdtr set and set
 		 * do_kqueue=1.
 		 */
 		if (uap->flags & SF_KQUEUE) {
 			error = copyin(((char *) uap->hdtr) + sizeof(hdtr),
 			    &hdtr_kq,
 			    sizeof(hdtr_kq));
 			if (error != 0)
 				goto out;
 			do_kqueue = 1;
 		}
 	}
 
 	/* Call sendfile */
 	error = _do_sendfile(td, uap->fd, uap->s, uap->flags, compat,
 	    uap->offset, uap->nbytes, &sbytes, hdr_uio, trl_uio, &hdtr_kq);
 
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
 out:
 	free(hdr_uio, M_IOV);
 	free(trl_uio, M_IOV);
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (do_sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
 
 static int
 sendfile_readpage(vm_object_t obj, struct vnode *vp, int nd,
     off_t off, int xfsize, int bsize, struct thread *td, vm_page_t *res)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	ssize_t resid;
 	int error, readahead, rv;
 
 	pindex = OFF_TO_IDX(off);
 	VM_OBJECT_WLOCK(obj);
 	m = vm_page_grab(obj, pindex, (vp != NULL ? VM_ALLOC_NOBUSY |
 	    VM_ALLOC_IGN_SBUSY : 0) | VM_ALLOC_WIRED | VM_ALLOC_NORMAL);
 
 	/*
 	 * Check if page is valid for what we need, otherwise initiate I/O.
 	 *
 	 * The non-zero nd argument prevents disk I/O, instead we
 	 * return the caller what he specified in nd.  In particular,
 	 * if we already turned some pages into mbufs, nd == EAGAIN
 	 * and the main function send them the pages before we come
 	 * here again and block.
 	 */
 	if (m->valid != 0 && vm_page_is_valid(m, off & PAGE_MASK, xfsize)) {
 		if (vp == NULL)
 			vm_page_xunbusy(m);
 		VM_OBJECT_WUNLOCK(obj);
 		*res = m;
 		return (0);
 	} else if (nd != 0) {
 		if (vp == NULL)
 			vm_page_xunbusy(m);
 		error = nd;
 		goto free_page;
 	}
 
 	/*
 	 * Get the page from backing store.
 	 */
 	error = 0;
 	if (vp != NULL) {
 		VM_OBJECT_WUNLOCK(obj);
 		readahead = sfreadahead * MAXBSIZE;
 
 		/*
 		 * Use vn_rdwr() instead of the pager interface for
 		 * the vnode, to allow the read-ahead.
 		 *
 		 * XXXMAC: Because we don't have fp->f_cred here, we
 		 * pass in NOCRED.  This is probably wrong, but is
 		 * consistent with our original implementation.
 		 */
 		error = vn_rdwr(UIO_READ, vp, NULL, readahead, trunc_page(off),
 		    UIO_NOCOPY, IO_NODELOCKED | IO_VMIO | ((readahead /
 		    bsize) << IO_SEQSHIFT), td->td_ucred, NOCRED, &resid, td);
 		SFSTAT_INC(sf_iocnt);
 		VM_OBJECT_WLOCK(obj);
 	} else {
 		if (vm_pager_has_page(obj, pindex, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, 0);
 			SFSTAT_INC(sf_iocnt);
 			m = vm_page_lookup(obj, pindex);
 			if (m == NULL)
 				error = EIO;
 			else if (rv != VM_PAGER_OK) {
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				m = NULL;
 				error = EIO;
 			}
 		} else {
 			pmap_zero_page(m);
 			m->valid = VM_PAGE_BITS_ALL;
 			m->dirty = 0;
 		}
 		if (m != NULL)
 			vm_page_xunbusy(m);
 	}
 	if (error == 0) {
 		*res = m;
 	} else if (m != NULL) {
 free_page:
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 
 		/*
 		 * See if anyone else might know about this page.  If
 		 * not and it is not valid, then free it.
 		 */
 		if (m->wire_count == 0 && m->valid == 0 && !vm_page_busied(m))
 			vm_page_free(m);
 		vm_page_unlock(m);
 	}
 	KASSERT(error != 0 || (m->wire_count > 0 &&
 	    vm_page_is_valid(m, off & PAGE_MASK, xfsize)),
 	    ("wrong page state m %p off %#jx xfsize %d", m, (uintmax_t)off,
 	    xfsize));
 	VM_OBJECT_WUNLOCK(obj);
 	return (error);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	struct vattr va;
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		error = VOP_GETATTR(vp, &va, td->td_ucred);
 		if (error != 0)
 			goto out;
 		*obj_size = va.va_size;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	VM_OBJECT_WLOCK(obj);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_WUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 static int
 kern_sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	cap_rights_t rights;
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock_cap(td->td_proc->p_fd, s, cap_rights_init(&rights,
 	    CAP_SEND), sock_fp, NULL);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (((*so)->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     int kflags, struct sendfile_sync *sfs, struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	struct socket *so;
 	struct mbuf *m;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	struct shmfd *shmfd;
 	struct vattr va;
 	off_t off, xfsize, fsbytes, sbytes, rem, obj_size;
 	int error, bsize, nd, hdrlen, mnw;
 
 	pg = NULL;
 	obj = NULL;
 	so = NULL;
 	m = NULL;
 	fsbytes = sbytes = 0;
 	hdrlen = mnw = 0;
 	rem = nbytes;
 	obj_size = 0;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 	if (rem == 0)
 		rem = obj_size;
 
 	error = kern_sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Do not wait on memory allocations but return ENOMEM for
 	 * caller to retry later.
 	 * XXX: Experimental.
 	 */
 	if (flags & SF_MNOWAIT)
 		mnw = 1;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	/* If headers are specified copy them into mbufs. */
 	if (hdr_uio != NULL) {
 		hdr_uio->uio_td = td;
 		hdr_uio->uio_rw = UIO_WRITE;
 		if (hdr_uio->uio_resid > 0) {
 			/*
 			 * In FBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (kflags & SFK_COMPAT) {
 				if (nbytes > hdr_uio->uio_resid)
 					nbytes -= hdr_uio->uio_resid;
 				else
 					nbytes = 0;
 			}
 			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
 			    0, 0, 0);
 			if (m == NULL) {
 				error = mnw ? EAGAIN : ENOBUFS;
 				goto out;
 			}
 			hdrlen = m_length(m, NULL);
 		}
 	}
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; ; ) {
 		struct mbuf *mtail;
 		int loopbytes;
 		int space;
 		int done;
 
 		if ((nbytes != 0 && nbytes == fsbytes) ||
 		    (nbytes == 0 && obj_size == fsbytes))
 			break;
 
 		mtail = NULL;
 		loopbytes = 0;
 		space = 0;
 		done = 0;
 
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * Reduce space in the socket buffer by the size of
 		 * the header mbuf chain.
 		 * hdrlen is set to 0 after the first loop.
 		 */
 		space -= hdrlen;
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 			error = VOP_GETATTR(vp, &va, td->td_ucred);
 			if (error != 0 || off >= va.va_size) {
 				VOP_UNLOCK(vp, 0);
 				goto done;
 			}
 			obj_size = va.va_size;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		while (space > loopbytes) {
 			vm_offset_t pgoff;
 			struct mbuf *m0;
 
 			/*
 			 * Calculate the amount to transfer.
 			 * Not to exceed a page, the EOF,
 			 * or the passed in nbytes.
 			 */
 			pgoff = (vm_offset_t)(off & PAGE_MASK);
 			rem = obj_size - offset;
 			if (nbytes != 0)
 				rem = omin(rem, nbytes);
 			rem -= fsbytes + loopbytes;
 			xfsize = omin(PAGE_SIZE - pgoff, rem);
 			xfsize = omin(space - loopbytes, xfsize);
 			if (xfsize <= 0) {
 				done = 1;		/* all data sent */
 				break;
 			}
 
 			/*
 			 * Attempt to look up the page.  Allocate
 			 * if not found or wait and loop if busy.
 			 */
 			if (m != NULL)
 				nd = EAGAIN; /* send what we already got */
 			else if ((flags & SF_NODISKIO) != 0)
 				nd = EBUSY;
 			else
 				nd = 0;
 			error = sendfile_readpage(obj, vp, nd, off,
 			    xfsize, bsize, td, &pg);
 			if (error != 0) {
 				if (error == EAGAIN)
 					error = 0;	/* not a real error */
 				break;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pg, (mnw || m != NULL) ? SFB_NOWAIT :
 			    SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				vm_page_lock(pg);
-				vm_page_unwire(pg, 0);
+				vm_page_unwire(pg, PQ_INACTIVE);
 				KASSERT(pg->object != NULL,
 				    ("%s: object disappeared", __func__));
 				vm_page_unlock(pg);
 				if (m == NULL)
 					error = (mnw ? EAGAIN : EINTR);
 				break;
 			}
 
 			/*
 			 * Get an mbuf and set it up as having
 			 * external storage.
 			 */
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
 				(void)sf_buf_mext(NULL, NULL, sf);
 				break;
 			}
 			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
 			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
 			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
 				error = (mnw ? EAGAIN : ENOBUFS);
 				(void)sf_buf_mext(NULL, NULL, sf);
 				m_freem(m0);
 				break;
 			}
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else if (m != NULL)
 				m_last(m)->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 
 			/* Keep track of bits processed. */
 			loopbytes += xfsize;
 			off += xfsize;
 
 			/*
 			 * XXX eventually this should be a sfsync
 			 * method call!
 			 */
 			if (sfs != NULL)
 				sf_sync_ref(sfs);
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp, 0);
 
 		/* Add the buffer chain to the socket buffer. */
 		if (m != NULL) {
 			int mlen, err;
 
 			mlen = m_length(m, NULL);
 			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 				error = EPIPE;
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			SOCKBUF_UNLOCK(&so->so_snd);
 			CURVNET_SET(so->so_vnet);
 			/* Avoid error aliasing. */
 			err = (*so->so_proto->pr_usrreqs->pru_send)
 				    (so, 0, m, NULL, NULL, td);
 			CURVNET_RESTORE();
 			if (err == 0) {
 				/*
 				 * We need two counters to get the
 				 * file offset and nbytes to send
 				 * right:
 				 * - sbytes contains the total amount
 				 *   of bytes sent, including headers.
 				 * - fsbytes contains the total amount
 				 *   of bytes sent from the file.
 				 */
 				sbytes += mlen;
 				fsbytes += mlen;
 				if (hdrlen) {
 					fsbytes -= hdrlen;
 					hdrlen = 0;
 				}
 			} else if (error == 0)
 				error = err;
 			m = NULL;	/* pru_send always consumes */
 		}
 
 		/* Quit outer loop on error or when we're done. */
 		if (done)
 			break;
 		if (error != 0)
 			goto done;
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		sbunlock(&so->so_snd);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	sbunlock(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 /*
  * SCTP syscalls.
  * Functionality only compiled in if SCTP is defined in the kernel Makefile,
  * otherwise all return EOPNOTSUPP.
  * XXX: We should make this loadable one day.
  */
 int
 sys_sctp_peeloff(td, uap)
 	struct thread *td;
 	struct sctp_peeloff_args /* {
 		int	sd;
 		caddr_t	name;
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
 	struct file *nfp = NULL;
 	struct socket *head, *so;
 	cap_rights_t rights;
 	u_int fflag;
 	int error, fd;
 
 	AUDIT_ARG_FD(uap->sd);
 	error = fgetsock(td, uap->sd, cap_rights_init(&rights, CAP_PEELOFF),
 	    &head, &fflag);
 	if (error != 0)
 		goto done2;
 	if (head->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto done;
 	}
 	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto done;
 	/*
 	 * At this point we know we do have a assoc to pull
 	 * we proceed to get the fd setup. This may block
 	 * but that is ok.
 	 */
 
 	error = falloc(td, &nfp, &fd, 0);
 	if (error != 0)
 		goto done;
 	td->td_retval[0] = fd;
 
 	CURVNET_SET(head->so_vnet);
 	so = sonewconn(head, SS_ISCONNECTED);
 	if (so == NULL) {
 		error = ENOMEM;
 		goto noconnection;
 	}
 	/*
 	 * Before changing the flags on the socket, we have to bump the
 	 * reference count.  Otherwise, if the protocol calls sofree(),
 	 * the socket will be released due to a zero refcount.
 	 */
         SOCK_LOCK(so);
         soref(so);                      /* file descriptor reference */
         SOCK_UNLOCK(so);
 
 	ACCEPT_LOCK();
 
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 	so->so_state |= (head->so_state & SS_NBIO);
 	so->so_state &= ~SS_NOFDREF;
 	so->so_qstate &= ~SQ_COMP;
 	so->so_head = NULL;
 	ACCEPT_UNLOCK();
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
 	if (error != 0)
 		goto noconnection;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
 
 noconnection:
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(td->td_proc->p_fd, nfp, fd, td);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 	CURVNET_RESTORE();
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fputsock(head);
 done2:
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sys_sctp_generic_sendmsg (td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_args /* {
 		int sd,
 		caddr_t msg,
 		int mlen,
 		caddr_t to,
 		__socklen_t tolen,
 		struct sctp_sndrcvinfo *sinfo,
 		int flags
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec iov[1];
 	cap_rights_t rights;
 	int error = 0, len;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 
 	cap_rights_init(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
 	if (error != 0)
 		goto sctp_bad;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	iov[0].iov_base = uap->msg;
 	iov[0].iov_len = uap->mlen;
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov =  iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	len = auio.uio_resid = uap->mlen;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio, (struct mbuf *)NULL,
 	    (struct mbuf *)NULL, uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket. */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sys_sctp_generic_sendmsg_iov(td, uap)
 	struct thread *td;
 	struct sctp_generic_sendmsg_iov_args /* {
 		int sd,
 		struct iovec *iov,
 		int iovlen,
 		caddr_t to,
 		__socklen_t tolen,
 		struct sctp_sndrcvinfo *sinfo,
 		int flags
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
 	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *to = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	cap_rights_t rights;
 	ssize_t len;
 	int error, i;
 
 	if (uap->sinfo != NULL) {
 		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
 		if (error != 0)
 			return (error);
 		u_sinfo = &sinfo;
 	}
 	cap_rights_init(&rights, CAP_SEND);
 	if (uap->tolen != 0) {
 		error = getsockaddr(&to, uap->to, uap->tolen);
 		if (error != 0) {
 			to = NULL;
 			goto sctp_bad2;
 		}
 		cap_rights_set(&rights, CAP_CONNECT);
 	}
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock_cap(td->td_proc->p_fd, uap->sd, &rights, &fp, NULL);
 	if (error != 0)
 		goto sctp_bad1;
 
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto sctp_bad1;
 #ifdef KTRACE
 	if (to && (KTRPOINT(td, KTR_STRUCT)))
 		ktrsockaddr(to);
 #endif
 
 	so = (struct socket *)fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto sctp_bad;
 	}
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto sctp_bad;
 #endif /* MAC */
 
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto sctp_bad;
 		}
 	}
 	len = auio.uio_resid;
 	CURVNET_SET(so->so_vnet);
 	error = sctp_lower_sosend(so, to, &auio,
 		    (struct mbuf *)NULL, (struct mbuf *)NULL,
 		    uap->flags, u_sinfo, td);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(uap->flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
 	}
 #endif /* KTRACE */
 sctp_bad:
 	free(iov, M_IOV);
 sctp_bad1:
 	if (fp != NULL)
 		fdrop(fp, td);
 sctp_bad2:
 	free(to, M_SONAME);
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
 
 int
 sys_sctp_generic_recvmsg(td, uap)
 	struct thread *td;
 	struct sctp_generic_recvmsg_args /* {
 		int sd,
 		struct iovec *iov,
 		int iovlen,
 		struct sockaddr *from,
 		__socklen_t *fromlenaddr,
 		struct sctp_sndrcvinfo *sinfo,
 		int *msg_flags
 	} */ *uap;
 {
 #if (defined(INET) || defined(INET6)) && defined(SCTP)
 	uint8_t sockbufstore[256];
 	struct uio auio;
 	struct iovec *iov, *tiov;
 	struct sctp_sndrcvinfo sinfo;
 	struct socket *so;
 	struct file *fp = NULL;
 	struct sockaddr *fromsa;
 	cap_rights_t rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, fromlen, i, msg_flags;
 
 	AUDIT_ARG_FD(uap->sd);
 	error = getsock_cap(td->td_proc->p_fd, uap->sd,
 	    cap_rights_init(&rights, CAP_RECV), &fp, NULL);
 	if (error != 0)
 		return (error);
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
 		    uap->iovlen, &iov, EMSGSIZE);
 	else
 #endif
 		error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		goto out1;
 
 	so = fp->f_data;
 	if (so->so_proto->pr_protocol != IPPROTO_SCTP) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif /* MAC */
 
 	if (uap->fromlenaddr != NULL) {
 		error = copyin(uap->fromlenaddr, &fromlen, sizeof (fromlen));
 		if (error != 0)
 			goto out;
 	} else {
 		fromlen = 0;
 	}
 	if (uap->msg_flags) {
 		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	} else {
 		msg_flags = 0;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	tiov = iov;
 	for (i = 0; i <uap->iovlen; i++, tiov++) {
 		if ((auio.uio_resid += tiov->iov_len) < 0) {
 			error = EINVAL;
 			goto out;
 		}
 	}
 	len = auio.uio_resid;
 	fromsa = (struct sockaddr *)sockbufstore;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif /* KTRACE */
 	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
 	CURVNET_SET(so->so_vnet);
 	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
 		    fromsa, fromlen, &msg_flags,
 		    (struct sctp_sndrcvinfo *)&sinfo, 1);
 	CURVNET_RESTORE();
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	} else {
 		if (uap->sinfo)
 			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
 	}
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
 		ktrgenio(uap->sd, UIO_READ, ktruio, error);
 	}
 #endif /* KTRACE */
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 
 	if (fromlen && uap->from) {
 		len = fromlen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa, uap->from, (size_t)len);
 			if (error != 0)
 				goto out;
 		}
 		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
 		if (error != 0)
 			goto out;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	if (uap->msg_flags) {
 		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
 		if (error != 0)
 			goto out;
 	}
 out:
 	free(iov, M_IOV);
 out1:
 	if (fp != NULL)
 		fdrop(fp, td);
 
 	return (error);
 #else  /* SCTP */
 	return (EOPNOTSUPP);
 #endif /* SCTP */
 }
Index: user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/kern/vfs_bio.c	(revision 267237)
@@ -1,4607 +1,4607 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include "opt_compat.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 /*
  * XXX buf is global because kern_shutdown.c and ffs_checkoverlap has
  * carnal knowledge of buffers.  This knowledge should be moved to vfs_bio.c.
  */
 struct buf *buf;		/* buffer header pool */
 caddr_t unmapped_buf;
 
 static struct proc *bufdaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int buf_flush(int);
 static int flushbufqueues(int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 #endif
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static long bufspace;
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 #else
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Virtual memory used for buffers");
 #endif
 static long unmapped_bufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
     &unmapped_bufspace, 0,
     "Amount of unmapped buffers, inclusive in the bufspace");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including buf_daemon)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace, 0,
     "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding buf_daemon)");
 static int bufreusecnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RW, &bufreusecnt, 0,
     "Number of times we have reused a buffer");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "XXX Unused");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "XXX Complicatedly unused");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer aquisition");
 static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static long notbufdflushes;
 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 
 /*
  * Lock for the non-dirty bufqueues
  */
 static struct mtx_padalign bqclean;
 
 /*
  * Lock for the dirty queue.
  */
 static struct mtx_padalign bqdirty;
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign rbreqlock;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct mtx_padalign nblock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static int needsbuffer;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define BUFFER_QUEUES	5	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
 #define QUEUE_EMPTY	4	/* empty buffer headers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
 static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 #define VFS_BIO_NEED_ANY	0x01	/* any freeable buffer */
 #define VFS_BIO_NEED_FREE	0x04	/* wait for free bufs, hi hysteresis */
 #define VFS_BIO_NEED_BUFSPACE	0x08	/* wait for buf space, lo hysteresis */
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, arg1, arg2, req));
 	lvalue = *(long *)arg1;
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #endif
 
 /*
  *	bqlock:
  *
  *	Return the appropriate queue lock based on the index.
  */
 static inline struct mtx *
 bqlock(int qindex)
 {
 
 	if (qindex == QUEUE_DIRTY)
 		return (struct mtx *)(&bqdirty);
 	return (struct mtx *)(&bqclean);
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(void)
 {
 
 	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bdirtywakeup();
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(void)
 {
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bd_wakeup();
 }
 
 /*
  *	bufspacewakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 
 static __inline void
 bufspacewakeup(void)
 {
 
 	/*
 	 * If someone is waiting for BUF space, wake them up.  Even
 	 * though we haven't freed the kva space yet, the waiting
 	 * process will be able to now.
 	 */
 	mtx_lock(&nblock);
 	if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
 		needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	bufcountadd:
  *
  *	Called when a buffer has been added to one of the free queues to
  *	account for the buffer and to wakeup anyone waiting for free buffers.
  *	This typically occurs when large amounts of metadata are being handled
  *	by the buffer cache ( else buffer space runs out first, usually ).
  */
 static __inline void
 bufcountadd(struct buf *bp)
 {
 	int old;
 
 	KASSERT((bp->b_flags & B_INFREECNT) == 0,
 	    ("buf %p already counted as free", bp));
 	bp->b_flags |= B_INFREECNT;
 	old = atomic_fetchadd_int(&numfreebuffers, 1);
 	KASSERT(old >= 0 && old < nbuf,
 	    ("numfreebuffers climbed to %d", old + 1));
 	mtx_lock(&nblock);
 	if (needsbuffer) {
 		needsbuffer &= ~VFS_BIO_NEED_ANY;
 		if (numfreebuffers >= hifreebuffers)
 			needsbuffer &= ~VFS_BIO_NEED_FREE;
 		wakeup(&needsbuffer);
 	}
 	mtx_unlock(&nblock);
 }
 
 /*
  *	bufcountsub:
  *
  *	Decrement the numfreebuffers count as needed.
  */
 static void
 bufcountsub(struct buf *bp)
 {
 	int old;
 
 	/*
 	 * Fixup numfreebuffers count.  If the buffer is invalid or not
 	 * delayed-write, the buffer was free and we must decrement
 	 * numfreebuffers.
 	 */
 	if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
 		KASSERT((bp->b_flags & B_INFREECNT) != 0,
 		    ("buf %p not counted in numfreebuffers", bp));
 		bp->b_flags &= ~B_INFREECNT;
 		old = atomic_fetchadd_int(&numfreebuffers, -1);
 		KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
 	}
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline
 void
 vfs_buf_test_cache(struct buf *bp,
 		  vm_ooffset_t foff, vm_offset_t off, vm_offset_t size,
 		  vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artifically limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = max(min(nbuf/4, 256), 16);
 #ifdef NSWBUF_MIN
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 #endif
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL | B_INFREECNT;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 #ifdef INVARIANTS
 		bq_len[QUEUE_EMPTY]++;
 #endif
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by buf_daemon.  hibufspace is the nominal maximum
 	 * used by most other processes.  The differential is required to 
 	 * ensure that buf_daemon is able to run when other processes might 
 	 * be blocked waiting for buffer space.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
 	lobufspace = hibufspace - MAXBSIZE;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
 
 /*
  * Limit the amount of malloc memory since it is wired permanently into
  * the kernel space.  Even though this is accounted for in the buffer
  * allocation, we don't want the malloced region to grow uncontrolled.
  * The malloc scheme improves memory utilization significantly on average
  * (small) directories.
  */
 	maxbufmallocspace = hibufspace / 20;
 
 /*
  * Reduce the chance of a deadlock occuring by limiting the number
  * of delayed-write dirty buffers we allow to stack up.
  */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 /*
  * To support extreme low-memory systems, make sure hidirtybuffers cannot
  * eat up all available buffer space.  This occurs when our minimum cannot
  * be met.  We try to size hidirtybuffers to 3/4 our buffer space assuming
  * BKVASIZE'd buffers.
  */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 /*
  * Try to keep the number of free buffers in the specified range,
  * and give special processes (e.g. like buf_daemon) access to an 
  * emergency reserve.
  */
 	lofreebuffers = nbuf / 18 + 5;
 	hifreebuffers = 2 * lofreebuffers;
 	numfreebuffers = nbuf;
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT((bp->b_flags & B_UNMAPPED) == 0,
 	    ("mapped buf %p %x", bp, bp->b_flags));
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
 	    ("unmapped buf %p %x", bp, bp->b_flags));
 	KASSERT(bp->b_kvabase == unmapped_buf,
 	    ("unmapped buf: corrupted b_kvabase %p", bp));
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
  * bfreekva() - free the kva allocation for a buffer.
  *
  *	Since this call frees up buffer space, we call bufspacewakeup().
  */
 static void
 bfreekva(struct buf *bp)
 {
 
 	if (bp->b_kvasize == 0)
 		return;
 
 	atomic_add_int(&buffreekvacnt, 1);
 	atomic_subtract_long(&bufspace, bp->b_kvasize);
 	if ((bp->b_flags & B_UNMAPPED) == 0) {
 		BUF_CHECK_MAPPED(bp);
 		vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
 		    bp->b_kvasize);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		if ((bp->b_flags & B_KVAALLOC) != 0) {
 			vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
 			    bp->b_kvasize);
 		}
 		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
 		bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
 	}
 	bp->b_kvasize = 0;
 	bufspacewakeup();
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
 	BUF_ASSERT_XLOCKED(bp);
 
 	olock = bqlock(bp->b_qindex);
 	nlock = bqlock(qindex);
 	mtx_lock(olock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
 		bremfreel(bp);
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("binsfree: free buffer onto another queue???");
 
 	bp->b_qindex = qindex;
 	if (olock != nlock) {
 		mtx_unlock(olock);
 		mtx_lock(nlock);
 	}
 	if (bp->b_flags & B_AGE)
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	else
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
 
 	/*
 	 * Something we can maybe free or reuse.
 	 */
 	if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
 		bufspacewakeup();
 
 	if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
 		bufcountadd(bp);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 	bufcountsub(bp);
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct mtx *qlock;
 
 	qlock = bqlock(bp->b_qindex);
 	mtx_lock(qlock);
 	bremfreel(bp);
 	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
 	    bp->b_qindex));
 	bq_len[bp->b_qindex]--;
 #endif
 	bp->b_qindex = QUEUE_NONE;
 	/*
 	 * If this was a delayed bremfree() we only need to remove the buffer
 	 * from the queue and return the stats are already done.
 	 */
 	if (bp->b_flags & B_REMFREE) {
 		bp->b_flags &= ~B_REMFREE;
 		return;
 	}
 	bufcountsub(bp);
 }
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread))
 				curthread->td_ru.ru_inblock++;
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
 	 */
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
 	if (bp == NULL)
 		return (EBUSY);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread))
 			curthread->td_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		barrierwrites++;
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_pin_count > 0)
 		bunpin_wait(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread))
 		curthread->td_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd();
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 static __noinline int
 buf_vm_page_count_severe(void)
 {
 
 	KFAIL_POINT_CODE(DEBUG_FP, buf_pressure, return 1);
 
 	return vm_page_count_severe();
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  Must clear BIO_ERROR to prevent
 		 * pages from being scrapped.  If the error is anything
 		 * other than an I/O error (EIO), assume that retrying
 		 * is futile.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed I/O or we were asked to free or not
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			if (bp->b_bufsize)
 				allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_release() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_release(), even
 	 * if B_DELWRI is set.
 	 *
 	 * If B_DELWRI is not set we may have to set B_RELBUF if we are low
 	 * on pages to return pages to the VM page queues.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 	else if (buf_vm_page_count_severe()) {
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if (!(bp->b_vflags & BV_BKGRDINPROG))
 			bp->b_flags |= B_RELBUF;
 	}
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO)
 	    && !(bp->b_vp->v_mount != NULL &&
 		 (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 		 !vn_isdisk(bp->b_vp, NULL) &&
 		 (bp->b_flags & B_DELWRI))
 	    ) {
 
 		int i, j, resid;
 		vm_page_t m;
 		off_t foff;
 		vm_pindex_t poff;
 		vm_object_t obj;
 
 		obj = bp->b_bufobj->bo_object;
 
 		/*
 		 * Get the base offset and length of the buffer.  Note that 
 		 * in the VMIO case if the buffer block size is not
 		 * page-aligned then b_data pointer may not be page-aligned.
 		 * But our b_pages[] array *IS* page aligned.
 		 *
 		 * block sizes less then DEV_BSIZE (usually 512) are not 
 		 * supported due to the page granularity bits (m->valid,
 		 * m->dirty, etc...). 
 		 *
 		 * See man buf(9) for more information
 		 */
 		resid = bp->b_bufsize;
 		foff = bp->b_offset;
 		for (i = 0; i < bp->b_npages; i++) {
 			int had_bogus = 0;
 
 			m = bp->b_pages[i];
 
 			/*
 			 * If we hit a bogus page, fixup *all* the bogus pages
 			 * now.
 			 */
 			if (m == bogus_page) {
 				poff = OFF_TO_IDX(bp->b_offset);
 				had_bogus = 1;
 
 				VM_OBJECT_RLOCK(obj);
 				for (j = i; j < bp->b_npages; j++) {
 					vm_page_t mtmp;
 					mtmp = bp->b_pages[j];
 					if (mtmp == bogus_page) {
 						mtmp = vm_page_lookup(obj, poff + j);
 						if (!mtmp) {
 							panic("brelse: page missing\n");
 						}
 						bp->b_pages[j] = mtmp;
 					}
 				}
 				VM_OBJECT_RUNLOCK(obj);
 
 				if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
 					BUF_CHECK_MAPPED(bp);
 					pmap_qenter(
 					    trunc_page((vm_offset_t)bp->b_data),
 					    bp->b_pages, bp->b_npages);
 				}
 				m = bp->b_pages[i];
 			}
 			if ((bp->b_flags & B_NOCACHE) ||
 			    (bp->b_ioflags & BIO_ERROR &&
 			     bp->b_iocmd == BIO_READ)) {
 				int poffset = foff & PAGE_MASK;
 				int presid = resid > (PAGE_SIZE - poffset) ?
 					(PAGE_SIZE - poffset) : resid;
 
 				KASSERT(presid >= 0, ("brelse: extra page"));
 				VM_OBJECT_WLOCK(obj);
 				while (vm_page_xbusied(m)) {
 					vm_page_lock(m);
 					VM_OBJECT_WUNLOCK(obj);
 					vm_page_busy_sleep(m, "mbncsh");
 					VM_OBJECT_WLOCK(obj);
 				}
 				if (pmap_page_wired_mappings(m) == 0)
 					vm_page_set_invalid(m, poffset, presid);
 				VM_OBJECT_WUNLOCK(obj);
 				if (had_bogus)
 					printf("avoided corruption bug in bogus_page/brelse code\n");
 			}
 			resid -= PAGE_SIZE - (foff & PAGE_MASK);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		}
 		if (bp->b_flags & (B_INVAL | B_RELBUF))
 			vfs_vmio_release(bp);
 
 	} else if (bp->b_flags & B_VMIO) {
 
 		if (bp->b_flags & (B_INVAL | B_RELBUF)) {
 			vfs_vmio_release(bp);
 		}
 
 	} else if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) {
 		if (bp->b_bufsize != 0)
 			allocbuf(bp, 0);
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 1");
 		if (bp->b_kvasize)
 			qindex = QUEUE_EMPTYKVA;
 		else
 			qindex = QUEUE_EMPTY;
 		bp->b_flags |= B_AGE;
 	/* buffers with junk contents */
 	} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if (bp->b_flags & B_DELWRI) {
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if (buf_vm_page_count_severe() &&
 		    (bp->b_vflags & BV_BKGRDINPROG) == 0) {
 			/*
 			 * We are too low on memory, we have to try to free
 			 * the buffer (most importantly: the wired pages
 			 * making up its backing store) *now*.
 			 */
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	binsfree(bp, qindex);
 
 out:
 	/* unlock */
 	BUF_UNLOCK(bp);
 }
 
 /* Give pages used by the bp back to the VM system (where possible) */
 static void
 vfs_vmio_release(struct buf *bp)
 {
 	int i;
 	vm_page_t m;
 
 	if ((bp->b_flags & B_UNMAPPED) == 0) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		bp->b_pages[i] = NULL;
 		/*
 		 * In order to keep page LRU ordering consistent, put
 		 * everything on the inactive queue.
 		 */
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 
 		/*
 		 * Might as well free the page if we can and it has
 		 * no valid data.  We also free the page if the
 		 * buffer was used for direct I/O
 		 */
 		if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
 			if (m->wire_count == 0 && !vm_page_busied(m))
 				vm_page_free(m);
 		} else if (bp->b_flags & B_DIRECT)
 			vm_page_try_to_free(m);
 		else if (buf_vm_page_count_severe())
 			vm_page_try_to_cache(m);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	
 	if (bp->b_bufsize) {
 		bufspacewakeup();
 		bp->b_bufsize = 0;
 	}
 	bp->b_npages = 0;
 	bp->b_flags &= ~B_VMIO;
 	if (bp->b_vp)
 		brelvp(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 static void
 setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
 {
 
 	KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
 	    bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
 	if ((gbflags & GB_UNMAPPED) == 0) {
 		bp->b_kvabase = (caddr_t)addr;
 	} else if ((gbflags & GB_KVAALLOC) != 0) {
 		KASSERT((gbflags & GB_UNMAPPED) != 0,
 		    ("GB_KVAALLOC without GB_UNMAPPED"));
 		bp->b_kvaalloc = (caddr_t)addr;
 		bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
 		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
 	}
 	bp->b_kvasize = maxsize;
 }
 
 /*
  * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
  * needed.
  */
 static int
 allocbufkva(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 
 	bfreekva(bp);
 	addr = 0;
 
 	if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		atomic_add_int(&bufdefragcnt, 1);
 		return (1);
 	}
 	setbufkva(bp, addr, maxsize, gbflags);
 	atomic_add_long(&bufspace, bp->b_kvasize);
 	return (0);
 }
 
 /*
  * Ask the bufdaemon for help, or act as bufdaemon itself, when a
  * locked vnode is supplied.
  */
 static void
 getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
     int defrag)
 {
 	struct thread *td;
 	char *waitmsg;
 	int cnt, error, flags, norunbuf, wait;
 
 	mtx_assert(&bqclean, MA_OWNED);
 
 	if (defrag) {
 		flags = VFS_BIO_NEED_BUFSPACE;
 		waitmsg = "nbufkv";
 	} else if (bufspace >= hibufspace) {
 		waitmsg = "nbufbs";
 		flags = VFS_BIO_NEED_BUFSPACE;
 	} else {
 		waitmsg = "newbuf";
 		flags = VFS_BIO_NEED_ANY;
 	}
 	mtx_lock(&nblock);
 	needsbuffer |= flags;
 	mtx_unlock(&nblock);
 	mtx_unlock(&bqclean);
 
 	bd_speedup();	/* heeeelp */
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	cnt = 0;
 	wait = MNT_NOWAIT;
 	mtx_lock(&nblock);
 	while (needsbuffer & flags) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			mtx_unlock(&nblock);
 
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			if (cnt++ > 2)
 				wait = MNT_WAIT;
 			ASSERT_VOP_LOCKED(vp, "bufd_helper");
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 			if (error == 0) {
 				/* play bufdaemon */
 				norunbuf = curthread_pflags_set(TDP_BUFNEED |
 				    TDP_NORUNNINGBUF);
 				VOP_FSYNC(vp, wait, td);
 				atomic_add_long(&notbufdflushes, 1);
 				curthread_pflags_restore(norunbuf);
 			}
 			mtx_lock(&nblock);
 			if ((needsbuffer & flags) == 0)
 				break;
 		}
 		if (msleep(&needsbuffer, &nblock, (PRIBIO + 4) | slpflag,
 		    waitmsg, slptimeo))
 			break;
 	}
 	mtx_unlock(&nblock);
 }
 
 static void
 getnewbuf_reuse_bp(struct buf *bp, int qindex)
 {
 
 	CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
 	    "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
 	     bp->b_kvasize, bp->b_bufsize, qindex);
 	mtx_assert(&bqclean, MA_NOTOWNED);
 
 	/*
 	 * Note: we no longer distinguish between VMIO and non-VMIO
 	 * buffers.
 	 */
 	KASSERT((bp->b_flags & B_DELWRI) == 0,
 	    ("delwri buffer %p found in queue %d", bp, qindex));
 
 	if (qindex == QUEUE_CLEAN) {
 		if (bp->b_flags & B_VMIO) {
 			bp->b_flags &= ~B_ASYNC;
 			vfs_vmio_release(bp);
 		}
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 
 	/*
 	 * Get the rest of the buffer freed up.  b_kva* is still valid
 	 * after this operation.
 	 */
 
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 3");
 	KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p.  qindex: %d",
 	    bp, bp->b_vp, qindex));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 
 	if (bp->b_bufsize)
 		allocbuf(bp, 0);
 
 	bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	KASSERT((bp->b_flags & B_INFREECNT) == 0,
 	    ("buf %p still counted as free?", bp));
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_pin_count = 0;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 
 	LIST_INIT(&bp->b_dep);
 }
 
 static int flushingbufs;
 
 static struct buf *
 getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
 {
 	struct buf *bp, *nbp;
 	int nqindex, qindex, pass;
 
 	KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
 
 	pass = 1;
 restart:
 	atomic_add_int(&getnewbufrestarts, 1);
 
 	/*
 	 * Setup for scan.  If we do not have enough free buffers,
 	 * we setup a degenerate case that immediately fails.  Note
 	 * that if we are specially marked process, we are allowed to
 	 * dip into our reserves.
 	 *
 	 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
 	 * for the allocation of the mapped buffer.  For unmapped, the
 	 * easiest is to start with EMPTY outright.
 	 *
 	 * We start with EMPTYKVA.  If the list is empty we backup to EMPTY.
 	 * However, there are a number of cases (defragging, reusing, ...)
 	 * where we cannot backup.
 	 */
 	nbp = NULL;
 	mtx_lock(&bqclean);
 	if (!defrag && unmapped) {
 		nqindex = QUEUE_EMPTY;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 	}
 	if (nbp == NULL) {
 		nqindex = QUEUE_EMPTYKVA;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 	}
 
 	/*
 	 * If no EMPTYKVA buffers and we are either defragging or
 	 * reusing, locate a CLEAN buffer to free or reuse.  If
 	 * bufspace useage is low skip this step so we can allocate a
 	 * new buffer.
 	 */
 	if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
 		nqindex = QUEUE_CLEAN;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 	}
 
 	/*
 	 * If we could not find or were not allowed to reuse a CLEAN
 	 * buffer, check to see if it is ok to use an EMPTY buffer.
 	 * We can only use an EMPTY buffer if allocating its KVA would
 	 * not otherwise run us out of buffer space.  No KVA is needed
 	 * for the unmapped allocation.
 	 */
 	if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
 	    metadata)) {
 		nqindex = QUEUE_EMPTY;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 	}
 
 	/*
 	 * All available buffers might be clean, retry ignoring the
 	 * lobufspace as the last resort.
 	 */
 	if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
 		nqindex = QUEUE_CLEAN;
 		nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 	}
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		qindex = nqindex;
 
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * block or do other fancy things).
 		 */
 		if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
 			switch (qindex) {
 			case QUEUE_EMPTY:
 				nqindex = QUEUE_EMPTYKVA;
 				nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
 				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_EMPTYKVA:
 				nqindex = QUEUE_CLEAN;
 				nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
 				if (nbp != NULL)
 					break;
 				/* FALLTHROUGH */
 			case QUEUE_CLEAN:
 				if (metadata && pass == 1) {
 					pass = 2;
 					nqindex = QUEUE_EMPTY;
 					nbp = TAILQ_FIRST(
 					    &bufqueues[QUEUE_EMPTY]);
 				}
 				/*
 				 * nbp is NULL. 
 				 */
 				break;
 			}
 		}
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * b_kvasize != 0.  XXX this situation should no longer
 		 * occur, if defrag is non-zero the buffer's b_kvasize
 		 * should also be non-zero at this point.  XXX
 		 */
 		if (defrag && bp->b_kvasize == 0) {
 			printf("Warning: defrag empty buffer %p\n", bp);
 			continue;
 		}
 
 		/*
 		 * Start freeing the bp.  This is somewhat involved.  nbp
 		 * remains valid only for QUEUE_EMPTY[KVA] bp's.
 		 */
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if (bp->b_vflags & BV_BKGRDINPROG) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == qindex,
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 
 		bremfreel(bp);
 		mtx_unlock(&bqclean);
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 
 		getnewbuf_reuse_bp(bp, qindex);
 		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		/*
 		 * If we are defragging then free the buffer.
 		 */
 		if (defrag) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			defrag = 0;
 			goto restart;
 		}
 
 		/*
 		 * Notify any waiters for the buffer lock about
 		 * identity change by freeing the buffer.
 		 */
 		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp)) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 
 		if (metadata)
 			break;
 
 		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
 		 * processes are blocked in getnewbuf() or allocbuf().
 		 */
 		if (bufspace >= hibufspace)
 			flushingbufs = 1;
 		if (flushingbufs && bp->b_kvasize != 0) {
 			bp->b_flags |= B_INVAL;
 			bfreekva(bp);
 			brelse(bp);
 			goto restart;
 		}
 		if (bufspace < lobufspace)
 			flushingbufs = 0;
 		break;
 	}
 	return (bp);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	Important:  B_INVAL is not set.  If the caller wishes to throw the
  *	buffer away, the caller must set B_INVAL prior to calling brelse().
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
     int gbflags)
 {
 	struct buf *bp;
 	int defrag, metadata;
 
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	defrag = 0;
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = 1;
 	else
 		metadata = 0;
 	/*
 	 * We can't afford to block since we might be holding a vnode lock,
 	 * which may prevent system daemons from running.  We deal with
 	 * low-memory situations by proactively returning memory and running
 	 * async I/O rather then sync I/O.
 	 */
 	atomic_add_int(&getnewbufcalls, 1);
 	atomic_subtract_int(&getnewbufrestarts, 1);
 restart:
 	bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
 	    GB_KVAALLOC)) == GB_UNMAPPED, metadata);
 	if (bp != NULL)
 		defrag = 0;
 
 	/*
 	 * If we exhausted our list, sleep as appropriate.  We may have to
 	 * wakeup various daemons and write out some dirty buffers.
 	 *
 	 * Generally we are sleeping due to insufficient buffer space.
 	 */
 	if (bp == NULL) {
 		mtx_assert(&bqclean, MA_OWNED);
 		getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
 		mtx_assert(&bqclean, MA_NOTOWNED);
 	} else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
 		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		bfreekva(bp);
 		bp->b_flags |= B_UNMAPPED;
 		bp->b_kvabase = bp->b_data = unmapped_buf;
 		bp->b_kvasize = maxsize;
 		atomic_add_long(&bufspace, bp->b_kvasize);
 		atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
 		atomic_add_int(&bufreusecnt, 1);
 	} else {
 		mtx_assert(&bqclean, MA_NOTOWNED);
 
 		/*
 		 * We finally have a valid bp.  We aren't quite out of the
 		 * woods, we still have to reserve kva space.  In order
 		 * to keep fragmentation sane we only allocate kva in
 		 * BKVASIZE chunks.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
 		    B_KVAALLOC)) == B_UNMAPPED) {
 			if (allocbufkva(bp, maxsize, gbflags)) {
 				defrag = 1;
 				bp->b_flags |= B_INVAL;
 				brelse(bp);
 				goto restart;
 			}
 			atomic_add_int(&bufreusecnt, 1);
 		} else if ((bp->b_flags & B_KVAALLOC) != 0 &&
 		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
 			/*
 			 * If the reused buffer has KVA allocated,
 			 * reassign b_kvaalloc to b_kvabase.
 			 */
 			bp->b_kvabase = bp->b_kvaalloc;
 			bp->b_flags &= ~B_KVAALLOC;
 			atomic_subtract_long(&unmapped_bufspace,
 			    bp->b_kvasize);
 			atomic_add_int(&bufreusecnt, 1);
 		} else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
 		    (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
 		    GB_KVAALLOC)) {
 			/*
 			 * The case of reused buffer already have KVA
 			 * mapped, but the request is for unmapped
 			 * buffer with KVA allocated.
 			 */
 			bp->b_kvaalloc = bp->b_kvabase;
 			bp->b_data = bp->b_kvabase = unmapped_buf;
 			bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
 			atomic_add_long(&unmapped_bufspace,
 			    bp->b_kvasize);
 			atomic_add_int(&bufreusecnt, 1);
 		}
 		if ((gbflags & GB_UNMAPPED) == 0) {
 			bp->b_saveaddr = bp->b_kvabase;
 			bp->b_data = bp->b_saveaddr;
 			bp->b_flags &= ~B_UNMAPPED;
 			BUF_CHECK_MAPPED(bp);
 		}
 	}
 	return (bp);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		flushed = flushbufqueues(target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
 			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.
 		 */
 		while (numdirtybuffers > lodirty) {
 			if (buf_flush(numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int queue;
 	int error;
 
 	flushed = 0;
 	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqdirty);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqdirty);
 	while (flushed != target) {
 		maybe_yield();
 		mtx_lock(&bqdirty);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
 		} else {
 			mtx_unlock(&bqdirty);
 			break;
 		}
 		KASSERT(bp->b_qindex != QUEUE_SENTINEL,
 		    ("parallel calls to flushbufqueues() bp %p", bp));
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		mtx_unlock(&bqdirty);
 		if (error != 0)
 			continue;
 		if (bp->b_pin_count > 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			vfs_bio_awrite(bp);
 			vn_finished_write(mp);
 			VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 			if (runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	mtx_lock(&bqdirty);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqdirty);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer. It handles the
  * cases of both B_UNMAPPED buffer, and buffer with the preallocated
  * KVA which is not mapped (B_KVAALLOC).
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	struct buf *scratch_bp;
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		bp->b_flags &= ~B_KVAALLOC;
 		KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
 		bp->b_kvabase = bp->b_kvaalloc;
 		atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 mapping_loop:
 	if (allocbufkva(bp, maxsize, gbflags)) {
 		/*
 		 * Request defragmentation. getnewbuf() returns us the
 		 * allocated space by the scratch buffer KVA.
 		 */
 		scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
 		    (GB_UNMAPPED | GB_KVAALLOC));
 		if (scratch_bp == NULL) {
 			if ((gbflags & GB_NOWAIT_BD) != 0) {
 				/*
 				 * XXXKIB: defragmentation cannot
 				 * succeed, not sure what else to do.
 				 */
 				panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
 			}
 			atomic_add_int(&mappingrestarts, 1);
 			goto mapping_loop;
 		}
 		KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
 		    ("scratch bp !B_KVAALLOC %p", scratch_bp));
 		setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
 		    scratch_bp->b_kvasize, gbflags);
 
 		/* Get rid of the scratch buffer. */
 		scratch_bp->b_kvasize = 0;
 		scratch_bp->b_flags |= B_INVAL;
 		scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
 		brelse(scratch_bp);
 	}
 	if (!need_mapping)
 		return;
 
 has_addr:
 	bp->b_saveaddr = bp->b_kvabase;
 	bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
 	bp->b_flags &= ~B_UNMAPPED;
 	BUF_CHECK_MAPPED(bp);
 	bpmap_qenter(bp);
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successfull read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > MAXBSIZE)
 		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					/*
 					 * If buffer is pinned and caller does
 					 * not want sleep  waiting for it to be
 					 * unpinned, bail out
 					 * */
 					if (bp->b_pin_count > 0) {
 						if (flags & GB_LOCK_NOWAIT) {
 							bqrelse(bp);
 							return (NULL);
 						} else {
 							bunpin_wait(bp);
 						}
 					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistant in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		if (bp->b_bcount != size)
 			allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~GB_UNMAPPED;
 		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, size, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistant data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize, mbsize;
 	int i;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	if ((bp->b_flags & B_VMIO) == 0) {
 		caddr_t origbuf;
 		int origbufsize;
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		if (bp->b_flags & B_MALLOC)
 			newbsize = mbsize;
 		else
 			newbsize = round_page(size);
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * malloced buffers are not shrunk
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				if (newbsize) {
 					bp->b_bcount = size;
 				} else {
 					free(bp->b_data, M_BIOBUF);
 					if (bp->b_bufsize) {
 						atomic_subtract_long(
 						    &bufmallocspace,
 						    bp->b_bufsize);
 						bufspacewakeup();
 						bp->b_bufsize = 0;
 					}
 					bp->b_saveaddr = bp->b_kvabase;
 					bp->b_data = bp->b_saveaddr;
 					bp->b_bcount = 0;
 					bp->b_flags &= ~B_MALLOC;
 				}
 				return 1;
 			}		
 			vm_hold_free_pages(bp, newbsize);
 		} else if (newbsize > bp->b_bufsize) {
 			/*
 			 * We only use malloced memory on the first allocation.
 			 * and revert to page-allocated memory when the buffer
 			 * grows.
 			 */
 			/*
 			 * There is a potential smp race here that could lead
 			 * to bufmallocspace slightly passing the max.  It
 			 * is probably extremely rare and not worth worrying
 			 * over.
 			 */
 			if ( (bufmallocspace < maxbufmallocspace) &&
 				(bp->b_bufsize == 0) &&
 				(mbsize <= PAGE_SIZE/2)) {
 
 				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
 				bp->b_bufsize = mbsize;
 				bp->b_bcount = size;
 				bp->b_flags |= B_MALLOC;
 				atomic_add_long(&bufmallocspace, mbsize);
 				return 1;
 			}
 			origbuf = NULL;
 			origbufsize = 0;
 			/*
 			 * If the buffer is growing on its other-than-first allocation,
 			 * then we revert to the page-allocation scheme.
 			 */
 			if (bp->b_flags & B_MALLOC) {
 				origbuf = bp->b_data;
 				origbufsize = bp->b_bufsize;
 				bp->b_data = bp->b_kvabase;
 				if (bp->b_bufsize) {
 					atomic_subtract_long(&bufmallocspace,
 					    bp->b_bufsize);
 					bufspacewakeup();
 					bp->b_bufsize = 0;
 				}
 				bp->b_flags &= ~B_MALLOC;
 				newbsize = round_page(newbsize);
 			}
 			vm_hold_load_pages(
 			    bp,
 			    (vm_offset_t) bp->b_data + bp->b_bufsize,
 			    (vm_offset_t) bp->b_data + newbsize);
 			if (origbuf) {
 				bcopy(origbuf, bp->b_data, origbufsize);
 				free(origbuf, M_BIOBUF);
 			}
 		}
 	} else {
 		int desiredpages;
 
 		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
 		desiredpages = (size == 0) ? 0 :
 			num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize) {
 			/*
 			 * DEV_BSIZE aligned new buffer size is less then the
 			 * DEV_BSIZE aligned existing buffer size.  Figure out
 			 * if we have to remove any pages.
 			 */
 			if (desiredpages < bp->b_npages) {
 				vm_page_t m;
 
 				if ((bp->b_flags & B_UNMAPPED) == 0) {
 					BUF_CHECK_MAPPED(bp);
 					pmap_qremove((vm_offset_t)trunc_page(
 					    (vm_offset_t)bp->b_data) +
 					    (desiredpages << PAGE_SHIFT),
 					    (bp->b_npages - desiredpages));
 				} else
 					BUF_CHECK_UNMAPPED(bp);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 				for (i = desiredpages; i < bp->b_npages; i++) {
 					/*
 					 * the page is not freed here -- it
 					 * is the responsibility of 
 					 * vnode_pager_setsize
 					 */
 					m = bp->b_pages[i];
 					KASSERT(m != bogus_page,
 					    ("allocbuf: bogus page found"));
 					while (vm_page_sleep_if_busy(m,
 					    "biodep"))
 						continue;
 
 					bp->b_pages[i] = NULL;
 					vm_page_lock(m);
-					vm_page_unwire(m, 0);
+					vm_page_unwire(m, PQ_INACTIVE);
 					vm_page_unlock(m);
 				}
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				bp->b_npages = desiredpages;
 			}
 		} else if (size > bp->b_bcount) {
 			/*
 			 * We are growing the buffer, possibly in a 
 			 * byte-granular fashion.
 			 */
 			vm_object_t obj;
 			vm_offset_t toff;
 			vm_offset_t tinc;
 
 			/*
 			 * Step 1, bring in the VM pages from the object, 
 			 * allocating them if necessary.  We must clear
 			 * B_CACHE if these pages are not valid for the 
 			 * range covered by the buffer.
 			 */
 
 			obj = bp->b_bufobj->bo_object;
 
 			VM_OBJECT_WLOCK(obj);
 			while (bp->b_npages < desiredpages) {
 				vm_page_t m;
 
 				/*
 				 * We must allocate system pages since blocking
 				 * here could interfere with paging I/O, no
 				 * matter which process we are.
 				 *
 				 * Only exclusive busy can be tested here.
 				 * Blocking on shared busy might lead to
 				 * deadlocks once allocbuf() is called after
 				 * pages are vfs_busy_pages().
 				 */
 				m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
 				    bp->b_npages, VM_ALLOC_NOBUSY |
 				    VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
 				    VM_ALLOC_IGN_SBUSY |
 				    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
 				if (m->valid == 0)
 					bp->b_flags &= ~B_CACHE;
 				bp->b_pages[bp->b_npages] = m;
 				++bp->b_npages;
 			}
 
 			/*
 			 * Step 2.  We've loaded the pages into the buffer,
 			 * we have to figure out if we can still have B_CACHE
 			 * set.  Note that B_CACHE is set according to the
 			 * byte-granular range ( bcount and size ), new the
 			 * aligned range ( newbsize ).
 			 *
 			 * The VM test is against m->valid, which is DEV_BSIZE
 			 * aligned.  Needless to say, the validity of the data
 			 * needs to also be DEV_BSIZE aligned.  Note that this
 			 * fails with NFS if the server or some other client
 			 * extends the file's EOF.  If our buffer is resized, 
 			 * B_CACHE may remain set! XXX
 			 */
 
 			toff = bp->b_bcount;
 			tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 
 			while ((bp->b_flags & B_CACHE) && toff < size) {
 				vm_pindex_t pi;
 
 				if (tinc > (size - toff))
 					tinc = size - toff;
 
 				pi = ((bp->b_offset & PAGE_MASK) + toff) >> 
 				    PAGE_SHIFT;
 
 				vfs_buf_test_cache(
 				    bp, 
 				    bp->b_offset,
 				    toff, 
 				    tinc, 
 				    bp->b_pages[pi]
 				);
 				toff += tinc;
 				tinc = PAGE_SIZE;
 			}
 			VM_OBJECT_WUNLOCK(obj);
 
 			/*
 			 * Step 3, fixup the KVM pmap.
 			 */
 			if ((bp->b_flags & B_UNMAPPED) == 0)
 				bpmap_qenter(bp);
 			else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 	}
 	if (newbsize < bp->b_bufsize)
 		bufspacewakeup();
 	bp->b_bufsize = newbsize;	/* actual buffer allocation	*/
 	bp->b_bcount = size;		/* requested buffer size	*/
 	return 1;
 }
 
 extern int inflight_transient_maps;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		pmap_qremove(start, OFF_TO_IDX(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else {
 		bp->bio_flags |= BIO_DONE;
 		done(bp);
 	}
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
  /*
   * Call back function from struct bio back up to struct buf.
   */
 static void
 bufdonebio(struct bio *bip)
 {
 	struct buf *bp;
 
 	bp = bip->bio_caller2;
 	bp->b_resid = bip->bio_resid;
 	bp->b_ioflags = bip->bio_flags;
 	bp->b_error = bip->bio_error;
 	if (bp->b_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bufdone(bp);
 	g_destroy_bio(bip);
 }
 
 void
 dev_strategy(struct cdev *dev, struct buf *bp)
 {
 	struct cdevsw *csw;
 	int ref;
 
 	KASSERT(dev->si_refcount > 0,
 	    ("dev_strategy on un-referenced struct cdev *(%s) %p",
 	    devtoname(dev), dev));
 
 	csw = dev_refthread(dev, &ref);
 	dev_strategy_csw(dev, csw, bp);
 	dev_relthread(dev, ref);
 }
 
 void
 dev_strategy_csw(struct cdev *dev, struct cdevsw *csw, struct buf *bp)
 {
 	struct bio *bip;
 
 	KASSERT(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE,
 	    ("b_iocmd botch"));
 	KASSERT(((dev->si_flags & SI_ETERNAL) != 0 && csw != NULL) ||
 	    dev->si_threadcount > 0,
 	    ("dev_strategy_csw threadcount cdev *(%s) %p", devtoname(dev),
 	    dev));
 	if (csw == NULL) {
 		bp->b_error = ENXIO;
 		bp->b_ioflags = BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	for (;;) {
 		bip = g_new_bio();
 		if (bip != NULL)
 			break;
 		/* Try again later */
 		tsleep(&bp, PRIBIO, "dev_strat", hz/10);
 	}
 	bip->bio_cmd = bp->b_iocmd;
 	bip->bio_offset = bp->b_iooffset;
 	bip->bio_length = bp->b_bcount;
 	bip->bio_bcount = bp->b_bcount;	/* XXX: remove */
 	bdata2bio(bp, bip);
 	bip->bio_done = bufdonebio;
 	bip->bio_caller2 = bp;
 	bip->bio_dev = dev;
 	(*csw->d_strategy)(bip);
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occured, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existance
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	BUF_ASSERT_HELD(bp);
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		vm_ooffset_t foff;
 		vm_page_t m;
 		vm_object_t obj;
 		struct vnode *vp;
 		int bogus, i, iosize;
 
 		obj = bp->b_bufobj->bo_object;
 		KASSERT(obj->paging_in_progress >= bp->b_npages,
 		    ("biodone_finish: paging in progress(%d) < b_npages(%d)",
 		    obj->paging_in_progress, bp->b_npages));
 
 		vp = bp->b_vp;
 		KASSERT(vp->v_holdcnt > 0,
 		    ("biodone_finish: vnode %p has zero hold count", vp));
 		KASSERT(vp->v_object != NULL,
 		    ("biodone_finish: vnode %p has no vm_object", vp));
 
 		foff = bp->b_offset;
 		KASSERT(bp->b_offset != NOOFFSET,
 		    ("biodone_finish: bp %p has no buffer offset", bp));
 
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occured.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		iosize = bp->b_bcount - bp->b_resid;
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR)) {
 			bp->b_flags |= B_CACHE;
 		}
 		bogus = 0;
 		VM_OBJECT_WLOCK(obj);
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
 
 			resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 			if (resid > iosize)
 				resid = iosize;
 
 			/*
 			 * cleanup bogus pages, restoring the originals
 			 */
 			m = bp->b_pages[i];
 			if (m == bogus_page) {
 				bogus = bogusflag = 1;
 				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
 			}
 			KASSERT(OFF_TO_IDX(foff) == m->pindex,
 			    ("biodone_finish: foff(%jd)/pindex(%ju) mismatch",
 			    (intmax_t)foff, (uintmax_t)m->pindex));
 
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) {
 				KASSERT((m->dirty & vm_page_bits(foff &
 				    PAGE_MASK, resid)) == 0, ("bufdone_finish:"
 				    " page %p has unexpected dirty bits", m));
 				vfs_page_set_valid(bp, foff, m);
 			}
 
 			vm_page_sunbusy(m);
 			vm_object_pip_subtract(obj, 1);
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
 		vm_object_pip_wakeupn(obj, 0);
 		VM_OBJECT_WUNLOCK(obj);
 		if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
 			BUF_CHECK_MAPPED(bp);
 			pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 			    bp->b_pages, bp->b_npages);
 		}
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) || (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistant.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if ((bp->b_flags & B_UNMAPPED) == 0) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_object_pip_subtract(obj, 1);
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, 0);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundry or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage");
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistant state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	int i, bogus;
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus++;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if ((bp->b_flags & B_UNMAPPED) == 0) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
 		if (p == NULL) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		if (vm_page_sbusied(p))
 			printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
 			    (intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
 		vm_page_free(p);
 	}
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	caddr_t kva;
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
 		kva = bp->b_saveaddr;
 		bp->b_saveaddr = bp->b_data;
 		bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
 		bp->b_flags &= ~B_UNMAPPED;
 	} else {
 		bp->b_flags |= B_UNMAPPED;
 		bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 		bp->b_saveaddr = bp->b_data;
 		bp->b_data = unmapped_buf;
 	}
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (bp->b_flags & B_UNMAPPED)
 		bp->b_flags &= ~B_UNMAPPED;
 	else
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 	
 	bp->b_data = bp->b_saveaddr;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 void
 bpin(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_pin_count++;
 	mtx_unlock(mtxp);
 }
 
 void
 bunpin(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	if (--bp->b_pin_count == 0)
 		wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bunpin_wait(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while (bp->b_pin_count > 0)
 		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
 	mtx_unlock(mtxp);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if ((bp->b_flags & B_UNMAPPED) != 0) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	db_printf(" ");
 	BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if ((bp->b_flags & B_INFREECNT) != 0)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/net/bpf_zerocopy.c	(revision 267237)
@@ -1,599 +1,599 @@
 /*-
  * Copyright (c) 2007 Seccuris Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract to
  * Seccuris Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_bpf.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/uio.h>
 
 #include <machine/atomic.h>
 
 #include <net/if.h>
 #include <net/bpf.h>
 #include <net/bpf_zerocopy.h>
 #include <net/bpfdesc.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 /*
  * Zero-copy buffer scheme for BPF: user space "donates" two buffers, which
  * are mapped into the kernel address space using sf_bufs and used directly
  * by BPF.  Memory is wired since page faults cannot be tolerated in the
  * contexts where the buffers are copied to (locks held, interrupt context,
  * etc).  Access to shared memory buffers is synchronized using a header on
  * each buffer, allowing the number of system calls to go to zero as BPF
  * reaches saturation (buffers filled as fast as they can be drained by the
  * user process).  Full details of the protocol for communicating between the
  * user process and BPF may be found in bpf(4).
  */
 
 /*
  * Maximum number of pages per buffer.  Since all BPF devices use two, the
  * maximum per device is 2*BPF_MAX_PAGES.  Resource limits on the number of
  * sf_bufs may be an issue, so do not set this too high.  On older systems,
  * kernel address space limits may also be an issue.
  */
 #define	BPF_MAX_PAGES	512
 
 /*
  * struct zbuf describes a memory buffer loaned by a user process to the
  * kernel.  We represent this as a series of pages managed using an array of
  * sf_bufs.  Even though the memory is contiguous in user space, it may not
  * be mapped contiguously in the kernel (i.e., a set of physically
  * non-contiguous pages in the direct map region) so we must implement
  * scatter-gather copying.  One significant mitigating factor is that on
  * systems with a direct memory map, we can avoid TLB misses.
  *
  * At the front of the shared memory region is a bpf_zbuf_header, which
  * contains shared control data to allow user space and the kernel to
  * synchronize; this is included in zb_size, but not bpf_bufsize, so that BPF
  * knows that the space is not available.
  */
 struct zbuf {
 	vm_offset_t	 zb_uaddr;	/* User address at time of setup. */
 	size_t		 zb_size;	/* Size of buffer, incl. header. */
 	u_int		 zb_numpages;	/* Number of pages. */
 	int		 zb_flags;	/* Flags on zbuf. */
 	struct sf_buf	**zb_pages;	/* Pages themselves. */
 	struct bpf_zbuf_header	*zb_header;	/* Shared header. */
 };
 
 /*
  * When a buffer has been assigned to userspace, flag it as such, as the
  * buffer may remain in the store position as a result of the user process
  * not yet having acknowledged the buffer in the hold position yet.
  */
 #define	ZBUF_FLAG_ASSIGNED	0x00000001	/* Set when owned by user. */
 
 /*
  * Release a page we've previously wired.
  */
 static void
 zbuf_page_free(vm_page_t pp)
 {
 
 	vm_page_lock(pp);
-	vm_page_unwire(pp, 0);
+	vm_page_unwire(pp, PQ_INACTIVE);
 	if (pp->wire_count == 0 && pp->object == NULL)
 		vm_page_free(pp);
 	vm_page_unlock(pp);
 }
 
 /*
  * Free an sf_buf with attached page.
  */
 static void
 zbuf_sfbuf_free(struct sf_buf *sf)
 {
 	vm_page_t pp;
 
 	pp = sf_buf_page(sf);
 	sf_buf_free(sf);
 	zbuf_page_free(pp);
 }
 
 /*
  * Free a zbuf, including its page array, sbufs, and pages.  Allow partially
  * allocated zbufs to be freed so that it may be used even during a zbuf
  * setup.
  */
 static void
 zbuf_free(struct zbuf *zb)
 {
 	int i;
 
 	for (i = 0; i < zb->zb_numpages; i++) {
 		if (zb->zb_pages[i] != NULL)
 			zbuf_sfbuf_free(zb->zb_pages[i]);
 	}
 	free(zb->zb_pages, M_BPF);
 	free(zb, M_BPF);
 }
 
 /*
  * Given a user pointer to a page of user memory, return an sf_buf for the
  * page.  Because we may be requesting quite a few sf_bufs, prefer failure to
  * deadlock and use SFB_NOWAIT.
  */
 static struct sf_buf *
 zbuf_sfbuf_get(struct vm_map *map, vm_offset_t uaddr)
 {
 	struct sf_buf *sf;
 	vm_page_t pp;
 
 	if (vm_fault_quick_hold_pages(map, uaddr, PAGE_SIZE, VM_PROT_READ |
 	    VM_PROT_WRITE, &pp, 1) < 0)
 		return (NULL);
 	vm_page_lock(pp);
 	vm_page_wire(pp);
 	vm_page_unhold(pp);
 	vm_page_unlock(pp);
 	sf = sf_buf_alloc(pp, SFB_NOWAIT);
 	if (sf == NULL) {
 		zbuf_page_free(pp);
 		return (NULL);
 	}
 	return (sf);
 }
 
 /*
  * Create a zbuf describing a range of user address space memory.  Validate
  * page alignment, size requirements, etc.
  */
 static int
 zbuf_setup(struct thread *td, vm_offset_t uaddr, size_t len,
     struct zbuf **zbp)
 {
 	struct zbuf *zb;
 	struct vm_map *map;
 	int error, i;
 
 	*zbp = NULL;
 
 	/*
 	 * User address must be page-aligned.
 	 */
 	if (uaddr & PAGE_MASK)
 		return (EINVAL);
 
 	/*
 	 * Length must be an integer number of full pages.
 	 */
 	if (len & PAGE_MASK)
 		return (EINVAL);
 
 	/*
 	 * Length must not exceed per-buffer resource limit.
 	 */
 	if ((len / PAGE_SIZE) > BPF_MAX_PAGES)
 		return (EINVAL);
 
 	/*
 	 * Allocate the buffer and set up each page with is own sf_buf.
 	 */
 	error = 0;
 	zb = malloc(sizeof(*zb), M_BPF, M_ZERO | M_WAITOK);
 	zb->zb_uaddr = uaddr;
 	zb->zb_size = len;
 	zb->zb_numpages = len / PAGE_SIZE;
 	zb->zb_pages = malloc(sizeof(struct sf_buf *) *
 	    zb->zb_numpages, M_BPF, M_ZERO | M_WAITOK);
 	map = &td->td_proc->p_vmspace->vm_map;
 	for (i = 0; i < zb->zb_numpages; i++) {
 		zb->zb_pages[i] = zbuf_sfbuf_get(map,
 		    uaddr + (i * PAGE_SIZE));
 		if (zb->zb_pages[i] == NULL) {
 			error = EFAULT;
 			goto error;
 		}
 	}
 	zb->zb_header =
 	    (struct bpf_zbuf_header *)sf_buf_kva(zb->zb_pages[0]);
 	bzero(zb->zb_header, sizeof(*zb->zb_header));
 	*zbp = zb;
 	return (0);
 
 error:
 	zbuf_free(zb);
 	return (error);
 }
 
 /*
  * Copy bytes from a source into the specified zbuf.  The caller is
  * responsible for performing bounds checking, etc.
  */
 void
 bpf_zerocopy_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
     void *src, u_int len)
 {
 	u_int count, page, poffset;
 	u_char *src_bytes;
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_append_bytes: not in zbuf mode"));
 	KASSERT(buf != NULL, ("bpf_zerocopy_append_bytes: NULL buf"));
 
 	src_bytes = (u_char *)src;
 	zb = (struct zbuf *)buf;
 
 	KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
 	    ("bpf_zerocopy_append_bytes: ZBUF_FLAG_ASSIGNED"));
 
 	/*
 	 * Scatter-gather copy to user pages mapped into kernel address space
 	 * using sf_bufs: copy up to a page at a time.
 	 */
 	offset += sizeof(struct bpf_zbuf_header);
 	page = offset / PAGE_SIZE;
 	poffset = offset % PAGE_SIZE;
 	while (len > 0) {
 		KASSERT(page < zb->zb_numpages, ("bpf_zerocopy_append_bytes:"
 		   " page overflow (%d p %d np)\n", page, zb->zb_numpages));
 
 		count = min(len, PAGE_SIZE - poffset);
 		bcopy(src_bytes, ((u_char *)sf_buf_kva(zb->zb_pages[page])) +
 		    poffset, count);
 		poffset += count;
 		if (poffset == PAGE_SIZE) {
 			poffset = 0;
 			page++;
 		}
 		KASSERT(poffset < PAGE_SIZE,
 		    ("bpf_zerocopy_append_bytes: page offset overflow (%d)",
 		    poffset));
 		len -= count;
 		src_bytes += count;
 	}
 }
 
 /*
  * Copy bytes from an mbuf chain to the specified zbuf: copying will be
  * scatter-gather both from mbufs, which may be fragmented over memory, and
  * to pages, which may not be contiguously mapped in kernel address space.
  * As with bpf_zerocopy_append_bytes(), the caller is responsible for
  * checking that this will not exceed the buffer limit.
  */
 void
 bpf_zerocopy_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset,
     void *src, u_int len)
 {
 	u_int count, moffset, page, poffset;
 	const struct mbuf *m;
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_append_mbuf not in zbuf mode"));
 	KASSERT(buf != NULL, ("bpf_zerocopy_append_mbuf: NULL buf"));
 
 	m = (struct mbuf *)src;
 	zb = (struct zbuf *)buf;
 
 	KASSERT((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0,
 	    ("bpf_zerocopy_append_mbuf: ZBUF_FLAG_ASSIGNED"));
 
 	/*
 	 * Scatter gather both from an mbuf chain and to a user page set
 	 * mapped into kernel address space using sf_bufs.  If we're lucky,
 	 * each mbuf requires one copy operation, but if page alignment and
 	 * mbuf alignment work out less well, we'll be doing two copies per
 	 * mbuf.
 	 */
 	offset += sizeof(struct bpf_zbuf_header);
 	page = offset / PAGE_SIZE;
 	poffset = offset % PAGE_SIZE;
 	moffset = 0;
 	while (len > 0) {
 		KASSERT(page < zb->zb_numpages,
 		    ("bpf_zerocopy_append_mbuf: page overflow (%d p %d "
 		    "np)\n", page, zb->zb_numpages));
 		KASSERT(m != NULL,
 		    ("bpf_zerocopy_append_mbuf: end of mbuf chain"));
 
 		count = min(m->m_len - moffset, len);
 		count = min(count, PAGE_SIZE - poffset);
 		bcopy(mtod(m, u_char *) + moffset,
 		    ((u_char *)sf_buf_kva(zb->zb_pages[page])) + poffset,
 		    count);
 		poffset += count;
 		if (poffset == PAGE_SIZE) {
 			poffset = 0;
 			page++;
 		}
 		KASSERT(poffset < PAGE_SIZE,
 		    ("bpf_zerocopy_append_mbuf: page offset overflow (%d)",
 		    poffset));
 		moffset += count;
 		if (moffset == m->m_len) {
 			m = m->m_next;
 			moffset = 0;
 		}
 		len -= count;
 	}
 }
 
 /*
  * Notification from the BPF framework that a buffer in the store position is
  * rejecting packets and may be considered full.  We mark the buffer as
  * immutable and assign to userspace so that it is immediately available for
  * the user process to access.
  */
 void
 bpf_zerocopy_buffull(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_buffull: not in zbuf mode"));
 
 	zb = (struct zbuf *)d->bd_sbuf;
 	KASSERT(zb != NULL, ("bpf_zerocopy_buffull: zb == NULL"));
 
 	if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
 		zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
 		zb->zb_header->bzh_kernel_len = d->bd_slen;
 		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
 	}
 }
 
 /*
  * Notification from the BPF framework that a buffer has moved into the held
  * slot on a descriptor.  Zero-copy BPF will update the shared page to let
  * the user process know and flag the buffer as assigned if it hasn't already
  * been marked assigned due to filling while it was in the store position.
  *
  * Note: identical logic as in bpf_zerocopy_buffull(), except that we operate
  * on bd_hbuf and bd_hlen.
  */
 void
 bpf_zerocopy_bufheld(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_bufheld: not in zbuf mode"));
 
 	zb = (struct zbuf *)d->bd_hbuf;
 	KASSERT(zb != NULL, ("bpf_zerocopy_bufheld: zb == NULL"));
 
 	if ((zb->zb_flags & ZBUF_FLAG_ASSIGNED) == 0) {
 		zb->zb_flags |= ZBUF_FLAG_ASSIGNED;
 		zb->zb_header->bzh_kernel_len = d->bd_hlen;
 		atomic_add_rel_int(&zb->zb_header->bzh_kernel_gen, 1);
 	}
 }
 
 /*
  * Notification from the BPF framework that the free buffer has been been
  * rotated out of the held position to the free position.  This happens when
  * the user acknowledges the held buffer.
  */
 void
 bpf_zerocopy_buf_reclaimed(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_reclaim_buf: not in zbuf mode"));
 
 	KASSERT(d->bd_fbuf != NULL,
 	    ("bpf_zerocopy_buf_reclaimed: NULL free buf"));
 	zb = (struct zbuf *)d->bd_fbuf;
 	zb->zb_flags &= ~ZBUF_FLAG_ASSIGNED;
 }
 
 /*
  * Query from the BPF framework regarding whether the buffer currently in the
  * held position can be moved to the free position, which can be indicated by
  * the user process making their generation number equal to the kernel
  * generation number.
  */
 int
 bpf_zerocopy_canfreebuf(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_canfreebuf: not in zbuf mode"));
 
 	zb = (struct zbuf *)d->bd_hbuf;
 	if (zb == NULL)
 		return (0);
 	if (zb->zb_header->bzh_kernel_gen ==
 	    atomic_load_acq_int(&zb->zb_header->bzh_user_gen))
 		return (1);
 	return (0);
 }
 
 /*
  * Query from the BPF framework as to whether or not the buffer current in
  * the store position can actually be written to.  This may return false if
  * the store buffer is assigned to userspace before the hold buffer is
  * acknowledged.
  */
 int
 bpf_zerocopy_canwritebuf(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_canwritebuf: not in zbuf mode"));
 
 	zb = (struct zbuf *)d->bd_sbuf;
 	KASSERT(zb != NULL, ("bpf_zerocopy_canwritebuf: bd_sbuf NULL"));
 
 	if (zb->zb_flags & ZBUF_FLAG_ASSIGNED)
 		return (0);
 	return (1);
 }
 
 /*
  * Free zero copy buffers at request of descriptor.
  */
 void
 bpf_zerocopy_free(struct bpf_d *d)
 {
 	struct zbuf *zb;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_free: not in zbuf mode"));
 
 	zb = (struct zbuf *)d->bd_sbuf;
 	if (zb != NULL)
 		zbuf_free(zb);
 	zb = (struct zbuf *)d->bd_hbuf;
 	if (zb != NULL)
 		zbuf_free(zb);
 	zb = (struct zbuf *)d->bd_fbuf;
 	if (zb != NULL)
 		zbuf_free(zb);
 }
 
 /*
  * Ioctl to return the maximum buffer size.
  */
 int
 bpf_zerocopy_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 {
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_ioctl_getzmax: not in zbuf mode"));
 
 	*i = BPF_MAX_PAGES * PAGE_SIZE;
 	return (0);
 }
 
 /*
  * Ioctl to force rotation of the two buffers, if there's any data available.
  * This can be used by user space to implement timeouts when waiting for a
  * buffer to fill.
  */
 int
 bpf_zerocopy_ioctl_rotzbuf(struct thread *td, struct bpf_d *d,
     struct bpf_zbuf *bz)
 {
 	struct zbuf *bzh;
 
 	bzero(bz, sizeof(*bz));
 	BPFD_LOCK(d);
 	if (d->bd_hbuf == NULL && d->bd_slen != 0) {
 		ROTATE_BUFFERS(d);
 		bzh = (struct zbuf *)d->bd_hbuf;
 		bz->bz_bufa = (void *)bzh->zb_uaddr;
 		bz->bz_buflen = d->bd_hlen;
 	}
 	BPFD_UNLOCK(d);
 	return (0);
 }
 
 /*
  * Ioctl to configure zero-copy buffers -- may be done only once.
  */
 int
 bpf_zerocopy_ioctl_setzbuf(struct thread *td, struct bpf_d *d,
     struct bpf_zbuf *bz)
 {
 	struct zbuf *zba, *zbb;
 	int error;
 
 	KASSERT(d->bd_bufmode == BPF_BUFMODE_ZBUF,
 	    ("bpf_zerocopy_ioctl_setzbuf: not in zbuf mode"));
 
 	/*
 	 * Must set both buffers.  Cannot clear them.
 	 */
 	if (bz->bz_bufa == NULL || bz->bz_bufb == NULL)
 		return (EINVAL);
 
 	/*
 	 * Buffers must have a size greater than 0.  Alignment and other size
 	 * validity checking is done in zbuf_setup().
 	 */
 	if (bz->bz_buflen == 0)
 		return (EINVAL);
 
 	/*
 	 * Allocate new buffers.
 	 */
 	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufa, bz->bz_buflen,
 	    &zba);
 	if (error)
 		return (error);
 	error = zbuf_setup(td, (vm_offset_t)bz->bz_bufb, bz->bz_buflen,
 	    &zbb);
 	if (error) {
 		zbuf_free(zba);
 		return (error);
 	}
 
 	/*
 	 * We only allow buffers to be installed once, so atomically check
 	 * that no buffers are currently installed and install new buffers.
 	 */
 	BPFD_LOCK(d);
 	if (d->bd_hbuf != NULL || d->bd_sbuf != NULL || d->bd_fbuf != NULL ||
 	    d->bd_bif != NULL) {
 		BPFD_UNLOCK(d);
 		zbuf_free(zba);
 		zbuf_free(zbb);
 		return (EINVAL);
 	}
 
 	/*
 	 * Point BPF descriptor at buffers; initialize sbuf as zba so that
 	 * it is always filled first in the sequence, per bpf(4).
 	 */
 	d->bd_fbuf = (caddr_t)zbb;
 	d->bd_sbuf = (caddr_t)zba;
 	d->bd_slen = 0;
 	d->bd_hlen = 0;
 
 	/*
 	 * We expose only the space left in the buffer after the size of the
 	 * shared management region.
 	 */
 	d->bd_bufsize = bz->bz_buflen - sizeof(struct bpf_zbuf_header);
 	BPFD_UNLOCK(d);
 	return (0);
 }
Index: user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/vm/vm_fault.c	(revision 267237)
@@ -1,1565 +1,1565 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_fault.c	8.4 (Berkeley) 1/12/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	Page fault handling module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 
 #define PFBAK 4
 #define PFFOR 4
 
 static int vm_fault_additional_pages(vm_page_t, int, int, vm_page_t *, int *);
 
 #define	VM_FAULT_READ_BEHIND	8
 #define	VM_FAULT_READ_MAX	(1 + VM_FAULT_READ_AHEAD_MAX)
 #define	VM_FAULT_NINCR		(VM_FAULT_READ_MAX / VM_FAULT_READ_BEHIND)
 #define	VM_FAULT_SUM		(VM_FAULT_NINCR * (VM_FAULT_NINCR + 1) / 2)
 #define	VM_FAULT_CACHE_BEHIND	(VM_FAULT_READ_BEHIND * VM_FAULT_SUM)
 
 struct faultstate {
 	vm_page_t m;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	vm_page_t first_m;
 	vm_object_t	first_object;
 	vm_pindex_t first_pindex;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	int lookup_still_valid;
 	struct vnode *vp;
 };
 
 static void vm_fault_cache_behind(const struct faultstate *fs, int distance);
 static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
 	    int faultcount, int reqpage);
 
 static inline void
 release_page(struct faultstate *fs)
 {
 
 	vm_page_xunbusy(fs->m);
 	vm_page_lock(fs->m);
 	vm_page_deactivate(fs->m);
 	vm_page_unlock(fs->m);
 	fs->m = NULL;
 }
 
 static inline void
 unlock_map(struct faultstate *fs)
 {
 
 	if (fs->lookup_still_valid) {
 		vm_map_lookup_done(fs->map, fs->entry);
 		fs->lookup_still_valid = FALSE;
 	}
 }
 
 static void
 unlock_and_deallocate(struct faultstate *fs)
 {
 
 	vm_object_pip_wakeup(fs->object);
 	VM_OBJECT_WUNLOCK(fs->object);
 	if (fs->object != fs->first_object) {
 		VM_OBJECT_WLOCK(fs->first_object);
 		vm_page_lock(fs->first_m);
 		vm_page_free(fs->first_m);
 		vm_page_unlock(fs->first_m);
 		vm_object_pip_wakeup(fs->first_object);
 		VM_OBJECT_WUNLOCK(fs->first_object);
 		fs->first_m = NULL;
 	}
 	vm_object_deallocate(fs->first_object);
 	unlock_map(fs);	
 	if (fs->vp != NULL) { 
 		vput(fs->vp);
 		fs->vp = NULL;
 	}
 }
 
 /*
  * TRYPAGER - used by vm_fault to calculate whether the pager for the
  *	      current object *might* contain the page.
  *
  *	      default objects are zero-fill, there is no real pager.
  */
 #define TRYPAGER	(fs.object->type != OBJT_DEFAULT && \
 			((fault_flags & VM_FAULT_CHANGE_WIRING) == 0 || wired))
 
 /*
  *	vm_fault:
  *
  *	Handle a page fault occurring at the given address,
  *	requiring the given permissions, in the map specified.
  *	If successful, the page is inserted into the
  *	associated physical map.
  *
  *	NOTE: the given address should be truncated to the
  *	proper page address.
  *
  *	KERN_SUCCESS is returned if the page fault is handled; otherwise,
  *	a standard error specifying why the fault is fatal is returned.
  *
  *	The map in question must be referenced, and remains so.
  *	Caller may hold no locks.
  */
 int
 vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags)
 {
 	struct thread *td;
 	int result;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_NOFAULTING) != 0)
 		return (KERN_PROTECTION_FAILURE);
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(td, KTR_FAULT))
 		ktrfault(vaddr, fault_type);
 #endif
 	result = vm_fault_hold(map, trunc_page(vaddr), fault_type, fault_flags,
 	    NULL);
 #ifdef KTRACE
 	if (map != kernel_map && KTRPOINT(td, KTR_FAULTEND))
 		ktrfaultend(result);
 #endif
 	return (result);
 }
 
 int
 vm_fault_hold(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
     int fault_flags, vm_page_t *m_hold)
 {
 	vm_prot_t prot;
 	long ahead, behind;
 	int alloc_req, era, faultcount, nera, reqpage, result;
 	boolean_t growstack, is_first_object_locked, wired;
 	int map_generation;
 	vm_object_t next_object;
 	vm_page_t marray[VM_FAULT_READ_MAX];
 	int hardfault;
 	struct faultstate fs;
 	struct vnode *vp;
 	int locked, error;
 
 	hardfault = 0;
 	growstack = TRUE;
 	PCPU_INC(cnt.v_vm_faults);
 	fs.vp = NULL;
 	faultcount = reqpage = 0;
 
 RetryFault:;
 
 	/*
 	 * Find the backing store object and offset into it to begin the
 	 * search.
 	 */
 	fs.map = map;
 	result = vm_map_lookup(&fs.map, vaddr, fault_type, &fs.entry,
 	    &fs.first_object, &fs.first_pindex, &prot, &wired);
 	if (result != KERN_SUCCESS) {
 		if (growstack && result == KERN_INVALID_ADDRESS &&
 		    map != kernel_map) {
 			result = vm_map_growstack(curproc, vaddr);
 			if (result != KERN_SUCCESS)
 				return (KERN_FAILURE);
 			growstack = FALSE;
 			goto RetryFault;
 		}
 		return (result);
 	}
 
 	map_generation = fs.map->timestamp;
 
 	if (fs.entry->eflags & MAP_ENTRY_NOFAULT) {
 		if ((curthread->td_pflags & TDP_DEVMEMIO) != 0) {
 			vm_map_unlock_read(fs.map);
 			return (KERN_FAILURE);
 		}
 		panic("vm_fault: fault on nofault entry, addr: %lx",
 		    (u_long)vaddr);
 	}
 
 	if (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION &&
 	    fs.entry->wiring_thread != curthread) {
 		vm_map_unlock_read(fs.map);
 		vm_map_lock(fs.map);
 		if (vm_map_lookup_entry(fs.map, vaddr, &fs.entry) &&
 		    (fs.entry->eflags & MAP_ENTRY_IN_TRANSITION)) {
 			fs.entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP;
 			vm_map_unlock_and_wait(fs.map, 0);
 		} else
 			vm_map_unlock(fs.map);
 		goto RetryFault;
 	}
 
 	/*
 	 * Make a reference to this object to prevent its disposal while we
 	 * are messing with it.  Once we have the reference, the map is free
 	 * to be diddled.  Since objects reference their shadows (and copies),
 	 * they will stay around as well.
 	 *
 	 * Bump the paging-in-progress count to prevent size changes (e.g. 
 	 * truncation operations) during I/O.  This must be done after
 	 * obtaining the vnode lock in order to avoid possible deadlocks.
 	 */
 	VM_OBJECT_WLOCK(fs.first_object);
 	vm_object_reference_locked(fs.first_object);
 	vm_object_pip_add(fs.first_object, 1);
 
 	fs.lookup_still_valid = TRUE;
 
 	if (wired)
 		fault_type = prot | (fault_type & VM_PROT_COPY);
 
 	fs.first_m = NULL;
 
 	/*
 	 * Search for the page at object/offset.
 	 */
 	fs.object = fs.first_object;
 	fs.pindex = fs.first_pindex;
 	while (TRUE) {
 		/*
 		 * If the object is dead, we stop here
 		 */
 		if (fs.object->flags & OBJ_DEAD) {
 			unlock_and_deallocate(&fs);
 			return (KERN_PROTECTION_FAILURE);
 		}
 
 		/*
 		 * See if page is resident
 		 */
 		fs.m = vm_page_lookup(fs.object, fs.pindex);
 		if (fs.m != NULL) {
 			/*
 			 * Wait/Retry if the page is busy.  We have to do this
 			 * if the page is either exclusive or shared busy
 			 * because the vm_pager may be using read busy for
 			 * pageouts (and even pageins if it is the vnode
 			 * pager), and we could end up trying to pagein and
 			 * pageout the same page simultaneously.
 			 *
 			 * We can theoretically allow the busy case on a read
 			 * fault if the page is marked valid, but since such
 			 * pages are typically already pmap'd, putting that
 			 * special case in might be more effort then it is 
 			 * worth.  We cannot under any circumstances mess
 			 * around with a shared busied page except, perhaps,
 			 * to pmap it.
 			 */
 			if (vm_page_busied(fs.m)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it. 
 				 */
 				vm_page_aflag_set(fs.m, PGA_REFERENCED);
 				if (fs.object != fs.first_object) {
 					if (!VM_OBJECT_TRYWLOCK(
 					    fs.first_object)) {
 						VM_OBJECT_WUNLOCK(fs.object);
 						VM_OBJECT_WLOCK(fs.first_object);
 						VM_OBJECT_WLOCK(fs.object);
 					}
 					vm_page_lock(fs.first_m);
 					vm_page_free(fs.first_m);
 					vm_page_unlock(fs.first_m);
 					vm_object_pip_wakeup(fs.first_object);
 					VM_OBJECT_WUNLOCK(fs.first_object);
 					fs.first_m = NULL;
 				}
 				unlock_map(&fs);
 				if (fs.m == vm_page_lookup(fs.object,
 				    fs.pindex)) {
 					vm_page_sleep_if_busy(fs.m, "vmpfw");
 				}
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_WUNLOCK(fs.object);
 				PCPU_INC(cnt.v_intrans);
 				vm_object_deallocate(fs.first_object);
 				goto RetryFault;
 			}
 			vm_page_lock(fs.m);
 			vm_page_remque(fs.m);
 			vm_page_unlock(fs.m);
 
 			/*
 			 * Mark page busy for other processes, and the 
 			 * pagedaemon.  If it still isn't completely valid
 			 * (readable), jump to readrest, else break-out ( we
 			 * found the page ).
 			 */
 			vm_page_xbusy(fs.m);
 			if (fs.m->valid != VM_PAGE_BITS_ALL)
 				goto readrest;
 			break;
 		}
 
 		/*
 		 * Page is not resident, If this is the search termination
 		 * or the pager might contain the page, allocate a new page.
 		 */
 		if (TRYPAGER || fs.object == fs.first_object) {
 			if (fs.pindex >= fs.object->size) {
 				unlock_and_deallocate(&fs);
 				return (KERN_PROTECTION_FAILURE);
 			}
 
 			/*
 			 * Allocate a new page for this object/offset pair.
 			 *
 			 * Unlocked read of the p_flag is harmless. At
 			 * worst, the P_KILLED might be not observed
 			 * there, and allocation can fail, causing
 			 * restart and new reading of the p_flag.
 			 */
 			fs.m = NULL;
 			if (!vm_page_count_severe() || P_KILLED(curproc)) {
 #if VM_NRESERVLEVEL > 0
 				if ((fs.object->flags & OBJ_COLORED) == 0) {
 					fs.object->flags |= OBJ_COLORED;
 					fs.object->pg_color = atop(vaddr) -
 					    fs.pindex;
 				}
 #endif
 				alloc_req = P_KILLED(curproc) ?
 				    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL;
 				if (fs.object->type != OBJT_VNODE &&
 				    fs.object->backing_object == NULL)
 					alloc_req |= VM_ALLOC_ZERO;
 				if ((fs.object->flags & OBJ_UNMANAGED) != 0)
 					alloc_req |= VM_ALLOC_WIRED;
 				fs.m = vm_page_alloc(fs.object, fs.pindex,
 				    alloc_req);
 			}
 			if (fs.m == NULL) {
 				unlock_and_deallocate(&fs);
 				VM_WAITPFAULT;
 				goto RetryFault;
 			} else if (fs.m->valid == VM_PAGE_BITS_ALL)
 				break;
 		}
 
 readrest:
 		/*
 		 * We have found a valid page or we have allocated a new page.
 		 * The page thus may not be valid or may not be entirely 
 		 * valid.
 		 *
 		 * Attempt to fault-in the page if there is a chance that the
 		 * pager has it, and potentially fault in additional pages
 		 * at the same time.
 		 */
 		if (TRYPAGER) {
 			int rv;
 			u_char behavior = vm_map_entry_behavior(fs.entry);
 
 			if (behavior == MAP_ENTRY_BEHAV_RANDOM ||
 			    P_KILLED(curproc)) {
 				behind = 0;
 				ahead = 0;
 			} else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) {
 				behind = 0;
 				ahead = atop(fs.entry->end - vaddr) - 1;
 				if (ahead > VM_FAULT_READ_AHEAD_MAX)
 					ahead = VM_FAULT_READ_AHEAD_MAX;
 				if (fs.pindex == fs.entry->next_read)
 					vm_fault_cache_behind(&fs,
 					    VM_FAULT_READ_MAX);
 			} else {
 				/*
 				 * If this is a sequential page fault, then
 				 * arithmetically increase the number of pages
 				 * in the read-ahead window.  Otherwise, reset
 				 * the read-ahead window to its smallest size.
 				 */
 				behind = atop(vaddr - fs.entry->start);
 				if (behind > VM_FAULT_READ_BEHIND)
 					behind = VM_FAULT_READ_BEHIND;
 				ahead = atop(fs.entry->end - vaddr) - 1;
 				era = fs.entry->read_ahead;
 				if (fs.pindex == fs.entry->next_read) {
 					nera = era + behind;
 					if (nera > VM_FAULT_READ_AHEAD_MAX)
 						nera = VM_FAULT_READ_AHEAD_MAX;
 					behind = 0;
 					if (ahead > nera)
 						ahead = nera;
 					if (era == VM_FAULT_READ_AHEAD_MAX)
 						vm_fault_cache_behind(&fs,
 						    VM_FAULT_CACHE_BEHIND);
 				} else if (ahead > VM_FAULT_READ_AHEAD_MIN)
 					ahead = VM_FAULT_READ_AHEAD_MIN;
 				if (era != ahead)
 					fs.entry->read_ahead = ahead;
 			}
 
 			/*
 			 * Call the pager to retrieve the data, if any, after
 			 * releasing the lock on the map.  We hold a ref on
 			 * fs.object and the pages are exclusive busied.
 			 */
 			unlock_map(&fs);
 
 			if (fs.object->type == OBJT_VNODE) {
 				vp = fs.object->handle;
 				if (vp == fs.vp)
 					goto vnode_locked;
 				else if (fs.vp != NULL) {
 					vput(fs.vp);
 					fs.vp = NULL;
 				}
 				locked = VOP_ISLOCKED(vp);
 
 				if (locked != LK_EXCLUSIVE)
 					locked = LK_SHARED;
 				/* Do not sleep for vnode lock while fs.m is busy */
 				error = vget(vp, locked | LK_CANRECURSE |
 				    LK_NOWAIT, curthread);
 				if (error != 0) {
 					vhold(vp);
 					release_page(&fs);
 					unlock_and_deallocate(&fs);
 					error = vget(vp, locked | LK_RETRY |
 					    LK_CANRECURSE, curthread);
 					vdrop(vp);
 					fs.vp = vp;
 					KASSERT(error == 0,
 					    ("vm_fault: vget failed"));
 					goto RetryFault;
 				}
 				fs.vp = vp;
 			}
 vnode_locked:
 			KASSERT(fs.vp == NULL || !fs.map->system_map,
 			    ("vm_fault: vnode-backed object mapped by system map"));
 
 			/*
 			 * now we find out if any other pages should be paged
 			 * in at this time this routine checks to see if the
 			 * pages surrounding this fault reside in the same
 			 * object as the page for this fault.  If they do,
 			 * then they are faulted in also into the object.  The
 			 * array "marray" returned contains an array of
 			 * vm_page_t structs where one of them is the
 			 * vm_page_t passed to the routine.  The reqpage
 			 * return value is the index into the marray for the
 			 * vm_page_t passed to the routine.
 			 *
 			 * fs.m plus the additional pages are exclusive busied.
 			 */
 			faultcount = vm_fault_additional_pages(
 			    fs.m, behind, ahead, marray, &reqpage);
 
 			rv = faultcount ?
 			    vm_pager_get_pages(fs.object, marray, faultcount,
 				reqpage) : VM_PAGER_FAIL;
 
 			if (rv == VM_PAGER_OK) {
 				/*
 				 * Found the page. Leave it busy while we play
 				 * with it.
 				 */
 
 				/*
 				 * Relookup in case pager changed page. Pager
 				 * is responsible for disposition of old page
 				 * if moved.
 				 */
 				fs.m = vm_page_lookup(fs.object, fs.pindex);
 				if (!fs.m) {
 					unlock_and_deallocate(&fs);
 					goto RetryFault;
 				}
 
 				hardfault++;
 				break; /* break to PAGE HAS BEEN FOUND */
 			}
 			/*
 			 * Remove the bogus page (which does not exist at this
 			 * object/offset); before doing so, we must get back
 			 * our object lock to preserve our invariant.
 			 *
 			 * Also wake up any other process that may want to bring
 			 * in this page.
 			 *
 			 * If this is the top-level object, we must leave the
 			 * busy page to prevent another process from rushing
 			 * past us, and inserting the page in that object at
 			 * the same time that we are.
 			 */
 			if (rv == VM_PAGER_ERROR)
 				printf("vm_fault: pager read error, pid %d (%s)\n",
 				    curproc->p_pid, curproc->p_comm);
 			/*
 			 * Data outside the range of the pager or an I/O error
 			 */
 			/*
 			 * XXX - the check for kernel_map is a kludge to work
 			 * around having the machine panic on a kernel space
 			 * fault w/ I/O error.
 			 */
 			if (((fs.map != kernel_map) && (rv == VM_PAGER_ERROR)) ||
 				(rv == VM_PAGER_BAD)) {
 				vm_page_lock(fs.m);
 				vm_page_free(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 				unlock_and_deallocate(&fs);
 				return ((rv == VM_PAGER_ERROR) ? KERN_FAILURE : KERN_PROTECTION_FAILURE);
 			}
 			if (fs.object != fs.first_object) {
 				vm_page_lock(fs.m);
 				vm_page_free(fs.m);
 				vm_page_unlock(fs.m);
 				fs.m = NULL;
 				/*
 				 * XXX - we cannot just fall out at this
 				 * point, m has been freed and is invalid!
 				 */
 			}
 		}
 
 		/*
 		 * We get here if the object has default pager (or unwiring) 
 		 * or the pager doesn't have the page.
 		 */
 		if (fs.object == fs.first_object)
 			fs.first_m = fs.m;
 
 		/*
 		 * Move on to the next object.  Lock the next object before
 		 * unlocking the current one.
 		 */
 		fs.pindex += OFF_TO_IDX(fs.object->backing_object_offset);
 		next_object = fs.object->backing_object;
 		if (next_object == NULL) {
 			/*
 			 * If there's no object left, fill the page in the top
 			 * object with zeros.
 			 */
 			if (fs.object != fs.first_object) {
 				vm_object_pip_wakeup(fs.object);
 				VM_OBJECT_WUNLOCK(fs.object);
 
 				fs.object = fs.first_object;
 				fs.pindex = fs.first_pindex;
 				fs.m = fs.first_m;
 				VM_OBJECT_WLOCK(fs.object);
 			}
 			fs.first_m = NULL;
 
 			/*
 			 * Zero the page if necessary and mark it valid.
 			 */
 			if ((fs.m->flags & PG_ZERO) == 0) {
 				pmap_zero_page(fs.m);
 			} else {
 				PCPU_INC(cnt.v_ozfod);
 			}
 			PCPU_INC(cnt.v_zfod);
 			fs.m->valid = VM_PAGE_BITS_ALL;
 			/* Don't try to prefault neighboring pages. */
 			faultcount = 1;
 			break;	/* break to PAGE HAS BEEN FOUND */
 		} else {
 			KASSERT(fs.object != next_object,
 			    ("object loop %p", next_object));
 			VM_OBJECT_WLOCK(next_object);
 			vm_object_pip_add(next_object, 1);
 			if (fs.object != fs.first_object)
 				vm_object_pip_wakeup(fs.object);
 			VM_OBJECT_WUNLOCK(fs.object);
 			fs.object = next_object;
 		}
 	}
 
 	vm_page_assert_xbusied(fs.m);
 
 	/*
 	 * PAGE HAS BEEN FOUND. [Loop invariant still holds -- the object lock
 	 * is held.]
 	 */
 
 	/*
 	 * If the page is being written, but isn't already owned by the
 	 * top-level object, we have to copy it into a new page owned by the
 	 * top-level object.
 	 */
 	if (fs.object != fs.first_object) {
 		/*
 		 * We only really need to copy if we want to write it.
 		 */
 		if ((fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) {
 			/*
 			 * This allows pages to be virtually copied from a 
 			 * backing_object into the first_object, where the 
 			 * backing object has no other refs to it, and cannot
 			 * gain any more refs.  Instead of a bcopy, we just 
 			 * move the page from the backing object to the 
 			 * first object.  Note that we must mark the page 
 			 * dirty in the first object so that it will go out 
 			 * to swap when needed.
 			 */
 			is_first_object_locked = FALSE;
 			if (
 				/*
 				 * Only one shadow object
 				 */
 				(fs.object->shadow_count == 1) &&
 				/*
 				 * No COW refs, except us
 				 */
 				(fs.object->ref_count == 1) &&
 				/*
 				 * No one else can look this object up
 				 */
 				(fs.object->handle == NULL) &&
 				/*
 				 * No other ways to look the object up
 				 */
 				((fs.object->type == OBJT_DEFAULT) ||
 				 (fs.object->type == OBJT_SWAP)) &&
 			    (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs.first_object)) &&
 				/*
 				 * We don't chase down the shadow chain
 				 */
 			    fs.object == fs.first_object->backing_object) {
 				/*
 				 * get rid of the unnecessary page
 				 */
 				vm_page_lock(fs.first_m);
 				vm_page_free(fs.first_m);
 				vm_page_unlock(fs.first_m);
 				/*
 				 * grab the page and put it into the 
 				 * process'es object.  The page is 
 				 * automatically made dirty.
 				 */
 				if (vm_page_rename(fs.m, fs.first_object,
 				    fs.first_pindex)) {
 					unlock_and_deallocate(&fs);
 					goto RetryFault;
 				}
 				vm_page_xbusy(fs.m);
 				fs.first_m = fs.m;
 				fs.m = NULL;
 				PCPU_INC(cnt.v_cow_optim);
 			} else {
 				/*
 				 * Oh, well, lets copy it.
 				 */
 				pmap_copy_page(fs.m, fs.first_m);
 				fs.first_m->valid = VM_PAGE_BITS_ALL;
 				if (wired && (fault_flags &
 				    VM_FAULT_CHANGE_WIRING) == 0) {
 					vm_page_lock(fs.first_m);
 					vm_page_wire(fs.first_m);
 					vm_page_unlock(fs.first_m);
 					
 					vm_page_lock(fs.m);
-					vm_page_unwire(fs.m, FALSE);
+					vm_page_unwire(fs.m, PQ_INACTIVE);
 					vm_page_unlock(fs.m);
 				}
 				/*
 				 * We no longer need the old page or object.
 				 */
 				release_page(&fs);
 			}
 			/*
 			 * fs.object != fs.first_object due to above 
 			 * conditional
 			 */
 			vm_object_pip_wakeup(fs.object);
 			VM_OBJECT_WUNLOCK(fs.object);
 			/*
 			 * Only use the new page below...
 			 */
 			fs.object = fs.first_object;
 			fs.pindex = fs.first_pindex;
 			fs.m = fs.first_m;
 			if (!is_first_object_locked)
 				VM_OBJECT_WLOCK(fs.object);
 			PCPU_INC(cnt.v_cow_faults);
 			curthread->td_cow++;
 		} else {
 			prot &= ~VM_PROT_WRITE;
 		}
 	}
 
 	/*
 	 * We must verify that the maps have not changed since our last
 	 * lookup.
 	 */
 	if (!fs.lookup_still_valid) {
 		vm_object_t retry_object;
 		vm_pindex_t retry_pindex;
 		vm_prot_t retry_prot;
 
 		if (!vm_map_trylock_read(fs.map)) {
 			release_page(&fs);
 			unlock_and_deallocate(&fs);
 			goto RetryFault;
 		}
 		fs.lookup_still_valid = TRUE;
 		if (fs.map->timestamp != map_generation) {
 			result = vm_map_lookup_locked(&fs.map, vaddr, fault_type,
 			    &fs.entry, &retry_object, &retry_pindex, &retry_prot, &wired);
 
 			/*
 			 * If we don't need the page any longer, put it on the inactive
 			 * list (the easiest thing to do here).  If no one needs it,
 			 * pageout will grab it eventually.
 			 */
 			if (result != KERN_SUCCESS) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 
 				/*
 				 * If retry of map lookup would have blocked then
 				 * retry fault from start.
 				 */
 				if (result == KERN_FAILURE)
 					goto RetryFault;
 				return (result);
 			}
 			if ((retry_object != fs.first_object) ||
 			    (retry_pindex != fs.first_pindex)) {
 				release_page(&fs);
 				unlock_and_deallocate(&fs);
 				goto RetryFault;
 			}
 
 			/*
 			 * Check whether the protection has changed or the object has
 			 * been copied while we left the map unlocked. Changing from
 			 * read to write permission is OK - we leave the page
 			 * write-protected, and catch the write fault. Changing from
 			 * write to read permission means that we can't mark the page
 			 * write-enabled after all.
 			 */
 			prot &= retry_prot;
 		}
 	}
 	/*
 	 * If the page was filled by a pager, update the map entry's
 	 * last read offset.  Since the pager does not return the
 	 * actual set of pages that it read, this update is based on
 	 * the requested set.  Typically, the requested and actual
 	 * sets are the same.
 	 *
 	 * XXX The following assignment modifies the map
 	 * without holding a write lock on it.
 	 */
 	if (hardfault)
 		fs.entry->next_read = fs.pindex + faultcount - reqpage;
 
 	if ((prot & VM_PROT_WRITE) != 0 ||
 	    (fault_flags & VM_FAULT_DIRTY) != 0) {
 		vm_object_set_writeable_dirty(fs.object);
 
 		/*
 		 * If this is a NOSYNC mmap we do not want to set VPO_NOSYNC
 		 * if the page is already dirty to prevent data written with
 		 * the expectation of being synced from not being synced.
 		 * Likewise if this entry does not request NOSYNC then make
 		 * sure the page isn't marked NOSYNC.  Applications sharing
 		 * data should use the same flags to avoid ping ponging.
 		 */
 		if (fs.entry->eflags & MAP_ENTRY_NOSYNC) {
 			if (fs.m->dirty == 0)
 				fs.m->oflags |= VPO_NOSYNC;
 		} else {
 			fs.m->oflags &= ~VPO_NOSYNC;
 		}
 
 		/*
 		 * If the fault is a write, we know that this page is being
 		 * written NOW so dirty it explicitly to save on 
 		 * pmap_is_modified() calls later.
 		 *
 		 * Also tell the backing pager, if any, that it should remove
 		 * any swap backing since the page is now dirty.
 		 */
 		if (((fault_type & VM_PROT_WRITE) != 0 &&
 		    (fault_flags & VM_FAULT_CHANGE_WIRING) == 0) ||
 		    (fault_flags & VM_FAULT_DIRTY) != 0) {
 			vm_page_dirty(fs.m);
 			vm_pager_page_unswapped(fs.m);
 		}
 	}
 
 	vm_page_assert_xbusied(fs.m);
 
 	/*
 	 * Page must be completely valid or it is not fit to
 	 * map into user space.  vm_pager_get_pages() ensures this.
 	 */
 	KASSERT(fs.m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_fault: page %p partially invalid", fs.m));
 	VM_OBJECT_WUNLOCK(fs.object);
 
 	/*
 	 * Put this page into the physical map.  We had to do the unlock above
 	 * because pmap_enter() may sleep.  We don't put the page
 	 * back on the active queue until later so that the pageout daemon
 	 * won't find it (yet).
 	 */
 	pmap_enter(fs.map->pmap, vaddr, fault_type, fs.m, prot, wired);
 	if (faultcount != 1 && (fault_flags & VM_FAULT_CHANGE_WIRING) == 0 &&
 	    wired == 0)
 		vm_fault_prefault(&fs, vaddr, faultcount, reqpage);
 	VM_OBJECT_WLOCK(fs.object);
 	vm_page_lock(fs.m);
 
 	/*
 	 * If the page is not wired down, then put it where the pageout daemon
 	 * can find it.
 	 */
 	if (fault_flags & VM_FAULT_CHANGE_WIRING) {
 		if (wired)
 			vm_page_wire(fs.m);
 		else
-			vm_page_unwire(fs.m, 1);
+			vm_page_unwire(fs.m, PQ_ACTIVE);
 	} else
 		vm_page_activate(fs.m);
 	if (m_hold != NULL) {
 		*m_hold = fs.m;
 		vm_page_hold(fs.m);
 	}
 	vm_page_unlock(fs.m);
 	vm_page_xunbusy(fs.m);
 
 	/*
 	 * Unlock everything, and return
 	 */
 	unlock_and_deallocate(&fs);
 	if (hardfault) {
 		PCPU_INC(cnt.v_io_faults);
 		curthread->td_ru.ru_majflt++;
 	} else 
 		curthread->td_ru.ru_minflt++;
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Speed up the reclamation of up to "distance" pages that precede the
  * faulting pindex within the first object of the shadow chain.
  */
 static void
 vm_fault_cache_behind(const struct faultstate *fs, int distance)
 {
 	vm_object_t first_object, object;
 	vm_page_t m, m_prev;
 	vm_pindex_t pindex;
 
 	object = fs->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	first_object = fs->first_object;
 	if (first_object != object) {
 		if (!VM_OBJECT_TRYWLOCK(first_object)) {
 			VM_OBJECT_WUNLOCK(object);
 			VM_OBJECT_WLOCK(first_object);
 			VM_OBJECT_WLOCK(object);
 		}
 	}
 	/* Neither fictitious nor unmanaged pages can be cached. */
 	if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) {
 		if (fs->first_pindex < distance)
 			pindex = 0;
 		else
 			pindex = fs->first_pindex - distance;
 		if (pindex < OFF_TO_IDX(fs->entry->offset))
 			pindex = OFF_TO_IDX(fs->entry->offset);
 		m = first_object != object ? fs->first_m : fs->m;
 		vm_page_assert_xbusied(m);
 		m_prev = vm_page_prev(m);
 		while ((m = m_prev) != NULL && m->pindex >= pindex &&
 		    m->valid == VM_PAGE_BITS_ALL) {
 			m_prev = vm_page_prev(m);
 			if (vm_page_busied(m))
 				continue;
 			vm_page_lock(m);
 			if (m->hold_count == 0 && m->wire_count == 0) {
 				pmap_remove_all(m);
 				vm_page_aflag_clear(m, PGA_REFERENCED);
 				if (m->dirty != 0)
 					vm_page_deactivate(m);
 				else
 					vm_page_cache(m);
 			}
 			vm_page_unlock(m);
 		}
 	}
 	if (first_object != object)
 		VM_OBJECT_WUNLOCK(first_object);
 }
 
 /*
  * vm_fault_prefault provides a quick way of clustering
  * pagefaults into a processes address space.  It is a "cousin"
  * of vm_map_pmap_enter, except it runs at page fault time instead
  * of mmap time.
  */
 static void
 vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra,
     int faultcount, int reqpage)
 {
 	pmap_t pmap;
 	vm_map_entry_t entry;
 	vm_object_t backing_object, lobject;
 	vm_offset_t addr, starta;
 	vm_pindex_t pindex;
 	vm_page_t m;
 	int backward, forward, i;
 
 	pmap = fs->map->pmap;
 	if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace))
 		return;
 
 	if (faultcount > 0) {
 		backward = reqpage;
 		forward = faultcount - reqpage - 1;
 	} else {
 		backward = PFBAK;
 		forward = PFFOR;
 	}
 	entry = fs->entry;
 
 	starta = addra - backward * PAGE_SIZE;
 	if (starta < entry->start) {
 		starta = entry->start;
 	} else if (starta > addra) {
 		starta = 0;
 	}
 
 	/*
 	 * Generate the sequence of virtual addresses that are candidates for
 	 * prefaulting in an outward spiral from the faulting virtual address,
 	 * "addra".  Specifically, the sequence is "addra - PAGE_SIZE", "addra
 	 * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ...
 	 * If the candidate address doesn't have a backing physical page, then
 	 * the loop immediately terminates.
 	 */
 	for (i = 0; i < 2 * imax(backward, forward); i++) {
 		addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE :
 		    PAGE_SIZE);
 		if (addr > addra + forward * PAGE_SIZE)
 			addr = 0;
 
 		if (addr < starta || addr >= entry->end)
 			continue;
 
 		if (!pmap_is_prefaultable(pmap, addr))
 			continue;
 
 		pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 		lobject = entry->object.vm_object;
 		VM_OBJECT_RLOCK(lobject);
 		while ((m = vm_page_lookup(lobject, pindex)) == NULL &&
 		    lobject->type == OBJT_DEFAULT &&
 		    (backing_object = lobject->backing_object) != NULL) {
 			KASSERT((lobject->backing_object_offset & PAGE_MASK) ==
 			    0, ("vm_fault_prefault: unaligned object offset"));
 			pindex += lobject->backing_object_offset >> PAGE_SHIFT;
 			VM_OBJECT_RLOCK(backing_object);
 			VM_OBJECT_RUNLOCK(lobject);
 			lobject = backing_object;
 		}
 		if (m == NULL) {
 			VM_OBJECT_RUNLOCK(lobject);
 			break;
 		}
 		if (m->valid == VM_PAGE_BITS_ALL &&
 		    (m->flags & PG_FICTITIOUS) == 0)
 			pmap_enter_quick(pmap, addr, m, entry->protection);
 		VM_OBJECT_RUNLOCK(lobject);
 	}
 }
 
 /*
  * Hold each of the physical pages that are mapped by the specified range of
  * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid
  * and allow the specified types of access, "prot".  If all of the implied
  * pages are successfully held, then the number of held pages is returned
  * together with pointers to those pages in the array "ma".  However, if any
  * of the pages cannot be held, -1 is returned.
  */
 int
 vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len,
     vm_prot_t prot, vm_page_t *ma, int max_count)
 {
 	vm_offset_t end, va;
 	vm_page_t *mp;
 	int count;
 	boolean_t pmap_failed;
 
 	if (len == 0)
 		return (0);
 	end = round_page(addr + len);
 	addr = trunc_page(addr);
 
 	/*
 	 * Check for illegal addresses.
 	 */
 	if (addr < vm_map_min(map) || addr > end || end > vm_map_max(map))
 		return (-1);
 
 	if (atop(end - addr) > max_count)
 		panic("vm_fault_quick_hold_pages: count > max_count");
 	count = atop(end - addr);
 
 	/*
 	 * Most likely, the physical pages are resident in the pmap, so it is
 	 * faster to try pmap_extract_and_hold() first.
 	 */
 	pmap_failed = FALSE;
 	for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) {
 		*mp = pmap_extract_and_hold(map->pmap, va, prot);
 		if (*mp == NULL)
 			pmap_failed = TRUE;
 		else if ((prot & VM_PROT_WRITE) != 0 &&
 		    (*mp)->dirty != VM_PAGE_BITS_ALL) {
 			/*
 			 * Explicitly dirty the physical page.  Otherwise, the
 			 * caller's changes may go unnoticed because they are
 			 * performed through an unmanaged mapping or by a DMA
 			 * operation.
 			 *
 			 * The object lock is not held here.
 			 * See vm_page_clear_dirty_mask().
 			 */
 			vm_page_dirty(*mp);
 		}
 	}
 	if (pmap_failed) {
 		/*
 		 * One or more pages could not be held by the pmap.  Either no
 		 * page was mapped at the specified virtual address or that
 		 * mapping had insufficient permissions.  Attempt to fault in
 		 * and hold these pages.
 		 */
 		for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE)
 			if (*mp == NULL && vm_fault_hold(map, va, prot,
 			    VM_FAULT_NORMAL, mp) != KERN_SUCCESS)
 				goto error;
 	}
 	return (count);
 error:	
 	for (mp = ma; mp < ma + count; mp++)
 		if (*mp != NULL) {
 			vm_page_lock(*mp);
 			vm_page_unhold(*mp);
 			vm_page_unlock(*mp);
 		}
 	return (-1);
 }
 
 /*
  *	vm_fault_wire:
  *
  *	Wire down a range of virtual addresses in a map.
  */
 int
 vm_fault_wire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     boolean_t fictitious)
 {
 	vm_offset_t va;
 	int rv;
 
 	/*
 	 * We simulate a fault to get the page and enter it in the physical
 	 * map.  For user wiring, we only ask for read access on currently
 	 * read-only sections.
 	 */
 	for (va = start; va < end; va += PAGE_SIZE) {
 		rv = vm_fault(map, va, VM_PROT_NONE, VM_FAULT_CHANGE_WIRING);
 		if (rv) {
 			if (va != start)
 				vm_fault_unwire(map, start, va, fictitious);
 			return (rv);
 		}
 	}
 	return (KERN_SUCCESS);
 }
 
 /*
  *	vm_fault_unwire:
  *
  *	Unwire a range of virtual addresses in a map.
  */
 void
 vm_fault_unwire(vm_map_t map, vm_offset_t start, vm_offset_t end,
     boolean_t fictitious)
 {
 	vm_paddr_t pa;
 	vm_offset_t va;
 	vm_page_t m;
 	pmap_t pmap;
 
 	pmap = vm_map_pmap(map);
 
 	/*
 	 * Since the pages are wired down, we must be able to get their
 	 * mappings from the physical map system.
 	 */
 	for (va = start; va < end; va += PAGE_SIZE) {
 		pa = pmap_extract(pmap, va);
 		if (pa != 0) {
 			pmap_change_wiring(pmap, va, FALSE);
 			if (!fictitious) {
 				m = PHYS_TO_VM_PAGE(pa);
 				vm_page_lock(m);
-				vm_page_unwire(m, TRUE);
+				vm_page_unwire(m, PQ_ACTIVE);
 				vm_page_unlock(m);
 			}
 		}
 	}
 }
 
 /*
  *	Routine:
  *		vm_fault_copy_entry
  *	Function:
  *		Create new shadow object backing dst_entry with private copy of
  *		all underlying pages. When src_entry is equal to dst_entry,
  *		function implements COW for wired-down map entry. Otherwise,
  *		it forks wired entry into dst_map.
  *
  *	In/out conditions:
  *		The source and destination maps must be locked for write.
  *		The source map entry must be wired down (or be a sharing map
  *		entry corresponding to a main map entry that is wired down).
  */
 void
 vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
     vm_map_entry_t dst_entry, vm_map_entry_t src_entry,
     vm_ooffset_t *fork_charge)
 {
 	vm_object_t backing_object, dst_object, object, src_object;
 	vm_pindex_t dst_pindex, pindex, src_pindex;
 	vm_prot_t access, prot;
 	vm_offset_t vaddr;
 	vm_page_t dst_m;
 	vm_page_t src_m;
 	boolean_t upgrade;
 
 #ifdef	lint
 	src_map++;
 #endif	/* lint */
 
 	upgrade = src_entry == dst_entry;
 	access = prot = dst_entry->protection;
 
 	src_object = src_entry->object.vm_object;
 	src_pindex = OFF_TO_IDX(src_entry->offset);
 
 	if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) {
 		dst_object = src_object;
 		vm_object_reference(dst_object);
 	} else {
 		/*
 		 * Create the top-level object for the destination entry. (Doesn't
 		 * actually shadow anything - we copy the pages directly.)
 		 */
 		dst_object = vm_object_allocate(OBJT_DEFAULT,
 		    OFF_TO_IDX(dst_entry->end - dst_entry->start));
 #if VM_NRESERVLEVEL > 0
 		dst_object->flags |= OBJ_COLORED;
 		dst_object->pg_color = atop(dst_entry->start);
 #endif
 	}
 
 	VM_OBJECT_WLOCK(dst_object);
 	KASSERT(upgrade || dst_entry->object.vm_object == NULL,
 	    ("vm_fault_copy_entry: vm_object not NULL"));
 	if (src_object != dst_object) {
 		dst_entry->object.vm_object = dst_object;
 		dst_entry->offset = 0;
 		dst_object->charge = dst_entry->end - dst_entry->start;
 	}
 	if (fork_charge != NULL) {
 		KASSERT(dst_entry->cred == NULL,
 		    ("vm_fault_copy_entry: leaked swp charge"));
 		dst_object->cred = curthread->td_ucred;
 		crhold(dst_object->cred);
 		*fork_charge += dst_object->charge;
 	} else if (dst_object->cred == NULL) {
 		KASSERT(dst_entry->cred != NULL, ("no cred for entry %p",
 		    dst_entry));
 		dst_object->cred = dst_entry->cred;
 		dst_entry->cred = NULL;
 	}
 
 	/*
 	 * If not an upgrade, then enter the mappings in the pmap as
 	 * read and/or execute accesses.  Otherwise, enter them as
 	 * write accesses.
 	 *
 	 * A writeable large page mapping is only created if all of
 	 * the constituent small page mappings are modified. Marking
 	 * PTEs as modified on inception allows promotion to happen
 	 * without taking potentially large number of soft faults.
 	 */
 	if (!upgrade)
 		access &= ~VM_PROT_WRITE;
 
 	/*
 	 * Loop through all of the virtual pages within the entry's
 	 * range, copying each page from the source object to the
 	 * destination object.  Since the source is wired, those pages
 	 * must exist.  In contrast, the destination is pageable.
 	 * Since the destination object does share any backing storage
 	 * with the source object, all of its pages must be dirtied,
 	 * regardless of whether they can be written.
 	 */
 	for (vaddr = dst_entry->start, dst_pindex = 0;
 	    vaddr < dst_entry->end;
 	    vaddr += PAGE_SIZE, dst_pindex++) {
 again:
 		/*
 		 * Find the page in the source object, and copy it in.
 		 * Because the source is wired down, the page will be
 		 * in memory.
 		 */
 		if (src_object != dst_object)
 			VM_OBJECT_RLOCK(src_object);
 		object = src_object;
 		pindex = src_pindex + dst_pindex;
 		while ((src_m = vm_page_lookup(object, pindex)) == NULL &&
 		    (backing_object = object->backing_object) != NULL) {
 			/*
 			 * Unless the source mapping is read-only or
 			 * it is presently being upgraded from
 			 * read-only, the first object in the shadow
 			 * chain should provide all of the pages.  In
 			 * other words, this loop body should never be
 			 * executed when the source mapping is already
 			 * read/write.
 			 */
 			KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 ||
 			    upgrade,
 			    ("vm_fault_copy_entry: main object missing page"));
 
 			VM_OBJECT_RLOCK(backing_object);
 			pindex += OFF_TO_IDX(object->backing_object_offset);
 			if (object != dst_object)
 				VM_OBJECT_RUNLOCK(object);
 			object = backing_object;
 		}
 		KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing"));
 
 		if (object != dst_object) {
 			/*
 			 * Allocate a page in the destination object.
 			 */
 			dst_m = vm_page_alloc(dst_object, (src_object ==
 			    dst_object ? src_pindex : 0) + dst_pindex,
 			    VM_ALLOC_NORMAL);
 			if (dst_m == NULL) {
 				VM_OBJECT_WUNLOCK(dst_object);
 				VM_OBJECT_RUNLOCK(object);
 				VM_WAIT;
 				VM_OBJECT_WLOCK(dst_object);
 				goto again;
 			}
 			pmap_copy_page(src_m, dst_m);
 			VM_OBJECT_RUNLOCK(object);
 			dst_m->valid = VM_PAGE_BITS_ALL;
 			dst_m->dirty = VM_PAGE_BITS_ALL;
 		} else {
 			dst_m = src_m;
 			if (vm_page_sleep_if_busy(dst_m, "fltupg"))
 				goto again;
 			vm_page_xbusy(dst_m);
 			KASSERT(dst_m->valid == VM_PAGE_BITS_ALL,
 			    ("invalid dst page %p", dst_m));
 		}
 		VM_OBJECT_WUNLOCK(dst_object);
 
 		/*
 		 * Enter it in the pmap. If a wired, copy-on-write
 		 * mapping is being replaced by a write-enabled
 		 * mapping, then wire that new mapping.
 		 */
 		pmap_enter(dst_map->pmap, vaddr, access, dst_m, prot, upgrade);
 
 		/*
 		 * Mark it no longer busy, and put it on the active list.
 		 */
 		VM_OBJECT_WLOCK(dst_object);
 		
 		if (upgrade) {
 			if (src_m != dst_m) {
 				vm_page_lock(src_m);
-				vm_page_unwire(src_m, 0);
+				vm_page_unwire(src_m, PQ_INACTIVE);
 				vm_page_unlock(src_m);
 				vm_page_lock(dst_m);
 				vm_page_wire(dst_m);
 				vm_page_unlock(dst_m);
 			} else {
 				KASSERT(dst_m->wire_count > 0,
 				    ("dst_m %p is not wired", dst_m));
 			}
 		} else {
 			vm_page_lock(dst_m);
 			vm_page_activate(dst_m);
 			vm_page_unlock(dst_m);
 		}
 		vm_page_xunbusy(dst_m);
 	}
 	VM_OBJECT_WUNLOCK(dst_object);
 	if (upgrade) {
 		dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY);
 		vm_object_deallocate(src_object);
 	}
 }
 
 
 /*
  * This routine checks around the requested page for other pages that
  * might be able to be faulted in.  This routine brackets the viable
  * pages for the pages to be paged in.
  *
  * Inputs:
  *	m, rbehind, rahead
  *
  * Outputs:
  *  marray (array of vm_page_t), reqpage (index of requested page)
  *
  * Return value:
  *  number of pages in marray
  */
 static int
 vm_fault_additional_pages(m, rbehind, rahead, marray, reqpage)
 	vm_page_t m;
 	int rbehind;
 	int rahead;
 	vm_page_t *marray;
 	int *reqpage;
 {
 	int i,j;
 	vm_object_t object;
 	vm_pindex_t pindex, startpindex, endpindex, tpindex;
 	vm_page_t rtm;
 	int alloc_req, cbehind, cahead;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	object = m->object;
 	pindex = m->pindex;
 	cbehind = cahead = 0;
 
 	/*
 	 * if the requested page is not available, then give up now
 	 */
 	if (!vm_pager_has_page(object, pindex, &cbehind, &cahead)) {
 		return 0;
 	}
 
 	if ((cbehind == 0) && (cahead == 0)) {
 		*reqpage = 0;
 		marray[0] = m;
 		return 1;
 	}
 
 	if (rahead > cahead) {
 		rahead = cahead;
 	}
 
 	if (rbehind > cbehind) {
 		rbehind = cbehind;
 	}
 
 	alloc_req = VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED;
 	if ((object->flags & OBJ_UNMANAGED) != 0)
 		alloc_req |= VM_ALLOC_WIRED;
 
 	/*
 	 * scan backward for the read behind pages -- in memory 
 	 */
 	if (pindex > 0) {
 		if (rbehind > pindex) {
 			rbehind = pindex;
 			startpindex = 0;
 		} else {
 			startpindex = pindex - rbehind;
 		}
 
 		if ((rtm = TAILQ_PREV(m, pglist, listq)) != NULL &&
 		    rtm->pindex >= startpindex)
 			startpindex = rtm->pindex + 1;
 
 		/* tpindex is unsigned; beware of numeric underflow. */
 		for (i = 0, tpindex = pindex - 1; tpindex >= startpindex &&
 		    tpindex < pindex; i++, tpindex--) {
 
 			rtm = vm_page_alloc(object, tpindex, alloc_req);
 			if (rtm == NULL) {
 				/*
 				 * Shift the allocated pages to the
 				 * beginning of the array.
 				 */
 				for (j = 0; j < i; j++) {
 					marray[j] = marray[j + tpindex + 1 -
 					    startpindex];
 				}
 				break;
 			}
 
 			marray[tpindex - startpindex] = rtm;
 		}
 	} else {
 		startpindex = 0;
 		i = 0;
 	}
 
 	marray[i] = m;
 	/* page offset of the required page */
 	*reqpage = i;
 
 	tpindex = pindex + 1;
 	i++;
 
 	/*
 	 * scan forward for the read ahead pages
 	 */
 	endpindex = tpindex + rahead;
 	if ((rtm = TAILQ_NEXT(m, listq)) != NULL && rtm->pindex < endpindex)
 		endpindex = rtm->pindex;
 	if (endpindex > object->size)
 		endpindex = object->size;
 
 	for (; tpindex < endpindex; i++, tpindex++) {
 
 		rtm = vm_page_alloc(object, tpindex, alloc_req);
 		if (rtm == NULL) {
 			break;
 		}
 
 		marray[i] = rtm;
 	}
 
 	/* return number of pages */
 	return i;
 }
 
 /*
  * Block entry into the machine-independent layer's page fault handler by
  * the calling thread.  Subsequent calls to vm_fault() by that thread will
  * return KERN_PROTECTION_FAILURE.  Enable machine-dependent handling of
  * spurious page faults. 
  */
 int
 vm_fault_disable_pagefaults(void)
 {
 
 	return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR));
 }
 
 void
 vm_fault_enable_pagefaults(int save)
 {
 
 	curthread_pflags_restore(save);
 }
Index: user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/vm/vm_glue.c	(revision 267237)
@@ -1,1044 +1,1044 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/vmem.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/_kstack_cache.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #ifndef NO_SWAPPING
 static int swapout(struct proc *);
 static void swapclear(struct proc *);
 static void vm_thread_swapin(struct thread *td);
 static void vm_thread_swapout(struct thread *td);
 #endif
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjuction with this call.
  */
 int
 useracc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (error == KERN_SUCCESS ? 0 : EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m, ma[1];
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		ma[0] = m;
 		rv = vm_pager_get_pages(object, ma, 1, 0);
 		m = vm_page_lookup(object, pindex);
 		if (m == NULL)
 			goto out;
 		if (rv != VM_PAGER_OK) {
 			vm_page_lock(m);
 			vm_page_free(m);
 			vm_page_unlock(m);
 			m = NULL;
 			goto out;
 		}
 	}
 	vm_page_xunbusy(m);
 	vm_page_lock(m);
 	vm_page_hold(m);
 	vm_page_unlock(m);
 out:
 	VM_OBJECT_WUNLOCK(object);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 void
 vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
 {
 
 	pmap_sync_icache(map->pmap, va, sz);
 }
 
 struct kstack_cache_entry *kstack_cache;
 static int kstack_cache_size = 128;
 static int kstacks;
 static struct mtx kstack_cache_mtx;
 MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
 
 SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
     "");
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
     "");
 
 #ifndef KSTACK_MAX_PAGES
 #define KSTACK_MAX_PAGES 32
 #endif
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
 int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	vm_page_t m, ma[KSTACK_MAX_PAGES];
 	struct kstack_cache_entry *ks_ce;
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = KSTACK_PAGES;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
 	if (pages == KSTACK_PAGES) {
 		mtx_lock(&kstack_cache_mtx);
 		if (kstack_cache != NULL) {
 			ks_ce = kstack_cache;
 			kstack_cache = ks_ce->next_ks_entry;
 			mtx_unlock(&kstack_cache_mtx);
 
 			td->td_kstack_obj = ks_ce->ksobj;
 			td->td_kstack = (vm_offset_t)ks_ce;
 			td->td_kstack_pages = KSTACK_PAGES;
 			return (1);
 		}
 		mtx_unlock(&kstack_cache_mtx);
 	}
 
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
 	
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 #if defined(__mips__)
 	/*
 	 * We need to align the kstack's mapped address to fit within
 	 * a single TLB entry.
 	 */
 	if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
 	    PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    M_BESTFIT | M_NOWAIT, &ks)) {
 		ks = 0;
 	}
 #else
 	ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 #endif
 	if (ks == 0) {
 		printf("vm_thread_new: kstack allocation failed\n");
 		vm_object_deallocate(ksobj);
 		return (0);
 	}
 
 	atomic_add_int(&kstacks, 1);
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 	td->td_kstack_obj = ksobj;
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		/*
 		 * Get a kernel stack page.
 		 */
 		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 		ma[i] = m;
 		m->valid = VM_PAGE_BITS_ALL;
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
 }
 
 static void
 vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages)
 {
 	vm_page_t m;
 	int i;
 
 	atomic_add_int(&kstacks, -1);
 	pmap_qremove(ks, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	struct kstack_cache_entry *ks_ce;
 	int pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
 	if (pages == KSTACK_PAGES && kstacks <= kstack_cache_size) {
 		ks_ce = (struct kstack_cache_entry *)ks;
 		ks_ce->ksobj = ksobj;
 		mtx_lock(&kstack_cache_mtx);
 		ks_ce->next_ks_entry = kstack_cache;
 		kstack_cache = ks_ce;
 		mtx_unlock(&kstack_cache_mtx);
 		return;
 	}
 	vm_thread_stack_dispose(ksobj, ks, pages);
 }
 
 static void
 vm_thread_stack_lowmem(void *nulll)
 {
 	struct kstack_cache_entry *ks_ce, *ks_ce1;
 
 	mtx_lock(&kstack_cache_mtx);
 	ks_ce = kstack_cache;
 	kstack_cache = NULL;
 	mtx_unlock(&kstack_cache_mtx);
 
 	while (ks_ce != NULL) {
 		ks_ce1 = ks_ce;
 		ks_ce = ks_ce->next_ks_entry;
 
 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
 		    KSTACK_PAGES);
 	}
 }
 
 static void
 kstack_cache_init(void *nulll)
 {
 
 	EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
 
 #ifndef NO_SWAPPING
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 static void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_dirty(m);
 		vm_page_lock(m);
-		vm_page_unwire(m, 0);
+		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 static void
 vm_thread_swapin(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t ma[KSTACK_MAX_PAGES];
 	int i, j, k, pages, rv;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++)
 		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
 		    VM_ALLOC_WIRED);
 	for (i = 0; i < pages; i++) {
 		if (ma[i]->valid != VM_PAGE_BITS_ALL) {
 			vm_page_assert_xbusied(ma[i]);
 			vm_object_pip_add(ksobj, 1);
 			for (j = i + 1; j < pages; j++) {
 				if (ma[j]->valid != VM_PAGE_BITS_ALL)
 					vm_page_assert_xbusied(ma[j]);
 				if (ma[j]->valid == VM_PAGE_BITS_ALL)
 					break;
 			}
 			rv = vm_pager_get_pages(ksobj, ma + i, j - i, 0);
 			if (rv != VM_PAGER_OK)
 	panic("vm_thread_swapin: cannot get kstack for proc: %d",
 				    td->td_proc->p_pid);
 			vm_object_pip_wakeup(ksobj);
 			for (k = i; k < j; k++)
 				ma[k] = vm_page_lookup(ksobj, k);
 			vm_page_xunbusy(ma[i]);
 		} else if (vm_page_xbusied(ma[i]))
 			vm_page_xunbusy(ma[i]);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 #endif /* !NO_SWAPPING */
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 int
 vm_forkproc(td, p2, td2, vm2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
 	struct vmspace *vm2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
 	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				error = vmspace_unshare(p1);
 				if (error)
 					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return (0);
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 	return (0);
 }
 
 /*
  * Called after process has been wait(2)'ed apon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_flag & P_SWAPPINGIN) {
 		while (p->p_flag & P_SWAPPINGIN)
 			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
 		return;
 	}
 	if ((p->p_flag & P_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		p->p_flag |= P_SWAPPINGIN;
 		PROC_UNLOCK(p);
 
 		/*
 		 * We hold no lock here because the list of threads
 		 * can not change while all threads in the process are
 		 * swapped out.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 		PROC_LOCK(p);
 		swapclear(p);
 		p->p_swtick = ticks;
 
 		wakeup(&p->p_flag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
 	}
 #endif /* NO_SWAPPING */
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  *
  * Giant is held on entry.
  */
 void
 swapper(void)
 {
 	struct proc *p;
 	struct thread *td;
 	struct proc *pp;
 	int slptime;
 	int swtime;
 	int ppri;
 	int pri;
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW ||
 		    p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		swtime = (ticks - p->p_swtick) / hz;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
 			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				slptime = (ticks - td->td_slptick) / hz;
 				pri = swtime + slptime;
 				if ((td->td_flags & TDF_SWAPINREQ) == 0)
 					pri -= p->p_nice * 8;
 				/*
 				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					pp = p;
 					ppri = pri;
 				}
 			}
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
 		goto loop;
 	}
 	PROC_LOCK(p);
 
 	/*
 	 * Another process may be bringing or may have already
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
 	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
 		goto loop;
 	}
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
 	goto loop;
 }
 
 void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
 
 /*
  * First, if any processes have been sleeping or stopped for at least
  * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
  * no such processes exist, then the longest-sleeping or stopped
  * process is swapped out.  Finally, and only as a last resort, if
  * there are no sleeping or stopped processes, the longest-resident
  * process is swapped out.
  */
 void
 swapout_procs(action)
 int action;
 {
 	struct proc *p;
 	struct thread *td;
 	int didswap = 0;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		int slptime;
 		
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
 		if (p->p_state == PRS_NEW)
 			continue;
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
 		 * Perform a quick check whether
 		 * a process has P_SYSTEM.
 		 */
 		if ((p->p_flag & P_SYSTEM) != 0)
 			continue;
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
 		 * structures as there is a possible
 		 * deadlock.  Test this first as
 		 * this may block.
 		 *
 		 * Lock the map until swapout
 		 * finishes, or a thread of this
 		 * process may attempt to alter
 		 * the map.
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL)
 			continue;
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
 		PROC_LOCK(p);
 		if (p->p_lock != 0 ||
 		    (p->p_flag & (P_STOPPED_SINGLE|P_TRACED|P_SYSTEM|P_WEXIT)
 		    ) != 0) {
 			goto nextproc;
 		}
 		/*
 		 * only aiod changes vmspace, however it will be
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
 		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
 			goto nextproc;
 
 		switch (p->p_state) {
 		default:
 			/* Don't swap out processes in any sort
 			 * of 'special' state. */
 			break;
 
 		case PRS_NORMAL:
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (PRI_IS_REALTIME(td->td_pri_class)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				slptime = (ticks - td->td_slptick) / hz;
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
 				if (slptime < swap_idle_threshold1) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				/*
 				 * Do not swapout a process if it is
 				 * waiting on a critical event of some
 				 * kind or there is a thread whose
 				 * pageable memory may be accessed.
 				 *
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
 				if (!thread_safetoswapout(td)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
 				 * idle processes >= swap_idle_threshold2,
 				 * then swap the process out.
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
 				    (slptime < swap_idle_threshold2))) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				if (minslptime > slptime)
 					minslptime = slptime;
 				thread_unlock(td);
 			}
 
 			/*
 			 * If the pageout daemon didn't free enough pages,
 			 * or if this process is idle and the system is
 			 * configured to swap proactively, swap it out.
 			 */
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
 				if (swapout(p) == 0)
 					didswap++;
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
 				sx_sunlock(&allproc_lock);
 				goto retry;
 			}
 		}
 nextproc:
 		PROC_UNLOCK(p);
 		vm_map_unlock(&vm->vm_map);
 nextproc1:
 		vmspace_free(vm);
 		continue;
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapclear(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		td->td_flags |= TDF_INMEM;
 		td->td_flags &= ~TDF_SWAPINREQ;
 		TD_CLR_SWAPPED(td);
 		if (TD_CAN_RUN(td))
 			if (setrunnable(td)) {
 #ifdef INVARIANTS
 				/*
 				 * XXX: We just cleared TDI_SWAPPED
 				 * above and set TDF_INMEM, so this
 				 * should never happen.
 				 */
 				panic("not waking up swapper");
 #endif
 			}
 		thread_unlock(td);
 	}
 	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
 	p->p_flag |= P_INMEM;
 }
 
 static int
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
 		("swapout: lost a swapout race?"));
 
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 	/*
 	 * Check and mark all threads before we proceed.
 	 */
 	p->p_flag &= ~P_INMEM;
 	p->p_flag |= P_SWAPPINGOUT;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (!thread_safetoswapout(td)) {
 			thread_unlock(td);
 			swapclear(p);
 			return (EBUSY);
 		}
 		td->td_flags &= ~TDF_INMEM;
 		TD_SET_SWAPPED(td);
 		thread_unlock(td);
 	}
 	td = FIRST_THREAD_IN_PROC(p);
 	++td->td_ru.ru_nswap;
 	PROC_UNLOCK(p);
 
 	/*
 	 * This list is stable because all threads are now prevented from
 	 * running.  The list is only modified in the context of a running
 	 * thread in this process.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	p->p_flag &= ~P_SWAPPINGOUT;
 	p->p_swtick = ticks;
 	return (0);
 }
 #endif /* !NO_SWAPPING */
Index: user/attilio/rm_vmobj_cache/sys/vm/vm_page.c
===================================================================
--- user/attilio/rm_vmobj_cache/sys/vm/vm_page.c	(revision 267236)
+++ user/attilio/rm_vmobj_cache/sys/vm/vm_page.c	(revision 267237)
@@ -1,3210 +1,3212 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- A page queue lock is required when adding or removing a page from a
  *	  page queue regardless of other locks or the busy state of a page.
  *
  *		* In general, no thread besides the page daemon can acquire or
  *		  hold more than one page queue lock at a time.
  *
  *		* The page daemon can acquire and hold any pair of page queue
  *		  locks in any order.
  *
  *	- The object lock is required when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).
  *
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 struct vm_domain vm_dom[MAXMEMDOM];
 struct mtx_padalign vm_page_queue_free_mtx;
 
 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
 long first_page;
 int vm_page_zero_count;
 
 static int boot_pages = UMA_BOOT_PAGES;
 TUNABLE_INT("vm.boot_pages", &boot_pages);
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RD, &boot_pages, 0,
 	"number of pages allocated for bootstrapping the VM system");
 
 static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
 static uma_zone_t fakepg_zone;
 
 static struct vnode *vm_page_alloc_init(vm_page_t m);
 static void vm_page_cache_turn_free(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
-static void vm_page_enqueue(int queue, vm_page_t m);
+static void vm_page_enqueue(uint8_t queue, vm_page_t m);
 static void vm_page_init_fakepg(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
 static void
 vm_page_init_fakepg(void *dummy)
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM); 
 }
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 #if PAGE_SIZE == 32768
 #ifdef CTASSERT
 CTASSERT(sizeof(u_long) >= 8);
 #endif
 #endif
 
 /*
  * Try to acquire a physical address lock while a pmap is locked.  If we
  * fail to trylock we unlock and lock the pmap directly and cache the
  * locked pa in *locked.  The caller should then restart their loop in case
  * the virtual to physical mapping has changed.
  */
 int
 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
 {
 	vm_paddr_t lockpa;
 
 	lockpa = *locked;
 	*locked = pa;
 	if (lockpa) {
 		PA_LOCK_ASSERT(lockpa, MA_OWNED);
 		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
 			return (0);
 		PA_UNLOCK(lockpa);
 	}
 	if (PA_TRYLOCK(pa))
 		return (0);
 	PMAP_UNLOCK(pmap);
 	atomic_add_int(&pa_tryrelock_restart, 1);
 	PA_LOCK(pa);
 	PMAP_LOCK(pmap);
 	return (EAGAIN);
 }
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
 	if (vm_cnt.v_page_size == 0)
 		vm_cnt.v_page_size = PAGE_SIZE;
 	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_lookup:
  *
  *	See if a physical address in this page has been listed
  *	in the blacklist tunable.  Entries in the tunable are
  *	separated by spaces or commas.  If an invalid integer is
  *	encountered then the rest of the string is skipped.
  */
 static int
 vm_page_blacklist_lookup(char *list, vm_paddr_t pa)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	for (pos = list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp != '\0') {
 			if (*cp == ' ' || *cp == ',') {
 				cp++;
 				if (cp == pos)
 					continue;
 			} else
 				break;
 		}
 		if (pa == trunc_page(bad))
 			return (1);
 	}
 	return (0);
 }
 
 static void
 vm_page_domain_init(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
 	int i;
 
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
 	    &vm_cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
 	    &vm_cnt.v_active_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
 	vmd->vmd_pass = 0;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
 		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
 		    MTX_DEF | MTX_DUPOK);
 	}
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.
  *
  *	Allocates memory for the page cells, and
  *	for the object/offset-to-page hash table headers.
  *	Each page cell is initialized and placed on the free list.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
 	vm_paddr_t page_range;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
 	vm_paddr_t last_pa;
 	char *list;
 
 	/* the biggest memory array is the second group of pages */
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
 	vm_paddr_t low_water, high_water;
 	int biggestone;
 
 	biggestsize = 0;
 	biggestone = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 
 	low_water = phys_avail[0];
 	high_water = phys_avail[1];
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		vm_paddr_t size = phys_avail[i + 1] - phys_avail[i];
 
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 		if (phys_avail[i] < low_water)
 			low_water = phys_avail[i];
 		if (phys_avail[i + 1] > high_water)
 			high_water = phys_avail[i + 1];
 	}
 
 #ifdef XEN
 	low_water = 0;
 #endif	
 
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the page and queue locks.
 	 */
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
 	for (i = 0; i < vm_ndomains; i++)
 		vm_page_domain_init(&vm_dom[i]);
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 */
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
 #if defined(__amd64__) || defined(__i386__) || defined(__arm__) || \
     defined(__mips__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	last_pa = 0;
 	for (i = 0; dump_avail[i + 1] != 0; i += 2)
 		if (dump_avail[i + 1] > last_pa)
 			last_pa = dump_avail[i + 1];
 	page_range = last_pa / PAGE_SIZE;
 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
 #ifdef __amd64__
 	/*
 	 * Request that the physical pages underlying the message buffer be
 	 * included in a crash dump.  Since the message buffer is accessed
 	 * through the direct map, they are not automatically included.
 	 */
 	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
 	last_pa = pa + round_page(msgbufsize);
 	while (pa < last_pa) {
 		dump_add_page(pa);
 		pa += PAGE_SIZE;
 	}
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use (taking into account the overhead of a page structure per
 	 * page).
 	 */
 	first_page = low_water / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	page_range = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		page_range += atop(phys_avail[i + 1] - phys_avail[i]);
 #elif defined(VM_PHYSSEG_DENSE)
 	page_range = high_water / PAGE_SIZE - first_page;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
 	 * Initialize the mem entry structures now, and put them in the free
 	 * queue.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate memory for the reservation management system's data
 	 * structures.
 	 */
 	new_end = vm_reserv_startup(&vaddr, new_end, high_water);
 #endif
 #if defined(__amd64__) || defined(__mips__)
 	/*
 	 * pmap_map on amd64 and mips can come out of the direct-map, not kvm
 	 * like i386, so the pages must be tracked for a crashdump to include
 	 * this data.  This includes the vm_page_array and the early UMA
 	 * bootstrap pages.
 	 */
 	for (pa = new_end; pa < phys_avail[biggestone + 1]; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif	
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	for (i = 0; i < page_range; i++)
 		vm_page_array[i].order = VM_NFREEORDER;
 	vm_page_array_size = page_range;
 
 	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
 	 * Add every available physical page that is not blacklisted to
 	 * the free lists.
 	 */
 	vm_cnt.v_page_count = 0;
 	vm_cnt.v_free_count = 0;
 	list = getenv("vm.blacklist");
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
 		while (pa < last_pa) {
 			if (list != NULL &&
 			    vm_page_blacklist_lookup(list, pa))
 				printf("Skipping page with pa 0x%jx\n",
 				    (uintmax_t)pa);
 			else
 				vm_phys_add_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Initialize the reservation management system.
 	 */
 	vm_reserv_init();
 #endif
 	return (vaddr);
 }
 
 void
 vm_page_reference(vm_page_t m)
 {
 
 	vm_page_aflag_set(m, PGA_REFERENCED);
 }
 
 /*
  *	vm_page_busy_downgrade:
  *
  *	Downgrade an exclusive busy page into a single shared busy page.
  */
 void
 vm_page_busy_downgrade(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_assert_xbusied(m);
 
 	for (;;) {
 		x = m->busy_lock;
 		x &= VPB_BIT_WAITERS;
 		if (atomic_cmpset_rel_int(&m->busy_lock,
 		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1) | x))
 			break;
 	}
 }
 
 /*
  *	vm_page_sbusied:
  *
  *	Return a positive value if the page is shared busied, 0 otherwise.
  */
 int
 vm_page_sbusied(vm_page_t m)
 {
 	u_int x;
 
 	x = m->busy_lock;
 	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
 }
 
 /*
  *	vm_page_sunbusy:
  *
  *	Shared unbusy a page.
  */
 void
 vm_page_sunbusy(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_assert_sbusied(m);
 
 	for (;;) {
 		x = m->busy_lock;
 		if (VPB_SHARERS(x) > 1) {
 			if (atomic_cmpset_int(&m->busy_lock, x,
 			    x - VPB_ONE_SHARER))
 				break;
 			continue;
 		}
 		if ((x & VPB_BIT_WAITERS) == 0) {
 			KASSERT(x == VPB_SHARERS_WORD(1),
 			    ("vm_page_sunbusy: invalid lock state"));
 			if (atomic_cmpset_int(&m->busy_lock,
 			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
 				break;
 			continue;
 		}
 		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
 		    ("vm_page_sunbusy: invalid lock state for waiters"));
 
 		vm_page_lock(m);
 		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
 			vm_page_unlock(m);
 			continue;
 		}
 		wakeup(m);
 		vm_page_unlock(m);
 		break;
 	}
 }
 
 /*
  *	vm_page_busy_sleep:
  *
  *	Sleep and release the page lock, using the page pointer as wchan.
  *	This is used to implement the hard-path of busying mechanism.
  *
  *	The given page must be locked.
  */
 void
 vm_page_busy_sleep(vm_page_t m, const char *wmesg)
 {
 	u_int x;
 
 	vm_page_lock_assert(m, MA_OWNED);
 
 	x = m->busy_lock;
 	if (x == VPB_UNBUSIED) {
 		vm_page_unlock(m);
 		return;
 	}
 	if ((x & VPB_BIT_WAITERS) == 0 &&
 	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS)) {
 		vm_page_unlock(m);
 		return;
 	}
 	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
 }
 
 /*
  *	vm_page_trysbusy:
  *
  *	Try to shared busy a page.
  *	If the operation succeeds 1 is returned otherwise 0.
  *	The operation never sleeps.
  */
 int
 vm_page_trysbusy(vm_page_t m)
 {
 	u_int x;
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_SHARED) == 0)
 			return (0);
 		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
 			return (1);
 	}
 }
 
 /*
  *	vm_page_xunbusy_hard:
  *
  *	Called after the first try the exclusive unbusy of a page failed.
  *	It is assumed that the waiters bit is on.
  */
 void
 vm_page_xunbusy_hard(vm_page_t m)
 {
 
 	vm_page_assert_xbusied(m);
 
 	vm_page_lock(m);
 	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 	wakeup(m);
 	vm_page_unlock(m);
 }
 
 /*
  *	vm_page_flash:
  *
  *	Wakeup anyone waiting for the page.
  *	The ownership bits do not change.
  *
  *	The given page must be locked.
  */
 void
 vm_page_flash(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_lock_assert(m, MA_OWNED);
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_WAITERS) == 0)
 			return;
 		if (atomic_cmpset_int(&m->busy_lock, x,
 		    x & (~VPB_BIT_WAITERS)))
 			break;
 	}
 	wakeup(m);
 }
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 void
 vm_page_hold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
         mem->hold_count++;
 }
 
 void
 vm_page_unhold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
 	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
 	--mem->hold_count;
 	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
  */ 
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
 	struct mtx *mtx, *new_mtx;
 
 	mtx = NULL;
 	for (; count != 0; count--) {
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		new_mtx = vm_page_lockptr(*ma);
 		if (mtx != new_mtx) {
 			if (mtx != NULL)
 				mtx_unlock(mtx);
 			mtx = new_mtx;
 			mtx_lock(mtx);
 		}
 		vm_page_unhold(*ma);
 		ma++;
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 vm_page_t
 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 {
 	vm_page_t m;
 
 #ifdef VM_PHYSSEG_SPARSE
 	m = vm_phys_paddr_to_vm_page(pa);
 	if (m == NULL)
 		m = vm_phys_fictitious_to_vm_page(pa);
 	return (m);
 #elif defined(VM_PHYSSEG_DENSE)
 	long pi;
 
 	pi = atop(pa);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		m = &vm_page_array[pi - first_page];
 		return (m);
 	}
 	return (vm_phys_fictitious_to_vm_page(pa));
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 }
 
 /*
  *	vm_page_getfake:
  *
  *	Create a fictitious page with the specified physical address and
  *	memory attribute.  The memory attribute is the only the machine-
  *	dependent aspect of a fictitious page that must be initialized.
  */
 vm_page_t
 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 {
 	vm_page_t m;
 
 	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 	vm_page_initfake(m, paddr, memattr);
 	return (m);
 }
 
 void
 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * The page's memattr might have changed since the
 		 * previous initialization.  Update the pmap to the
 		 * new memattr.
 		 */
 		goto memattr;
 	}
 	m->phys_addr = paddr;
 	m->queue = PQ_NONE;
 	/* Fictitious pages don't use "segind". */
 	m->flags = PG_FICTITIOUS;
 	/* Fictitious pages don't use "order" or "pool". */
 	m->oflags = VPO_UNMANAGED;
 	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	m->wire_count = 1;
 	pmap_page_init(m);
 memattr:
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_putfake:
  *
  *	Release a fictitious page.
  */
 void
 vm_page_putfake(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_putfake: bad page %p", m));
 	uma_zfree(fakepg_zone, m);
 }
 
 /*
  *	vm_page_updatefake:
  *
  *	Update the given fictitious page to the specified physical address and
  *	memory attribute.
  */
 void
 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_updatefake: bad page %p", m));
 	m->phys_addr = paddr;
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  * Unbusy and handle the page queueing for a page from the VOP_GETPAGES()
  * array which is not the request page.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
 	if (m->valid != 0) {
 		/*
 		 * Since the page is not the requested page, whether
 		 * it should be activated or deactivated is not
 		 * obvious.  Empirical results have shown that
 		 * deactivating the page is usually the best choice,
 		 * unless the page is wanted by another thread.
 		 */
 		vm_page_lock(m);
 		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 			vm_page_activate(m);
 		else
 			vm_page_deactivate(m);
 		vm_page_unlock(m);
 		vm_page_xunbusy(m);
 	} else {
 		/*
 		 * Free the completely invalid page.  Such page state
 		 * occurs due to the short read operation which did
 		 * not covered our page at all, or in case when a read
 		 * error happens.
 		 */
 		vm_page_lock(m);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
 }
 
 /*
  *	vm_page_sleep_if_busy:
  *
  *	Sleep and release the page queues lock if the page is busied.
  *	Returns TRUE if the thread slept.
  *
  *	The given page must be unlocked and object containing it must
  *	be locked.
  */
 int
 vm_page_sleep_if_busy(vm_page_t m, const char *msg)
 {
 	vm_object_t obj;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	if (vm_page_busied(m)) {
 		/*
 		 * The page-specific object must be cached because page
 		 * identity can change during the sleep, causing the
 		 * re-lock of a different object.
 		 * It is assumed that a reference to the object is already
 		 * held by the callers.
 		 */
 		obj = m->object;
 		vm_page_lock(m);
 		VM_OBJECT_WUNLOCK(obj);
 		vm_page_busy_sleep(m, msg);
 		VM_OBJECT_WLOCK(obj);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_page_dirty_KBI:		[ internal use only ]
  *
  *	Set all bits in the page's dirty field.
  *
  *	The object containing the specified page must be locked if the
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
  *
  *	This function should only be called by vm_page_dirty().
  */
 void
 vm_page_dirty_KBI(vm_page_t m)
 {
 
 	/* These assertions refer to this operation by its public name. */
 	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_dirty: page in cache!"));
 	KASSERT(m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The object must be locked.
  */
 int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mpred;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
  *	vm_page_insert_after:
  *
  *	Inserts the page "m" into the specified object at offset "pindex".
  *
  *	The page "mpred" must immediately precede the offset "pindex" within
  *	the specified object.
  *
  *	The object must be locked.
  */
 static int
 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred)
 {
 	vm_pindex_t sidx;
 	vm_object_t sobj;
 	vm_page_t msucc;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(m->object == NULL,
 	    ("vm_page_insert_after: page already inserted"));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL)
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 	sobj = m->object;
 	sidx = m->pindex;
 	m->object = object;
 	m->pindex = pindex;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	if (vm_radix_insert(&object->rtree, m)) {
 		m->object = sobj;
 		m->pindex = sidx;
 		return (1);
 	}
 	vm_page_insert_radixdone(m, object, mpred);
 	return (0);
 }
 
 /*
  *	vm_page_insert_radixdone:
  *
  *	Complete page "m" insertion into the specified object after the
  *	radix trie hooking.
  *
  *	The page "mpred" must precede the offset "m->pindex" within the
  *	specified object.
  *
  *	The object must be locked.
  */
 static void
 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object != NULL && m->object == object,
 	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < m->pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 	}
 
 	if (mpred != NULL)
 		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&object->memq, m, listq);
 
 	/*
 	 * Show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold(object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (pmap_page_is_write_mapped(m))
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  *	vm_page_remove:
  *
  *	Removes the given mem entry from the object/offset-page
  *	table and the object page list, but do not invalidate/terminate
  *	the backing store.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
 	boolean_t lockacq;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_lock_assert(m, MA_OWNED);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (vm_page_xbusied(m)) {
 		lockacq = FALSE;
 		if ((m->oflags & VPO_UNMANAGED) != 0 &&
 		    !mtx_owned(vm_page_lockptr(m))) {
 			lockacq = TRUE;
 			vm_page_lock(m);
 		}
 		vm_page_flash(m);
 		atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 		if (lockacq)
 			vm_page_unlock(m);
 	}
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop(object->handle);
 
 	m->object = NULL;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
  *	vm_page_find_least:
  *
  *	Returns the page associated with the object with least pindex
  *	greater than or equal to the parameter pindex, or NULL.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
 /*
  * Returns the given page's successor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_next(vm_page_t m)
 {
 	vm_page_t next;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL &&
 	    next->pindex != m->pindex + 1)
 		next = NULL;
 	return (next);
 }
 
 /*
  * Returns the given page's predecessor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_prev(vm_page_t m)
 {
 	vm_page_t prev;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL &&
 	    prev->pindex != m->pindex - 1)
 		prev = NULL;
 	return (prev);
 }
 
 /*
  * Uses the page mnew as a replacement for an existing page at index
  * pindex which must be already present in the object.
  *
  * The existing page must not be on a paging queue.
  */
 vm_page_t
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mold, mpred;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * This function mostly follows vm_page_insert() and
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
 	mpred = vm_radix_lookup(&object->rtree, pindex);
 	KASSERT(mpred != NULL,
 	    ("vm_page_replace: replacing page not present with pindex"));
 	mpred = TAILQ_PREV(mpred, respgs, listq);
 	if (mpred != NULL)
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	mold = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mold->queue == PQ_NONE,
 	    ("vm_page_replace: mold is on a paging queue"));
 
 	/* Detach the old page from the resident tailq. */
 	TAILQ_REMOVE(&object->memq, mold, listq);
 
 	mold->object = NULL;
 	vm_page_xunbusy(mold);
 
 	/* Insert the new page in the resident tailq. */
 	if (mpred != NULL)
 		TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
 	else
 		TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	return (mold);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.  If the page is on the cache, we have to deactivate it
  *	      or vm_page_dirty() will panic.  Dirty pages are not allowed
  *	      on the cache.
  *
  *	The objects must be locked.
  */
 int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 	vm_page_t mpred;
 	vm_pindex_t opidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 
 	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
 	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
 	    ("vm_page_rename: pindex already renamed"));
 
 	/*
 	 * Create a custom version of vm_page_insert() which does not depend
 	 * by m_prev and can cheat on the implementation aspects of the
 	 * function.
 	 */
 	opidx = m->pindex;
 	m->pindex = new_pindex;
 	if (vm_radix_insert(&new_object->rtree, m)) {
 		m->pindex = opidx;
 		return (1);
 	}
 
 	/*
 	 * The operation cannot fail anymore.  The removal must happen before
 	 * the listq iterator is tainted.
 	 */
 	m->pindex = opidx;
 	vm_page_lock(m);
 	vm_page_remove(m);
 
 	/* Return back to the new pindex to complete vm_page_insert(). */
 	m->pindex = new_pindex;
 	m->object = new_object;
 	vm_page_unlock(m);
 	vm_page_insert_radixdone(m, new_object, mpred);
 	vm_page_dirty(m);
 	return (0);
 }
 
 /*
  *	Convert all of the given object's cached pages that have a
  *	pindex within the given range into free pages.  If the value
  *	zero is given for "end", then the range's upper bound is
  *	infinity.  If the given object is backed by a vnode and it
  *	transitions from having one or more cached pages to none, the
  *	vnode's hold count is reduced. 
  */
 void
 vm_page_cache_free(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
 {
 	vm_page_t m;
 	boolean_t empty;
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (__predict_false(vm_radix_is_empty(&object->cache))) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return;
 	}
 	while ((m = vm_radix_lookup_ge(&object->cache, start)) != NULL) {
 		if (end != 0 && m->pindex >= end)
 			break;
 		vm_radix_remove(&object->cache, m->pindex);
 		vm_page_cache_turn_free(m);
 	}
 	empty = vm_radix_is_empty(&object->cache);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (object->type == OBJT_VNODE && empty)
 		vdrop(object->handle);
 }
 
 /*
  *	Returns the cached page that is associated with the given
  *	object and offset.  If, however, none exists, returns NULL.
  *
  *	The free page queue must be locked.
  */
 static inline vm_page_t
 vm_page_cache_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	return (vm_radix_lookup(&object->cache, pindex));
 }
 
 /*
  *	Remove the given cached page from its containing object's
  *	collection of cached pages.
  *
  *	The free page queue must be locked.
  */
 static void
 vm_page_cache_remove(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_page_cache_remove: page %p is not cached", m));
 	vm_radix_remove(&m->object->cache, m->pindex);
 	m->object = NULL;
 	vm_cnt.v_cache_count--;
 }
 
 /*
  *	Transfer all of the cached pages with offset greater than or
  *	equal to 'offidxstart' from the original object's cache to the
  *	new object's cache.  However, any cached pages with offset
  *	greater than or equal to the new object's size are kept in the
  *	original object.  Initially, the new object's cache must be
  *	empty.  Offset 'offidxstart' in the original object must
  *	correspond to offset zero in the new object.
  *
  *	The new object must be locked.
  */
 void
 vm_page_cache_transfer(vm_object_t orig_object, vm_pindex_t offidxstart,
     vm_object_t new_object)
 {
 	vm_page_t m;
 
 	/*
 	 * Insertion into an object's collection of cached pages
 	 * requires the object to be locked.  In contrast, removal does
 	 * not.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 	KASSERT(vm_radix_is_empty(&new_object->cache),
 	    ("vm_page_cache_transfer: object %p has cached pages",
 	    new_object));
 	mtx_lock(&vm_page_queue_free_mtx);
 	while ((m = vm_radix_lookup_ge(&orig_object->cache,
 	    offidxstart)) != NULL) {
 		/*
 		 * Transfer all of the pages with offset greater than or
 		 * equal to 'offidxstart' from the original object's
 		 * cache to the new object's cache.
 		 */
 		if ((m->pindex - offidxstart) >= new_object->size)
 			break;
 		vm_radix_remove(&orig_object->cache, m->pindex);
 		/* Update the page's object and offset. */
 		m->object = new_object;
 		m->pindex -= offidxstart;
 		if (vm_radix_insert(&new_object->cache, m))
 			vm_page_cache_turn_free(m);
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 }
 
 /*
  *	Returns TRUE if a cached page is associated with the given object and
  *	offset, and FALSE otherwise.
  *
  *	The object must be locked.
  */
 boolean_t
 vm_page_is_cached(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	/*
 	 * Insertion into an object's collection of cached pages requires the
 	 * object to be locked.  Therefore, if the object is locked and the
 	 * object's collection is empty, there is no need to acquire the free
 	 * page queues lock in order to prove that the specified page doesn't
 	 * exist.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (__predict_true(vm_object_cache_is_empty(object)))
 		return (FALSE);
 	mtx_lock(&vm_page_queue_free_mtx);
 	m = vm_page_cache_lookup(object, pindex);
 	mtx_unlock(&vm_page_queue_free_mtx);
 	return (m != NULL);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
  *	object and offset pair.  By default, this page is exclusive busied.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_IFCACHED	return page only if it is cached
  *	VM_ALLOC_IFNOTCACHED	return NULL, do not reactivate if the page
  *				is cached
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy 
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 	struct vnode *vp = NULL;
 	vm_object_t m_object;
 	vm_page_t m, mpred;
 	int flags, req_class, unmanaged;
 
 	mpred = 0;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
 	    req));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
 	unmanaged = (object == NULL || (object->flags & OBJ_UNMANAGED) != 0);
 	KASSERT(unmanaged == 0 || (req & VM_ALLOC_WIRED) != 0,
 	    ("vm_page_alloc: unmanaged but unwired request req(%x)", req));
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		   ("vm_page_alloc: pindex already allocated"));
 	}
 
 	/*
 	 * The page allocation request can came from consumers which already
 	 * hold the free page queue mutex, like vm_page_insert() in
 	 * vm_page_cache().
 	 */
 	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
 	if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count > 0)) {
 		/*
 		 * Allocate from the free queue if the number of free pages
 		 * exceeds the minimum for the request class.
 		 */
 		if (object != NULL &&
 		    (m = vm_page_cache_lookup(object, pindex)) != NULL) {
 			if ((req & VM_ALLOC_IFNOTCACHED) != 0) {
 				mtx_unlock(&vm_page_queue_free_mtx);
 				return (NULL);
 			}
 			if (vm_phys_unfree_page(m))
 				vm_phys_set_pool(VM_FREEPOOL_DEFAULT, m, 0);
 #if VM_NRESERVLEVEL > 0
 			else if (!vm_reserv_reactivate_page(m))
 #else
 			else
 #endif
 				panic("vm_page_alloc: cache page %p is missing"
 				    " from the free queue", m);
 		} else if ((req & VM_ALLOC_IFCACHED) != 0) {
 			mtx_unlock(&vm_page_queue_free_mtx);
 			return (NULL);
 #if VM_NRESERVLEVEL > 0
 		} else if (object == NULL || (object->flags & (OBJ_COLORED |
 		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
 		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL) {
 #else
 		} else {
 #endif
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 #if VM_NRESERVLEVEL > 0
 			if (m == NULL && vm_reserv_reclaim_inactive()) {
 				m = vm_phys_alloc_pages(object != NULL ?
 				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
 				    0);
 			}
 #endif
 		}
 	} else {
 		/*
 		 * Not allocatable, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 
 	/*
 	 *  At this point we had better have found a good page.
 	 */
 	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
 	KASSERT(m->queue == PQ_NONE,
 	    ("vm_page_alloc: page %p has unexpected queue %d", m, m->queue));
 	KASSERT(m->wire_count == 0, ("vm_page_alloc: page %p is wired", m));
 	KASSERT(m->hold_count == 0, ("vm_page_alloc: page %p is held", m));
 	KASSERT(!vm_page_sbusied(m), 
 	    ("vm_page_alloc: page %p is busy", m));
 	KASSERT(m->dirty == 0, ("vm_page_alloc: page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("vm_page_alloc: page %p has unexpected memattr %d", m,
 	    pmap_page_get_memattr(m)));
 	if ((m->flags & PG_CACHED) != 0) {
 		KASSERT((m->flags & PG_ZERO) == 0,
 		    ("vm_page_alloc: cached page %p is PG_ZERO", m));
 		KASSERT(m->valid != 0,
 		    ("vm_page_alloc: cached page %p is invalid", m));
 		if (m->object == object && m->pindex == pindex)
 			vm_cnt.v_reactivated++;
 		else
 			m->valid = 0;
 		m_object = m->object;
 		vm_page_cache_remove(m);
 		if (m_object->type == OBJT_VNODE &&
 		    vm_object_cache_is_empty(m_object))
 			vp = m_object->handle;
 	} else {
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc: free page %p is valid", m));
 		vm_phys_freecnt_adj(m, -1);
 		if ((m->flags & PG_ZERO) != 0)
 			vm_page_zero_count--;
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	flags &= m->flags;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
 	m->aflags = 0;
 	m->oflags = (unmanaged != 0) ? VPO_UNMANAGED : 0;
 	m->busy_lock = VPB_UNBUSIED;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	if ((req & VM_ALLOC_SBUSY) != 0)
 		m->busy_lock = VPB_SHARERS_WORD(1);
 	if (req & VM_ALLOC_WIRED) {
 		/*
 		 * The page lock is not required for wiring a page until that
 		 * page is inserted into the object.
 		 */
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	m->act_count = 0;
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
 			/* See the comment below about hold count. */
 			if (vp != NULL)
 				vdrop(vp);
 			pagedaemon_wakeup();
 			if ((req & VM_ALLOC_WIRED) != 0 && unmanaged == 0) {
 				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				m->wire_count = 0;
 			}
 			m->object = NULL;
 			vm_page_free(m);
 			return (NULL);
 		}
 
 		/* Ignore device objects; the pager sets "memattr" for them. */
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    (object->flags & OBJ_FICTITIOUS) == 0)
 			pmap_page_set_memattr(m, object->memattr);
 	} else
 		m->pindex = pindex;
 
 	/*
 	 * The following call to vdrop() must come after the above call
 	 * to vm_page_insert() in case both affect the same object and
 	 * vnode.  Otherwise, the affected vnode's hold count could
 	 * temporarily become zero.
 	 */
 	if (vp != NULL)
 		vdrop(vp);
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
 static void
 vm_page_alloc_contig_vdrop(struct spglist *lst)
 {
 
 	while (!SLIST_EMPTY(lst)) {
 		vdrop((struct vnode *)SLIST_FIRST(lst)-> plinks.s.pv);
 		SLIST_REMOVE_HEAD(lst, plinks.s.ss);
 	}
 }
 
 /*
  *	vm_page_alloc_contig:
  *
  *	Allocate a contiguous set of physical pages of the given size "npages"
  *	from the free lists.  All of the physical pages must be at or above
  *	the given physical address "low" and below the given physical address
  *	"high".  The given value "alignment" determines the alignment of the
  *	first physical page in the set.  If the given value "boundary" is
  *	non-zero, then the set of physical pages cannot cross any physical
  *	address boundary that is a multiple of that value.  Both "alignment"
  *	and "boundary" must be a power of two.
  *
  *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
  *	then the memory attribute setting for the physical pages is configured
  *	to the object's memory attribute setting.  Otherwise, the memory
  *	attribute setting for the physical pages is configured to "memattr",
  *	overriding the object's memory attribute setting.  However, if the
  *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
  *	The caller must always specify an allocation class.
  *
  *	The returned pages will all be wired.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy 
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	struct vnode *drop;
 	struct spglist deferred_vdrop_list;
 	vm_page_t m, m_tmp, m_ret;
 	u_int flags;
 	int req_class;
 
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", (void *)object,
 	    req));
 	KASSERT((req & VM_ALLOC_WIRED) == 0,
 	    ("vm_page_alloc_contig: VM_ALLOC_WIRED passed in req (%x)", req));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
 		KASSERT(object->type == OBJT_PHYS,
 		    ("vm_page_alloc_contig: object %p isn't OBJT_PHYS",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	SLIST_INIT(&deferred_vdrop_list);
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages +
 	    vm_cnt.v_free_reserved || (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages +
 	    vm_cnt.v_interrupt_free_min) || (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count >= npages)) {
 #if VM_NRESERVLEVEL > 0
 retry:
 		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
 		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
 		    low, high, alignment, boundary)) == NULL)
 #endif
 			m_ret = vm_phys_alloc_contig(npages, low, high,
 			    alignment, boundary);
 	} else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, npages);
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m_ret != NULL)
 		for (m = m_ret; m < &m_ret[npages]; m++) {
 			drop = vm_page_alloc_init(m);
 			if (drop != NULL) {
 				/*
 				 * Enqueue the vnode for deferred vdrop().
 				 */
 				m->plinks.s.pv = drop;
 				SLIST_INSERT_HEAD(&deferred_vdrop_list, m,
 				    plinks.s.ss);
 			}
 		}
 	else {
 #if VM_NRESERVLEVEL > 0
 		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
 		    boundary))
 			goto retry;
 #endif
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (m_ret == NULL)
 		return (NULL);
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	atomic_add_int(&vm_cnt.v_wire_count, npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
 			memattr = object->memattr;
 	}
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->aflags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
 		m->busy_lock = VPB_UNBUSIED;
 		if (object != NULL) {
 			if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
 				m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 			if ((req & VM_ALLOC_SBUSY) != 0)
 				m->busy_lock = VPB_SHARERS_WORD(1);
 		}
 		m->wire_count = 1;
 
 		/* Unmanaged pages don't use "act_count". */
 		m->oflags = VPO_UNMANAGED;
 		if (object != NULL) {
 			if (vm_page_insert(m, object, pindex)) {
 				vm_page_alloc_contig_vdrop(
 				    &deferred_vdrop_list);
 				if (vm_paging_needed())
 					pagedaemon_wakeup();
 				for (m_tmp = m, m = m_ret;
 				    m < &m_ret[npages]; m++) {
 					m->wire_count = 1;
 					m->oflags = VPO_UNMANAGED;
 					if (m >= m_tmp)
 						m->object = NULL;
 					vm_page_free(m);
 				}
 				return (NULL);
 			}
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
 	vm_page_alloc_contig_vdrop(&deferred_vdrop_list);
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m_ret);
 }
 
 /*
  * Initialize a page that has been freshly dequeued from a freelist.
  * The caller has to drop the vnode returned, if it is not NULL.
  *
  * This function may only be used to initialize unmanaged pages.
  *
  * To be called with vm_page_queue_free_mtx held.
  */
 static struct vnode *
 vm_page_alloc_init(vm_page_t m)
 {
 	struct vnode *drop;
 	vm_object_t m_object;
 
 	KASSERT(m->queue == PQ_NONE,
 	    ("vm_page_alloc_init: page %p has unexpected queue %d",
 	    m, m->queue));
 	KASSERT(m->wire_count == 0,
 	    ("vm_page_alloc_init: page %p is wired", m));
 	KASSERT(m->hold_count == 0,
 	    ("vm_page_alloc_init: page %p is held", m));
 	KASSERT(!vm_page_sbusied(m),
 	    ("vm_page_alloc_init: page %p is busy", m));
 	KASSERT(m->dirty == 0,
 	    ("vm_page_alloc_init: page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("vm_page_alloc_init: page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	drop = NULL;
 	if ((m->flags & PG_CACHED) != 0) {
 		KASSERT((m->flags & PG_ZERO) == 0,
 		    ("vm_page_alloc_init: cached page %p is PG_ZERO", m));
 		m->valid = 0;
 		m_object = m->object;
 		vm_page_cache_remove(m);
 		if (m_object->type == OBJT_VNODE &&
 		    vm_object_cache_is_empty(m_object))
 			drop = m_object->handle;
 	} else {
 		KASSERT(m->valid == 0,
 		    ("vm_page_alloc_init: free page %p is valid", m));
 		vm_phys_freecnt_adj(m, -1);
 		if ((m->flags & PG_ZERO) != 0)
 			vm_page_zero_count--;
 	}
 	return (drop);
 }
 
 /*
  * 	vm_page_alloc_freelist:
  *
  *	Allocate a physical page from the specified free page list.
  *
  *	The caller must always specify an allocation class.
  *
  *	The returned page will be wired.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
 {
 	struct vnode *drop;
 	vm_page_t m;
 	u_int flags;
 	int req_class;
 
 	KASSERT((req & VM_ALLOC_WIRED) == 0,
 	    ("vm_page_alloc_freelist: VM_ALLOC_WIRED passed in req (%x)",
 	    req));
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
 	mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
 	if (vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count + vm_cnt.v_cache_count > 0))
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
 	else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m == NULL) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return (NULL);
 	}
 	drop = vm_page_alloc_init(m);
 	mtx_unlock(&vm_page_queue_free_mtx);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	m->aflags = 0;
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	m->flags &= flags;
 
 	/*
 	 * The page lock is not required for wiring a page that does
 	 * not belong to an object.
 	 */
 	atomic_add_int(&vm_cnt.v_wire_count, 1);
 	m->wire_count = 1;
 
 	/* Unmanaged pages don't use "act_count". */
 	m->oflags = VPO_UNMANAGED;
 	if (drop != NULL)
 		vdrop(drop);
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m);
 }
 
 /*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places before memory allocations.
  */
 void
 vm_wait(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (!vm_pages_needed) {
 			vm_pages_needed = 1;
 			wakeup(&vm_pages_needed);
 		}
 		msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 		    "vmwait", 0);
 	}
 }
 
 /*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pages_needed) {
 		vm_pages_needed = 1;
 		wakeup(&vm_pages_needed);
 	}
 	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
 struct vm_pagequeue *
 vm_page_pagequeue(vm_page_t m)
 {
 
 	return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
 }
 
 /*
  *	vm_page_dequeue:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_dequeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_assert_locked(m);
-	KASSERT(m->queue == PQ_ACTIVE || m->queue == PQ_INACTIVE,
-	    ("vm_page_dequeue: page %p is not queued", m));
+	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
+	    m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_dequeue_locked:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page and page queue must be locked.
  */
 void
 vm_page_dequeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 }
 
 /*
  *	vm_page_enqueue:
  *
  *	Add the given page to the specified page queue.
  *
  *	The page must be locked.
  */
 static void
-vm_page_enqueue(int queue, vm_page_t m)
+vm_page_enqueue(uint8_t queue, vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
+	KASSERT(queue < PQ_COUNT,
+	    ("vm_page_enqueue: invalid queue %u request for page %m",
+	    queue, m));
+
 	pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_inc(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_requeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue_locked:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page queue must be locked.
  */
 void
 vm_page_requeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue_locked: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *	Ensure that act_count is at least ACT_INIT but do not otherwise
  *	mess with it.
  *
  *	The page must be locked.
  */
 void
 vm_page_activate(vm_page_t m)
 {
 	int queue;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((queue = m->queue) != PQ_ACTIVE) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_ACTIVE, m);
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("vm_page_activate: wired page %p is queued", m));
 	} else {
 		if (m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
 	}
 }
 
 /*
  *	vm_page_free_wakeup:
  *
  *	Helper routine for vm_page_free_toq() and vm_page_cache().  This
  *	routine is called when a page has been added to the cache or free
  *	queues.
  *
  *	The page queues must be locked.
  */
 static inline void
 vm_page_free_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
 	    vm_cnt.v_cache_count + vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	/*
 	 * wakeup processes that are waiting on memory if we hit a
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = 0;
 		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
 /*
  *	Turn a cached page into a free page, by changing its attributes.
  *	Keep the statistics up-to-date.
  *
  *	The free page queue must be locked.
  */
 static void
 vm_page_cache_turn_free(vm_page_t m)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 
 	m->object = NULL;
 	m->valid = 0;
 	KASSERT((m->flags & PG_CACHED) != 0,
 	    ("vm_page_cache_turn_free: page %p is not cached", m));
 	m->flags &= ~PG_CACHED;
 	vm_cnt.v_cache_count--;
 	vm_phys_freecnt_adj(m, 1);
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the free list,
  *	disassociating it with any VM object.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_free_toq(vm_page_t m)
 {
 
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("vm_page_free_toq: freeing mapped page %p", m));
 	} else {
 		KASSERT(m->queue == PQ_NONE,
 		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_free_toq: invalid wired count %u for unmanaged page %p",
 		    m->wire_count, m));
 		m->wire_count--;
 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 	}
 	PCPU_INC(cnt.v_tfree);
 
 	if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
 	 * Unqueue, then remove page.  Note that we cannot destroy
 	 * the page here because we do not want to call the pager's
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
 	vm_page_remque(m);
 	vm_page_remove(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		return;
 	}
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->wire_count != 0)
 		panic("vm_page_free: freeing wired page %p", m);
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
 	} else {
 		/*
 		 * Restore the default memory attribute to the page.
 		 */
 		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
 		/*
 		 * Insert the page into the physical memory allocator's
 		 * cache/free page queues.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
 		if (TRUE)
 #endif
 			vm_phys_free_pages(m, 0);
 		if ((m->flags & PG_ZERO) != 0)
 			++vm_page_zero_count;
 		else
 			vm_page_zero_idle_wakeup();
 		vm_page_free_wakeup();
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 }
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	If the page is fictitious, then its wire count must remain one.
  *
  *	The page must be locked.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 
 	/*
 	 * Only bump the wire statistics if the page is not already wired,
 	 * and only unqueue the page if it is on some queue (if it is unmanaged
 	 * it is already off the queues).
 	 */
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 		    ("vm_page_wire: fictitious page %p's wire count isn't one",
 		    m));
 		return;
 	}
 	if (m->wire_count == 0) {
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
 		    m->queue == PQ_NONE,
 		    ("vm_page_wire: unmanaged page %p is queued", m));
 		vm_page_remque(m);
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 }
 
 /*
  * vm_page_unwire:
  *
  * Release one wiring of the specified page, potentially enabling it to be
  * paged again.  If paging is enabled, then the value of the parameter
- * "activate" determines to which queue the page is added.  If "activate" is
- * non-zero, then the page is added to the active queue.  Otherwise, it is
- * added to the inactive queue.
+ * "queue" determines to which queue the page is added.
  *
- * However, unless the page belongs to an object, it is not enqueued because
- * it cannot be paged out.
+ * If a page is fictitious or managed, then its wire count must always be one.
  *
- * If a page is fictitious, then its wire count must always be one.
- *
  * A managed page must be locked.
  */
 void
-vm_page_unwire(vm_page_t m, int activate)
+vm_page_unwire(vm_page_t m, uint8_t queue)
 {
 
+	KASSERT(queue < PQ_COUNT,
+	    ("vm_page_unwire: invalid queue %u request for page %m",
+	    queue, m));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_lock_assert(m, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
 		return;
 	}
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			if ((m->oflags & VPO_UNMANAGED) != 0 ||
 			    m->object == NULL)
 		panic("vm_page_unwire: unmanaged page %p's wire count is one",
 				    m);
 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
-			if (!activate)
+			if (queue == PQ_INACTIVE)
 				m->flags &= ~PG_WINATCFLS;
-			vm_page_enqueue(activate ? PQ_ACTIVE : PQ_INACTIVE, m);
+			vm_page_enqueue(queue, m);
 		}
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * Many pages placed on the inactive queue should actually go
  * into the cache, but it is difficult to figure out which.  What
  * we do instead, if the inactive target is well met, is to put
  * clean pages at the head of the inactive queue instead of the tail.
  * This will cause them to be moved to the cache more quickly and
  * if not actively re-referenced, reclaimed more quickly.  If we just
  * stick these pages at the end of the inactive queue, heavy filesystem
  * meta-data accesses can cause an unnecessary paging load on memory bound 
  * processes.  This optimization causes one-time-use metadata to be
  * reused more quickly.
  *
  * Normally athead is 0 resulting in LRU operation.  athead is set
  * to 1 if we want this page to be 'as if it were placed in the cache',
  * except without unmapping it from the process address space.
  *
  * The page must be locked.
  */
 static inline void
 _vm_page_deactivate(vm_page_t m, int athead)
 {
 	struct vm_pagequeue *pq;
 	int queue;
 
 	vm_page_lock_assert(m, MA_OWNED);
 
 	/*
 	 * Ignore if already inactive.
 	 */
 	if ((queue = m->queue) == PQ_INACTIVE)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 		if (queue != PQ_NONE)
 			vm_page_dequeue(m);
 		m->flags &= ~PG_WINATCFLS;
 		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
 		vm_pagequeue_lock(pq);
 		m->queue = PQ_INACTIVE;
 		if (athead)
 			TAILQ_INSERT_HEAD(&pq->pq_pl, m, plinks.q);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_inc(pq);
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * The page must be locked.
  */
 void
 vm_page_deactivate(vm_page_t m)
 {
 
 	_vm_page_deactivate(m, 0);
 }
 
 /*
  * vm_page_try_to_cache:
  *
  * Returns 0 on failure, 1 on success
  */
 int
 vm_page_try_to_cache(vm_page_t m)
 {
 
 	vm_page_lock_assert(m, MA_OWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty || m->hold_count || m->wire_count ||
 	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_cache(m);
 	return (1);
 }
 
 /*
  * vm_page_try_to_free()
  *
  *	Attempt to free the page.  If we cannot free it, we do nothing.
  *	1 is returned on success, 0 on failure.
  */
 int
 vm_page_try_to_free(vm_page_t m)
 {
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if (m->object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty || m->hold_count || m->wire_count ||
 	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_free(m);
 	return (1);
 }
 
 /*
  * vm_page_cache
  *
  * Put the specified page onto the page cache queue (if appropriate).
  *
  * The object and page must be locked.
  */
 void
 vm_page_cache(vm_page_t m)
 {
 	vm_object_t object;
 	boolean_t cache_was_empty;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (vm_page_busied(m) || (m->oflags & VPO_UNMANAGED) ||
 	    m->hold_count || m->wire_count)
 		panic("vm_page_cache: attempting to cache busy page");
 	KASSERT(!pmap_page_is_mapped(m),
 	    ("vm_page_cache: page %p is mapped", m));
 	KASSERT(m->dirty == 0, ("vm_page_cache: page %p is dirty", m));
 	if (m->valid == 0 || object->type == OBJT_DEFAULT ||
 	    (object->type == OBJT_SWAP &&
 	    !vm_pager_has_page(object, m->pindex, NULL, NULL))) {
 		/*
 		 * Hypothesis: A cache-elgible page belonging to a
 		 * default object or swap object but without a backing
 		 * store must be zero filled.
 		 */
 		vm_page_free(m);
 		return;
 	}
 	KASSERT((m->flags & PG_CACHED) == 0,
 	    ("vm_page_cache: page %p is already cached", m));
 
 	/*
 	 * Remove the page from the paging queues.
 	 */
 	vm_page_remque(m);
 
 	/*
 	 * Remove the page from the object's collection of resident
 	 * pages. 
 	 */
 	vm_radix_remove(&object->rtree, m->pindex);
 	TAILQ_REMOVE(&object->memq, m, listq);
 	object->resident_page_count--;
 
 	/*
 	 * Restore the default memory attribute to the page.
 	 */
 	if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 		pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
 	/*
 	 * Insert the page into the object's collection of cached pages
 	 * and the physical memory allocator's cache/free page queues.
 	 */
 	m->flags &= ~PG_ZERO;
 	mtx_lock(&vm_page_queue_free_mtx);
 	cache_was_empty = vm_radix_is_empty(&object->cache);
 	if (vm_radix_insert(&object->cache, m)) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		if (object->resident_page_count == 0)
 			vdrop(object->handle);
 		m->object = NULL;
 		vm_page_free(m);
 		return;
 	}
 
 	/*
 	 * The above call to vm_radix_insert() could reclaim the one pre-
 	 * existing cached page from this object, resulting in a call to
 	 * vdrop().
 	 */
 	if (!cache_was_empty)
 		cache_was_empty = vm_radix_is_singleton(&object->cache);
 
 	m->flags |= PG_CACHED;
 	vm_cnt.v_cache_count++;
 	PCPU_INC(cnt.v_tcached);
 #if VM_NRESERVLEVEL > 0
 	if (!vm_reserv_free_page(m)) {
 #else
 	if (TRUE) {
 #endif
 		vm_phys_set_pool(VM_FREEPOOL_CACHE, m, 0);
 		vm_phys_free_pages(m, 0);
 	}
 	vm_page_free_wakeup();
 	mtx_unlock(&vm_page_queue_free_mtx);
 
 	/*
 	 * Increment the vnode's hold count if this is the object's only
 	 * cached page.  Decrement the vnode's hold count if this was
 	 * the object's only resident page.
 	 */
 	if (object->type == OBJT_VNODE) {
 		if (cache_was_empty && object->resident_page_count != 0)
 			vhold(object->handle);
 		else if (!cache_was_empty && object->resident_page_count == 0)
 			vdrop(object->handle);
 	}
 }
 
 /*
  * vm_page_advise
  *
  *	Cache, deactivate, or do nothing as appropriate.  This routine
  *	is used by madvise().
  *
  *	Generally speaking we want to move the page into the cache so
  *	it gets reused quickly.  However, this can result in a silly syndrome
  *	due to the page recycling too quickly.  Small objects will not be
  *	fully cached.  On the other hand, if we move the page to the inactive
  *	queue we wind up with a problem whereby very large objects 
  *	unnecessarily blow away our inactive and cache queues.
  *
  *	The solution is to move the pages based on a fixed weighting.  We
  *	either leave them alone, deactivate them, or move them to the cache,
  *	where moving them to the cache has the highest weighting.
  *	By forcing some pages into other queues we eventually force the
  *	system to balance the queues, potentially recovering other unrelated
  *	space from active.  The idea is to not force this to happen too
  *	often.
  *
  *	The object and page must be locked.
  */
 void
 vm_page_advise(vm_page_t m, int advice)
 {
 	int dnw, head;
 
 	vm_page_assert_locked(m);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (advice == MADV_FREE) {
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
 		 * up by the system.  However, such pages are often reused
 		 * quickly by malloc() so we do not do anything that would
 		 * cause a page fault if we can help it.
 		 *
 		 * Specifically, we do not try to actually free the page now
 		 * nor do we try to put it in the cache (which would cause a
 		 * page fault on reuse).
 		 *
 		 * But we do make the page is freeable as we can without
 		 * actually taking the step of unmapping it.
 		 */
 		m->dirty = 0;
 		m->act_count = 0;
 	} else if (advice != MADV_DONTNEED)
 		return;
 	dnw = PCPU_GET(dnweight);
 	PCPU_INC(dnweight);
 
 	/*
 	 * Occasionally leave the page alone.
 	 */
 	if ((dnw & 0x01F0) == 0 || m->queue == PQ_INACTIVE) {
 		if (m->act_count >= ACT_INIT)
 			--m->act_count;
 		return;
 	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
 	 * immediately reactivate the page.
 	 */
 	vm_page_aflag_clear(m, PGA_REFERENCED);
 
 	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	if (m->dirty || (dnw & 0x0070) == 0) {
 		/*
 		 * Deactivate the page 3 times out of 32.
 		 */
 		head = 0;
 	} else {
 		/*
 		 * Cache the page 28 times out of every 32.  Note that
 		 * the page is deactivated instead of cached, but placed
 		 * at the head of the queue instead of the tail.
 		 */
 		head = 1;
 	}
 	_vm_page_deactivate(m, head);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may sleep.
  *
  * The object must be locked on entry.  The lock will, however, be released
  * and reacquired if the routine sleeps.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 	int sleep;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
 			 * likely to reclaim it.
 			 */
 			vm_page_aflag_set(m, PGA_REFERENCED);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(m, "pgrbwt");
 			VM_OBJECT_WLOCK(object);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
 				vm_page_lock(m);
 				vm_page_wire(m);
 				vm_page_unlock(m);
 			}
 			if ((allocflags &
 			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
 				vm_page_xbusy(m);
 			if ((allocflags & VM_ALLOC_SBUSY) != 0)
 				vm_page_sbusy(m);
 			return (m);
 		}
 	}
 	m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_IGN_SBUSY);
 	if (m == NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_WLOCK(object);
 		goto retrylookup;
 	} else if (m->valid != 0)
 		return (m);
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 /*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
  */
 vm_page_bits_t
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return (((vm_page_bits_t)2 << last_bit) -
 	    ((vm_page_bits_t)1 << first_bit));
 }
 
 /*
  *	vm_page_set_valid_range:
  *
  *	Sets portions of a page valid.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zeroed.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_valid_range(vm_page_t m, int base, int size)
 {
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the 
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
 	 * is already dirty. 
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
 
 	/*
 	 * Set valid bits inclusive of any overlap.
 	 */
 	m->valid |= vm_page_bits(base, size);
 }
 
 /*
  * Clear the given bits from the specified page's dirty field.
  */
 static __inline void
 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
 {
 	uintptr_t addr;
 #if PAGE_SIZE < 16384
 	int shift;
 #endif
 
 	/*
 	 * If the object is locked and the page is neither exclusive busy nor
 	 * write mapped, then the page's dirty field cannot possibly be
 	 * set by a concurrent pmap operation.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 		m->dirty &= ~pagebits;
 	else {
 		/*
 		 * The pmap layer can call vm_page_dirty() without
 		 * holding a distinguished lock.  The combination of
 		 * the object's lock and an atomic operation suffice
 		 * to guarantee consistency of the page dirty field.
 		 *
 		 * For PAGE_SIZE == 32768 case, compiler already
 		 * properly aligns the dirty field, so no forcible
 		 * alignment is needed. Only require existence of
 		 * atomic_clear_64 when page size is 32768.
 		 */
 		addr = (uintptr_t)&m->dirty;
 #if PAGE_SIZE == 32768
 		atomic_clear_64((uint64_t *)addr, pagebits);
 #elif PAGE_SIZE == 16384
 		atomic_clear_32((uint32_t *)addr, pagebits);
 #else		/* PAGE_SIZE <= 8192 */
 		/*
 		 * Use a trick to perform a 32-bit atomic on the
 		 * containing aligned word, to not depend on the existence
 		 * of atomic_clear_{8, 16}.
 		 */
 		shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
 #else
 		shift *= NBBY;
 #endif
 		addr &= ~(sizeof(uint32_t) - 1);
 		atomic_clear_32((uint32_t *)addr, pagebits << shift);
 #endif		/* PAGE_SIZE */
 	}
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t oldvalid, pagebits;
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = base & ~(DEV_BSIZE - 1)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the 
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = endoff & ~(DEV_BSIZE - 1)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	oldvalid = m->valid;
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	if (base == 0 && size == PAGE_SIZE) {
 		/*
 		 * The page can only be modified within the pmap if it is
 		 * mapped, and it can only be mapped if it was previously
 		 * fully valid.
 		 */
 		if (oldvalid == VM_PAGE_BITS_ALL)
 			/*
 			 * Perform the pmap_clear_modify() first.  Otherwise,
 			 * a concurrent pmap operation, such as
 			 * pmap_protect(), could clear a modification in the
 			 * pmap and set the dirty field on the page before
 			 * pmap_clear_modify() had begun and after the dirty
 			 * field was cleared here.
 			 */
 			pmap_clear_modify(m);
 		m->dirty = 0;
 		m->oflags &= ~VPO_NOSYNC;
 	} else if (oldvalid != VM_PAGE_BITS_ALL)
 		m->dirty &= ~pagebits;
 	else
 		vm_page_clear_dirty_mask(m, pagebits);
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 	vm_object_t object;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
 	    size >= object->un_pager.vnp.vnp_size)
 		bits = VM_PAGE_BITS_ALL;
 	else
 		bits = vm_page_bits(base, size);
 	if (m->valid == VM_PAGE_BITS_ALL && bits != 0)
 		pmap_remove_all(m);
 	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
 	    !pmap_page_is_mapped(m),
 	    ("vm_page_set_invalid: page %p is mapped", m));
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain 
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped 
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zerod.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zerod by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) || 
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
 				pmap_zero_page_area(m, 
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		m->valid = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	bits = vm_page_bits(base, size);
 	return (m->valid != 0 && (m->valid & bits) == bits);
 }
 
 /*
  *	vm_page_ps_is_valid:
  *
  *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
  */
 boolean_t
 vm_page_ps_is_valid(vm_page_t m)
 {
 	int i, npages;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
 	 * The physically contiguous pages that make up a superpage, i.e., a
 	 * page with a page size index ("psind") greater than zero, will
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
 		if (m[i].valid != VM_PAGE_BITS_ALL)
 			return (FALSE);
 	}
 	return (TRUE);
 }
 
 /*
  * Set the page's dirty bits if the page is modified.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 		vm_page_dirty(m);
 }
 
 void
 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 void
 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 int
 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
 }
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void
 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 {
 
 	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 }
 
 void
 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 {
 
 	mtx_assert_(vm_page_lockptr(m), a, file, line);
 }
 #endif
 
 #ifdef INVARIANTS
 void
 vm_page_object_lock_assert(vm_page_t m)
 {
 
 	/*
 	 * Certain of the page's fields may only be modified by the
 	 * holder of the containing object's lock or the exclusive busy.
 	 * holder.  Unfortunately, the holder of the write busy is
 	 * not recorded, and thus cannot be checked here.
 	 */
 	if (m->object != NULL && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
 	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
 	db_printf("vm_cnt.v_cache_count: %d\n", vm_cnt.v_cache_count);
 	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
 	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
 	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
 	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
 	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
 	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
 	db_printf("vm_cnt.v_cache_min: %d\n", vm_cnt.v_cache_min);
 	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int dom;
 
 	db_printf("pq_free %d pq_cache %d\n",
 	    vm_cnt.v_free_count, vm_cnt.v_cache_count);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
 	"dom %d page_cnt %d free %d pq_act %d pq_inact %d pass %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pass);
 	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
 	boolean_t phys;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
 		return;
 	}
 
 	phys = strchr(modif, 'p') != NULL;
 	if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;
 	db_printf(
     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
     "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
 	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
 }
 #endif /* DDB */