Index: stable/11/sys/kern/vfs_bio.c
===================================================================
--- stable/11/sys/kern/vfs_bio.c	(revision 323661)
+++ stable/11/sys/kern/vfs_bio.c	(revision 323662)
@@ -1,5003 +1,5001 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * Copyright (c) 1994,1997 John S. Dyson
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * this file contains a new buffer I/O scheme implementing a coherent
  * VM object and buffer cache scheme.  Pains have been taken to make
  * sure that the performance degradation associated with schemes such
  * as this is not realized.
  *
  * Author:  John S. Dyson
  * Significant help during the development and debugging phases
  * had been provided by David Greenman, also of the FreeBSD core team.
  *
  * see man buf(9) for more info.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
 #include <sys/fail.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/watchdog.h>
 #include <geom/geom.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/swap_pager.h>
 #include "opt_compat.h"
 #include "opt_swap.h"
 
 static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
 struct	buf_ops buf_ops_bio = {
 	.bop_name	=	"buf_ops_bio",
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
 	.bop_bdflush	=	bufbdflush,
 };
 
 static struct buf *buf;		/* buffer header pool */
 extern struct buf *swbuf;	/* Swap buffer header pool. */
 caddr_t unmapped_buf;
 
 /* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
 struct proc *bufdaemonproc;
 struct proc *bufspacedaemonproc;
 
 static int inmem(struct vnode *vp, daddr_t blkno);
 static void vm_hold_free_pages(struct buf *bp, int newbsize);
 static void vm_hold_load_pages(struct buf *bp, vm_offset_t from,
 		vm_offset_t to);
 static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
 static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
 		vm_page_t m);
 static void vfs_clean_pages_dirty_buf(struct buf *bp);
 static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_invalidate(struct buf *bp);
 static void vfs_vmio_truncate(struct buf *bp, int npages);
 static void vfs_vmio_extend(struct buf *bp, int npages, int size);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
 static int buf_flush(struct vnode *vp, int);
 static int buf_recycle(bool);
 static int buf_scan(bool);
 static int flushbufqueues(struct vnode *, int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 static __inline void bd_wakeup(void);
 static int sysctl_runningspace(SYSCTL_HANDLER_ARGS);
 static void bufkva_reclaim(vmem_t *, int);
 static void bufkva_free(struct buf *);
 static int buf_import(void *, void **, int, int);
 static void buf_release(void *, void **, int);
 static void maxbcachebuf_adjust(void);
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
 #endif
 
 int vmiodirenable = TRUE;
 SYSCTL_INT(_vfs, OID_AUTO, vmiodirenable, CTLFLAG_RW, &vmiodirenable, 0,
     "Use the VM system for directory writes");
 long runningbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0,
     "Amount of presently outstanding async buffer io");
 static long bufspace;
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 SYSCTL_PROC(_vfs, OID_AUTO, bufspace, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RD,
     &bufspace, 0, sysctl_bufspace, "L", "Virtual memory used for buffers");
 #else
 SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
     "Physical memory used for buffers");
 #endif
 static long bufkvaspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufkvaspace, CTLFLAG_RD, &bufkvaspace, 0,
     "Kernel virtual memory used for buffers");
 static long maxbufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, &maxbufspace, 0,
     "Maximum allowed value of bufspace (including metadata)");
 static long bufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0,
     "Amount of malloced memory for buffers");
 static long maxbufmallocspace;
 SYSCTL_LONG(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, &maxbufmallocspace,
     0, "Maximum amount of malloced memory for buffers");
 static long lobufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, lobufspace, CTLFLAG_RW, &lobufspace, 0,
     "Minimum amount of buffers we want to have");
 long hibufspace;
 SYSCTL_LONG(_vfs, OID_AUTO, hibufspace, CTLFLAG_RW, &hibufspace, 0,
     "Maximum allowed value of bufspace (excluding metadata)");
 long bufspacethresh;
 SYSCTL_LONG(_vfs, OID_AUTO, bufspacethresh, CTLFLAG_RW, &bufspacethresh,
     0, "Bufspace consumed before waking the daemon to free some");
 static int buffreekvacnt;
 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RW, &buffreekvacnt, 0,
     "Number of times we have freed the KVA space from some buffer");
 static int bufdefragcnt;
 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RW, &bufdefragcnt, 0,
     "Number of times we have had to repeat buffer allocation to defragment");
 static long lorunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, lorunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &lorunningspace, 0, sysctl_runningspace, "L",
     "Minimum preferred space used for in-progress I/O");
 static long hirunningspace;
 SYSCTL_PROC(_vfs, OID_AUTO, hirunningspace, CTLTYPE_LONG | CTLFLAG_MPSAFE |
     CTLFLAG_RW, &hirunningspace, 0, sysctl_runningspace, "L",
     "Maximum amount of space to use for in-progress I/O");
 int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
 int bdwriteskip;
 SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
     0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
 int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
 SYSCTL_INT(_vfs, OID_AUTO, recursiveflushes, CTLFLAG_RW, &recursiveflushes,
     0, "Number of flushes skipped due to being recursive");
 static int numdirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, &numdirtybuffers, 0,
     "Number of buffers that are dirty (has unwritten changes) at the moment");
 static int lodirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, &lodirtybuffers, 0,
     "How many buffers we want to have free before bufdaemon can sleep");
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
 int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, &numfreebuffers, 0,
     "Number of free buffers");
 static int lofreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, &lofreebuffers, 0,
    "Target number of free buffers");
 static int hifreebuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, &hifreebuffers, 0,
    "Threshold for clean buffer recycling");
 static int getnewbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RW, &getnewbufcalls, 0,
    "Number of calls to getnewbuf");
 static int getnewbufrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
     "Number of times getnewbuf has had to restart a buffer acquisition");
 static int mappingrestarts;
 SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
     "Number of times getblk has had to restart a buffer mapping for "
     "unmapped buffer");
 static int numbufallocfails;
 SYSCTL_INT(_vfs, OID_AUTO, numbufallocfails, CTLFLAG_RW, &numbufallocfails, 0,
     "Number of times buffer allocations failed");
 static int flushbufqtarget = 100;
 SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
     "Amount of work to do in flushbufqueues when helping bufdaemon");
 static long notbufdflushes;
 SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, &notbufdflushes, 0,
     "Number of dirty buffer flushes done by the bufdaemon helpers");
 static long barrierwrites;
 SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
     "Number of barrier writes");
 SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
     &unmapped_buf_allowed, 0,
     "Permit the use of the unmapped i/o");
 int maxbcachebuf = MAXBCACHEBUF;
 SYSCTL_INT(_vfs, OID_AUTO, maxbcachebuf, CTLFLAG_RDTUN, &maxbcachebuf, 0,
     "Maximum size of a buffer cache block");
 
 /*
  * This lock synchronizes access to bd_request.
  */
 static struct mtx_padalign bdlock;
 
 /*
  * This lock protects the runningbufreq and synchronizes runningbufwakeup and
  * waitrunningbufspace().
  */
 static struct mtx_padalign rbreqlock;
 
 /*
  * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
  */
 static struct rwlock_padalign nblock;
 
 /*
  * Lock that protects bdirtywait.
  */
 static struct mtx_padalign bdirtylock;
 
 /*
  * Wakeup point for bufdaemon, as well as indicator of whether it is already
  * active.  Set to 1 when the bufdaemon is already "on" the queue, 0 when it
  * is idling.
  */
 static int bd_request;
 
 /*
  * Request/wakeup point for the bufspace daemon.
  */
 static int bufspace_request;
 
 /*
  * Request for the buf daemon to write more buffers than is indicated by
  * lodirtybuf.  This may be necessary to push out excess dependencies or
  * defragment the address space where a simple count of the number of dirty
  * buffers is insufficient to characterize the demand for flushing them.
  */
 static int bd_speedupreq;
 
 /*
  * bogus page -- for I/O to/from partially complete buffers
  * this is a temporary solution to the problem, but it is not
  * really that bad.  it would be better to split the buffer
  * for input in the case of buffers partially already in memory,
  * but the code is intricate enough already.
  */
 vm_page_t bogus_page;
 
 /*
  * Synchronization (sleep/wakeup) variable for active buffer space requests.
  * Set when wait starts, cleared prior to wakeup().
  * Used in runningbufwakeup() and waitrunningbufspace().
  */
 static int runningbufreq;
 
 /* 
  * Synchronization (sleep/wakeup) variable for buffer requests.
  * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
  * by and/or.
  * Used in numdirtywakeup(), bufspace_wakeup(), bwillwrite(),
  * getnewbuf(), and getblk().
  */
 static volatile int needsbuffer;
 
 /*
  * Synchronization for bwillwrite() waiters.
  */
 static int bdirtywait;
 
 /*
  * Definitions for the buffer free lists.
  */
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_EMPTY	1	/* empty buffer headers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
 #define QUEUE_CLEAN	3	/* non-B_DELWRI buffers */
 #define QUEUE_SENTINEL	1024	/* not an queue index, but mark for sentinel */
 
 /* Maximum number of clean buffer queues. */
 #define	CLEAN_QUEUES	16
 
 /* Configured number of clean queues. */
 static int clean_queues;
 
 /* Maximum number of buffer queues. */
 #define BUFFER_QUEUES	(QUEUE_CLEAN + CLEAN_QUEUES)
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
 #ifdef INVARIANTS
 static int bq_len[BUFFER_QUEUES];
 #endif
 
 /*
  * Lock for each bufqueue
  */
 static struct mtx_padalign bqlocks[BUFFER_QUEUES];
 
 /*
  * per-cpu empty buffer cache.
  */
 uma_zone_t buf_zone;
 
 /*
  * Single global constant for BUF_WMESG, to avoid getting multiple references.
  * buf_wmesg is referred from macros.
  */
 const char *buf_wmesg = BUF_WMESG;
 
 static int
 sysctl_runningspace(SYSCTL_HANDLER_ARGS)
 {
 	long value;
 	int error;
 
 	value = *(long *)arg1;
 	error = sysctl_handle_long(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	mtx_lock(&rbreqlock);
 	if (arg1 == &hirunningspace) {
 		if (value < lorunningspace)
 			error = EINVAL;
 		else
 			hirunningspace = value;
 	} else {
 		KASSERT(arg1 == &lorunningspace,
 		    ("%s: unknown arg1", __func__));
 		if (value > hirunningspace)
 			error = EINVAL;
 		else
 			lorunningspace = value;
 	}
 	mtx_unlock(&rbreqlock);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
 static int
 sysctl_bufspace(SYSCTL_HANDLER_ARGS)
 {
 	long lvalue;
 	int ivalue;
 
 	if (sizeof(int) == sizeof(long) || req->oldlen >= sizeof(long))
 		return (sysctl_handle_long(oidp, arg1, arg2, req));
 	lvalue = *(long *)arg1;
 	if (lvalue > INT_MAX)
 		/* On overflow, still write out a long to trigger ENOMEM. */
 		return (sysctl_handle_long(oidp, &lvalue, 0, req));
 	ivalue = lvalue;
 	return (sysctl_handle_int(oidp, &ivalue, 0, req));
 }
 #endif
 
 static int
 bqcleanq(void)
 {
 	static int nextq;
 
 	return ((atomic_fetchadd_int(&nextq, 1) % clean_queues) + QUEUE_CLEAN);
 }
 
 static int
 bqisclean(int qindex)
 {
 
 	return (qindex >= QUEUE_CLEAN && qindex < QUEUE_CLEAN + CLEAN_QUEUES);
 }
 
 /*
  *	bqlock:
  *
  *	Return the appropriate queue lock based on the index.
  */
 static inline struct mtx *
 bqlock(int qindex)
 {
 
 	return (struct mtx *)&bqlocks[qindex];
 }
 
 /*
  *	bdirtywakeup:
  *
  *	Wakeup any bwillwrite() waiters.
  */
 static void
 bdirtywakeup(void)
 {
 	mtx_lock(&bdirtylock);
 	if (bdirtywait) {
 		bdirtywait = 0;
 		wakeup(&bdirtywait);
 	}
 	mtx_unlock(&bdirtylock);
 }
 
 /*
  *	bdirtysub:
  *
  *	Decrement the numdirtybuffers count by one and wakeup any
  *	threads blocked in bwillwrite().
  */
 static void
 bdirtysub(void)
 {
 
 	if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bdirtywakeup();
 }
 
 /*
  *	bdirtyadd:
  *
  *	Increment the numdirtybuffers count by one and wakeup the buf 
  *	daemon if needed.
  */
 static void
 bdirtyadd(void)
 {
 
 	/*
 	 * Only do the wakeup once as we cross the boundary.  The
 	 * buf daemon will keep running until the condition clears.
 	 */
 	if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
 	    (lodirtybuffers + hidirtybuffers) / 2)
 		bd_wakeup();
 }
 
 /*
  *	bufspace_wakeup:
  *
  *	Called when buffer space is potentially available for recovery.
  *	getnewbuf() will block on this flag when it is unable to free 
  *	sufficient buffer space.  Buffer space becomes recoverable when 
  *	bp's get placed back in the queues.
  */
 static void
 bufspace_wakeup(void)
 {
 
 	/*
 	 * If someone is waiting for bufspace, wake them up.
 	 *
 	 * Since needsbuffer is set prior to doing an additional queue
 	 * scan it is safe to check for the flag prior to acquiring the
 	 * lock.  The thread that is preparing to scan again before
 	 * blocking would discover the buf we released.
 	 */
 	if (needsbuffer) {
 		rw_rlock(&nblock);
 		if (atomic_cmpset_int(&needsbuffer, 1, 0) == 1)
 			wakeup(__DEVOLATILE(void *, &needsbuffer));
 		rw_runlock(&nblock);
 	}
 }
 
 /*
  *	bufspace_daemonwakeup:
  *
  *	Wakeup the daemon responsible for freeing clean bufs.
  */
 static void
 bufspace_daemonwakeup(void)
 {
 	rw_rlock(&nblock);
 	if (bufspace_request == 0) {
 		bufspace_request = 1;
 		wakeup(&bufspace_request);
 	}
 	rw_runlock(&nblock);
 }
 
 /*
  *	bufspace_adjust:
  *
  *	Adjust the reported bufspace for a KVA managed buffer, possibly
  * 	waking any waiters.
  */
 static void
 bufspace_adjust(struct buf *bp, int bufsize)
 {
 	long space;
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) == 0,
 	    ("bufspace_adjust: malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0) {
 		atomic_subtract_long(&bufspace, -diff);
 		bufspace_wakeup();
 	} else {
 		space = atomic_fetchadd_long(&bufspace, diff);
 		/* Wake up the daemon on the transition. */
 		if (space < bufspacethresh && space + diff >= bufspacethresh)
 			bufspace_daemonwakeup();
 	}
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	bufspace_reserve:
  *
  *	Reserve bufspace before calling allocbuf().  metadata has a
  *	different space limit than data.
  */
 static int
 bufspace_reserve(int size, bool metadata)
 {
 	long limit;
 	long space;
 
 	if (metadata)
 		limit = maxbufspace;
 	else
 		limit = hibufspace;
 	do {
 		space = bufspace;
 		if (space + size > limit)
 			return (ENOSPC);
 	} while (atomic_cmpset_long(&bufspace, space, space + size) == 0);
 
 	/* Wake up the daemon on the transition. */
 	if (space < bufspacethresh && space + size >= bufspacethresh)
 		bufspace_daemonwakeup();
 
 	return (0);
 }
 
 /*
  *	bufspace_release:
  *
  *	Release reserved bufspace after bufspace_adjust() has consumed it.
  */
 static void
 bufspace_release(int size)
 {
 	atomic_subtract_long(&bufspace, size);
 	bufspace_wakeup();
 }
 
 /*
  *	bufspace_wait:
  *
  *	Wait for bufspace, acting as the buf daemon if a locked vnode is
  *	supplied.  needsbuffer must be set in a safe fashion prior to
  *	polling for space.  The operation must be re-tried on return.
  */
 static void
 bufspace_wait(struct vnode *vp, int gbflags, int slpflag, int slptimeo)
 {
 	struct thread *td;
 	int error, fl, norunbuf;
 
 	if ((gbflags & GB_NOWAIT_BD) != 0)
 		return;
 
 	td = curthread;
 	rw_wlock(&nblock);
 	while (needsbuffer != 0) {
 		if (vp != NULL && vp->v_type != VCHR &&
 		    (td->td_pflags & TDP_BUFNEED) == 0) {
 			rw_wunlock(&nblock);
 			/*
 			 * getblk() is called with a vnode locked, and
 			 * some majority of the dirty buffers may as
 			 * well belong to the vnode.  Flushing the
 			 * buffers there would make a progress that
 			 * cannot be achieved by the buf_daemon, that
 			 * cannot lock the vnode.
 			 */
 			norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
 			    (td->td_pflags & TDP_NORUNNINGBUF);
 
 			/*
 			 * Play bufdaemon.  The getnewbuf() function
 			 * may be called while the thread owns lock
 			 * for another dirty buffer for the same
 			 * vnode, which makes it impossible to use
 			 * VOP_FSYNC() there, due to the buffer lock
 			 * recursion.
 			 */
 			td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
 			fl = buf_flush(vp, flushbufqtarget);
 			td->td_pflags &= norunbuf;
 			rw_wlock(&nblock);
 			if (fl != 0)
 				continue;
 			if (needsbuffer == 0)
 				break;
 		}
 		error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
 		    (PRIBIO + 4) | slpflag, "newbuf", slptimeo);
 		if (error != 0)
 			break;
 	}
 	rw_wunlock(&nblock);
 }
 
 
 /*
  *	bufspace_daemon:
  *
  *	buffer space management daemon.  Tries to maintain some marginal
  *	amount of free buffer space so that requesting processes neither
  *	block nor work to reclaim buffers.
  */
 static void
 bufspace_daemon(void)
 {
 	for (;;) {
 		kproc_suspend_check(bufspacedaemonproc);
 
 		/*
 		 * Free buffers from the clean queue until we meet our
 		 * targets.
 		 *
 		 * Theory of operation:  The buffer cache is most efficient
 		 * when some free buffer headers and space are always
 		 * available to getnewbuf().  This daemon attempts to prevent
 		 * the excessive blocking and synchronization associated
 		 * with shortfall.  It goes through three phases according
 		 * demand:
 		 *
 		 * 1)	The daemon wakes up voluntarily once per-second
 		 *	during idle periods when the counters are below
 		 *	the wakeup thresholds (bufspacethresh, lofreebuffers).
 		 *
 		 * 2)	The daemon wakes up as we cross the thresholds
 		 *	ahead of any potential blocking.  This may bounce
 		 *	slightly according to the rate of consumption and
 		 *	release.
 		 *
 		 * 3)	The daemon and consumers are starved for working
 		 *	clean buffers.  This is the 'bufspace' sleep below
 		 *	which will inefficiently trade bufs with bqrelse
 		 *	until we return to condition 2.
 		 */
 		while (bufspace > lobufspace ||
 		    numfreebuffers < hifreebuffers) {
 			if (buf_recycle(false) != 0) {
 				atomic_set_int(&needsbuffer, 1);
 				if (buf_recycle(false) != 0) {
 					rw_wlock(&nblock);
 					if (needsbuffer)
 						rw_sleep(__DEVOLATILE(void *,
 						    &needsbuffer), &nblock,
 						    PRIBIO|PDROP, "bufspace",
 						    hz/10);
 					else
 						rw_wunlock(&nblock);
 				}
 			}
 			maybe_yield();
 		}
 
 		/*
 		 * Re-check our limits under the exclusive nblock.
 		 */
 		rw_wlock(&nblock);
 		if (bufspace < bufspacethresh &&
 		    numfreebuffers > lofreebuffers) {
 			bufspace_request = 0;
 			rw_sleep(&bufspace_request, &nblock, PRIBIO|PDROP,
 			    "-", hz);
 		} else
 			rw_wunlock(&nblock);
 	}
 }
 
 static struct kproc_desc bufspace_kp = {
 	"bufspacedaemon",
 	bufspace_daemon,
 	&bufspacedaemonproc
 };
 SYSINIT(bufspacedaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start,
     &bufspace_kp);
 
 /*
  *	bufmallocadjust:
  *
  *	Adjust the reported bufspace for a malloc managed buffer, possibly
  *	waking any waiters.
  */
 static void
 bufmallocadjust(struct buf *bp, int bufsize)
 {
 	int diff;
 
 	KASSERT((bp->b_flags & B_MALLOC) != 0,
 	    ("bufmallocadjust: non-malloc buf %p", bp));
 	diff = bufsize - bp->b_bufsize;
 	if (diff < 0)
 		atomic_subtract_long(&bufmallocspace, -diff);
 	else
 		atomic_add_long(&bufmallocspace, diff);
 	bp->b_bufsize = bufsize;
 }
 
 /*
  *	runningwakeup:
  *
  *	Wake up processes that are waiting on asynchronous writes to fall
  *	below lorunningspace.
  */
 static void
 runningwakeup(void)
 {
 
 	mtx_lock(&rbreqlock);
 	if (runningbufreq) {
 		runningbufreq = 0;
 		wakeup(&runningbufreq);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 /*
  *	runningbufwakeup:
  *
  *	Decrement the outstanding write count according.
  */
 void
 runningbufwakeup(struct buf *bp)
 {
 	long space, bspace;
 
 	bspace = bp->b_runningbufspace;
 	if (bspace == 0)
 		return;
 	space = atomic_fetchadd_long(&runningbufspace, -bspace);
 	KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
 	    space, bspace));
 	bp->b_runningbufspace = 0;
 	/*
 	 * Only acquire the lock and wakeup on the transition from exceeding
 	 * the threshold to falling below it.
 	 */
 	if (space < lorunningspace)
 		return;
 	if (space - bspace > lorunningspace)
 		return;
 	runningwakeup();
 }
 
 /*
  *	waitrunningbufspace()
  *
  *	runningbufspace is a measure of the amount of I/O currently
  *	running.  This routine is used in async-write situations to
  *	prevent creating huge backups of pending writes to a device.
  *	Only asynchronous writes are governed by this function.
  *
  *	This does NOT turn an async write into a sync write.  It waits  
  *	for earlier writes to complete and generally returns before the
  *	caller's write has reached the device.
  */
 void
 waitrunningbufspace(void)
 {
 
 	mtx_lock(&rbreqlock);
 	while (runningbufspace > hirunningspace) {
 		runningbufreq = 1;
 		msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
 	}
 	mtx_unlock(&rbreqlock);
 }
 
 
 /*
  *	vfs_buf_test_cache:
  *
  *	Called when a buffer is extended.  This function clears the B_CACHE
  *	bit if the newly extended portion of the buffer does not contain
  *	valid data.
  */
 static __inline void
 vfs_buf_test_cache(struct buf *bp, vm_ooffset_t foff, vm_offset_t off,
     vm_offset_t size, vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if (bp->b_flags & B_CACHE) {
 		int base = (foff + off) & PAGE_MASK;
 		if (vm_page_is_valid(m, base, size) == 0)
 			bp->b_flags &= ~B_CACHE;
 	}
 }
 
 /* Wake up the buffer daemon if necessary */
 static __inline void
 bd_wakeup(void)
 {
 
 	mtx_lock(&bdlock);
 	if (bd_request == 0) {
 		bd_request = 1;
 		wakeup(&bd_request);
 	}
 	mtx_unlock(&bdlock);
 }
 
 /*
  * Adjust the maxbcachbuf tunable.
  */
 static void
 maxbcachebuf_adjust(void)
 {
 	int i;
 
 	/*
 	 * maxbcachebuf must be a power of 2 >= MAXBSIZE.
 	 */
 	i = 2;
 	while (i * 2 <= maxbcachebuf)
 		i *= 2;
 	maxbcachebuf = i;
 	if (maxbcachebuf < MAXBSIZE)
 		maxbcachebuf = MAXBSIZE;
 	if (maxbcachebuf > MAXPHYS)
 		maxbcachebuf = MAXPHYS;
 	if (bootverbose != 0 && maxbcachebuf != MAXBCACHEBUF)
 		printf("maxbcachebuf=%d\n", maxbcachebuf);
 }
 
 /*
  * bd_speedup - speedup the buffer cache flushing code
  */
 void
 bd_speedup(void)
 {
 	int needwake;
 
 	mtx_lock(&bdlock);
 	needwake = 0;
 	if (bd_speedupreq == 0 || bd_request == 0)
 		needwake = 1;
 	bd_speedupreq = 1;
 	bd_request = 1;
 	if (needwake)
 		wakeup(&bd_request);
 	mtx_unlock(&bdlock);
 }
 
 #ifndef NSWBUF_MIN
 #define	NSWBUF_MIN	16
 #endif
 
 #ifdef __i386__
 #define	TRANSIENT_DENOM	5
 #else
 #define	TRANSIENT_DENOM 10
 #endif
 
 /*
  * Calculating buffer cache scaling values and reserve space for buffer
  * headers.  This is called during low level kernel initialization and
  * may be called more then once.  We CANNOT write to the memory area
  * being reserved at this time.
  */
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
 	int tuned_nbuf;
 	long maxbuf, maxbuf_sz, buf_sz,	biotmap_sz;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
 	 * PAGE_SIZE is >= 1K)
 	 */
 	physmem_est = physmem_est * (PAGE_SIZE / 1024);
 
 	maxbcachebuf_adjust();
 	/*
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
 	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
 	 * factor represents the 1/4 x ram conversion.
 	 */
 	if (nbuf == 0) {
 		int factor = 4 * BKVASIZE / 1024;
 
 		nbuf = 50;
 		if (physmem_est > 4096)
 			nbuf += min((physmem_est - 4096) / factor,
 			    65536 / factor);
 		if (physmem_est > 65536)
 			nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
 			    32 * 1024 * 1024 / (factor * 5));
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
 		tuned_nbuf = 1;
 	} else
 		tuned_nbuf = 0;
 
 	/* XXX Avoid unsigned long overflows later on with maxbufspace. */
 	maxbuf = (LONG_MAX / 3) / BKVASIZE;
 	if (nbuf > maxbuf) {
 		if (!tuned_nbuf)
 			printf("Warning: nbufs lowered from %d to %ld\n", nbuf,
 			    maxbuf);
 		nbuf = maxbuf;
 	}
 
 	/*
 	 * Ideal allocation size for the transient bio submap is 10%
 	 * of the maximal space buffer map.  This roughly corresponds
 	 * to the amount of the buffer mapped for typical UFS load.
 	 *
 	 * Clip the buffer map to reserve space for the transient
 	 * BIOs, if its extent is bigger than 90% (80% on i386) of the
 	 * maximum buffer map extent on the platform.
 	 *
 	 * The fall-back to the maxbuf in case of maxbcache unset,
 	 * allows to not trim the buffer KVA for the architectures
 	 * with ample KVA space.
 	 */
 	if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
 		maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
 		buf_sz = (long)nbuf * BKVASIZE;
 		if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
 		    (TRANSIENT_DENOM - 1)) {
 			/*
 			 * There is more KVA than memory.  Do not
 			 * adjust buffer map size, and assign the rest
 			 * of maxbuf to transient map.
 			 */
 			biotmap_sz = maxbuf_sz - buf_sz;
 		} else {
 			/*
 			 * Buffer map spans all KVA we could afford on
 			 * this platform.  Give 10% (20% on i386) of
 			 * the buffer map to the transient bio map.
 			 */
 			biotmap_sz = buf_sz / TRANSIENT_DENOM;
 			buf_sz -= biotmap_sz;
 		}
 		if (biotmap_sz / INT_MAX > MAXPHYS)
 			bio_transient_maxcnt = INT_MAX;
 		else
 			bio_transient_maxcnt = biotmap_sz / MAXPHYS;
 		/*
 		 * Artificially limit to 1024 simultaneous in-flight I/Os
 		 * using the transient mapping.
 		 */
 		if (bio_transient_maxcnt > 1024)
 			bio_transient_maxcnt = 1024;
 		if (tuned_nbuf)
 			nbuf = buf_sz / BKVASIZE;
 	}
 
 	/*
 	 * swbufs are used as temporary holders for I/O, such as paging I/O.
 	 * We have no less then 16 and no more then 256.
 	 */
 	nswbuf = min(nbuf / 4, 256);
 	TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
 	if (nswbuf < NSWBUF_MIN)
 		nswbuf = NSWBUF_MIN;
 
 	/*
 	 * Reserve space for the buffer cache buffers
 	 */
 	swbuf = (void *)v;
 	v = (caddr_t)(swbuf + nswbuf);
 	buf = (void *)v;
 	v = (caddr_t)(buf + nbuf);
 
 	return(v);
 }
 
 /* Initialize the buffer subsystem.  Called before use of any buffers. */
 void
 bufinit(void)
 {
 	struct buf *bp;
 	int i;
 
 	KASSERT(maxbcachebuf >= MAXBSIZE,
 	    ("maxbcachebuf (%d) must be >= MAXBSIZE (%d)\n", maxbcachebuf,
 	    MAXBSIZE));
 	mtx_init(&bqlocks[QUEUE_DIRTY], "bufq dirty lock", NULL, MTX_DEF);
 	mtx_init(&bqlocks[QUEUE_EMPTY], "bufq empty lock", NULL, MTX_DEF);
 	for (i = QUEUE_CLEAN; i < QUEUE_CLEAN + CLEAN_QUEUES; i++)
 		mtx_init(&bqlocks[i], "bufq clean lock", NULL, MTX_DEF);
 	mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
 	rw_init(&nblock, "needsbuffer lock");
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
 		TAILQ_INIT(&bufqueues[i]);
 
 	unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
 
 	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		bzero(bp, sizeof *bp);
 		bp->b_flags = B_INVAL;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
 		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_xflags = 0;
 		bp->b_data = bp->b_kvabase = unmapped_buf;
 		LIST_INIT(&bp->b_dep);
 		BUF_LOCKINIT(bp);
 		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
 #ifdef INVARIANTS
 		bq_len[QUEUE_EMPTY]++;
 #endif
 	}
 
 	/*
 	 * maxbufspace is the absolute maximum amount of buffer space we are 
 	 * allowed to reserve in KVM and in real terms.  The absolute maximum
 	 * is nominally used by metadata.  hibufspace is the nominal maximum
 	 * used by most other requests.  The differential is required to 
 	 * ensure that metadata deadlocks don't occur.
 	 *
 	 * maxbufspace is based on BKVASIZE.  Allocating buffers larger then
 	 * this may result in KVM fragmentation which is not handled optimally
 	 * by the system. XXX This is less true with vmem.  We could use
 	 * PAGE_SIZE.
 	 */
 	maxbufspace = (long)nbuf * BKVASIZE;
 	hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - maxbcachebuf * 10);
 	lobufspace = (hibufspace / 20) * 19; /* 95% */
 	bufspacethresh = lobufspace + (hibufspace - lobufspace) / 2;
 
 	/*
 	 * Note: The 16 MiB upper limit for hirunningspace was chosen
 	 * arbitrarily and may need further tuning. It corresponds to
 	 * 128 outstanding write IO requests (if IO size is 128 KiB),
 	 * which fits with many RAID controllers' tagged queuing limits.
 	 * The lower 1 MiB limit is the historical upper limit for
 	 * hirunningspace.
 	 */
 	hirunningspace = lmax(lmin(roundup(hibufspace / 64, maxbcachebuf),
 	    16 * 1024 * 1024), 1024 * 1024);
 	lorunningspace = roundup((hirunningspace * 2) / 3, maxbcachebuf);
 
 	/*
 	 * Limit the amount of malloc memory since it is wired permanently into
 	 * the kernel space.  Even though this is accounted for in the buffer
 	 * allocation, we don't want the malloced region to grow uncontrolled.
 	 * The malloc scheme improves memory utilization significantly on
 	 * average (small) directories.
 	 */
 	maxbufmallocspace = hibufspace / 20;
 
 	/*
 	 * Reduce the chance of a deadlock occurring by limiting the number
 	 * of delayed-write dirty buffers we allow to stack up.
 	 */
 	hidirtybuffers = nbuf / 4 + 20;
 	dirtybufthresh = hidirtybuffers * 9 / 10;
 	numdirtybuffers = 0;
 	/*
 	 * To support extreme low-memory systems, make sure hidirtybuffers
 	 * cannot eat up all available buffer space.  This occurs when our
 	 * minimum cannot be met.  We try to size hidirtybuffers to 3/4 our
 	 * buffer space assuming BKVASIZE'd buffers.
 	 */
 	while ((long)hidirtybuffers * BKVASIZE > 3 * hibufspace / 4) {
 		hidirtybuffers >>= 1;
 	}
 	lodirtybuffers = hidirtybuffers / 2;
 
 	/*
 	 * lofreebuffers should be sufficient to avoid stalling waiting on
 	 * buf headers under heavy utilization.  The bufs in per-cpu caches
 	 * are counted as free but will be unavailable to threads executing
 	 * on other cpus.
 	 *
 	 * hifreebuffers is the free target for the bufspace daemon.  This
 	 * should be set appropriately to limit work per-iteration.
 	 */
 	lofreebuffers = MIN((nbuf / 25) + (20 * mp_ncpus), 128 * mp_ncpus);
 	hifreebuffers = (3 * lofreebuffers) / 2;
 	numfreebuffers = nbuf;
 
 	bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
 
 	/* Setup the kva and free list allocators. */
 	vmem_set_reclaim(buffer_arena, bufkva_reclaim);
 	buf_zone = uma_zcache_create("buf free cache", sizeof(struct buf),
 	    NULL, NULL, NULL, NULL, buf_import, buf_release, NULL, 0);
 
 	/*
 	 * Size the clean queue according to the amount of buffer space.
 	 * One queue per-256mb up to the max.  More queues gives better
 	 * concurrency but less accurate LRU.
 	 */
 	clean_queues = MIN(howmany(maxbufspace, 256*1024*1024), CLEAN_QUEUES);
 
 }
 
 #ifdef INVARIANTS
 static inline void
 vfs_buf_check_mapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_kvabase != unmapped_buf,
 	    ("mapped buf: b_kvabase was not updated %p", bp));
 	KASSERT(bp->b_data != unmapped_buf,
 	    ("mapped buf: b_data was not updated %p", bp));
 	KASSERT(bp->b_data < unmapped_buf || bp->b_data >= unmapped_buf +
 	    MAXPHYS, ("b_data + b_offset unmapped %p", bp));
 }
 
 static inline void
 vfs_buf_check_unmapped(struct buf *bp)
 {
 
 	KASSERT(bp->b_data == unmapped_buf,
 	    ("unmapped buf: corrupted b_data %p", bp));
 }
 
 #define	BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
 #define	BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
 #else
 #define	BUF_CHECK_MAPPED(bp) do {} while (0)
 #define	BUF_CHECK_UNMAPPED(bp) do {} while (0)
 #endif
 
 static int
 isbufbusy(struct buf *bp)
 {
 	if (((bp->b_flags & B_INVAL) == 0 && BUF_ISLOCKED(bp)) ||
 	    ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI))
 		return (1);
 	return (0);
 }
 
 /*
  * Shutdown the system cleanly to prepare for reboot, halt, or power off.
  */
 void
 bufshutdown(int show_busybufs)
 {
 	static int first_buf_printf = 1;
 	struct buf *bp;
 	int iter, nbusy, pbusy;
 #ifndef PREEMPTION
 	int subiter;
 #endif
 
 	/* 
 	 * Sync filesystems for shutdown
 	 */
 	wdog_kern_pat(WD_LASTVAL);
 	sys_sync(curthread, NULL);
 
 	/*
 	 * With soft updates, some buffers that are
 	 * written will be remarked as dirty until other
 	 * buffers are written.
 	 */
 	for (iter = pbusy = 0; iter < 20; iter++) {
 		nbusy = 0;
 		for (bp = &buf[nbuf]; --bp >= buf; )
 			if (isbufbusy(bp))
 				nbusy++;
 		if (nbusy == 0) {
 			if (first_buf_printf)
 				printf("All buffers synced.");
 			break;
 		}
 		if (first_buf_printf) {
 			printf("Syncing disks, buffers remaining... ");
 			first_buf_printf = 0;
 		}
 		printf("%d ", nbusy);
 		if (nbusy < pbusy)
 			iter = 0;
 		pbusy = nbusy;
 
 		wdog_kern_pat(WD_LASTVAL);
 		sys_sync(curthread, NULL);
 
 #ifdef PREEMPTION
 		/*
 		 * Drop Giant and spin for a while to allow
 		 * interrupt threads to run.
 		 */
 		DROP_GIANT();
 		DELAY(50000 * iter);
 		PICKUP_GIANT();
 #else
 		/*
 		 * Drop Giant and context switch several times to
 		 * allow interrupt threads to run.
 		 */
 		DROP_GIANT();
 		for (subiter = 0; subiter < 50 * iter; subiter++) {
 			thread_lock(curthread);
 			mi_switch(SW_VOL, NULL);
 			thread_unlock(curthread);
 			DELAY(1000);
 		}
 		PICKUP_GIANT();
 #endif
 	}
 	printf("\n");
 	/*
 	 * Count only busy local buffers to prevent forcing 
 	 * a fsck if we're just a client of a wedged NFS server
 	 */
 	nbusy = 0;
 	for (bp = &buf[nbuf]; --bp >= buf; ) {
 		if (isbufbusy(bp)) {
 #if 0
 /* XXX: This is bogus.  We should probably have a BO_REMOTE flag instead */
 			if (bp->b_dev == NULL) {
 				TAILQ_REMOVE(&mountlist,
 				    bp->b_vp->v_mount, mnt_list);
 				continue;
 			}
 #endif
 			nbusy++;
 			if (show_busybufs > 0) {
 				printf(
 	    "%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
 				    nbusy, bp, bp->b_vp, bp->b_flags,
 				    (intmax_t)bp->b_blkno,
 				    (intmax_t)bp->b_lblkno);
 				BUF_LOCKPRINTINFO(bp);
 				if (show_busybufs > 1)
 					vn_printf(bp->b_vp,
 					    "vnode content: ");
 			}
 		}
 	}
 	if (nbusy) {
 		/*
 		 * Failed to sync all blocks. Indicate this and don't
 		 * unmount filesystems (thus forcing an fsck on reboot).
 		 */
 		printf("Giving up on %d buffers\n", nbusy);
 		DELAY(5000000);	/* 5 seconds */
 	} else {
 		if (!first_buf_printf)
 			printf("Final sync complete\n");
 		/*
 		 * Unmount filesystems
 		 */
 		if (panicstr == NULL)
 			vfs_unmountall();
 	}
 	swapoff_all();
 	DELAY(100000);		/* wait for console output to finish */
 }
 
 static void
 bpmap_qenter(struct buf *bp)
 {
 
 	BUF_CHECK_MAPPED(bp);
 
 	/*
 	 * bp->b_data is relative to bp->b_offset, but
 	 * bp->b_offset may be offset into the first page.
 	 */
 	bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
 	pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
 	    (vm_offset_t)(bp->b_offset & PAGE_MASK));
 }
 
 /*
  *	binsfree:
  *
  *	Insert the buffer into the appropriate free list.
  */
 static void
 binsfree(struct buf *bp, int qindex)
 {
 	struct mtx *olock, *nlock;
 
 	if (qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 
 	/*
 	 * Stick to the same clean queue for the lifetime of the buf to
 	 * limit locking below.  Otherwise pick ont sequentially.
 	 */
 	if (qindex == QUEUE_CLEAN) {
 		if (bqisclean(bp->b_qindex))
 			qindex = bp->b_qindex;
 		else
 			qindex = bqcleanq();
 	}
 
 	/*
 	 * Handle delayed bremfree() processing.
 	 */
 	nlock = bqlock(qindex);
 	if (bp->b_flags & B_REMFREE) {
 		olock = bqlock(bp->b_qindex);
 		mtx_lock(olock);
 		bremfreel(bp);
 		if (olock != nlock) {
 			mtx_unlock(olock);
 			mtx_lock(nlock);
 		}
 	} else
 		mtx_lock(nlock);
 
 	if (bp->b_qindex != QUEUE_NONE)
 		panic("binsfree: free buffer onto another queue???");
 
 	bp->b_qindex = qindex;
 	if (bp->b_flags & B_AGE)
 		TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
 	else
 		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	bq_len[bp->b_qindex]++;
 #endif
 	mtx_unlock(nlock);
 }
 
 /*
  * buf_free:
  *
  *	Free a buffer to the buf zone once it no longer has valid contents.
  */
 static void
 buf_free(struct buf *bp)
 {
 
 	if (bp->b_flags & B_REMFREE)
 		bremfreef(bp);
 	if (bp->b_vflags & BV_BKGRDINPROG)
 		panic("losing buffer 1");
 	if (bp->b_rcred != NOCRED) {
 		crfree(bp->b_rcred);
 		bp->b_rcred = NOCRED;
 	}
 	if (bp->b_wcred != NOCRED) {
 		crfree(bp->b_wcred);
 		bp->b_wcred = NOCRED;
 	}
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_deallocate(bp);
 	bufkva_free(bp);
 	BUF_UNLOCK(bp);
 	uma_zfree(buf_zone, bp);
 	atomic_add_int(&numfreebuffers, 1);
 	bufspace_wakeup();
 }
 
 /*
  * buf_import:
  *
  *	Import bufs into the uma cache from the buf list.  The system still
  *	expects a static array of bufs and much of the synchronization
  *	around bufs assumes type stable storage.  As a result, UMA is used
  *	only as a per-cpu cache of bufs still maintained on a global list.
  */
 static int
 buf_import(void *arg, void **store, int cnt, int flags)
 {
 	struct buf *bp;
 	int i;
 
 	mtx_lock(&bqlocks[QUEUE_EMPTY]);
 	for (i = 0; i < cnt; i++) {
 		bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
 		if (bp == NULL)
 			break;
 		bremfreel(bp);
 		store[i] = bp;
 	}
 	mtx_unlock(&bqlocks[QUEUE_EMPTY]);
 
 	return (i);
 }
 
 /*
  * buf_release:
  *
  *	Release bufs from the uma cache back to the buffer queues.
  */
 static void
 buf_release(void *arg, void **store, int cnt)
 {
         int i;
 
         for (i = 0; i < cnt; i++)
 		binsfree(store[i], QUEUE_EMPTY);
 }
 
 /*
  * buf_alloc:
  *
  *	Allocate an empty buffer header.
  */
 static struct buf *
 buf_alloc(void)
 {
 	struct buf *bp;
 
 	bp = uma_zalloc(buf_zone, M_NOWAIT);
 	if (bp == NULL) {
 		bufspace_daemonwakeup();
 		atomic_add_int(&numbufallocfails, 1);
 		return (NULL);
 	}
 
 	/*
 	 * Wake-up the bufspace daemon on transition.
 	 */
 	if (atomic_fetchadd_int(&numfreebuffers, -1) == lofreebuffers)
 		bufspace_daemonwakeup();
 
 	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		panic("getnewbuf_empty: Locked buf %p on free queue.", bp);
 	
 	KASSERT(bp->b_vp == NULL,
 	    ("bp: %p still has vnode %p.", bp, bp->b_vp));
 	KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0,
 	    ("invalid buffer %p flags %#x", bp, bp->b_flags));
 	KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
 	    ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
 	KASSERT(bp->b_npages == 0,
 	    ("bp: %p still has %d vm pages\n", bp, bp->b_npages));
 	KASSERT(bp->b_kvasize == 0, ("bp: %p still has kva\n", bp));
 	KASSERT(bp->b_bufsize == 0, ("bp: %p still has bufspace\n", bp));
 
 	bp->b_flags = 0;
 	bp->b_ioflags = 0;
 	bp->b_xflags = 0;
 	bp->b_vflags = 0;
 	bp->b_vp = NULL;
 	bp->b_blkno = bp->b_lblkno = 0;
 	bp->b_offset = NOOFFSET;
 	bp->b_iodone = 0;
 	bp->b_error = 0;
 	bp->b_resid = 0;
 	bp->b_bcount = 0;
 	bp->b_npages = 0;
 	bp->b_dirtyoff = bp->b_dirtyend = 0;
 	bp->b_bufobj = NULL;
 	bp->b_pin_count = 0;
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_fsprivate1 = NULL;
 	bp->b_fsprivate2 = NULL;
 	bp->b_fsprivate3 = NULL;
 	LIST_INIT(&bp->b_dep);
 
 	return (bp);
 }
 
 /*
  *	buf_qrecycle:
  *
  *	Free a buffer from the given bufqueue.  kva controls whether the
  *	freed buf must own some kva resources.  This is used for
  *	defragmenting.
  */
 static int
 buf_qrecycle(int qindex, bool kva)
 {
 	struct buf *bp, *nbp;
 
 	if (kva)
 		atomic_add_int(&bufdefragcnt, 1);
 	nbp = NULL;
 	mtx_lock(&bqlocks[qindex]);
 	nbp = TAILQ_FIRST(&bufqueues[qindex]);
 
 	/*
 	 * Run scan, possibly freeing data and/or kva mappings on the fly
 	 * depending.
 	 */
 	while ((bp = nbp) != NULL) {
 		/*
 		 * Calculate next bp (we can only use it if we do not
 		 * release the bqlock).
 		 */
 		nbp = TAILQ_NEXT(bp, b_freelist);
 
 		/*
 		 * If we are defragging then we need a buffer with 
 		 * some kva to reclaim.
 		 */
 		if (kva && bp->b_kvasize == 0)
 			continue;
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
 
 		/*
 		 * Skip buffers with background writes in progress.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 
 		KASSERT(bp->b_qindex == qindex,
 		    ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
 		/*
 		 * NOTE:  nbp is now entirely invalid.  We can only restart
 		 * the scan from this point on.
 		 */
 		bremfreel(bp);
 		mtx_unlock(&bqlocks[qindex]);
 
 		/*
 		 * Requeue the background write buffer with error and
 		 * restart the scan.
 		 */
 		if ((bp->b_vflags & BV_BKGRDERR) != 0) {
 			bqrelse(bp);
 			mtx_lock(&bqlocks[qindex]);
 			nbp = TAILQ_FIRST(&bufqueues[qindex]);
 			continue;
 		}
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 		return (0);
 	}
 	mtx_unlock(&bqlocks[qindex]);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_recycle:
  *
  *	Iterate through all clean queues until we find a buf to recycle or
  *	exhaust the search.
  */
 static int
 buf_recycle(bool kva)
 {
 	int qindex, first_qindex;
 
 	qindex = first_qindex = bqcleanq();
 	do {
 		if (buf_qrecycle(qindex, kva) == 0)
 			return (0);
 		if (++qindex == QUEUE_CLEAN + clean_queues)
 			qindex = QUEUE_CLEAN;
 	} while (qindex != first_qindex);
 
 	return (ENOBUFS);
 }
 
 /*
  *	buf_scan:
  *
  *	Scan the clean queues looking for a buffer to recycle.  needsbuffer
  *	is set on failure so that the caller may optionally bufspace_wait()
  *	in a race-free fashion.
  */
 static int
 buf_scan(bool defrag)
 {
 	int error;
 
 	/*
 	 * To avoid heavy synchronization and wakeup races we set
 	 * needsbuffer and re-poll before failing.  This ensures that
 	 * no frees can be missed between an unsuccessful poll and
 	 * going to sleep in a synchronized fashion.
 	 */
 	if ((error = buf_recycle(defrag)) != 0) {
 		atomic_set_int(&needsbuffer, 1);
 		bufspace_daemonwakeup();
 		error = buf_recycle(defrag);
 	}
 	if (error == 0)
 		atomic_add_int(&getnewbufrestarts, 1);
 	return (error);
 }
 
 /*
  *	bremfree:
  *
  *	Mark the buffer for removal from the appropriate free list.
  *	
  */
 void
 bremfree(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT((bp->b_flags & B_REMFREE) == 0,
 	    ("bremfree: buffer %p already marked for delayed removal.", bp));
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfree: buffer %p not on a queue.", bp));
 	BUF_ASSERT_XLOCKED(bp);
 
 	bp->b_flags |= B_REMFREE;
 }
 
 /*
  *	bremfreef:
  *
  *	Force an immediate removal from a free list.  Used only in nfs when
  *	it abuses the b_freelist pointer.
  */
 void
 bremfreef(struct buf *bp)
 {
 	struct mtx *qlock;
 
 	qlock = bqlock(bp->b_qindex);
 	mtx_lock(qlock);
 	bremfreel(bp);
 	mtx_unlock(qlock);
 }
 
 /*
  *	bremfreel:
  *
  *	Removes a buffer from the free list, must be called with the
  *	correct qlock held.
  */
 static void
 bremfreel(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_qindex != QUEUE_NONE,
 	    ("bremfreel: buffer %p not on a queue.", bp));
 	if (bp->b_qindex != QUEUE_EMPTY) {
 		BUF_ASSERT_XLOCKED(bp);
 	}
 	mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
 
 	TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
 #ifdef INVARIANTS
 	KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
 	    bp->b_qindex));
 	bq_len[bp->b_qindex]--;
 #endif
 	bp->b_qindex = QUEUE_NONE;
 	bp->b_flags &= ~B_REMFREE;
 }
 
 /*
  *	bufkva_free:
  *
  *	Free the kva allocation for a buffer.
  *
  */
 static void
 bufkva_free(struct buf *bp)
 {
 
 #ifdef INVARIANTS
 	if (bp->b_kvasize == 0) {
 		KASSERT(bp->b_kvabase == unmapped_buf &&
 		    bp->b_data == unmapped_buf,
 		    ("Leaked KVA space on %p", bp));
 	} else if (buf_mapped(bp))
 		BUF_CHECK_MAPPED(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 #endif
 	if (bp->b_kvasize == 0)
 		return;
 
 	vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase, bp->b_kvasize);
 	atomic_subtract_long(&bufkvaspace, bp->b_kvasize);
 	atomic_add_int(&buffreekvacnt, 1);
 	bp->b_data = bp->b_kvabase = unmapped_buf;
 	bp->b_kvasize = 0;
 }
 
 /*
  *	bufkva_alloc:
  *
  *	Allocate the buffer KVA and set b_kvasize and b_kvabase.
  */
 static int
 bufkva_alloc(struct buf *bp, int maxsize, int gbflags)
 {
 	vm_offset_t addr;
 	int error;
 
 	KASSERT((gbflags & GB_UNMAPPED) == 0 || (gbflags & GB_KVAALLOC) != 0,
 	    ("Invalid gbflags 0x%x in %s", gbflags, __func__));
 
 	bufkva_free(bp);
 
 	addr = 0;
 	error = vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr);
 	if (error != 0) {
 		/*
 		 * Buffer map is too fragmented.  Request the caller
 		 * to defragment the map.
 		 */
 		return (error);
 	}
 	bp->b_kvabase = (caddr_t)addr;
 	bp->b_kvasize = maxsize;
 	atomic_add_long(&bufkvaspace, bp->b_kvasize);
 	if ((gbflags & GB_UNMAPPED) != 0) {
 		bp->b_data = unmapped_buf;
 		BUF_CHECK_UNMAPPED(bp);
 	} else {
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 	}
 	return (0);
 }
 
 /*
  *	bufkva_reclaim:
  *
  *	Reclaim buffer kva by freeing buffers holding kva.  This is a vmem
  *	callback that fires to avoid returning failure.
  */
 static void
 bufkva_reclaim(vmem_t *vmem, int flags)
 {
 	int i;
 
 	for (i = 0; i < 5; i++)
 		if (buf_scan(true) != 0)
 			break;
 	return;
 }
 
 
 /*
  * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
  * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
  * the buffer is valid and we do not have to do anything.
  */
 void
 breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
     int cnt, struct ucred * cred)
 {
 	struct buf *rabp;
 	int i;
 
 	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
 		if (inmem(vp, *rablkno))
 			continue;
 		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
 
 		if ((rabp->b_flags & B_CACHE) == 0) {
 			if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 				if (racct_enable) {
 					PROC_LOCK(curproc);
 					racct_add_buf(curproc, rabp, 0);
 					PROC_UNLOCK(curproc);
 				}
 #endif /* RACCT */
 				curthread->td_ru.ru_inblock++;
 			}
 			rabp->b_flags |= B_ASYNC;
 			rabp->b_flags &= ~B_INVAL;
 			rabp->b_ioflags &= ~BIO_ERROR;
 			rabp->b_iocmd = BIO_READ;
 			if (rabp->b_rcred == NOCRED && cred != NOCRED)
 				rabp->b_rcred = crhold(cred);
 			vfs_busy_pages(rabp, 0);
 			BUF_KERNPROC(rabp);
 			rabp->b_iooffset = dbtob(rabp->b_blkno);
 			bstrategy(rabp);
 		} else {
 			brelse(rabp);
 		}
 	}
 }
 
 /*
  * Entry point for bread() and breadn() via #defines in sys/buf.h.
  *
  * Get a buffer with the specified data.  Look in the cache first.  We
  * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
  * is set, the buffer is valid and we do not have to do anything, see
  * getblk(). Also starts asynchronous I/O on read-ahead blocks.
  *
  * Always return a NULL buffer pointer (in bpp) when returning an error.
  */
 int
 breadn_flags(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablkno,
     int *rabsize, int cnt, struct ucred *cred, int flags, struct buf **bpp)
 {
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
 	/*
 	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
 	 */
 	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
 	if (bp == NULL)
 		return (EBUSY);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
 		if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(curproc);
 				racct_add_buf(curproc, bp, 0);
 				PROC_UNLOCK(curproc);
 			}
 #endif /* RACCT */
 			curthread->td_ru.ru_inblock++;
 		}
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
 		if (bp->b_rcred == NOCRED && cred != NOCRED)
 			bp->b_rcred = crhold(cred);
 		vfs_busy_pages(bp, 0);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
 		++readwait;
 	}
 
 	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
 		if (rv != 0) {
 			brelse(bp);
 			*bpp = NULL;
 		}
 	}
 	return (rv);
 }
 
 /*
  * Write, release buffer on completion.  (Done by iodone
  * if async).  Do not bother writing anything if the buffer
  * is invalid.
  *
  * Note that we set B_CACHE here, indicating that buffer is
  * fully valid and thus cacheable.  This is true even of NFS
  * now so we set it generally.  This could be set either here 
  * or in biodone() since the I/O is synchronous.  We put it
  * here.
  */
 int
 bufwrite(struct buf *bp)
 {
 	int oldflags;
 	struct vnode *vp;
 	long space;
 	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
 		bp->b_flags |= B_INVAL | B_RELBUF;
 		bp->b_flags &= ~B_CACHE;
 		brelse(bp);
 		return (ENXIO);
 	}
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return (0);
 	}
 
 	if (bp->b_flags & B_BARRIER)
 		barrierwrites++;
 
 	oldflags = bp->b_flags;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_pin_count > 0)
 		bunpin_wait(bp);
 
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
 	vp = bp->b_vp;
 	if (vp)
 		vp_md = vp->v_vflag & VV_MD;
 	else
 		vp_md = 0;
 
 	/*
 	 * Mark the buffer clean.  Increment the bufobj write count
 	 * before bundirty() call, to prevent other thread from seeing
 	 * empty dirty list and zero counter for writes in progress,
 	 * falsely indicating that the bufobj is clean.
 	 */
 	bufobj_wref(bp->b_bufobj);
 	bundirty(bp);
 
 	bp->b_flags &= ~B_DONE;
 	bp->b_ioflags &= ~BIO_ERROR;
 	bp->b_flags |= B_CACHE;
 	bp->b_iocmd = BIO_WRITE;
 
 	vfs_busy_pages(bp, 1);
 
 	/*
 	 * Normal bwrites pipeline writes
 	 */
 	bp->b_runningbufspace = bp->b_bufsize;
 	space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
 
 	if (!TD_IS_IDLETHREAD(curthread)) {
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(curproc);
 			racct_add_buf(curproc, bp, 1);
 			PROC_UNLOCK(curproc);
 		}
 #endif /* RACCT */
 		curthread->td_ru.ru_oublock++;
 	}
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 
 	if ((oldflags & B_ASYNC) == 0) {
 		int rtval = bufwait(bp);
 		brelse(bp);
 		return (rtval);
 	} else if (space > hirunningspace) {
 		/*
 		 * don't allow the async write to saturate the I/O
 		 * system.  We will not deadlock here because
 		 * we are blocking waiting for I/O that is already in-progress
 		 * to complete. We do not block here if it is the update
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
 		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
 void
 bufbdflush(struct bufobj *bo, struct buf *bp)
 {
 	struct buf *nbp;
 
 	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
 		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
 		altbufferflushes++;
 	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
 		BO_LOCK(bo);
 		/*
 		 * Try to find a buffer to flush.
 		 */
 		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
 			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
 			    BUF_LOCK(nbp,
 				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
 				continue;
 			if (bp == nbp)
 				panic("bdwrite: found ourselves");
 			BO_UNLOCK(bo);
 			/* Don't countdeps with the bo lock held. */
 			if (buf_countdeps(nbp, 0)) {
 				BO_LOCK(bo);
 				BUF_UNLOCK(nbp);
 				continue;
 			}
 			if (nbp->b_flags & B_CLUSTEROK) {
 				vfs_bio_awrite(nbp);
 			} else {
 				bremfree(nbp);
 				bawrite(nbp);
 			}
 			dirtybufferflushes++;
 			break;
 		}
 		if (nbp == NULL)
 			BO_UNLOCK(bo);
 	}
 }
 
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
  *
  * Note that since the buffer must be completely valid, we can safely
  * set B_CACHE.  In fact, we have to set B_CACHE here rather then in
  * biodone() in order to prevent getblk from writing the buffer
  * out synchronously.
  */
 void
 bdwrite(struct buf *bp)
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT((bp->b_flags & B_BARRIER) == 0,
 	    ("Barrier request in delayed write %p", bp));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_INVAL) {
 		brelse(bp);
 		return;
 	}
 
 	/*
 	 * If we have too many dirty buffers, don't create any more.
 	 * If we are wildly over our limit, then force a complete
 	 * cleanup. Otherwise, just keep the situation from getting
 	 * out of control. Note that we have to avoid a recursive
 	 * disaster and not try to clean up after our own cleanup!
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
 	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
 		td->td_pflags |= TDP_INBDFLUSH;
 		BO_BDFLUSH(bo, bp);
 		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
 	bdirty(bp);
 	/*
 	 * Set B_CACHE, indicating that the buffer is fully valid.  This is
 	 * true even of NFS now.
 	 */
 	bp->b_flags |= B_CACHE;
 
 	/*
 	 * This bmap keeps the system from needing to do the bmap later,
 	 * perhaps when the system is attempting to do a sync.  Since it
 	 * is likely that the indirect block -- or whatever other datastructure
 	 * that the filesystem needs is still in memory now, it is a good
 	 * thing to do this.  Note also, that if the pageout daemon is
 	 * requesting a sync -- there might not be enough memory to do
 	 * the bmap then...  So, this is important to do.
 	 */
 	if (vp->v_type != VCHR && bp->b_lblkno == bp->b_blkno) {
 		VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
 	}
 
 	/*
 	 * Set the *dirty* buffer range based upon the VM system dirty
 	 * pages.
 	 *
 	 * Mark the buffer pages as clean.  We need to do this here to
 	 * satisfy the vnode_pager and the pageout daemon, so that it
 	 * thinks that the pages have been "cleaned".  Note that since
 	 * the pages are in a delayed write buffer -- the VFS layer
 	 * "will" see that the pages get written out on the next sync,
 	 * or perhaps the cluster will be completed.
 	 */
 	vfs_clean_pages_dirty_buf(bp);
 	bqrelse(bp);
 
 	/*
 	 * note: we cannot initiate I/O from a bdwrite even if we wanted to,
 	 * due to the softdep code.
 	 */
 }
 
 /*
  *	bdirty:
  *
  *	Turn buffer into delayed write request.  We must clear BIO_READ and
  *	B_RELBUF, and we must set B_DELWRI.  We reassign the buffer to 
  *	itself to properly update it in the dirty/clean lists.  We mark it
  *	B_DONE to ensure that any asynchronization of the buffer properly
  *	clears B_DONE ( else a panic will occur later ).  
  *
  *	bdirty() is kinda like bdwrite() - we have to clear B_INVAL which
  *	might have been set pre-getblk().  Unlike bwrite/bdwrite, bdirty()
  *	should only be called if the buffer is known-good.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *
  *	The buffer must be on QUEUE_NONE.
  */
 void
 bdirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bdirty(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 	bp->b_flags &= ~(B_RELBUF);
 	bp->b_iocmd = BIO_WRITE;
 
 	if ((bp->b_flags & B_DELWRI) == 0) {
 		bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
 		reassignbuf(bp);
 		bdirtyadd();
 	}
 }
 
 /*
  *	bundirty:
  *
  *	Clear B_DELWRI for buffer.
  *
  *	Since the buffer is not on a queue, we do not update the numfreebuffers
  *	count.
  *	
  *	The buffer must be on QUEUE_NONE.
  */
 
 void
 bundirty(struct buf *bp)
 {
 
 	CTR3(KTR_BUF, "bundirty(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
 	KASSERT(bp->b_flags & B_REMFREE || bp->b_qindex == QUEUE_NONE,
 	    ("bundirty: buffer %p still on queue %d", bp, bp->b_qindex));
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_flags & B_DELWRI) {
 		bp->b_flags &= ~B_DELWRI;
 		reassignbuf(bp);
 		bdirtysub();
 	}
 	/*
 	 * Since it is now being written, we can clear its deferred write flag.
 	 */
 	bp->b_flags &= ~B_DEFERRED;
 }
 
 /*
  *	bawrite:
  *
  *	Asynchronous write.  Start output on a buffer, but do not wait for
  *	it to complete.  The buffer is released when the output completes.
  *
  *	bwrite() ( or the VOP routine anyway ) is responsible for handling 
  *	B_INVAL buffers.  Not us.
  */
 void
 bawrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC;
 	(void) bwrite(bp);
 }
 
 /*
  *	babarrierwrite:
  *
  *	Asynchronous barrier write.  Start output on a buffer, but do not
  *	wait for it to complete.  Place a write barrier after this write so
  *	that this buffer and all buffers written before it are committed to
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 void
 babarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_ASYNC | B_BARRIER;
 	(void) bwrite(bp);
 }
 
 /*
  *	bbarrierwrite:
  *
  *	Synchronous barrier write.  Start output on a buffer and wait for
  *	it to complete.  Place a write barrier after this write so that
  *	this buffer and all buffers written before it are committed to 
  *	the disk before any buffers written after this write are committed
  *	to the disk.  The buffer is released when the output completes.
  */
 int
 bbarrierwrite(struct buf *bp)
 {
 
 	bp->b_flags |= B_BARRIER;
 	return (bwrite(bp));
 }
 
 /*
  *	bwillwrite:
  *
  *	Called prior to the locking of any vnodes when we are expecting to
  *	write.  We do not want to starve the buffer cache with too many
  *	dirty buffers so we block here.  By blocking prior to the locking
  *	of any vnodes we attempt to avoid the situation where a locked vnode
  *	prevents the various system daemons from flushing related buffers.
  */
 void
 bwillwrite(void)
 {
 
 	if (numdirtybuffers >= hidirtybuffers) {
 		mtx_lock(&bdirtylock);
 		while (numdirtybuffers >= hidirtybuffers) {
 			bdirtywait = 1;
 			msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
 			    "flswai", 0);
 		}
 		mtx_unlock(&bdirtylock);
 	}
 }
 
 /*
  * Return true if we have too many dirty buffers.
  */
 int
 buf_dirty_count_severe(void)
 {
 
 	return(numdirtybuffers >= hidirtybuffers);
 }
 
 /*
  *	brelse:
  *
  *	Release a busy buffer and, if requested, free its resources.  The
  *	buffer will be stashed in the appropriate bufqueue[] allowing it
  *	to be accessed later as a cache entity or reused for other purposes.
  */
 void
 brelse(struct buf *bp)
 {
 	int qindex;
 
 	/*
 	 * Many functions erroneously call brelse with a NULL bp under rare
 	 * error conditions. Simply return when called with a NULL bp.
 	 */
 	if (bp == NULL)
 		return;
 	CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
 	    bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 	KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0,
 	    ("brelse: non-VMIO buffer marked NOREUSE"));
 
 	if (BUF_LOCKRECURSED(bp)) {
 		/*
 		 * Do not process, in particular, do not handle the
 		 * B_INVAL/B_RELBUF and do not release to free list.
 		 */
 		BUF_UNLOCK(bp);
 		return;
 	}
 
 	if (bp->b_flags & B_MANAGED) {
 		bqrelse(bp);
 		return;
 	}
 
 	if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		bdirty(bp);
 	}
 	if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
 	    (bp->b_error != ENXIO || !LIST_EMPTY(&bp->b_dep)) &&
 	    !(bp->b_flags & B_INVAL)) {
 		/*
 		 * Failed write, redirty.  All errors except ENXIO (which
 		 * means the device is gone) are expected to be potentially
 		 * transient - underlying media might work if tried again
 		 * after EIO, and memory might be available after an ENOMEM.
 		 *
 		 * Do this also for buffers that failed with ENXIO, but have
 		 * non-empty dependencies - the soft updates code might need
 		 * to access the buffer to untangle them.
 		 *
 		 * Must clear BIO_ERROR to prevent pages from being scrapped.
 		 */
 		bp->b_ioflags &= ~BIO_ERROR;
 		bdirty(bp);
 	} else if ((bp->b_flags & (B_NOCACHE | B_INVAL)) ||
 	    (bp->b_ioflags & BIO_ERROR) || (bp->b_bufsize <= 0)) {
 		/*
 		 * Either a failed read I/O, or we were asked to free or not
 		 * cache the buffer, or we failed to write to a device that's
 		 * no longer present.
 		 */
 		bp->b_flags |= B_INVAL;
 		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI)
 			bdirtysub();
 		bp->b_flags &= ~(B_DELWRI | B_CACHE);
 		if ((bp->b_flags & B_VMIO) == 0) {
 			allocbuf(bp, 0);
 			if (bp->b_vp)
 				brelvp(bp);
 		}
 	}
 
 	/*
 	 * We must clear B_RELBUF if B_DELWRI is set.  If vfs_vmio_truncate() 
 	 * is called with B_DELWRI set, the underlying pages may wind up
 	 * getting freed causing a previous write (bdwrite()) to get 'lost'
 	 * because pages associated with a B_DELWRI bp are marked clean.
 	 * 
 	 * We still allow the B_INVAL case to call vfs_vmio_truncate(), even
 	 * if B_DELWRI is set.
 	 */
 	if (bp->b_flags & B_DELWRI)
 		bp->b_flags &= ~B_RELBUF;
 
 	/*
 	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
 	 * constituted, not even NFS buffers now.  Two flags effect this.  If
 	 * B_INVAL, the struct buf is invalidated but the VM object is kept
 	 * around ( i.e. so it is trivial to reconstitute the buffer later ).
 	 *
 	 * If BIO_ERROR or B_NOCACHE is set, pages in the VM object will be
 	 * invalidated.  BIO_ERROR cannot be set for a failed write unless the
 	 * buffer is also B_INVAL because it hits the re-dirtying code above.
 	 *
 	 * Normally we can do this whether a buffer is B_DELWRI or not.  If
 	 * the buffer is an NFS buffer, it is tracking piecemeal writes or
 	 * the commit state and we cannot afford to lose the buffer. If the
 	 * buffer has a background write in progress, we need to keep it
 	 * around to prevent it from being reconstituted and starting a second
 	 * background write.
 	 */
 	if ((bp->b_flags & B_VMIO) && (bp->b_flags & B_NOCACHE ||
 	    (bp->b_ioflags & BIO_ERROR && bp->b_iocmd == BIO_READ)) &&
 	    !(bp->b_vp->v_mount != NULL &&
 	    (bp->b_vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    !vn_isdisk(bp->b_vp, NULL) && (bp->b_flags & B_DELWRI))) {
 		vfs_vmio_invalidate(bp);
 		allocbuf(bp, 0);
 	}
 
 	if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 ||
 	    (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) {
 		allocbuf(bp, 0);
 		bp->b_flags &= ~B_NOREUSE;
 		if (bp->b_vp != NULL)
 			brelvp(bp);
 	}
 			
 	/*
 	 * If the buffer has junk contents signal it and eventually
 	 * clean up B_DELWRI and diassociate the vnode so that gbincore()
 	 * doesn't find it.
 	 */
 	if (bp->b_bufsize == 0 || (bp->b_ioflags & BIO_ERROR) != 0 ||
 	    (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) != 0)
 		bp->b_flags |= B_INVAL;
 	if (bp->b_flags & B_INVAL) {
 		if (bp->b_flags & B_DELWRI)
 			bundirty(bp);
 		if (bp->b_vp)
 			brelvp(bp);
 	}
 
 	/* buffers with no memory */
 	if (bp->b_bufsize == 0) {
 		buf_free(bp);
 		return;
 	}
 	/* buffers with junk contents */
 	if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
 	    (bp->b_ioflags & BIO_ERROR)) {
 		bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 2");
 		qindex = QUEUE_CLEAN;
 		bp->b_flags |= B_AGE;
 	/* remaining buffers */
 	} else if (bp->b_flags & B_DELWRI)
 		qindex = QUEUE_DIRTY;
 	else
 		qindex = QUEUE_CLEAN;
 
 	binsfree(bp, qindex);
 
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
 	if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
 		panic("brelse: not dirty");
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Release a buffer back to the appropriate queue but do not try to free
  * it.  The buffer is expected to be used again soon.
  *
  * bqrelse() is used by bdwrite() to requeue a delayed write, and used by
  * biodone() to requeue an async I/O on completion.  It is also used when
  * known good buffers need to be requeued but we think we may need the data
  * again soon.
  *
  * XXX we should be able to leave the B_RELBUF hint set on completion.
  */
 void
 bqrelse(struct buf *bp)
 {
 	int qindex;
 
 	CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
 	qindex = QUEUE_NONE;
 	if (BUF_LOCKRECURSED(bp)) {
 		/* do not release to free list */
 		BUF_UNLOCK(bp);
 		return;
 	}
 	bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
 
 	if (bp->b_flags & B_MANAGED) {
 		if (bp->b_flags & B_REMFREE)
 			bremfreef(bp);
 		goto out;
 	}
 
 	/* buffers with stale but valid contents */
 	if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
 	    BV_BKGRDERR)) == BV_BKGRDERR) {
 		BO_LOCK(bp->b_bufobj);
 		bp->b_vflags &= ~BV_BKGRDERR;
 		BO_UNLOCK(bp->b_bufobj);
 		qindex = QUEUE_DIRTY;
 	} else {
 		if ((bp->b_flags & B_DELWRI) == 0 &&
 		    (bp->b_xflags & BX_VNDIRTY))
 			panic("bqrelse: not dirty");
 		if ((bp->b_flags & B_NOREUSE) != 0) {
 			brelse(bp);
 			return;
 		}
 		qindex = QUEUE_CLEAN;
 	}
 	binsfree(bp, qindex);
 
 out:
 	/* unlock */
 	BUF_UNLOCK(bp);
 	if (qindex == QUEUE_CLEAN)
 		bufspace_wakeup();
 }
 
 /*
  * Complete I/O to a VMIO backed page.  Validate the pages as appropriate,
  * restore bogus pages.
  */
 static void
 vfs_vmio_iodone(struct buf *bp)
 {
 	vm_ooffset_t foff;
 	vm_page_t m;
 	vm_object_t obj;
 	struct vnode *vp;
 	int i, iosize, resid;
 	bool bogus;
 
 	obj = bp->b_bufobj->bo_object;
 	KASSERT(obj->paging_in_progress >= bp->b_npages,
 	    ("vfs_vmio_iodone: paging in progress(%d) < b_npages(%d)",
 	    obj->paging_in_progress, bp->b_npages));
 
 	vp = bp->b_vp;
 	KASSERT(vp->v_holdcnt > 0,
 	    ("vfs_vmio_iodone: vnode %p has zero hold count", vp));
 	KASSERT(vp->v_object != NULL,
 	    ("vfs_vmio_iodone: vnode %p has no vm_object", vp));
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_vmio_iodone: bp %p has no buffer offset", bp));
 
 	bogus = false;
 	iosize = bp->b_bcount - bp->b_resid;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff;
 		if (resid > iosize)
 			resid = iosize;
 
 		/*
 		 * cleanup bogus pages, restoring the originals
 		 */
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			bogus = true;
 			m = vm_page_lookup(obj, OFF_TO_IDX(foff));
 			if (m == NULL)
 				panic("biodone: page disappeared!");
 			bp->b_pages[i] = m;
 		} else if ((bp->b_iocmd == BIO_READ) && resid > 0) {
 			/*
 			 * In the write case, the valid and clean bits are
 			 * already changed correctly ( see bdwrite() ), so we 
 			 * only need to do this here in the read case.
 			 */
 			KASSERT((m->dirty & vm_page_bits(foff & PAGE_MASK,
 			    resid)) == 0, ("vfs_vmio_iodone: page %p "
 			    "has unexpected dirty bits", m));
 			vfs_page_set_valid(bp, foff, m);
 		}
 		KASSERT(OFF_TO_IDX(foff) == m->pindex,
 		    ("vfs_vmio_iodone: foff(%jd)/pindex(%ju) mismatch",
 		    (intmax_t)foff, (uintmax_t)m->pindex));
 
 		vm_page_sunbusy(m);
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		iosize -= resid;
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  * Unwire a page held by a buf and place it on the appropriate vm queue.
  */
 static void
 vfs_vmio_unwire(struct buf *bp, vm_page_t m)
 {
 	bool freed;
 
 	vm_page_lock(m);
 	if (vm_page_unwire(m, PQ_NONE)) {
 		/*
 		 * Determine if the page should be freed before adding
 		 * it to the inactive queue.
 		 */
 		if (m->valid == 0) {
 			freed = !vm_page_busied(m);
 			if (freed)
 				vm_page_free(m);
 		} else if ((bp->b_flags & B_DIRECT) != 0)
 			freed = vm_page_try_to_free(m);
 		else
 			freed = false;
 		if (!freed) {
 			/*
 			 * If the page is unlikely to be reused, let the
 			 * VM know.  Otherwise, maintain LRU page
 			 * ordering and put the page at the tail of the
 			 * inactive queue.
 			 */
 			if ((bp->b_flags & B_NOREUSE) != 0)
 				vm_page_deactivate_noreuse(m);
 			else
 				vm_page_deactivate(m);
 		}
 	}
 	vm_page_unlock(m);
 }
 
 /*
  * Perform page invalidation when a buffer is released.  The fully invalid
  * pages will be reclaimed later in vfs_vmio_truncate().
  */
 static void
 vfs_vmio_invalidate(struct buf *bp)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i, resid, poffset, presid;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	/*
 	 * Get the base offset and length of the buffer.  Note that 
 	 * in the VMIO case if the buffer block size is not
 	 * page-aligned then b_data pointer may not be page-aligned.
 	 * But our b_pages[] array *IS* page aligned.
 	 *
 	 * block sizes less then DEV_BSIZE (usually 512) are not 
 	 * supported due to the page granularity bits (m->valid,
 	 * m->dirty, etc...). 
 	 *
 	 * See man buf(9) for more information
 	 */
 	obj = bp->b_bufobj->bo_object;
 	resid = bp->b_bufsize;
 	poffset = bp->b_offset & PAGE_MASK;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page)
 			panic("vfs_vmio_invalidate: Unexpected bogus page.");
 		bp->b_pages[i] = NULL;
 
 		presid = resid > (PAGE_SIZE - poffset) ?
 		    (PAGE_SIZE - poffset) : resid;
 		KASSERT(presid >= 0, ("brelse: extra page"));
 		while (vm_page_xbusied(m)) {
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(obj);
 			vm_page_busy_sleep(m, "mbncsh", true);
 			VM_OBJECT_WLOCK(obj);
 		}
 		if (pmap_page_wired_mappings(m) == 0)
 			vm_page_set_invalid(m, poffset, presid);
 		vfs_vmio_unwire(bp, m);
 		resid -= presid;
 		poffset = 0;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = 0;
 }
 
 /*
  * Page-granular truncation of an existing VMIO buffer.
  */
 static void
 vfs_vmio_truncate(struct buf *bp, int desiredpages)
 {
 	vm_object_t obj;
 	vm_page_t m;
 	int i;
 
 	if (bp->b_npages == desiredpages)
 		return;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qremove((vm_offset_t)trunc_page((vm_offset_t)bp->b_data) +
 		    (desiredpages << PAGE_SHIFT), bp->b_npages - desiredpages);
 	} else
 		BUF_CHECK_UNMAPPED(bp);
 	obj = bp->b_bufobj->bo_object;
 	if (obj != NULL)
 		VM_OBJECT_WLOCK(obj);
 	for (i = desiredpages; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		KASSERT(m != bogus_page, ("allocbuf: bogus page found"));
 		bp->b_pages[i] = NULL;
 		vfs_vmio_unwire(bp, m);
 	}
 	if (obj != NULL)
 		VM_OBJECT_WUNLOCK(obj);
 	bp->b_npages = desiredpages;
 }
 
 /*
  * Byte granular extension of VMIO buffers.
  */
 static void
 vfs_vmio_extend(struct buf *bp, int desiredpages, int size)
 {
 	/*
 	 * We are growing the buffer, possibly in a 
 	 * byte-granular fashion.
 	 */
 	vm_object_t obj;
 	vm_offset_t toff;
 	vm_offset_t tinc;
 	vm_page_t m;
 
 	/*
 	 * Step 1, bring in the VM pages from the object, allocating
 	 * them if necessary.  We must clear B_CACHE if these pages
 	 * are not valid for the range covered by the buffer.
 	 */
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
-	while (bp->b_npages < desiredpages) {
+	if (bp->b_npages < desiredpages) {
 		/*
 		 * We must allocate system pages since blocking
 		 * here could interfere with paging I/O, no
 		 * matter which process we are.
 		 *
 		 * Only exclusive busy can be tested here.
 		 * Blocking on shared busy might lead to
 		 * deadlocks once allocbuf() is called after
 		 * pages are vfs_busy_pages().
 		 */
-		m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) + bp->b_npages,
-		    VM_ALLOC_NOBUSY | VM_ALLOC_SYSTEM |
-		    VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
-		    VM_ALLOC_COUNT(desiredpages - bp->b_npages));
-		if (m->valid == 0)
-			bp->b_flags &= ~B_CACHE;
-		bp->b_pages[bp->b_npages] = m;
-		++bp->b_npages;
+		vm_page_grab_pages(obj,
+		    OFF_TO_IDX(bp->b_offset) + bp->b_npages,
+		    VM_ALLOC_SYSTEM | VM_ALLOC_IGN_SBUSY |
+		    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED,
+		    &bp->b_pages[bp->b_npages], desiredpages - bp->b_npages);
+		bp->b_npages = desiredpages;
 	}
 
 	/*
 	 * Step 2.  We've loaded the pages into the buffer,
 	 * we have to figure out if we can still have B_CACHE
 	 * set.  Note that B_CACHE is set according to the
 	 * byte-granular range ( bcount and size ), not the
 	 * aligned range ( newbsize ).
 	 *
 	 * The VM test is against m->valid, which is DEV_BSIZE
 	 * aligned.  Needless to say, the validity of the data
 	 * needs to also be DEV_BSIZE aligned.  Note that this
 	 * fails with NFS if the server or some other client
 	 * extends the file's EOF.  If our buffer is resized, 
 	 * B_CACHE may remain set! XXX
 	 */
 	toff = bp->b_bcount;
 	tinc = PAGE_SIZE - ((bp->b_offset + toff) & PAGE_MASK);
 	while ((bp->b_flags & B_CACHE) && toff < size) {
 		vm_pindex_t pi;
 
 		if (tinc > (size - toff))
 			tinc = size - toff;
 		pi = ((bp->b_offset & PAGE_MASK) + toff) >> PAGE_SHIFT;
 		m = bp->b_pages[pi];
 		vfs_buf_test_cache(bp, bp->b_offset, toff, tinc, m);
 		toff += tinc;
 		tinc = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 
 	/*
 	 * Step 3, fixup the KVA pmap.
 	 */
 	if (buf_mapped(bp))
 		bpmap_qenter(bp);
 	else
 		BUF_CHECK_UNMAPPED(bp);
 }
 
 /*
  * Check to see if a block at a particular lbn is available for a clustered
  * write.
  */
 static int
 vfs_bio_clcheck(struct vnode *vp, int size, daddr_t lblkno, daddr_t blkno)
 {
 	struct buf *bpa;
 	int match;
 
 	match = 0;
 
 	/* If the buf isn't in core skip it */
 	if ((bpa = gbincore(&vp->v_bufobj, lblkno)) == NULL)
 		return (0);
 
 	/* If the buf is busy we don't want to wait for it */
 	if (BUF_LOCK(bpa, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 		return (0);
 
 	/* Only cluster with valid clusterable delayed write buffers */
 	if ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) !=
 	    (B_DELWRI | B_CLUSTEROK))
 		goto done;
 
 	if (bpa->b_bufsize != size)
 		goto done;
 
 	/*
 	 * Check to see if it is in the expected place on disk and that the
 	 * block has been mapped.
 	 */
 	if ((bpa->b_blkno != bpa->b_lblkno) && (bpa->b_blkno == blkno))
 		match = 1;
 done:
 	BUF_UNLOCK(bpa);
 	return (match);
 }
 
 /*
  *	vfs_bio_awrite:
  *
  *	Implement clustered async writes for clearing out B_DELWRI buffers.
  *	This is much better then the old way of writing only one buffer at
  *	a time.  Note that we may not be presented with the buffers in the 
  *	correct order, so we search for the cluster in both directions.
  */
 int
 vfs_bio_awrite(struct buf *bp)
 {
 	struct bufobj *bo;
 	int i;
 	int j;
 	daddr_t lblkno = bp->b_lblkno;
 	struct vnode *vp = bp->b_vp;
 	int ncl;
 	int nwritten;
 	int size;
 	int maxcl;
 	int gbflags;
 
 	bo = &vp->v_bufobj;
 	gbflags = (bp->b_data == unmapped_buf) ? GB_UNMAPPED : 0;
 	/*
 	 * right now we support clustered writing only to regular files.  If
 	 * we find a clusterable block we could be in the middle of a cluster
 	 * rather then at the beginning.
 	 */
 	if ((vp->v_type == VREG) && 
 	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
 	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
 
 		size = vp->v_mount->mnt_stat.f_iosize;
 		maxcl = MAXPHYS / size;
 
 		BO_RLOCK(bo);
 		for (i = 1; i < maxcl; i++)
 			if (vfs_bio_clcheck(vp, size, lblkno + i,
 			    bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
 				break;
 
 		for (j = 1; i + j <= maxcl && j <= lblkno; j++) 
 			if (vfs_bio_clcheck(vp, size, lblkno - j,
 			    bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
 				break;
 		BO_RUNLOCK(bo);
 		--j;
 		ncl = i + j;
 		/*
 		 * this is a possible cluster write
 		 */
 		if (ncl != 1) {
 			BUF_UNLOCK(bp);
 			nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
 			    gbflags);
 			return (nwritten);
 		}
 	}
 	bremfree(bp);
 	bp->b_flags |= B_ASYNC;
 	/*
 	 * default (old) behavior, writing out only one block
 	 *
 	 * XXX returns b_bufsize instead of b_bcount for nwritten?
 	 */
 	nwritten = bp->b_bufsize;
 	(void) bwrite(bp);
 
 	return (nwritten);
 }
 
 /*
  *	getnewbuf_kva:
  *
  *	Allocate KVA for an empty buf header according to gbflags.
  */
 static int
 getnewbuf_kva(struct buf *bp, int gbflags, int maxsize)
 {
 
 	if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_UNMAPPED) {
 		/*
 		 * In order to keep fragmentation sane we only allocate kva
 		 * in BKVASIZE chunks.  XXX with vmem we can do page size.
 		 */
 		maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
 
 		if (maxsize != bp->b_kvasize &&
 		    bufkva_alloc(bp, maxsize, gbflags))
 			return (ENOSPC);
 	}
 	return (0);
 }
 
 /*
  *	getnewbuf:
  *
  *	Find and initialize a new buffer header, freeing up existing buffers
  *	in the bufqueues as necessary.  The new buffer is returned locked.
  *
  *	We block if:
  *		We have insufficient buffer headers
  *		We have insufficient buffer space
  *		buffer_arena is too fragmented ( space reservation fails )
  *		If we have to flush dirty buffers ( but we try to avoid this )
  *
  *	The caller is responsible for releasing the reserved bufspace after
  *	allocbuf() is called.
  */
 static struct buf *
 getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int maxsize, int gbflags)
 {
 	struct buf *bp;
 	bool metadata, reserved;
 
 	bp = NULL;
 	KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	if (!unmapped_buf_allowed)
 		gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
 	    vp->v_type == VCHR)
 		metadata = true;
 	else
 		metadata = false;
 	atomic_add_int(&getnewbufcalls, 1);
 	reserved = false;
 	do {
 		if (reserved == false &&
 		    bufspace_reserve(maxsize, metadata) != 0)
 			continue;
 		reserved = true;
 		if ((bp = buf_alloc()) == NULL)
 			continue;
 		if (getnewbuf_kva(bp, gbflags, maxsize) == 0)
 			return (bp);
 		break;
 	} while(buf_scan(false) == 0);
 
 	if (reserved)
 		atomic_subtract_long(&bufspace, maxsize);
 	if (bp != NULL) {
 		bp->b_flags |= B_INVAL;
 		brelse(bp);
 	}
 	bufspace_wait(vp, gbflags, slpflag, slptimeo);
 
 	return (NULL);
 }
 
 /*
  *	buf_daemon:
  *
  *	buffer flushing daemon.  Buffers are normally flushed by the
  *	update daemon but if it cannot keep up this process starts to
  *	take the load in an attempt to prevent getnewbuf() from blocking.
  */
 static struct kproc_desc buf_kp = {
 	"bufdaemon",
 	buf_daemon,
 	&bufdaemonproc
 };
 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
 
 static int
 buf_flush(struct vnode *vp, int target)
 {
 	int flushed;
 
 	flushed = flushbufqueues(vp, target, 0);
 	if (flushed == 0) {
 		/*
 		 * Could not find any buffers without rollback
 		 * dependencies, so just write the first one
 		 * in the hopes of eventually making progress.
 		 */
 		if (vp != NULL && target > 2)
 			target /= 2;
 		flushbufqueues(vp, target, 1);
 	}
 	return (flushed);
 }
 
 static void
 buf_daemon()
 {
 	int lodirty;
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
 	 */
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, bufdaemonproc,
 	    SHUTDOWN_PRI_LAST);
 
 	/*
 	 * This process is allowed to take the buffer cache to the limit
 	 */
 	curthread->td_pflags |= TDP_NORUNNINGBUF | TDP_BUFNEED;
 	mtx_lock(&bdlock);
 	for (;;) {
 		bd_request = 0;
 		mtx_unlock(&bdlock);
 
 		kproc_suspend_check(bufdaemonproc);
 		lodirty = lodirtybuffers;
 		if (bd_speedupreq) {
 			lodirty = numdirtybuffers / 2;
 			bd_speedupreq = 0;
 		}
 		/*
 		 * Do the flush.  Limit the amount of in-transit I/O we
 		 * allow to build up, otherwise we would completely saturate
 		 * the I/O system.
 		 */
 		while (numdirtybuffers > lodirty) {
 			if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
 				break;
 			kern_yield(PRI_USER);
 		}
 
 		/*
 		 * Only clear bd_request if we have reached our low water
 		 * mark.  The buf_daemon normally waits 1 second and
 		 * then incrementally flushes any dirty buffers that have
 		 * built up, within reason.
 		 *
 		 * If we were unable to hit our low water mark and couldn't
 		 * find any flushable buffers, we sleep for a short period
 		 * to avoid endless loops on unlockable buffers.
 		 */
 		mtx_lock(&bdlock);
 		if (numdirtybuffers <= lodirtybuffers) {
 			/*
 			 * We reached our low water mark, reset the
 			 * request and sleep until we are needed again.
 			 * The sleep is just so the suspend code works.
 			 */
 			bd_request = 0;
 			/*
 			 * Do an extra wakeup in case dirty threshold
 			 * changed via sysctl and the explicit transition
 			 * out of shortfall was missed.
 			 */
 			bdirtywakeup();
 			if (runningbufspace <= lorunningspace)
 				runningwakeup();
 			msleep(&bd_request, &bdlock, PVM, "psleep", hz);
 		} else {
 			/*
 			 * We couldn't find any flushable dirty buffers but
 			 * still have too many dirty buffers, we
 			 * have to sleep and try again.  (rare)
 			 */
 			msleep(&bd_request, &bdlock, PVM, "qsleep", hz / 10);
 		}
 	}
 }
 
 /*
  *	flushbufqueues:
  *
  *	Try to flush a buffer in the dirty queue.  We must be careful to
  *	free up B_INVAL buffers instead of write them, which NFS is 
  *	particularly sensitive to.
  */
 static int flushwithdeps = 0;
 SYSCTL_INT(_vfs, OID_AUTO, flushwithdeps, CTLFLAG_RW, &flushwithdeps,
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
 flushbufqueues(struct vnode *lvp, int target, int flushdeps)
 {
 	struct buf *sentinel;
 	struct vnode *vp;
 	struct mount *mp;
 	struct buf *bp;
 	int hasdeps;
 	int flushed;
 	int queue;
 	int error;
 	bool unlock;
 
 	flushed = 0;
 	queue = QUEUE_DIRTY;
 	bp = NULL;
 	sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
 	sentinel->b_qindex = QUEUE_SENTINEL;
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	while (flushed != target) {
 		maybe_yield();
 		mtx_lock(&bqlocks[queue]);
 		bp = TAILQ_NEXT(sentinel, b_freelist);
 		if (bp != NULL) {
 			TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 			TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
 			    b_freelist);
 		} else {
 			mtx_unlock(&bqlocks[queue]);
 			break;
 		}
 		/*
 		 * Skip sentinels inserted by other invocations of the
 		 * flushbufqueues(), taking care to not reorder them.
 		 *
 		 * Only flush the buffers that belong to the
 		 * vnode locked by the curthread.
 		 */
 		if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
 		    bp->b_vp != lvp)) {
 			mtx_unlock(&bqlocks[queue]);
  			continue;
 		}
 		error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
 		mtx_unlock(&bqlocks[queue]);
 		if (error != 0)
 			continue;
 		if (bp->b_pin_count > 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		/*
 		 * BKGRDINPROG can only be set with the buf and bufobj
 		 * locks both held.  We tolerate a race to clear it here.
 		 */
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (bp->b_flags & B_INVAL) {
 			bremfreef(bp);
 			brelse(bp);
 			flushed++;
 			continue;
 		}
 
 		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
 			}
 			hasdeps = 1;
 		} else
 			hasdeps = 0;
 		/*
 		 * We must hold the lock on a vnode before writing
 		 * one of its buffers. Otherwise we may confuse, or
 		 * in the case of a snapshot vnode, deadlock the
 		 * system.
 		 *
 		 * The lock order here is the reverse of the normal
 		 * of vnode followed by buf lock.  This is ok because
 		 * the NOWAIT will prevent deadlock.
 		 */
 		vp = bp->b_vp;
 		if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			BUF_UNLOCK(bp);
 			continue;
 		}
 		if (lvp == NULL) {
 			unlock = true;
 			error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
 		} else {
 			ASSERT_VOP_LOCKED(vp, "getbuf");
 			unlock = false;
 			error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
 			    vn_lock(vp, LK_TRYUPGRADE);
 		}
 		if (error == 0) {
 			CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
 			    bp, bp->b_vp, bp->b_flags);
 			if (curproc == bufdaemonproc) {
 				vfs_bio_awrite(bp);
 			} else {
 				bremfree(bp);
 				bwrite(bp);
 				notbufdflushes++;
 			}
 			vn_finished_write(mp);
 			if (unlock)
 				VOP_UNLOCK(vp, 0);
 			flushwithdeps += hasdeps;
 			flushed++;
 
 			/*
 			 * Sleeping on runningbufspace while holding
 			 * vnode lock leads to deadlock.
 			 */
 			if (curproc == bufdaemonproc &&
 			    runningbufspace > hirunningspace)
 				waitrunningbufspace();
 			continue;
 		}
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
 	mtx_lock(&bqlocks[queue]);
 	TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
 	mtx_unlock(&bqlocks[queue]);
 	free(sentinel, M_TEMP);
 	return (flushed);
 }
 
 /*
  * Check to see if a block is currently memory resident.
  */
 struct buf *
 incore(struct bufobj *bo, daddr_t blkno)
 {
 	struct buf *bp;
 
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	BO_RUNLOCK(bo);
 	return (bp);
 }
 
 /*
  * Returns true if no I/O is needed to access the
  * associated VM object.  This is like incore except
  * it also hunts around in the VM system for the data.
  */
 
 static int
 inmem(struct vnode * vp, daddr_t blkno)
 {
 	vm_object_t obj;
 	vm_offset_t toff, tinc, size;
 	vm_page_t m;
 	vm_ooffset_t off;
 
 	ASSERT_VOP_LOCKED(vp, "inmem");
 
 	if (incore(&vp->v_bufobj, blkno))
 		return 1;
 	if (vp->v_mount == NULL)
 		return 0;
 	obj = vp->v_object;
 	if (obj == NULL)
 		return (0);
 
 	size = PAGE_SIZE;
 	if (size > vp->v_mount->mnt_stat.f_iosize)
 		size = vp->v_mount->mnt_stat.f_iosize;
 	off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
 
 	VM_OBJECT_RLOCK(obj);
 	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
 		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
 		if (!m)
 			goto notinmem;
 		tinc = size;
 		if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK))
 			tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK);
 		if (vm_page_is_valid(m,
 		    (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
 			goto notinmem;
 	}
 	VM_OBJECT_RUNLOCK(obj);
 	return 1;
 
 notinmem:
 	VM_OBJECT_RUNLOCK(obj);
 	return (0);
 }
 
 /*
  * Set the dirty range for a buffer based on the status of the dirty
  * bits in the pages comprising the buffer.  The range is limited
  * to the size of the buffer.
  *
  * Tell the VM system that the pages associated with this buffer
  * are clean.  This is used for delayed writes where the data is
  * going to go to disk eventually without additional VM intevention.
  *
  * Note that while we only really need to clean through to b_bcount, we
  * just go ahead and clean through to b_bufsize.
  */
 static void
 vfs_clean_pages_dirty_buf(struct buf *bp)
 {
 	vm_ooffset_t foff, noff, eoff;
 	vm_page_t m;
 	int i;
 
 	if ((bp->b_flags & B_VMIO) == 0 || bp->b_bufsize == 0)
 		return;
 
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_clean_pages_dirty_buf: no buffer offset"));
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	vfs_drain_busy_pages(bp);
 	vfs_setdirty_locked_object(bp);
 	for (i = 0; i < bp->b_npages; i++) {
 		noff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 		eoff = noff;
 		if (eoff > bp->b_offset + bp->b_bufsize)
 			eoff = bp->b_offset + bp->b_bufsize;
 		m = bp->b_pages[i];
 		vfs_page_set_validclean(bp, foff, m);
 		/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
 		foff = noff;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 static void
 vfs_setdirty_locked_object(struct buf *bp)
 {
 	vm_object_t object;
 	int i;
 
 	object = bp->b_bufobj->bo_object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * We qualify the scan for modified pages on whether the
 	 * object has been flushed yet.
 	 */
 	if ((object->flags & OBJ_MIGHTBEDIRTY) != 0) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
 
 		/*
 		 * test the pages to see if they have been modified directly
 		 * by users through the VM system.
 		 */
 		for (i = 0; i < bp->b_npages; i++)
 			vm_page_test_dirty(bp->b_pages[i]);
 
 		/*
 		 * Calculate the encompassing dirty range, boffset and eoffset,
 		 * (eoffset - boffset) bytes.
 		 */
 
 		for (i = 0; i < bp->b_npages; i++) {
 			if (bp->b_pages[i]->dirty)
 				break;
 		}
 		boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		for (i = bp->b_npages - 1; i >= 0; --i) {
 			if (bp->b_pages[i]->dirty) {
 				break;
 			}
 		}
 		eoffset = ((i + 1) << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK);
 
 		/*
 		 * Fit it to the buffer.
 		 */
 
 		if (eoffset > bp->b_bcount)
 			eoffset = bp->b_bcount;
 
 		/*
 		 * If we have a good dirty range, merge with the existing
 		 * dirty range.
 		 */
 
 		if (boffset < eoffset) {
 			if (bp->b_dirtyoff > boffset)
 				bp->b_dirtyoff = boffset;
 			if (bp->b_dirtyend < eoffset)
 				bp->b_dirtyend = eoffset;
 		}
 	}
 }
 
 /*
  * Allocate the KVA mapping for an existing buffer.
  * If an unmapped buffer is provided but a mapped buffer is requested, take
  * also care to properly setup mappings between pages and KVA.
  */
 static void
 bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
 {
 	int bsize, maxsize, need_mapping, need_kva;
 	off_t offset;
 
 	need_mapping = bp->b_data == unmapped_buf &&
 	    (gbflags & GB_UNMAPPED) == 0;
 	need_kva = bp->b_kvabase == unmapped_buf &&
 	    bp->b_data == unmapped_buf &&
 	    (gbflags & GB_KVAALLOC) != 0;
 	if (!need_mapping && !need_kva)
 		return;
 
 	BUF_CHECK_UNMAPPED(bp);
 
 	if (need_mapping && bp->b_kvabase != unmapped_buf) {
 		/*
 		 * Buffer is not mapped, but the KVA was already
 		 * reserved at the time of the instantiation.  Use the
 		 * allocated space.
 		 */
 		goto has_addr;
 	}
 
 	/*
 	 * Calculate the amount of the address space we would reserve
 	 * if the buffer was mapped.
 	 */
 	bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
 	KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 	offset = blkno * bsize;
 	maxsize = size + (offset & PAGE_MASK);
 	maxsize = imax(maxsize, bsize);
 
 	while (bufkva_alloc(bp, maxsize, gbflags) != 0) {
 		if ((gbflags & GB_NOWAIT_BD) != 0) {
 			/*
 			 * XXXKIB: defragmentation cannot
 			 * succeed, not sure what else to do.
 			 */
 			panic("GB_NOWAIT_BD and GB_UNMAPPED %p", bp);
 		}
 		atomic_add_int(&mappingrestarts, 1);
 		bufspace_wait(bp->b_vp, gbflags, 0, 0);
 	}
 has_addr:
 	if (need_mapping) {
 		/* b_offset is handled by bpmap_qenter. */
 		bp->b_data = bp->b_kvabase;
 		BUF_CHECK_MAPPED(bp);
 		bpmap_qenter(bp);
 	}
 }
 
 /*
  *	getblk:
  *
  *	Get a block given a specified block and offset into a file/device.
  *	The buffers B_DONE bit will be cleared on return, making it almost
  * 	ready for an I/O initiation.  B_INVAL may or may not be set on 
  *	return.  The caller should clear B_INVAL prior to initiating a
  *	READ.
  *
  *	For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for
  *	an existing buffer.
  *
  *	For a VMIO buffer, B_CACHE is modified according to the backing VM.
  *	If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set
  *	and then cleared based on the backing VM.  If the previous buffer is
  *	non-0-sized but invalid, B_CACHE will be cleared.
  *
  *	If getblk() must create a new buffer, the new buffer is returned with
  *	both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which
  *	case it is returned with B_INVAL clear and B_CACHE set based on the
  *	backing VM.
  *
  *	getblk() also forces a bwrite() for any B_DELWRI buffer whos
  *	B_CACHE bit is clear.
  *	
  *	What this means, basically, is that the caller should use B_CACHE to
  *	determine whether the buffer is fully valid or not and should clear
  *	B_INVAL prior to issuing a read.  If the caller intends to validate
  *	the buffer by loading its data area with something, the caller needs
  *	to clear B_INVAL.  If the caller does this without issuing an I/O, 
  *	the caller should set B_CACHE ( as an optimization ), else the caller
  *	should issue the I/O and biodone() will set B_CACHE if the I/O was
  *	a write attempt or if it was a successful read.  If the caller 
  *	intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
  *	prior to issuing the READ.  biodone() will *not* clear B_INVAL.
  */
 struct buf *
 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo,
     int flags)
 {
 	struct buf *bp;
 	struct bufobj *bo;
 	int bsize, error, maxsize, vmio;
 	off_t offset;
 
 	CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
 	KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
 	    ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
 	ASSERT_VOP_LOCKED(vp, "getblk");
 	if (size > maxbcachebuf)
 		panic("getblk: size(%d) > maxbcachebuf(%d)\n", size,
 		    maxbcachebuf);
 	if (!unmapped_buf_allowed)
 		flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 
 	bo = &vp->v_bufobj;
 loop:
 	BO_RLOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
 		/*
 		 * Buffer is in-core.  If the buffer is not busy nor managed,
 		 * it must be on a queue.
 		 */
 		lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
 
 		if (flags & GB_LOCK_NOWAIT)
 			lockflags |= LK_NOWAIT;
 
 		error = BUF_TIMELOCK(bp, lockflags,
 		    BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
 
 		/*
 		 * If we slept and got the lock we have to restart in case
 		 * the buffer changed identities.
 		 */
 		if (error == ENOLCK)
 			goto loop;
 		/* We timed out or were interrupted. */
 		else if (error)
 			return (NULL);
 		/* If recursed, assume caller knows the rules. */
 		else if (BUF_LOCKRECURSED(bp))
 			goto end;
 
 		/*
 		 * The buffer is locked.  B_CACHE is cleared if the buffer is 
 		 * invalid.  Otherwise, for a non-VMIO buffer, B_CACHE is set
 		 * and for a VMIO buffer B_CACHE is adjusted according to the
 		 * backing VM cache.
 		 */
 		if (bp->b_flags & B_INVAL)
 			bp->b_flags &= ~B_CACHE;
 		else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
 			bp->b_flags |= B_CACHE;
 		if (bp->b_flags & B_MANAGED)
 			MPASS(bp->b_qindex == QUEUE_NONE);
 		else
 			bremfree(bp);
 
 		/*
 		 * check for size inconsistencies for non-VMIO case.
 		 */
 		if (bp->b_bcount != size) {
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
 					/*
 					 * If buffer is pinned and caller does
 					 * not want sleep  waiting for it to be
 					 * unpinned, bail out
 					 * */
 					if (bp->b_pin_count > 0) {
 						if (flags & GB_LOCK_NOWAIT) {
 							bqrelse(bp);
 							return (NULL);
 						} else {
 							bunpin_wait(bp);
 						}
 					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
 					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
 						bp->b_flags |= B_NOCACHE;
 						bwrite(bp);
 					}
 				}
 				goto loop;
 			}
 		}
 
 		/*
 		 * Handle the case of unmapped buffer which should
 		 * become mapped, or the buffer for which KVA
 		 * reservation is requested.
 		 */
 		bp_unmapped_get_kva(bp, blkno, size, flags);
 
 		/*
 		 * If the size is inconsistent in the VMIO case, we can resize
 		 * the buffer.  This might lead to B_CACHE getting set or
 		 * cleared.  If the size has not changed, B_CACHE remains
 		 * unchanged from its previous state.
 		 */
 		allocbuf(bp, size);
 
 		KASSERT(bp->b_offset != NOOFFSET, 
 		    ("getblk: no buffer offset"));
 
 		/*
 		 * A buffer with B_DELWRI set and B_CACHE clear must
 		 * be committed before we can return the buffer in
 		 * order to prevent the caller from issuing a read
 		 * ( due to B_CACHE not being set ) and overwriting
 		 * it.
 		 *
 		 * Most callers, including NFS and FFS, need this to
 		 * operate properly either because they assume they
 		 * can issue a read if B_CACHE is not set, or because
 		 * ( for example ) an uncached B_DELWRI might loop due 
 		 * to softupdates re-dirtying the buffer.  In the latter
 		 * case, B_CACHE is set after the first write completes,
 		 * preventing further loops.
 		 * NOTE!  b*write() sets B_CACHE.  If we cleared B_CACHE
 		 * above while extending the buffer, we cannot allow the
 		 * buffer to remain with B_CACHE set after the write
 		 * completes or it will represent a corrupt state.  To
 		 * deal with this we set B_NOCACHE to scrap the buffer
 		 * after the write.
 		 *
 		 * We might be able to do something fancy, like setting
 		 * B_CACHE in bwrite() except if B_DELWRI is already set,
 		 * so the below call doesn't set B_CACHE, but that gets real
 		 * confusing.  This is much easier.
 		 */
 
 		if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) {
 			bp->b_flags |= B_NOCACHE;
 			bwrite(bp);
 			goto loop;
 		}
 		bp->b_flags &= ~B_DONE;
 	} else {
 		/*
 		 * Buffer is not in-core, create new buffer.  The buffer
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
 		BO_RUNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
 		 */
 		if (flags & GB_NOCREAT)
 			return NULL;
 		if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
 			return NULL;
 
 		bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
 		KASSERT(bsize != 0, ("bsize == 0, check bo->bo_bsize"));
 		offset = blkno * bsize;
 		vmio = vp->v_object != NULL;
 		if (vmio) {
 			maxsize = size + (offset & PAGE_MASK);
 		} else {
 			maxsize = size;
 			/* Do not allow non-VMIO notmapped buffers. */
 			flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
 		}
 		maxsize = imax(maxsize, bsize);
 
 		bp = getnewbuf(vp, slpflag, slptimeo, maxsize, flags);
 		if (bp == NULL) {
 			if (slpflag || slptimeo)
 				return NULL;
 			/*
 			 * XXX This is here until the sleep path is diagnosed
 			 * enough to work under very low memory conditions.
 			 *
 			 * There's an issue on low memory, 4BSD+non-preempt
 			 * systems (eg MIPS routers with 32MB RAM) where buffer
 			 * exhaustion occurs without sleeping for buffer
 			 * reclaimation.  This just sticks in a loop and
 			 * constantly attempts to allocate a buffer, which
 			 * hits exhaustion and tries to wakeup bufdaemon.
 			 * This never happens because we never yield.
 			 *
 			 * The real solution is to identify and fix these cases
 			 * so we aren't effectively busy-waiting in a loop
 			 * until the reclaimation path has cycles to run.
 			 */
 			kern_yield(PRI_USER);
 			goto loop;
 		}
 
 		/*
 		 * This code is used to make sure that a buffer is not
 		 * created while the getnewbuf routine is blocked.
 		 * This can be a problem whether the vnode is locked or not.
 		 * If the buffer is created out from under us, we have to
 		 * throw away the one we just created.
 		 *
 		 * Note: this must occur before we associate the buffer
 		 * with the vp especially considering limitations in
 		 * the splay tree implementation when dealing with duplicate
 		 * lblkno's.
 		 */
 		BO_LOCK(bo);
 		if (gbincore(bo, blkno)) {
 			BO_UNLOCK(bo);
 			bp->b_flags |= B_INVAL;
 			brelse(bp);
 			bufspace_release(maxsize);
 			goto loop;
 		}
 
 		/*
 		 * Insert the buffer into the hash, so that it can
 		 * be found by incore.
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
 		/*
 		 * set B_VMIO bit.  allocbuf() the buffer bigger.  Since the
 		 * buffer size starts out as 0, B_CACHE will be set by
 		 * allocbuf() for the VMIO case prior to it testing the
 		 * backing store for validity.
 		 */
 
 		if (vmio) {
 			bp->b_flags |= B_VMIO;
 			KASSERT(vp->v_object == bp->b_bufobj->bo_object,
 			    ("ARGH! different b_bufobj->bo_object %p %p %p\n",
 			    bp, vp->v_object, bp->b_bufobj->bo_object));
 		} else {
 			bp->b_flags &= ~B_VMIO;
 			KASSERT(bp->b_bufobj->bo_object == NULL,
 			    ("ARGH! has b_bufobj->bo_object %p %p\n",
 			    bp, bp->b_bufobj->bo_object));
 			BUF_CHECK_MAPPED(bp);
 		}
 
 		allocbuf(bp, size);
 		bufspace_release(maxsize);
 		bp->b_flags &= ~B_DONE;
 	}
 	CTR4(KTR_BUF, "getblk(%p, %ld, %d) = %p", vp, (long)blkno, size, bp);
 	BUF_ASSERT_HELD(bp);
 end:
 	KASSERT(bp->b_bufobj == bo,
 	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
 	return (bp);
 }
 
 /*
  * Get an empty, disassociated buffer of given size.  The buffer is initially
  * set to B_INVAL.
  */
 struct buf *
 geteblk(int size, int flags)
 {
 	struct buf *bp;
 	int maxsize;
 
 	maxsize = (size + BKVAMASK) & ~BKVAMASK;
 	while ((bp = getnewbuf(NULL, 0, 0, maxsize, flags)) == NULL) {
 		if ((flags & GB_NOWAIT_BD) &&
 		    (curthread->td_pflags & TDP_BUFNEED) != 0)
 			return (NULL);
 	}
 	allocbuf(bp, size);
 	bufspace_release(maxsize);
 	bp->b_flags |= B_INVAL;	/* b_dep cleared by getnewbuf() */
 	BUF_ASSERT_HELD(bp);
 	return (bp);
 }
 
 /*
  * Truncate the backing store for a non-vmio buffer.
  */
 static void
 vfs_nonvmio_truncate(struct buf *bp, int newbsize)
 {
 
 	if (bp->b_flags & B_MALLOC) {
 		/*
 		 * malloced buffers are not shrunk
 		 */
 		if (newbsize == 0) {
 			bufmallocadjust(bp, 0);
 			free(bp->b_data, M_BIOBUF);
 			bp->b_data = bp->b_kvabase;
 			bp->b_flags &= ~B_MALLOC;
 		}
 		return;
 	}
 	vm_hold_free_pages(bp, newbsize);
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * Extend the backing for a non-VMIO buffer.
  */
 static void
 vfs_nonvmio_extend(struct buf *bp, int newbsize)
 {
 	caddr_t origbuf;
 	int origbufsize;
 
 	/*
 	 * We only use malloced memory on the first allocation.
 	 * and revert to page-allocated memory when the buffer
 	 * grows.
 	 *
 	 * There is a potential smp race here that could lead
 	 * to bufmallocspace slightly passing the max.  It
 	 * is probably extremely rare and not worth worrying
 	 * over.
 	 */
 	if (bp->b_bufsize == 0 && newbsize <= PAGE_SIZE/2 &&
 	    bufmallocspace < maxbufmallocspace) {
 		bp->b_data = malloc(newbsize, M_BIOBUF, M_WAITOK);
 		bp->b_flags |= B_MALLOC;
 		bufmallocadjust(bp, newbsize);
 		return;
 	}
 
 	/*
 	 * If the buffer is growing on its other-than-first
 	 * allocation then we revert to the page-allocation
 	 * scheme.
 	 */
 	origbuf = NULL;
 	origbufsize = 0;
 	if (bp->b_flags & B_MALLOC) {
 		origbuf = bp->b_data;
 		origbufsize = bp->b_bufsize;
 		bp->b_data = bp->b_kvabase;
 		bufmallocadjust(bp, 0);
 		bp->b_flags &= ~B_MALLOC;
 		newbsize = round_page(newbsize);
 	}
 	vm_hold_load_pages(bp, (vm_offset_t) bp->b_data + bp->b_bufsize,
 	    (vm_offset_t) bp->b_data + newbsize);
 	if (origbuf != NULL) {
 		bcopy(origbuf, bp->b_data, origbufsize);
 		free(origbuf, M_BIOBUF);
 	}
 	bufspace_adjust(bp, newbsize);
 }
 
 /*
  * This code constitutes the buffer memory from either anonymous system
  * memory (in the case of non-VMIO operations) or from an associated
  * VM object (in the case of VMIO operations).  This code is able to
  * resize a buffer up or down.
  *
  * Note that this code is tricky, and has many complications to resolve
  * deadlock or inconsistent data situations.  Tread lightly!!! 
  * There are B_CACHE and B_DELWRI interactions that must be dealt with by 
  * the caller.  Calling this code willy nilly can result in the loss of data.
  *
  * allocbuf() only adjusts B_CACHE for VMIO buffers.  getblk() deals with
  * B_CACHE for the non-VMIO case.
  */
 int
 allocbuf(struct buf *bp, int size)
 {
 	int newbsize;
 
 	BUF_ASSERT_HELD(bp);
 
 	if (bp->b_bcount == size)
 		return (1);
 
 	if (bp->b_kvasize != 0 && bp->b_kvasize < size)
 		panic("allocbuf: buffer too small");
 
 	newbsize = roundup2(size, DEV_BSIZE);
 	if ((bp->b_flags & B_VMIO) == 0) {
 		if ((bp->b_flags & B_MALLOC) == 0)
 			newbsize = round_page(newbsize);
 		/*
 		 * Just get anonymous memory from the kernel.  Don't
 		 * mess with B_CACHE.
 		 */
 		if (newbsize < bp->b_bufsize)
 			vfs_nonvmio_truncate(bp, newbsize);
 		else if (newbsize > bp->b_bufsize)
 			vfs_nonvmio_extend(bp, newbsize);
 	} else {
 		int desiredpages;
 
 		desiredpages = (size == 0) ? 0 :
 		    num_pages((bp->b_offset & PAGE_MASK) + newbsize);
 
 		if (bp->b_flags & B_MALLOC)
 			panic("allocbuf: VMIO buffer can't be malloced");
 		/*
 		 * Set B_CACHE initially if buffer is 0 length or will become
 		 * 0-length.
 		 */
 		if (size == 0 || bp->b_bufsize == 0)
 			bp->b_flags |= B_CACHE;
 
 		if (newbsize < bp->b_bufsize)
 			vfs_vmio_truncate(bp, desiredpages);
 		/* XXX This looks as if it should be newbsize > b_bufsize */
 		else if (size > bp->b_bcount)
 			vfs_vmio_extend(bp, desiredpages, size);
 		bufspace_adjust(bp, newbsize);
 	}
 	bp->b_bcount = size;		/* requested buffer size. */
 	return (1);
 }
 
 extern int inflight_transient_maps;
 
 void
 biodone(struct bio *bp)
 {
 	struct mtx *mtxp;
 	void (*done)(struct bio *);
 	vm_offset_t start, end;
 
 	if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
 		bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
 		bp->bio_flags |= BIO_UNMAPPED;
 		start = trunc_page((vm_offset_t)bp->bio_data);
 		end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
 		bp->bio_data = unmapped_buf;
 		pmap_qremove(start, atop(end - start));
 		vmem_free(transient_arena, start, end - start);
 		atomic_add_int(&inflight_transient_maps, -1);
 	}
 	done = bp->bio_done;
 	if (done == NULL) {
 		mtxp = mtx_pool_find(mtxpool_sleep, bp);
 		mtx_lock(mtxp);
 		bp->bio_flags |= BIO_DONE;
 		wakeup(bp);
 		mtx_unlock(mtxp);
 	} else
 		done(bp);
 }
 
 /*
  * Wait for a BIO to finish.
  */
 int
 biowait(struct bio *bp, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->bio_flags & BIO_DONE) == 0)
 		msleep(bp, mtxp, PRIBIO, wchan, 0);
 	mtx_unlock(mtxp);
 	if (bp->bio_error != 0)
 		return (bp->bio_error);
 	if (!(bp->bio_flags & BIO_ERROR))
 		return (0);
 	return (EIO);
 }
 
 void
 biofinish(struct bio *bp, struct devstat *stat, int error)
 {
 	
 	if (error) {
 		bp->bio_error = error;
 		bp->bio_flags |= BIO_ERROR;
 	}
 	if (stat != NULL)
 		devstat_end_transaction_bio(stat, bp);
 	biodone(bp);
 }
 
 /*
  *	bufwait:
  *
  *	Wait for buffer I/O completion, returning error status.  The buffer
  *	is left locked and B_DONE on return.  B_EINTR is converted into an EINTR
  *	error and cleared.
  */
 int
 bufwait(struct buf *bp)
 {
 	if (bp->b_iocmd == BIO_READ)
 		bwait(bp, PRIBIO, "biord");
 	else
 		bwait(bp, PRIBIO, "biowr");
 	if (bp->b_flags & B_EINTR) {
 		bp->b_flags &= ~B_EINTR;
 		return (EINTR);
 	}
 	if (bp->b_ioflags & BIO_ERROR) {
 		return (bp->b_error ? bp->b_error : EIO);
 	} else {
 		return (0);
 	}
 }
 
 /*
  *	bufdone:
  *
  *	Finish I/O on a buffer, optionally calling a completion function.
  *	This is usually called from an interrupt so process blocking is
  *	not allowed.
  *
  *	biodone is also responsible for setting B_CACHE in a B_VMIO bp.
  *	In a non-VMIO bp, B_CACHE will be set on the next getblk() 
  *	assuming B_INVAL is clear.
  *
  *	For the VMIO case, we set B_CACHE if the op was a read and no
  *	read error occurred, or if the op was a write.  B_CACHE is never
  *	set if the buffer is invalid or otherwise uncacheable.
  *
  *	biodone does not mess with B_INVAL, allowing the I/O routine or the
  *	initiator to leave B_INVAL set to brelse the buffer out of existence
  *	in the biodone routine.
  */
 void
 bufdone(struct buf *bp)
 {
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 	BUF_ASSERT_HELD(bp);
 
 	runningbufwakeup(bp);
 	if (bp->b_iocmd == BIO_WRITE)
 		dropobj = bp->b_bufobj;
 	/* call optional completion function if requested */
 	if (bp->b_iodone != NULL) {
 		biodone = bp->b_iodone;
 		bp->b_iodone = NULL;
 		(*biodone) (bp);
 		if (dropobj)
 			bufobj_wdrop(dropobj);
 		return;
 	}
 
 	bufdone_finish(bp);
 
 	if (dropobj)
 		bufobj_wdrop(dropobj);
 }
 
 void
 bufdone_finish(struct buf *bp)
 {
 	BUF_ASSERT_HELD(bp);
 
 	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
 		/*
 		 * Set B_CACHE if the op was a normal read and no error
 		 * occurred.  B_CACHE is set for writes in the b*write()
 		 * routines.
 		 */
 		if (bp->b_iocmd == BIO_READ &&
 		    !(bp->b_flags & (B_INVAL|B_NOCACHE)) &&
 		    !(bp->b_ioflags & BIO_ERROR))
 			bp->b_flags |= B_CACHE;
 		vfs_vmio_iodone(bp);
 	}
 
 	/*
 	 * For asynchronous completions, release the buffer now. The brelse
 	 * will do a wakeup there if necessary - so no need to do a wakeup
 	 * here in the async case. The sync case always needs to do a wakeup.
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_RELBUF)) ||
 		    (bp->b_ioflags & BIO_ERROR))
 			brelse(bp);
 		else
 			bqrelse(bp);
 	} else
 		bdone(bp);
 }
 
 /*
  * This routine is called in lieu of iodone in the case of
  * incomplete I/O.  This keeps the busy status for pages
  * consistent.
  */
 void
 vfs_unbusy_pages(struct buf *bp)
 {
 	int i;
 	vm_object_t obj;
 	vm_page_t m;
 
 	runningbufwakeup(bp);
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
 			m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i);
 			if (!m)
 				panic("vfs_unbusy_pages: page missing\n");
 			bp->b_pages[i] = m;
 			if (buf_mapped(bp)) {
 				BUF_CHECK_MAPPED(bp);
 				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 				    bp->b_pages, bp->b_npages);
 			} else
 				BUF_CHECK_UNMAPPED(bp);
 		}
 		vm_page_sunbusy(m);
 	}
 	vm_object_pip_wakeupn(obj, bp->b_npages);
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * vfs_page_set_valid:
  *
  *	Set the valid bits in a page based on the supplied offset.   The
  *	range is restricted to the buffer's size.
  *
  *	This routine is typically called after a read completes.
  */
 static void
 vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t eoff;
 
 	/*
 	 * Compute the end offset, eoff, such that [off, eoff) does not span a
 	 * page boundary and eoff is not greater than the end of the buffer.
 	 * The end of the buffer, in this case, is our file EOF, not the
 	 * allocation size of the buffer.
 	 */
 	eoff = (off + PAGE_SIZE) & ~(vm_ooffset_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > off)
 		vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
 }
 
 /*
  * vfs_page_set_validclean:
  *
  *	Set the valid bits and clear the dirty bits in a page based on the
  *	supplied offset.   The range is restricted to the buffer's size.
  */
 static void
 vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off, vm_page_t m)
 {
 	vm_ooffset_t soff, eoff;
 
 	/*
 	 * Start and end offsets in buffer.  eoff - soff may not cross a
 	 * page boundary or cross the end of the buffer.  The end of the
 	 * buffer, in this case, is our file EOF, not the allocation size
 	 * of the buffer.
 	 */
 	soff = off;
 	eoff = (off + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	if (eoff > bp->b_offset + bp->b_bcount)
 		eoff = bp->b_offset + bp->b_bcount;
 
 	/*
 	 * Set valid range.  This is typically the entire buffer and thus the
 	 * entire page.
 	 */
 	if (eoff > soff) {
 		vm_page_set_validclean(
 		    m,
 		   (vm_offset_t) (soff & PAGE_MASK),
 		   (vm_offset_t) (eoff - soff)
 		);
 	}
 }
 
 /*
  * Ensure that all buffer pages are not exclusive busied.  If any page is
  * exclusive busy, drain it.
  */
 void
 vfs_drain_busy_pages(struct buf *bp)
 {
 	vm_page_t m;
 	int i, last_busied;
 
 	VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
 	last_busied = 0;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (vm_page_xbusied(m)) {
 			for (; last_busied < i; last_busied++)
 				vm_page_sbusy(bp->b_pages[last_busied]);
 			while (vm_page_xbusied(m)) {
 				vm_page_lock(m);
 				VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 				vm_page_busy_sleep(m, "vbpage", true);
 				VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 			}
 		}
 	}
 	for (i = 0; i < last_busied; i++)
 		vm_page_sunbusy(bp->b_pages[i]);
 }
 
 /*
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
  * almost as being exclusive busy.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistent.
  *
  * Since I/O has not been initiated yet, certain buffer flags
  * such as BIO_ERROR or B_INVAL may be in an inconsistent state
  * and should be ignored.
  */
 void
 vfs_busy_pages(struct buf *bp, int clear_modify)
 {
 	vm_object_t obj;
 	vm_ooffset_t foff;
 	vm_page_t m;
 	int i;
 	bool bogus;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	obj = bp->b_bufobj->bo_object;
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
 	VM_OBJECT_WLOCK(obj);
 	vfs_drain_busy_pages(bp);
 	if (bp->b_bufsize != 0)
 		vfs_setdirty_locked_object(bp);
 	bogus = false;
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
 		if ((bp->b_flags & B_CLUSTER) == 0) {
 			vm_object_pip_add(obj, 1);
 			vm_page_sbusy(m);
 		}
 		/*
 		 * When readying a buffer for a read ( i.e
 		 * clear_modify == 0 ), it is important to do
 		 * bogus_page replacement for valid pages in 
 		 * partially instantiated buffers.  Partially 
 		 * instantiated buffers can, in turn, occur when
 		 * reconstituting a buffer from its VM backing store
 		 * base.  We only have to do this if B_CACHE is
 		 * clear ( which causes the I/O to occur in the
 		 * first place ).  The replacement prevents the read
 		 * I/O from overwriting potentially dirty VM-backed
 		 * pages.  XXX bogus page replacement is, uh, bogus.
 		 * It may not work properly with small-block devices.
 		 * We need to find a better way.
 		 */
 		if (clear_modify) {
 			pmap_remove_write(m);
 			vfs_page_set_validclean(bp, foff, m);
 		} else if (m->valid == VM_PAGE_BITS_ALL &&
 		    (bp->b_flags & B_CACHE) == 0) {
 			bp->b_pages[i] = bogus_page;
 			bogus = true;
 		}
 		foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 	}
 	VM_OBJECT_WUNLOCK(obj);
 	if (bogus && buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
 		    bp->b_pages, bp->b_npages);
 	}
 }
 
 /*
  *	vfs_bio_set_valid:
  *
  *	Set the range within the buffer to valid.  The range is
  *	relative to the beginning of the buffer, b_offset.  Note that
  *	b_offset itself may be offset from the beginning of the first
  *	page.
  */
 void   
 vfs_bio_set_valid(struct buf *bp, int base, int size)
 {
 	int i, n;
 	vm_page_t m;
 
 	if (!(bp->b_flags & B_VMIO))
 		return;
 
 	/*
 	 * Fixup base to be relative to beginning of first page.
 	 * Set initial n to be the maximum number of bytes in the
 	 * first page that can be validated.
 	 */
 	base += (bp->b_offset & PAGE_MASK);
 	n = PAGE_SIZE - (base & PAGE_MASK);
 
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 		m = bp->b_pages[i];
 		if (n > size)
 			n = size;
 		vm_page_set_valid_range(m, base & PAGE_MASK, n);
 		base += n;
 		size -= n;
 		n = PAGE_SIZE;
 	}
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 }
 
 /*
  *	vfs_bio_clrbuf:
  *
  *	If the specified buffer is a non-VMIO buffer, clear the entire
  *	buffer.  If the specified buffer is a VMIO buffer, clear and
  *	validate only the previously invalid portions of the buffer.
  *	This routine essentially fakes an I/O, so we need to clear
  *	BIO_ERROR and B_INVAL.
  *
  *	Note that while we only theoretically need to clear through b_bcount,
  *	we go ahead and clear through b_bufsize.
  */
 void
 vfs_bio_clrbuf(struct buf *bp) 
 {
 	int i, j, mask, sa, ea, slide;
 
 	if ((bp->b_flags & (B_VMIO | B_MALLOC)) != B_VMIO) {
 		clrbuf(bp);
 		return;
 	}
 	bp->b_flags &= ~B_INVAL;
 	bp->b_ioflags &= ~BIO_ERROR;
 	VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
 	if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
 	    (bp->b_offset & PAGE_MASK) == 0) {
 		if (bp->b_pages[0] == bogus_page)
 			goto unlock;
 		mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
 		if ((bp->b_pages[0]->valid & mask) == mask)
 			goto unlock;
 		if ((bp->b_pages[0]->valid & mask) == 0) {
 			pmap_zero_page_area(bp->b_pages[0], 0, bp->b_bufsize);
 			bp->b_pages[0]->valid |= mask;
 			goto unlock;
 		}
 	}
 	sa = bp->b_offset & PAGE_MASK;
 	slide = 0;
 	for (i = 0; i < bp->b_npages; i++, sa = 0) {
 		slide = imin(slide + PAGE_SIZE, bp->b_offset + bp->b_bufsize);
 		ea = slide & PAGE_MASK;
 		if (ea == 0)
 			ea = PAGE_SIZE;
 		if (bp->b_pages[i] == bogus_page)
 			continue;
 		j = sa / DEV_BSIZE;
 		mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
 		VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
 		if ((bp->b_pages[i]->valid & mask) == mask)
 			continue;
 		if ((bp->b_pages[i]->valid & mask) == 0)
 			pmap_zero_page_area(bp->b_pages[i], sa, ea - sa);
 		else {
 			for (; sa < ea; sa += DEV_BSIZE, j++) {
 				if ((bp->b_pages[i]->valid & (1 << j)) == 0) {
 					pmap_zero_page_area(bp->b_pages[i],
 					    sa, DEV_BSIZE);
 				}
 			}
 		}
 		bp->b_pages[i]->valid |= mask;
 	}
 unlock:
 	VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
 	bp->b_resid = 0;
 }
 
 void
 vfs_bio_bzero_buf(struct buf *bp, int base, int size)
 {
 	vm_page_t m;
 	int i, n;
 
 	if (buf_mapped(bp)) {
 		BUF_CHECK_MAPPED(bp);
 		bzero(bp->b_data + base, size);
 	} else {
 		BUF_CHECK_UNMAPPED(bp);
 		n = PAGE_SIZE - (base & PAGE_MASK);
 		for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
 			m = bp->b_pages[i];
 			if (n > size)
 				n = size;
 			pmap_zero_page_area(m, base & PAGE_MASK, n);
 			base += n;
 			size -= n;
 			n = PAGE_SIZE;
 		}
 	}
 }
 
 /*
  * vm_hold_load_pages and vm_hold_free_pages get pages into
  * a buffers address space.  The pages are anonymous and are
  * not associated with a file object.
  */
 static void
 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to)
 {
 	vm_offset_t pg;
 	vm_page_t p;
 	int index;
 
 	BUF_CHECK_MAPPED(bp);
 
 	to = round_page(to);
 	from = round_page(from);
 	index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 
 	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
 tryagain:
 		/*
 		 * note: must allocate system pages since blocking here
 		 * could interfere with paging I/O, no matter which
 		 * process we are.
 		 */
 		p = vm_page_alloc(NULL, 0, VM_ALLOC_SYSTEM | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_COUNT((to - pg) >> PAGE_SHIFT));
 		if (p == NULL) {
 			VM_WAIT;
 			goto tryagain;
 		}
 		pmap_qenter(pg, &p, 1);
 		bp->b_pages[index] = p;
 	}
 	bp->b_npages = index;
 }
 
 /* Return pages associated with this buf to the vm system */
 static void
 vm_hold_free_pages(struct buf *bp, int newbsize)
 {
 	vm_offset_t from;
 	vm_page_t p;
 	int index, newnpages;
 
 	BUF_CHECK_MAPPED(bp);
 
 	from = round_page((vm_offset_t)bp->b_data + newbsize);
 	newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
 	if (bp->b_npages > newnpages)
 		pmap_qremove(from, bp->b_npages - newnpages);
 	for (index = newnpages; index < bp->b_npages; index++) {
 		p = bp->b_pages[index];
 		bp->b_pages[index] = NULL;
 		p->wire_count--;
 		vm_page_free(p);
 	}
 	atomic_subtract_int(&vm_cnt.v_wire_count, bp->b_npages - newnpages);
 	bp->b_npages = newnpages;
 }
 
 /*
  * Map an IO request into kernel virtual address space.
  *
  * All requests are (re)mapped into kernel VA space.
  * Notice that we use b_bufsize for the size of the buffer
  * to be mapped.  b_bcount might be modified by the driver.
  *
  * Note that even if the caller determines that the address space should
  * be valid, a race or a smaller-file mapped into a larger space may
  * actually cause vmapbuf() to fail, so all callers of vmapbuf() MUST
  * check the return value.
  *
  * This function only works with pager buffers.
  */
 int
 vmapbuf(struct buf *bp, int mapbuf)
 {
 	vm_prot_t prot;
 	int pidx;
 
 	if (bp->b_bufsize < 0)
 		return (-1);
 	prot = VM_PROT_READ;
 	if (bp->b_iocmd == BIO_READ)
 		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
 	if ((pidx = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
 	    (vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
 	    btoc(MAXPHYS))) < 0)
 		return (-1);
 	bp->b_npages = pidx;
 	bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
 	if (mapbuf || !unmapped_buf_allowed) {
 		pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_pages, pidx);
 		bp->b_data = bp->b_kvabase + bp->b_offset;
 	} else
 		bp->b_data = unmapped_buf;
 	return(0);
 }
 
 /*
  * Free the io map PTEs associated with this IO operation.
  * We also invalidate the TLB entries and restore the original b_addr.
  *
  * This function only works with pager buffers.
  */
 void
 vunmapbuf(struct buf *bp)
 {
 	int npages;
 
 	npages = bp->b_npages;
 	if (buf_mapped(bp))
 		pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
 	vm_page_unhold_pages(bp->b_pages, npages);
 
 	bp->b_data = unmapped_buf;
 }
 
 void
 bdone(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_flags |= B_DONE;
 	wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bwait(struct buf *bp, u_char pri, const char *wchan)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while ((bp->b_flags & B_DONE) == 0)
 		msleep(bp, mtxp, pri, wchan, 0);
 	mtx_unlock(mtxp);
 }
 
 int
 bufsync(struct bufobj *bo, int waitfor)
 {
 
 	return (VOP_FSYNC(bo->__bo_vnode, waitfor, curthread));
 }
 
 void
 bufstrategy(struct bufobj *bo, struct buf *bp)
 {
 	int i = 0;
 	struct vnode *vp;
 
 	vp = bp->b_vp;
 	KASSERT(vp == bo->bo_private, ("Inconsistent vnode bufstrategy"));
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("Wrong vnode in bufstrategy(bp=%p, vp=%p)", bp, vp));
 	i = VOP_STRATEGY(vp, bp);
 	KASSERT(i == 0, ("VOP_STRATEGY failed bp=%p vp=%p", bp, bp->b_vp));
 }
 
 void
 bufobj_wrefl(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	ASSERT_BO_WLOCKED(bo);
 	bo->bo_numoutput++;
 }
 
 void
 bufobj_wref(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
 	BO_LOCK(bo);
 	bo->bo_numoutput++;
 	BO_UNLOCK(bo);
 }
 
 void
 bufobj_wdrop(struct bufobj *bo)
 {
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wdrop"));
 	BO_LOCK(bo);
 	KASSERT(bo->bo_numoutput > 0, ("bufobj_wdrop non-positive count"));
 	if ((--bo->bo_numoutput == 0) && (bo->bo_flag & BO_WWAIT)) {
 		bo->bo_flag &= ~BO_WWAIT;
 		wakeup(&bo->bo_numoutput);
 	}
 	BO_UNLOCK(bo);
 }
 
 int
 bufobj_wwait(struct bufobj *bo, int slpflag, int timeo)
 {
 	int error;
 
 	KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
 	ASSERT_BO_WLOCKED(bo);
 	error = 0;
 	while (bo->bo_numoutput) {
 		bo->bo_flag |= BO_WWAIT;
 		error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
 		    slpflag | (PRIBIO + 1), "bo_wwait", timeo);
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 void
 bpin(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	bp->b_pin_count++;
 	mtx_unlock(mtxp);
 }
 
 void
 bunpin(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	if (--bp->b_pin_count == 0)
 		wakeup(bp);
 	mtx_unlock(mtxp);
 }
 
 void
 bunpin_wait(struct buf *bp)
 {
 	struct mtx *mtxp;
 
 	mtxp = mtx_pool_find(mtxpool_sleep, bp);
 	mtx_lock(mtxp);
 	while (bp->b_pin_count > 0)
 		msleep(bp, mtxp, PRIBIO, "bwunpin", 0);
 	mtx_unlock(mtxp);
 }
 
 /*
  * Set bio_data or bio_ma for struct bio from the struct buf.
  */
 void
 bdata2bio(struct buf *bp, struct bio *bip)
 {
 
 	if (!buf_mapped(bp)) {
 		KASSERT(unmapped_buf_allowed, ("unmapped"));
 		bip->bio_ma = bp->b_pages;
 		bip->bio_ma_n = bp->b_npages;
 		bip->bio_data = unmapped_buf;
 		bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bip->bio_flags |= BIO_UNMAPPED;
 		KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
 		    PAGE_SIZE == bp->b_npages,
 		    ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
 		    (long long)bip->bio_length, bip->bio_ma_n));
 	} else {
 		bip->bio_data = bp->b_data;
 		bip->bio_ma = NULL;
 	}
 }
 
 static int buf_pager_relbuf;
 SYSCTL_INT(_vfs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
     &buf_pager_relbuf, 0,
     "Make buffer pager release buffers after reading");
 
 /*
  * The buffer pager.  It uses buffer reads to validate pages.
  *
  * In contrast to the generic local pager from vm/vnode_pager.c, this
  * pager correctly and easily handles volumes where the underlying
  * device block size is greater than the machine page size.  The
  * buffer cache transparently extends the requested page run to be
  * aligned at the block boundary, and does the necessary bogus page
  * replacements in the addends to avoid obliterating already valid
  * pages.
  *
  * The only non-trivial issue is that the exclusive busy state for
  * pages, which is assumed by the vm_pager_getpages() interface, is
  * incompatible with the VMIO buffer cache's desire to share-busy the
  * pages.  This function performs a trivial downgrade of the pages'
  * state before reading buffers, and a less trivial upgrade from the
  * shared-busy to excl-busy state after the read.
  */
 int
 vfs_bio_getpages(struct vnode *vp, vm_page_t *ma, int count,
     int *rbehind, int *rahead, vbg_get_lblkno_t get_lblkno,
     vbg_get_blksize_t get_blksize)
 {
 	vm_page_t m;
 	vm_object_t object;
 	struct buf *bp;
 	struct mount *mp;
 	daddr_t lbn, lbnp;
 	vm_ooffset_t la, lb, poff, poffe;
 	long bsize;
 	int bo_bs, br_flags, error, i, pgsin, pgsin_a, pgsin_b;
 	bool redo, lpart;
 
 	object = vp->v_object;
 	mp = vp->v_mount;
 	la = IDX_TO_OFF(ma[count - 1]->pindex);
 	if (la >= object->un_pager.vnp.vnp_size)
 		return (VM_PAGER_BAD);
 	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
 	bo_bs = get_blksize(vp, get_lblkno(vp, IDX_TO_OFF(ma[0]->pindex)));
 
 	/*
 	 * Calculate read-ahead, behind and total pages.
 	 */
 	pgsin = count;
 	lb = IDX_TO_OFF(ma[0]->pindex);
 	pgsin_b = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
 	pgsin += pgsin_b;
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	pgsin_a = OFF_TO_IDX(roundup2(la, bo_bs) - la);
 	if (la + IDX_TO_OFF(pgsin_a) >= object->un_pager.vnp.vnp_size)
 		pgsin_a = OFF_TO_IDX(roundup2(object->un_pager.vnp.vnp_size,
 		    PAGE_SIZE) - la);
 	pgsin += pgsin_a;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	PCPU_INC(cnt.v_vnodein);
 	PCPU_ADD(cnt.v_vnodepgsin, pgsin);
 
 	br_flags = (mp != NULL && (mp->mnt_kern_flag & MNTK_UNMAPPED_BUFS)
 	    != 0) ? GB_UNMAPPED : 0;
 	VM_OBJECT_WLOCK(object);
 again:
 	for (i = 0; i < count; i++)
 		vm_page_busy_downgrade(ma[i]);
 	VM_OBJECT_WUNLOCK(object);
 
 	lbnp = -1;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 
 		/*
 		 * Pages are shared busy and the object lock is not
 		 * owned, which together allow for the pages'
 		 * invalidation.  The racy test for validity avoids
 		 * useless creation of the buffer for the most typical
 		 * case when invalidation is not used in redo or for
 		 * parallel read.  The shared->excl upgrade loop at
 		 * the end of the function catches the race in a
 		 * reliable way (protected by the object lock).
 		 */
 		if (m->valid == VM_PAGE_BITS_ALL)
 			continue;
 
 		poff = IDX_TO_OFF(m->pindex);
 		poffe = MIN(poff + PAGE_SIZE, object->un_pager.vnp.vnp_size);
 		for (; poff < poffe; poff += bsize) {
 			lbn = get_lblkno(vp, poff);
 			if (lbn == lbnp)
 				goto next_page;
 			lbnp = lbn;
 
 			bsize = get_blksize(vp, lbn);
 			error = bread_gb(vp, lbn, bsize, curthread->td_ucred,
 			    br_flags, &bp);
 			if (error != 0)
 				goto end_pages;
 			if (LIST_EMPTY(&bp->b_dep)) {
 				/*
 				 * Invalidation clears m->valid, but
 				 * may leave B_CACHE flag if the
 				 * buffer existed at the invalidation
 				 * time.  In this case, recycle the
 				 * buffer to do real read on next
 				 * bread() after redo.
 				 *
 				 * Otherwise B_RELBUF is not strictly
 				 * necessary, enable to reduce buf
 				 * cache pressure.
 				 */
 				if (buf_pager_relbuf ||
 				    m->valid != VM_PAGE_BITS_ALL)
 					bp->b_flags |= B_RELBUF;
 
 				bp->b_flags &= ~B_NOCACHE;
 				brelse(bp);
 			} else {
 				bqrelse(bp);
 			}
 		}
 		KASSERT(1 /* racy, enable for debugging */ ||
 		    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
 		    ("buf %d %p invalid", i, m));
 		if (i == count - 1 && lpart) {
 			VM_OBJECT_WLOCK(object);
 			if (m->valid != 0 &&
 			    m->valid != VM_PAGE_BITS_ALL)
 				vm_page_zero_invalid(m, TRUE);
 			VM_OBJECT_WUNLOCK(object);
 		}
 next_page:;
 	}
 end_pages:
 
 	VM_OBJECT_WLOCK(object);
 	redo = false;
 	for (i = 0; i < count; i++) {
 		vm_page_sunbusy(ma[i]);
 		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
 
 		/*
 		 * Since the pages were only sbusy while neither the
 		 * buffer nor the object lock was held by us, or
 		 * reallocated while vm_page_grab() slept for busy
 		 * relinguish, they could have been invalidated.
 		 * Recheck the valid bits and re-read as needed.
 		 *
 		 * Note that the last page is made fully valid in the
 		 * read loop, and partial validity for the page at
 		 * index count - 1 could mean that the page was
 		 * invalidated or removed, so we must restart for
 		 * safety as well.
 		 */
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			redo = true;
 	}
 	if (redo && error == 0)
 		goto again;
 	VM_OBJECT_WUNLOCK(object);
 	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 }
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
 
 /* DDB command to show buffer data */
 DB_SHOW_COMMAND(buffer, db_show_buffer)
 {
 	/* get args */
 	struct buf *bp = (struct buf *)addr;
 
 	if (!have_addr) {
 		db_printf("usage: show buffer <addr>\n");
 		return;
 	}
 
 	db_printf("buf at %p\n", bp);
 	db_printf("b_flags = 0x%b, b_xflags=0x%b, b_vflags=0x%b\n",
 	    (u_int)bp->b_flags, PRINT_BUF_FLAGS, (u_int)bp->b_xflags,
 	    PRINT_BUF_XFLAGS, (u_int)bp->b_vflags, PRINT_BUF_VFLAGS);
 	db_printf(
 	    "b_error = %d, b_bufsize = %ld, b_bcount = %ld, b_resid = %ld\n"
 	    "b_bufobj = (%p), b_data = %p, b_blkno = %jd, b_lblkno = %jd, "
 	    "b_dep = %p\n",
 	    bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid,
 	    bp->b_bufobj, bp->b_data, (intmax_t)bp->b_blkno,
 	    (intmax_t)bp->b_lblkno, bp->b_dep.lh_first);
 	db_printf("b_kvabase = %p, b_kvasize = %d\n",
 	    bp->b_kvabase, bp->b_kvasize);
 	if (bp->b_npages) {
 		int i;
 		db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages);
 		for (i = 0; i < bp->b_npages; i++) {
 			vm_page_t m;
 			m = bp->b_pages[i];
 			db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object,
 			    (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m));
 			if ((i + 1) < bp->b_npages)
 				db_printf(",");
 		}
 		db_printf("\n");
 	}
 	db_printf(" ");
 	BUF_LOCKPRINTINFO(bp);
 }
 
 DB_SHOW_COMMAND(lockedbufs, lockedbufs)
 {
 	struct buf *bp;
 	int i;
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (BUF_ISLOCKED(bp)) {
 			db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 			db_printf("\n");
 			if (db_pager_quit)
 				break;
 		}
 	}
 }
 
 DB_SHOW_COMMAND(vnodebufs, db_show_vnodebufs)
 {
 	struct vnode *vp;
 	struct buf *bp;
 
 	if (!have_addr) {
 		db_printf("usage: show vnodebufs <addr>\n");
 		return;
 	}
 	vp = (struct vnode *)addr;
 	db_printf("Clean buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_clean.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 	db_printf("Dirty buffers:\n");
 	TAILQ_FOREACH(bp, &vp->v_bufobj.bo_dirty.bv_hd, b_bobufs) {
 		db_show_buffer((uintptr_t)bp, 1, 0, NULL);
 		db_printf("\n");
 	}
 }
 
 DB_COMMAND(countfreebufs, db_coundfreebufs)
 {
 	struct buf *bp;
 	int i, used = 0, nfree = 0;
 
 	if (have_addr) {
 		db_printf("usage: countfreebufs\n");
 		return;
 	}
 
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
 		if (bp->b_qindex == QUEUE_EMPTY)
 			nfree++;
 		else
 			used++;
 	}
 
 	db_printf("Counted %d free, %d used (%d tot)\n", nfree, used,
 	    nfree + used);
 	db_printf("numfreebuffers is %d\n", numfreebuffers);
 }
 #endif /* DDB */
Index: stable/11/sys/sparc64/sparc64/pmap.c
===================================================================
--- stable/11/sys/sparc64/sparc64/pmap.c	(revision 323661)
+++ stable/11/sys/sparc64/sparc64/pmap.c	(revision 323662)
@@ -1,2367 +1,2363 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *      from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Manages physical address maps.
  *
  * Since the information managed by this module is also stored by the
  * logical address mapping module, this module may throw away valid virtual
  * to physical mappings at almost any time.  However, invalidations of
  * mappings must be done as requested.
  *
  * In order to cope with hardware architectures which make virtual to
  * physical map invalidates expensive, this module may delay invalidate
  * reduced protection operations until such time as they are actually
  * necessary.  This module is given full information as to which processors
  * are currently using which maps, and to when physical maps must be made
  * correct.
  */
 
 #include "opt_kstack_pages.h"
 #include "opt_pmap.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vmmeter.h>
 
 #include <dev/ofw/openfirm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 
 #include <machine/cache.h>
 #include <machine/frame.h>
 #include <machine/instr.h>
 #include <machine/md_var.h>
 #include <machine/metadata.h>
 #include <machine/ofw_mem.h>
 #include <machine/smp.h>
 #include <machine/tlb.h>
 #include <machine/tte.h>
 #include <machine/tsb.h>
 #include <machine/ver.h>
 
 /*
  * Virtual address of message buffer
  */
 struct msgbuf *msgbufp;
 
 /*
  * Map of physical memory reagions
  */
 vm_paddr_t phys_avail[128];
 static struct ofw_mem_region mra[128];
 struct ofw_mem_region sparc64_memreg[128];
 int sparc64_nmemreg;
 static struct ofw_map translations[128];
 static int translations_size;
 
 static vm_offset_t pmap_idle_map;
 static vm_offset_t pmap_temp_map_1;
 static vm_offset_t pmap_temp_map_2;
 
 /*
  * First and last available kernel virtual addresses
  */
 vm_offset_t virtual_avail;
 vm_offset_t virtual_end;
 vm_offset_t kernel_vm_end;
 
 vm_offset_t vm_max_kernel_address;
 
 /*
  * Kernel pmap
  */
 struct pmap kernel_pmap_store;
 
 struct rwlock_padalign tte_list_global_lock;
 
 /*
  * Allocate physical memory for use in pmap_bootstrap.
  */
 static vm_paddr_t pmap_bootstrap_alloc(vm_size_t size, uint32_t colors);
 
 static void pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data);
 static void pmap_cache_remove(vm_page_t m, vm_offset_t va);
 static int pmap_protect_tte(struct pmap *pm1, struct pmap *pm2,
     struct tte *tp, vm_offset_t va);
 static int pmap_unwire_tte(pmap_t pm, pmap_t pm2, struct tte *tp,
     vm_offset_t va);
 static void pmap_init_qpages(void);
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static int pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, u_int flags, int8_t psind);
 
 extern int tl1_dmmu_miss_direct_patch_tsb_phys_1[];
 extern int tl1_dmmu_miss_direct_patch_tsb_phys_end_1[];
 extern int tl1_dmmu_miss_patch_asi_1[];
 extern int tl1_dmmu_miss_patch_quad_ldd_1[];
 extern int tl1_dmmu_miss_patch_tsb_1[];
 extern int tl1_dmmu_miss_patch_tsb_2[];
 extern int tl1_dmmu_miss_patch_tsb_mask_1[];
 extern int tl1_dmmu_miss_patch_tsb_mask_2[];
 extern int tl1_dmmu_prot_patch_asi_1[];
 extern int tl1_dmmu_prot_patch_quad_ldd_1[];
 extern int tl1_dmmu_prot_patch_tsb_1[];
 extern int tl1_dmmu_prot_patch_tsb_2[];
 extern int tl1_dmmu_prot_patch_tsb_mask_1[];
 extern int tl1_dmmu_prot_patch_tsb_mask_2[];
 extern int tl1_immu_miss_patch_asi_1[];
 extern int tl1_immu_miss_patch_quad_ldd_1[];
 extern int tl1_immu_miss_patch_tsb_1[];
 extern int tl1_immu_miss_patch_tsb_2[];
 extern int tl1_immu_miss_patch_tsb_mask_1[];
 extern int tl1_immu_miss_patch_tsb_mask_2[];
 
 /*
  * If user pmap is processed with pmap_remove and with pmap_remove and the
  * resident count drops to 0, there are no more pages to remove, so we
  * need not continue.
  */
 #define	PMAP_REMOVE_DONE(pm) \
 	((pm) != kernel_pmap && (pm)->pm_stats.resident_count == 0)
 
 /*
  * The threshold (in bytes) above which tsb_foreach() is used in pmap_remove()
  * and pmap_protect() instead of trying each virtual address.
  */
 #define	PMAP_TSB_THRESH	((TSB_SIZE / 2) * PAGE_SIZE)
 
 SYSCTL_NODE(_debug, OID_AUTO, pmap_stats, CTLFLAG_RD, 0, "");
 
 PMAP_STATS_VAR(pmap_nenter);
 PMAP_STATS_VAR(pmap_nenter_update);
 PMAP_STATS_VAR(pmap_nenter_replace);
 PMAP_STATS_VAR(pmap_nenter_new);
 PMAP_STATS_VAR(pmap_nkenter);
 PMAP_STATS_VAR(pmap_nkenter_oc);
 PMAP_STATS_VAR(pmap_nkenter_stupid);
 PMAP_STATS_VAR(pmap_nkremove);
 PMAP_STATS_VAR(pmap_nqenter);
 PMAP_STATS_VAR(pmap_nqremove);
 PMAP_STATS_VAR(pmap_ncache_enter);
 PMAP_STATS_VAR(pmap_ncache_enter_c);
 PMAP_STATS_VAR(pmap_ncache_enter_oc);
 PMAP_STATS_VAR(pmap_ncache_enter_cc);
 PMAP_STATS_VAR(pmap_ncache_enter_coc);
 PMAP_STATS_VAR(pmap_ncache_enter_nc);
 PMAP_STATS_VAR(pmap_ncache_enter_cnc);
 PMAP_STATS_VAR(pmap_ncache_remove);
 PMAP_STATS_VAR(pmap_ncache_remove_c);
 PMAP_STATS_VAR(pmap_ncache_remove_oc);
 PMAP_STATS_VAR(pmap_ncache_remove_cc);
 PMAP_STATS_VAR(pmap_ncache_remove_coc);
 PMAP_STATS_VAR(pmap_ncache_remove_nc);
 PMAP_STATS_VAR(pmap_nzero_page);
 PMAP_STATS_VAR(pmap_nzero_page_c);
 PMAP_STATS_VAR(pmap_nzero_page_oc);
 PMAP_STATS_VAR(pmap_nzero_page_nc);
 PMAP_STATS_VAR(pmap_nzero_page_area);
 PMAP_STATS_VAR(pmap_nzero_page_area_c);
 PMAP_STATS_VAR(pmap_nzero_page_area_oc);
 PMAP_STATS_VAR(pmap_nzero_page_area_nc);
 PMAP_STATS_VAR(pmap_nzero_page_idle);
 PMAP_STATS_VAR(pmap_nzero_page_idle_c);
 PMAP_STATS_VAR(pmap_nzero_page_idle_oc);
 PMAP_STATS_VAR(pmap_nzero_page_idle_nc);
 PMAP_STATS_VAR(pmap_ncopy_page);
 PMAP_STATS_VAR(pmap_ncopy_page_c);
 PMAP_STATS_VAR(pmap_ncopy_page_oc);
 PMAP_STATS_VAR(pmap_ncopy_page_nc);
 PMAP_STATS_VAR(pmap_ncopy_page_dc);
 PMAP_STATS_VAR(pmap_ncopy_page_doc);
 PMAP_STATS_VAR(pmap_ncopy_page_sc);
 PMAP_STATS_VAR(pmap_ncopy_page_soc);
 
 PMAP_STATS_VAR(pmap_nnew_thread);
 PMAP_STATS_VAR(pmap_nnew_thread_oc);
 
 static inline u_long dtlb_get_data(u_int tlb, u_int slot);
 
 /*
  * Quick sort callout for comparing memory regions
  */
 static int mr_cmp(const void *a, const void *b);
 static int om_cmp(const void *a, const void *b);
 
 static int
 mr_cmp(const void *a, const void *b)
 {
 	const struct ofw_mem_region *mra;
 	const struct ofw_mem_region *mrb;
 
 	mra = a;
 	mrb = b;
 	if (mra->mr_start < mrb->mr_start)
 		return (-1);
 	else if (mra->mr_start > mrb->mr_start)
 		return (1);
 	else
 		return (0);
 }
 
 static int
 om_cmp(const void *a, const void *b)
 {
 	const struct ofw_map *oma;
 	const struct ofw_map *omb;
 
 	oma = a;
 	omb = b;
 	if (oma->om_start < omb->om_start)
 		return (-1);
 	else if (oma->om_start > omb->om_start)
 		return (1);
 	else
 		return (0);
 }
 
 static inline u_long
 dtlb_get_data(u_int tlb, u_int slot)
 {
 	u_long data;
 	register_t s;
 
 	slot = TLB_DAR_SLOT(tlb, slot);
 	/*
 	 * We read ASI_DTLB_DATA_ACCESS_REG twice back-to-back in order to
 	 * work around errata of USIII and beyond.
 	 */
 	s = intr_disable();
 	(void)ldxa(slot, ASI_DTLB_DATA_ACCESS_REG);
 	data = ldxa(slot, ASI_DTLB_DATA_ACCESS_REG);
 	intr_restore(s);
 	return (data);
 }
 
 /*
  * Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(u_int cpu_impl)
 {
 	struct pmap *pm;
 	struct tte *tp;
 	vm_offset_t off;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	vm_size_t physsz;
 	vm_size_t virtsz;
 	u_long data;
 	u_long vpn;
 	phandle_t pmem;
 	phandle_t vmem;
 	u_int dtlb_slots_avail;
 	int i;
 	int j;
 	int sz;
 	uint32_t asi;
 	uint32_t colors;
 	uint32_t ldd;
 
 	/*
 	 * Set the kernel context.
 	 */
 	pmap_set_kctx();
 
 	colors = dcache_color_ignore != 0 ? 1 : DCACHE_COLORS;
 
 	/*
 	 * Find out what physical memory is available from the PROM and
 	 * initialize the phys_avail array.  This must be done before
 	 * pmap_bootstrap_alloc is called.
 	 */
 	if ((pmem = OF_finddevice("/memory")) == -1)
 		OF_panic("%s: finddevice /memory", __func__);
 	if ((sz = OF_getproplen(pmem, "available")) == -1)
 		OF_panic("%s: getproplen /memory/available", __func__);
 	if (sizeof(phys_avail) < sz)
 		OF_panic("%s: phys_avail too small", __func__);
 	if (sizeof(mra) < sz)
 		OF_panic("%s: mra too small", __func__);
 	bzero(mra, sz);
 	if (OF_getprop(pmem, "available", mra, sz) == -1)
 		OF_panic("%s: getprop /memory/available", __func__);
 	sz /= sizeof(*mra);
 #ifdef DIAGNOSTIC
 	OF_printf("pmap_bootstrap: physical memory\n");
 #endif
 	qsort(mra, sz, sizeof (*mra), mr_cmp);
 	physsz = 0;
 	getenv_quad("hw.physmem", &physmem);
 	physmem = btoc(physmem);
 	for (i = 0, j = 0; i < sz; i++, j += 2) {
 #ifdef DIAGNOSTIC
 		OF_printf("start=%#lx size=%#lx\n", mra[i].mr_start,
 		    mra[i].mr_size);
 #endif
 		if (physmem != 0 && btoc(physsz + mra[i].mr_size) >= physmem) {
 			if (btoc(physsz) < physmem) {
 				phys_avail[j] = mra[i].mr_start;
 				phys_avail[j + 1] = mra[i].mr_start +
 				    (ctob(physmem) - physsz);
 				physsz = ctob(physmem);
 			}
 			break;
 		}
 		phys_avail[j] = mra[i].mr_start;
 		phys_avail[j + 1] = mra[i].mr_start + mra[i].mr_size;
 		physsz += mra[i].mr_size;
 	}
 	physmem = btoc(physsz);
 
 	/*
 	 * Calculate the size of kernel virtual memory, and the size and mask
 	 * for the kernel TSB based on the phsyical memory size but limited
 	 * by the amount of dTLB slots available for locked entries if we have
 	 * to lock the TSB in the TLB (given that for spitfire-class CPUs all
 	 * of the dt64 slots can hold locked entries but there is no large
 	 * dTLB for unlocked ones, we don't use more than half of it for the
 	 * TSB).
 	 * Note that for reasons unknown OpenSolaris doesn't take advantage of
 	 * ASI_ATOMIC_QUAD_LDD_PHYS on UltraSPARC-III.  However, given that no
 	 * public documentation is available for these, the latter just might
 	 * not support it, yet.
 	 */
 	if (cpu_impl == CPU_IMPL_SPARC64V ||
 	    cpu_impl >= CPU_IMPL_ULTRASPARCIIIp) {
 		tsb_kernel_ldd_phys = 1;
 		virtsz = roundup(5 / 3 * physsz, PAGE_SIZE_4M <<
 		    (PAGE_SHIFT - TTE_SHIFT));
 	} else {
 		dtlb_slots_avail = 0;
 		for (i = 0; i < dtlb_slots; i++) {
 			data = dtlb_get_data(cpu_impl ==
 			    CPU_IMPL_ULTRASPARCIII ? TLB_DAR_T16 :
 			    TLB_DAR_T32, i);
 			if ((data & (TD_V | TD_L)) != (TD_V | TD_L))
 				dtlb_slots_avail++;
 		}
 #ifdef SMP
 		dtlb_slots_avail -= PCPU_PAGES;
 #endif
 		if (cpu_impl >= CPU_IMPL_ULTRASPARCI &&
 		    cpu_impl < CPU_IMPL_ULTRASPARCIII)
 			dtlb_slots_avail /= 2;
 		virtsz = roundup(physsz, PAGE_SIZE_4M <<
 		    (PAGE_SHIFT - TTE_SHIFT));
 		virtsz = MIN(virtsz, (dtlb_slots_avail * PAGE_SIZE_4M) <<
 		    (PAGE_SHIFT - TTE_SHIFT));
 	}
 	vm_max_kernel_address = VM_MIN_KERNEL_ADDRESS + virtsz;
 	tsb_kernel_size = virtsz >> (PAGE_SHIFT - TTE_SHIFT);
 	tsb_kernel_mask = (tsb_kernel_size >> TTE_SHIFT) - 1;
 
 	/*
 	 * Allocate the kernel TSB and lock it in the TLB if necessary.
 	 */
 	pa = pmap_bootstrap_alloc(tsb_kernel_size, colors);
 	if (pa & PAGE_MASK_4M)
 		OF_panic("%s: TSB unaligned", __func__);
 	tsb_kernel_phys = pa;
 	if (tsb_kernel_ldd_phys == 0) {
 		tsb_kernel =
 		    (struct tte *)(VM_MIN_KERNEL_ADDRESS - tsb_kernel_size);
 		pmap_map_tsb();
 		bzero(tsb_kernel, tsb_kernel_size);
 	} else {
 		tsb_kernel =
 		    (struct tte *)TLB_PHYS_TO_DIRECT(tsb_kernel_phys);
 		aszero(ASI_PHYS_USE_EC, tsb_kernel_phys, tsb_kernel_size);
 	}
 
 	/*
 	 * Allocate and map the dynamic per-CPU area for the BSP.
 	 */
 	pa = pmap_bootstrap_alloc(DPCPU_SIZE, colors);
 	dpcpu0 = (void *)TLB_PHYS_TO_DIRECT(pa);
 
 	/*
 	 * Allocate and map the message buffer.
 	 */
 	pa = pmap_bootstrap_alloc(msgbufsize, colors);
 	msgbufp = (struct msgbuf *)TLB_PHYS_TO_DIRECT(pa);
 
 	/*
 	 * Patch the TSB addresses and mask as well as the ASIs used to load
 	 * it into the trap table.
 	 */
 
 #define	LDDA_R_I_R(rd, imm_asi, rs1, rs2)				\
 	(EIF_OP(IOP_LDST) | EIF_F3_RD(rd) | EIF_F3_OP3(INS3_LDDA) |	\
 	    EIF_F3_RS1(rs1) | EIF_F3_I(0) | EIF_F3_IMM_ASI(imm_asi) |	\
 	    EIF_F3_RS2(rs2))
 #define	OR_R_I_R(rd, imm13, rs1)					\
 	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_OR) |	\
 	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
 #define	SETHI(rd, imm22)						\
 	(EIF_OP(IOP_FORM2) | EIF_F2_RD(rd) | EIF_F2_OP2(INS0_SETHI) |	\
 	    EIF_IMM((imm22) >> 10, 22))
 #define	WR_R_I(rd, imm13, rs1)						\
 	(EIF_OP(IOP_MISC) | EIF_F3_RD(rd) | EIF_F3_OP3(INS2_WR) |	\
 	    EIF_F3_RS1(rs1) | EIF_F3_I(1) | EIF_IMM(imm13, 13))
 
 #define	PATCH_ASI(addr, asi) do {					\
 	if (addr[0] != WR_R_I(IF_F3_RD(addr[0]), 0x0,			\
 	    IF_F3_RS1(addr[0])))					\
 		OF_panic("%s: patched instructions have changed",	\
 		    __func__);						\
 	addr[0] |= EIF_IMM((asi), 13);					\
 	flush(addr);							\
 } while (0)
 
 #define	PATCH_LDD(addr, asi) do {					\
 	if (addr[0] != LDDA_R_I_R(IF_F3_RD(addr[0]), 0x0,		\
 	    IF_F3_RS1(addr[0]), IF_F3_RS2(addr[0])))			\
 		OF_panic("%s: patched instructions have changed",	\
 		    __func__);						\
 	addr[0] |= EIF_F3_IMM_ASI(asi);					\
 	flush(addr);							\
 } while (0)
 
 #define	PATCH_TSB(addr, val) do {					\
 	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) ||			\
 	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0,			\
 	    IF_F3_RS1(addr[1]))	||					\
 	    addr[3] != SETHI(IF_F2_RD(addr[3]), 0x0))			\
 		OF_panic("%s: patched instructions have changed",	\
 		    __func__);						\
 	addr[0] |= EIF_IMM((val) >> 42, 22);				\
 	addr[1] |= EIF_IMM((val) >> 32, 10);				\
 	addr[3] |= EIF_IMM((val) >> 10, 22);				\
 	flush(addr);							\
 	flush(addr + 1);						\
 	flush(addr + 3);						\
 } while (0)
 
 #define	PATCH_TSB_MASK(addr, val) do {					\
 	if (addr[0] != SETHI(IF_F2_RD(addr[0]), 0x0) ||			\
 	    addr[1] != OR_R_I_R(IF_F3_RD(addr[1]), 0x0,			\
 	    IF_F3_RS1(addr[1])))					\
 		OF_panic("%s: patched instructions have changed",	\
 		    __func__);						\
 	addr[0] |= EIF_IMM((val) >> 10, 22);				\
 	addr[1] |= EIF_IMM((val), 10);					\
 	flush(addr);							\
 	flush(addr + 1);						\
 } while (0)
 
 	if (tsb_kernel_ldd_phys == 0) {
 		asi = ASI_N;
 		ldd = ASI_NUCLEUS_QUAD_LDD;
 		off = (vm_offset_t)tsb_kernel;
 	} else {
 		asi = ASI_PHYS_USE_EC;
 		ldd = ASI_ATOMIC_QUAD_LDD_PHYS;
 		off = (vm_offset_t)tsb_kernel_phys;
 	}
 	PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_1, tsb_kernel_phys);
 	PATCH_TSB(tl1_dmmu_miss_direct_patch_tsb_phys_end_1,
 	    tsb_kernel_phys + tsb_kernel_size - 1);
 	PATCH_ASI(tl1_dmmu_miss_patch_asi_1, asi);
 	PATCH_LDD(tl1_dmmu_miss_patch_quad_ldd_1, ldd);
 	PATCH_TSB(tl1_dmmu_miss_patch_tsb_1, off);
 	PATCH_TSB(tl1_dmmu_miss_patch_tsb_2, off);
 	PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_1, tsb_kernel_mask);
 	PATCH_TSB_MASK(tl1_dmmu_miss_patch_tsb_mask_2, tsb_kernel_mask);
 	PATCH_ASI(tl1_dmmu_prot_patch_asi_1, asi);
 	PATCH_LDD(tl1_dmmu_prot_patch_quad_ldd_1, ldd);
 	PATCH_TSB(tl1_dmmu_prot_patch_tsb_1, off);
 	PATCH_TSB(tl1_dmmu_prot_patch_tsb_2, off);
 	PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_1, tsb_kernel_mask);
 	PATCH_TSB_MASK(tl1_dmmu_prot_patch_tsb_mask_2, tsb_kernel_mask);
 	PATCH_ASI(tl1_immu_miss_patch_asi_1, asi);
 	PATCH_LDD(tl1_immu_miss_patch_quad_ldd_1, ldd);
 	PATCH_TSB(tl1_immu_miss_patch_tsb_1, off);
 	PATCH_TSB(tl1_immu_miss_patch_tsb_2, off);
 	PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_1, tsb_kernel_mask);
 	PATCH_TSB_MASK(tl1_immu_miss_patch_tsb_mask_2, tsb_kernel_mask);
 
 	/*
 	 * Enter fake 8k pages for the 4MB kernel pages, so that
 	 * pmap_kextract() will work for them.
 	 */
 	for (i = 0; i < kernel_tlb_slots; i++) {
 		pa = kernel_tlbs[i].te_pa;
 		va = kernel_tlbs[i].te_va;
 		for (off = 0; off < PAGE_SIZE_4M; off += PAGE_SIZE) {
 			tp = tsb_kvtotte(va + off);
 			vpn = TV_VPN(va + off, TS_8K);
 			data = TD_V | TD_8K | TD_PA(pa + off) | TD_REF |
 			    TD_SW | TD_CP | TD_CV | TD_P | TD_W;
 			pmap_bootstrap_set_tte(tp, vpn, data);
 		}
 	}
 
 	/*
 	 * Set the start and end of KVA.  The kernel is loaded starting
 	 * at the first available 4MB super page, so we advance to the
 	 * end of the last one used for it.
 	 */
 	virtual_avail = KERNBASE + kernel_tlb_slots * PAGE_SIZE_4M;
 	virtual_end = vm_max_kernel_address;
 	kernel_vm_end = vm_max_kernel_address;
 
 	/*
 	 * Allocate kva space for temporary mappings.
 	 */
 	pmap_idle_map = virtual_avail;
 	virtual_avail += PAGE_SIZE * colors;
 	pmap_temp_map_1 = virtual_avail;
 	virtual_avail += PAGE_SIZE * colors;
 	pmap_temp_map_2 = virtual_avail;
 	virtual_avail += PAGE_SIZE * colors;
 
 	/*
 	 * Allocate a kernel stack with guard page for thread0 and map it
 	 * into the kernel TSB.  We must ensure that the virtual address is
 	 * colored properly for corresponding CPUs, since we're allocating
 	 * from phys_avail so the memory won't have an associated vm_page_t.
 	 */
 	pa = pmap_bootstrap_alloc(KSTACK_PAGES * PAGE_SIZE, colors);
 	kstack0_phys = pa;
 	virtual_avail += roundup(KSTACK_GUARD_PAGES, colors) * PAGE_SIZE;
 	kstack0 = virtual_avail;
 	virtual_avail += roundup(KSTACK_PAGES, colors) * PAGE_SIZE;
 	if (dcache_color_ignore == 0)
 		KASSERT(DCACHE_COLOR(kstack0) == DCACHE_COLOR(kstack0_phys),
 		    ("pmap_bootstrap: kstack0 miscolored"));
 	for (i = 0; i < KSTACK_PAGES; i++) {
 		pa = kstack0_phys + i * PAGE_SIZE;
 		va = kstack0 + i * PAGE_SIZE;
 		tp = tsb_kvtotte(va);
 		vpn = TV_VPN(va, TS_8K);
 		data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_SW | TD_CP |
 		    TD_CV | TD_P | TD_W;
 		pmap_bootstrap_set_tte(tp, vpn, data);
 	}
 
 	/*
 	 * Calculate the last available physical address.
 	 */
 	for (i = 0; phys_avail[i + 2] != 0; i += 2)
 		;
 	Maxmem = sparc64_btop(phys_avail[i + 1]);
 
 	/*
 	 * Add the PROM mappings to the kernel TSB.
 	 */
 	if ((vmem = OF_finddevice("/virtual-memory")) == -1)
 		OF_panic("%s: finddevice /virtual-memory", __func__);
 	if ((sz = OF_getproplen(vmem, "translations")) == -1)
 		OF_panic("%s: getproplen translations", __func__);
 	if (sizeof(translations) < sz)
 		OF_panic("%s: translations too small", __func__);
 	bzero(translations, sz);
 	if (OF_getprop(vmem, "translations", translations, sz) == -1)
 		OF_panic("%s: getprop /virtual-memory/translations",
 		    __func__);
 	sz /= sizeof(*translations);
 	translations_size = sz;
 #ifdef DIAGNOSTIC
 	OF_printf("pmap_bootstrap: translations\n");
 #endif
 	qsort(translations, sz, sizeof (*translations), om_cmp);
 	for (i = 0; i < sz; i++) {
 #ifdef DIAGNOSTIC
 		OF_printf("translation: start=%#lx size=%#lx tte=%#lx\n",
 		    translations[i].om_start, translations[i].om_size,
 		    translations[i].om_tte);
 #endif
 		if ((translations[i].om_tte & TD_V) == 0)
 			continue;
 		if (translations[i].om_start < VM_MIN_PROM_ADDRESS ||
 		    translations[i].om_start > VM_MAX_PROM_ADDRESS)
 			continue;
 		for (off = 0; off < translations[i].om_size;
 		    off += PAGE_SIZE) {
 			va = translations[i].om_start + off;
 			tp = tsb_kvtotte(va);
 			vpn = TV_VPN(va, TS_8K);
 			data = ((translations[i].om_tte &
 			    ~((TD_SOFT2_MASK << TD_SOFT2_SHIFT) |
 			    (cpu_impl >= CPU_IMPL_ULTRASPARCI &&
 			    cpu_impl < CPU_IMPL_ULTRASPARCIII ?
 			    (TD_DIAG_SF_MASK << TD_DIAG_SF_SHIFT) :
 			    (TD_RSVD_CH_MASK << TD_RSVD_CH_SHIFT)) |
 			    (TD_SOFT_MASK << TD_SOFT_SHIFT))) | TD_EXEC) +
 			    off;
 			pmap_bootstrap_set_tte(tp, vpn, data);
 		}
 	}
 
 	/*
 	 * Get the available physical memory ranges from /memory/reg.  These
 	 * are only used for kernel dumps, but it may not be wise to do PROM
 	 * calls in that situation.
 	 */
 	if ((sz = OF_getproplen(pmem, "reg")) == -1)
 		OF_panic("%s: getproplen /memory/reg", __func__);
 	if (sizeof(sparc64_memreg) < sz)
 		OF_panic("%s: sparc64_memreg too small", __func__);
 	if (OF_getprop(pmem, "reg", sparc64_memreg, sz) == -1)
 		OF_panic("%s: getprop /memory/reg", __func__);
 	sparc64_nmemreg = sz / sizeof(*sparc64_memreg);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 */
 	pm = kernel_pmap;
 	PMAP_LOCK_INIT(pm);
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = TLB_CTX_KERNEL;
 	CPU_FILL(&pm->pm_active);
 
 	/*
 	 * Initialize the global tte list lock, which is more commonly
 	 * known as the pmap pv global lock.
 	 */
 	rw_init(&tte_list_global_lock, "pmap pv global");
 
 	/*
 	 * Flush all non-locked TLB entries possibly left over by the
 	 * firmware.
 	 */
 	tlb_flush_nonlocked();
 }
 
 static void
 pmap_init_qpages(void)
 {
 	struct pcpu *pc;
 	int i;
 
 	if (dcache_color_ignore != 0)
 		return;
 
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		pc->pc_qmap_addr = kva_alloc(PAGE_SIZE * DCACHE_COLORS);
 		if (pc->pc_qmap_addr == 0)
 			panic("pmap_init_qpages: unable to allocate KVA");
 	}
 }
 
 SYSINIT(qpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_qpages, NULL);
 
 /*
  * Map the 4MB kernel TSB pages.
  */
 void
 pmap_map_tsb(void)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	u_long data;
 	int i;
 
 	for (i = 0; i < tsb_kernel_size; i += PAGE_SIZE_4M) {
 		va = (vm_offset_t)tsb_kernel + i;
 		pa = tsb_kernel_phys + i;
 		data = TD_V | TD_4M | TD_PA(pa) | TD_L | TD_CP | TD_CV |
 		    TD_P | TD_W;
 		stxa(AA_DMMU_TAR, ASI_DMMU, TLB_TAR_VA(va) |
 		    TLB_TAR_CTX(TLB_CTX_KERNEL));
 		stxa_sync(0, ASI_DTLB_DATA_IN_REG, data);
 	}
 }
 
 /*
  * Set the secondary context to be the kernel context (needed for FP block
  * operations in the kernel).
  */
 void
 pmap_set_kctx(void)
 {
 
 	stxa(AA_DMMU_SCXR, ASI_DMMU, (ldxa(AA_DMMU_SCXR, ASI_DMMU) &
 	    TLB_CXR_PGSZ_MASK) | TLB_CTX_KERNEL);
 	flush(KERNBASE);
 }
 
 /*
  * Allocate a physical page of memory directly from the phys_avail map.
  * Can only be called from pmap_bootstrap before avail start and end are
  * calculated.
  */
 static vm_paddr_t
 pmap_bootstrap_alloc(vm_size_t size, uint32_t colors)
 {
 	vm_paddr_t pa;
 	int i;
 
 	size = roundup(size, PAGE_SIZE * colors);
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i + 1] - phys_avail[i] < size)
 			continue;
 		pa = phys_avail[i];
 		phys_avail[i] += size;
 		return (pa);
 	}
 	OF_panic("%s: no suitable region found", __func__);
 }
 
 /*
  * Set a TTE.  This function is intended as a helper when tsb_kernel is
  * direct-mapped but we haven't taken over the trap table, yet, as it's the
  * case when we are taking advantage of ASI_ATOMIC_QUAD_LDD_PHYS to access
  * the kernel TSB.
  */
 void
 pmap_bootstrap_set_tte(struct tte *tp, u_long vpn, u_long data)
 {
 
 	if (tsb_kernel_ldd_phys == 0) {
 		tp->tte_vpn = vpn;
 		tp->tte_data = data;
 	} else {
 		stxa((vm_paddr_t)tp + offsetof(struct tte, tte_vpn),
 		    ASI_PHYS_USE_EC, vpn);
 		stxa((vm_paddr_t)tp + offsetof(struct tte, tte_data),
 		    ASI_PHYS_USE_EC, data);
 	}
 }
 
 /*
  * Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.tte_list);
 	m->md.color = DCACHE_COLOR(VM_PAGE_TO_PHYS(m));
 	m->md.pmap = NULL;
 }
 
 /*
  * Initialize the pmap module.
  */
 void
 pmap_init(void)
 {
 	vm_offset_t addr;
 	vm_size_t size;
 	int result;
 	int i;
 
 	for (i = 0; i < translations_size; i++) {
 		addr = translations[i].om_start;
 		size = translations[i].om_size;
 		if ((translations[i].om_tte & TD_V) == 0)
 			continue;
 		if (addr < VM_MIN_PROM_ADDRESS || addr > VM_MAX_PROM_ADDRESS)
 			continue;
 		result = vm_map_find(kernel_map, NULL, 0, &addr, size, 0,
 		    VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
 		if (result != KERN_SUCCESS || addr != translations[i].om_start)
 			panic("pmap_init: vm_map_find");
 	}
 }
 
 /*
  * Extract the physical page address associated with the given
  * map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pm, vm_offset_t va)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 
 	if (pm == kernel_pmap)
 		return (pmap_kextract(va));
 	PMAP_LOCK(pm);
 	tp = tsb_tte_lookup(pm, va);
 	if (tp == NULL)
 		pa = 0;
 	else
 		pa = TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp));
 	PMAP_UNLOCK(pm);
 	return (pa);
 }
 
 /*
  * Atomically extract and hold the physical page with the given
  * pmap and virtual address pair if that mapping permits the given
  * protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pm, vm_offset_t va, vm_prot_t prot)
 {
 	struct tte *tp;
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	m = NULL;
 	pa = 0;
 	PMAP_LOCK(pm);
 retry:
 	if (pm == kernel_pmap) {
 		if (va >= VM_MIN_DIRECT_ADDRESS) {
 			tp = NULL;
 			m = PHYS_TO_VM_PAGE(TLB_DIRECT_TO_PHYS(va));
 			(void)vm_page_pa_tryrelock(pm, TLB_DIRECT_TO_PHYS(va),
 			    &pa);
 			vm_page_hold(m);
 		} else {
 			tp = tsb_kvtotte(va);
 			if ((tp->tte_data & TD_V) == 0)
 				tp = NULL;
 		}
 	} else
 		tp = tsb_tte_lookup(pm, va);
 	if (tp != NULL && ((tp->tte_data & TD_SW) ||
 	    (prot & VM_PROT_WRITE) == 0)) {
 		if (vm_page_pa_tryrelock(pm, TTE_GET_PA(tp), &pa))
 			goto retry;
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		vm_page_hold(m);
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pm);
 	return (m);
 }
 
 /*
  * Extract the physical page address associated with the given kernel virtual
  * address.
  */
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	struct tte *tp;
 
 	if (va >= VM_MIN_DIRECT_ADDRESS)
 		return (TLB_DIRECT_TO_PHYS(va));
 	tp = tsb_kvtotte(va);
 	if ((tp->tte_data & TD_V) == 0)
 		return (0);
 	return (TTE_GET_PA(tp) | (va & TTE_GET_PAGE_MASK(tp)));
 }
 
 int
 pmap_cache_enter(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_enter: fake page"));
 	PMAP_STATS_INC(pmap_ncache_enter);
 
 	if (dcache_color_ignore != 0)
 		return (1);
 
 	/*
 	 * Find the color for this virtual address and note the added mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]++;
 
 	/*
 	 * If all existing mappings have the same color, the mapping is
 	 * cacheable.
 	 */
 	if (m->md.color == color) {
 		KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] == 0,
 		    ("pmap_cache_enter: cacheable, mappings of other color"));
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_oc);
 		return (1);
 	}
 
 	/*
 	 * If there are no mappings of the other color, and the page still has
 	 * the wrong color, this must be a new mapping.  Change the color to
 	 * match the new mapping, which is cacheable.  We must flush the page
 	 * from the cache now.
 	 */
 	if (m->md.colors[DCACHE_OTHER_COLOR(color)] == 0) {
 		KASSERT(m->md.colors[color] == 1,
 		    ("pmap_cache_enter: changing color, not new mapping"));
 		dcache_page_inval(VM_PAGE_TO_PHYS(m));
 		m->md.color = color;
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_enter_cc);
 		else
 			PMAP_STATS_INC(pmap_ncache_enter_coc);
 		return (1);
 	}
 
 	/*
 	 * If the mapping is already non-cacheable, just return.
 	 */
 	if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncache_enter_nc);
 		return (0);
 	}
 
 	PMAP_STATS_INC(pmap_ncache_enter_cnc);
 
 	/*
 	 * Mark all mappings as uncacheable, flush any lines with the other
 	 * color out of the dcache, and set the color to none (-1).
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_clear_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	dcache_page_inval(VM_PAGE_TO_PHYS(m));
 	m->md.color = -1;
 	return (0);
 }
 
 static void
 pmap_cache_remove(vm_page_t m, vm_offset_t va)
 {
 	struct tte *tp;
 	int color;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	CTR3(KTR_PMAP, "pmap_cache_remove: m=%p va=%#lx c=%d", m, va,
 	    m->md.colors[DCACHE_COLOR(va)]);
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_cache_remove: fake page"));
 	PMAP_STATS_INC(pmap_ncache_remove);
 
 	if (dcache_color_ignore != 0)
 		return;
 
 	KASSERT(m->md.colors[DCACHE_COLOR(va)] > 0,
 	    ("pmap_cache_remove: no mappings %d <= 0",
 	    m->md.colors[DCACHE_COLOR(va)]));
 
 	/*
 	 * Find the color for this virtual address and note the removal of
 	 * the mapping.
 	 */
 	color = DCACHE_COLOR(va);
 	m->md.colors[color]--;
 
 	/*
 	 * If the page is cacheable, just return and keep the same color, even
 	 * if there are no longer any mappings.
 	 */
 	if (m->md.color != -1) {
 		if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 			PMAP_STATS_INC(pmap_ncache_remove_c);
 		else
 			PMAP_STATS_INC(pmap_ncache_remove_oc);
 		return;
 	}
 
 	KASSERT(m->md.colors[DCACHE_OTHER_COLOR(color)] != 0,
 	    ("pmap_cache_remove: uncacheable, no mappings of other color"));
 
 	/*
 	 * If the page is not cacheable (color is -1), and the number of
 	 * mappings for this color is not zero, just return.  There are
 	 * mappings of the other color still, so remain non-cacheable.
 	 */
 	if (m->md.colors[color] != 0) {
 		PMAP_STATS_INC(pmap_ncache_remove_nc);
 		return;
 	}
 
 	/*
 	 * The number of mappings for this color is now zero.  Recache the
 	 * other colored mappings, and change the page color to the other
 	 * color.  There should be no lines in the data cache for this page,
 	 * so flushing should not be needed.
 	 */
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		atomic_set_long(&tp->tte_data, TD_CV);
 		tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	m->md.color = DCACHE_OTHER_COLOR(color);
 
 	if (m->md.color == DCACHE_COLOR(VM_PAGE_TO_PHYS(m)))
 		PMAP_STATS_INC(pmap_ncache_remove_cc);
 	else
 		PMAP_STATS_INC(pmap_ncache_remove_coc);
 }
 
 /*
  * Map a wired page into kernel virtual address space.
  */
 void
 pmap_kenter(vm_offset_t va, vm_page_t m)
 {
 	vm_offset_t ova;
 	struct tte *tp;
 	vm_page_t om;
 	u_long data;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_STATS_INC(pmap_nkenter);
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, VM_PAGE_TO_PHYS(m), tp, tp->tte_data);
 	if (DCACHE_COLOR(VM_PAGE_TO_PHYS(m)) != DCACHE_COLOR(va)) {
 		CTR5(KTR_SPARE2,
 	"pmap_kenter: off color va=%#lx pa=%#lx o=%p ot=%d pi=%#lx",
 		    va, VM_PAGE_TO_PHYS(m), m->object,
 		    m->object ? m->object->type : -1,
 		    m->pindex);
 		PMAP_STATS_INC(pmap_nkenter_oc);
 	}
 	if ((tp->tte_data & TD_V) != 0) {
 		om = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		ova = TTE_GET_VA(tp);
 		if (m == om && va == ova) {
 			PMAP_STATS_INC(pmap_nkenter_stupid);
 			return;
 		}
 		TAILQ_REMOVE(&om->md.tte_list, tp, tte_link);
 		pmap_cache_remove(om, ova);
 		if (va != ova)
 			tlb_page_demap(kernel_pmap, ova);
 	}
 	data = TD_V | TD_8K | VM_PAGE_TO_PHYS(m) | TD_REF | TD_SW | TD_CP |
 	    TD_P | TD_W;
 	if (pmap_cache_enter(m, va) != 0)
 		data |= TD_CV;
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = data;
 	TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 }
 
 /*
  * Map a wired page into kernel virtual address space.  This additionally
  * takes a flag argument which is or'ed to the TTE data.  This is used by
  * sparc64_bus_mem_map().
  * NOTE: if the mapping is non-cacheable, it's the caller's responsibility
  * to flush entries that might still be in the cache, if applicable.
  */
 void
 pmap_kenter_flags(vm_offset_t va, vm_paddr_t pa, u_long flags)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR4(KTR_PMAP, "pmap_kenter_flags: va=%#lx pa=%#lx tp=%p data=%#lx",
 	    va, pa, tp, tp->tte_data);
 	tp->tte_vpn = TV_VPN(va, TS_8K);
 	tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_REF | TD_P | flags;
 }
 
 /*
  * Remove a wired page from kernel virtual address space.
  */
 void
 pmap_kremove(vm_offset_t va)
 {
 	struct tte *tp;
 	vm_page_t m;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_STATS_INC(pmap_nkremove);
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	if ((tp->tte_data & TD_V) == 0)
 		return;
 	m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 	TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 	pmap_cache_remove(m, va);
 	TTE_ZERO(tp);
 }
 
 /*
  * Inverse of pmap_kenter_flags, used by bus_space_unmap().
  */
 void
 pmap_kremove_flags(vm_offset_t va)
 {
 	struct tte *tp;
 
 	tp = tsb_kvtotte(va);
 	CTR3(KTR_PMAP, "pmap_kremove_flags: va=%#lx tp=%p data=%#lx", va, tp,
 	    tp->tte_data);
 	TTE_ZERO(tp);
 }
 
 /*
  * Map a range of physical addresses into kernel virtual address space.
  *
  * The value passed in *virt is a suggested virtual address for the mapping.
  * Architectures which can support a direct-mapped physical to virtual region
  * can return the appropriate address within that region, leaving '*virt'
  * unchanged.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 
 	return (TLB_PHYS_TO_DIRECT(start));
 }
 
 /*
  * Map a list of wired pages into kernel virtual address space.  This is
  * intended for temporary mappings which do not need page modification or
  * references recorded.  Existing mappings in the region are overwritten.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *m, int count)
 {
 	vm_offset_t va;
 
 	PMAP_STATS_INC(pmap_nqenter);
 	va = sva;
 	rw_wlock(&tte_list_global_lock);
 	while (count-- > 0) {
 		pmap_kenter(va, *m);
 		va += PAGE_SIZE;
 		m++;
 	}
 	rw_wunlock(&tte_list_global_lock);
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Remove page mappings from kernel virtual address space.  Intended for
  * temporary mappings entered by pmap_qenter.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	PMAP_STATS_INC(pmap_nqremove);
 	va = sva;
 	rw_wlock(&tte_list_global_lock);
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	rw_wunlock(&tte_list_global_lock);
 	tlb_range_demap(kernel_pmap, sva, va);
 }
 
 /*
  * Initialize the pmap associated with process 0.
  */
 void
 pmap_pinit0(pmap_t pm)
 {
 	int i;
 
 	PMAP_LOCK_INIT(pm);
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = TLB_CTX_KERNEL;
 	CPU_ZERO(&pm->pm_active);
 	pm->pm_tsb = NULL;
 	pm->pm_tsb_obj = NULL;
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure, such as one in a
  * vmspace structure.
  */
 int
 pmap_pinit(pmap_t pm)
 {
 	vm_page_t ma[TSB_PAGES];
-	vm_page_t m;
 	int i;
 
 	/*
 	 * Allocate KVA space for the TSB.
 	 */
 	if (pm->pm_tsb == NULL) {
 		pm->pm_tsb = (struct tte *)kva_alloc(TSB_BSIZE);
 		if (pm->pm_tsb == NULL)
 			return (0);
 		}
 
 	/*
 	 * Allocate an object for it.
 	 */
 	if (pm->pm_tsb_obj == NULL)
 		pm->pm_tsb_obj = vm_object_allocate(OBJT_PHYS, TSB_PAGES);
 
 	for (i = 0; i < MAXCPU; i++)
 		pm->pm_context[i] = -1;
 	CPU_ZERO(&pm->pm_active);
 
 	VM_OBJECT_WLOCK(pm->pm_tsb_obj);
-	for (i = 0; i < TSB_PAGES; i++) {
-		m = vm_page_grab(pm->pm_tsb_obj, i, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
-		m->valid = VM_PAGE_BITS_ALL;
-		m->md.pmap = pm;
-		ma[i] = m;
-	}
+	vm_page_grab_pages(pm->pm_tsb_obj, 0, VM_ALLOC_NORMAL |
+	    VM_ALLOC_NOBUSY | VM_ALLOC_WIRED | VM_ALLOC_ZERO, ma, TSB_PAGES);
 	VM_OBJECT_WUNLOCK(pm->pm_tsb_obj);
+	for (i = 0; i < TSB_PAGES; i++)
+		ma[i]->md.pmap = pm;
 	pmap_qenter((vm_offset_t)pm->pm_tsb, ma, TSB_PAGES);
 
 	bzero(&pm->pm_stats, sizeof(pm->pm_stats));
 	return (1);
 }
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pm)
 {
 	vm_object_t obj;
 	vm_page_t m;
 #ifdef SMP
 	struct pcpu *pc;
 #endif
 
 	CTR2(KTR_PMAP, "pmap_release: ctx=%#x tsb=%p",
 	    pm->pm_context[curcpu], pm->pm_tsb);
 	KASSERT(pmap_resident_count(pm) == 0,
 	    ("pmap_release: resident pages %ld != 0",
 	    pmap_resident_count(pm)));
 
 	/*
 	 * After the pmap was freed, it might be reallocated to a new process.
 	 * When switching, this might lead us to wrongly assume that we need
 	 * not switch contexts because old and new pmap pointer are equal.
 	 * Therefore, make sure that this pmap is not referenced by any PCPU
 	 * pointer any more.  This could happen in two cases:
 	 * - A process that referenced the pmap is currently exiting on a CPU.
 	 *   However, it is guaranteed to not switch in any more after setting
 	 *   its state to PRS_ZOMBIE.
 	 * - A process that referenced this pmap ran on a CPU, but we switched
 	 *   to a kernel thread, leaving the pmap pointer unchanged.
 	 */
 #ifdef SMP
 	sched_pin();
 	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
 		atomic_cmpset_rel_ptr((uintptr_t *)&pc->pc_pmap,
 		    (uintptr_t)pm, (uintptr_t)NULL);
 	sched_unpin();
 #else
 	critical_enter();
 	if (PCPU_GET(pmap) == pm)
 		PCPU_SET(pmap, NULL);
 	critical_exit();
 #endif
 
 	pmap_qremove((vm_offset_t)pm->pm_tsb, TSB_PAGES);
 	obj = pm->pm_tsb_obj;
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(obj->ref_count == 1, ("pmap_release: tsbobj ref count != 1"));
 	while (!TAILQ_EMPTY(&obj->memq)) {
 		m = TAILQ_FIRST(&obj->memq);
 		m->md.pmap = NULL;
 		m->wire_count--;
 		atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 		vm_page_free_zero(m);
 	}
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 /*
  * Grow the number of kernel page table entries.  Unneeded.
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 
 	panic("pmap_growkernel: can't grow kernel");
 }
 
 int
 pmap_remove_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
     vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	data = atomic_readandclear_long(&tp->tte_data);
 	if ((data & TD_FAKE) == 0) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		if ((data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((data & TD_PV) != 0) {
 			if ((data & TD_W) != 0)
 				vm_page_dirty(m);
 			if ((data & TD_REF) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.tte_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 			pm->pm_stats.resident_count--;
 		}
 		pmap_cache_remove(m, va);
 	}
 	TTE_ZERO(tp);
 	if (PMAP_REMOVE_DONE(pm))
 		return (0);
 	return (1);
 }
 
 /*
  * Remove the given range of addresses from the specified map.
  */
 void
 pmap_remove(pmap_t pm, vm_offset_t start, vm_offset_t end)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	CTR3(KTR_PMAP, "pmap_remove: ctx=%#lx start=%#lx end=%#lx",
 	    pm->pm_context[curcpu], start, end);
 	if (PMAP_REMOVE_DONE(pm))
 		return;
 	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	if (end - start > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, start, end, pmap_remove_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = start; va < end; va += PAGE_SIZE)
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL &&
 			    !pmap_remove_tte(pm, NULL, tp, va))
 				break;
 		tlb_range_demap(pm, start, end - 1);
 	}
 	PMAP_UNLOCK(pm);
 	rw_wunlock(&tte_list_global_lock);
 }
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct pmap *pm;
 	struct tte *tpn;
 	struct tte *tp;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	rw_wlock(&tte_list_global_lock);
 	for (tp = TAILQ_FIRST(&m->md.tte_list); tp != NULL; tp = tpn) {
 		tpn = TAILQ_NEXT(tp, tte_link);
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		pm = TTE_GET_PMAP(tp);
 		va = TTE_GET_VA(tp);
 		PMAP_LOCK(pm);
 		if ((tp->tte_data & TD_WIRED) != 0)
 			pm->pm_stats.wired_count--;
 		if ((tp->tte_data & TD_REF) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		if ((tp->tte_data & TD_W) != 0)
 			vm_page_dirty(m);
 		tp->tte_data &= ~TD_V;
 		tlb_page_demap(pm, va);
 		TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 		pm->pm_stats.resident_count--;
 		pmap_cache_remove(m, va);
 		TTE_ZERO(tp);
 		PMAP_UNLOCK(pm);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&tte_list_global_lock);
 }
 
 static int
 pmap_protect_tte(struct pmap *pm, struct pmap *pm2, struct tte *tp,
     vm_offset_t va)
 {
 	u_long data;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W);
 	if ((data & (TD_PV | TD_W)) == (TD_PV | TD_W)) {
 		m = PHYS_TO_VM_PAGE(TD_PA(data));
 		vm_page_dirty(m);
 	}
 	return (1);
 }
 
 /*
  * Set the physical protection on the specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pm, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va;
 	struct tte *tp;
 
 	CTR4(KTR_PMAP, "pmap_protect: ctx=%#lx sva=%#lx eva=%#lx prot=%#lx",
 	    pm->pm_context[curcpu], sva, eva, prot);
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pm, sva, eva);
 		return;
 	}
 
 	if (prot & VM_PROT_WRITE)
 		return;
 
 	PMAP_LOCK(pm);
 	if (eva - sva > PMAP_TSB_THRESH) {
 		tsb_foreach(pm, NULL, sva, eva, pmap_protect_tte);
 		tlb_context_demap(pm);
 	} else {
 		for (va = sva; va < eva; va += PAGE_SIZE)
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL)
 				pmap_protect_tte(pm, NULL, tp, va);
 		tlb_range_demap(pm, sva, eva - 1);
 	}
 	PMAP_UNLOCK(pm);
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  */
 int
 pmap_enter(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	int rv;
 
 	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	rv = pmap_enter_locked(pm, va, m, prot, flags, psind);
 	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 	return (rv);
 }
 
 /*
  * Map the given physical page at the specified virtual address in the
  * target pmap with the protection requested.  If specified the page
  * will be wired down.
  *
  * The page queues and pmap must be locked.
  */
 static int
 pmap_enter_locked(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind __unused)
 {
 	struct tte *tp;
 	vm_paddr_t pa;
 	vm_page_t real;
 	u_long data;
 	boolean_t wired;
 
 	rw_assert(&tte_list_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	PMAP_STATS_INC(pmap_nenter);
 	pa = VM_PAGE_TO_PHYS(m);
 	wired = (flags & PMAP_ENTER_WIRED) != 0;
 
 	/*
 	 * If this is a fake page from the device_pager, but it covers actual
 	 * physical memory, convert to the real backing page.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		real = vm_phys_paddr_to_vm_page(pa);
 		if (real != NULL)
 			m = real;
 	}
 
 	CTR6(KTR_PMAP,
 	    "pmap_enter_locked: ctx=%p m=%p va=%#lx pa=%#lx prot=%#x wired=%d",
 	    pm->pm_context[curcpu], m, va, pa, prot, wired);
 
 	/*
 	 * If there is an existing mapping, and the physical address has not
 	 * changed, must be protection or wiring change.
 	 */
 	if ((tp = tsb_tte_lookup(pm, va)) != NULL && TTE_GET_PA(tp) == pa) {
 		CTR0(KTR_PMAP, "pmap_enter_locked: update");
 		PMAP_STATS_INC(pmap_nenter_update);
 
 		/*
 		 * Wiring change, just update stats.
 		 */
 		if (wired) {
 			if ((tp->tte_data & TD_WIRED) == 0) {
 				tp->tte_data |= TD_WIRED;
 				pm->pm_stats.wired_count++;
 			}
 		} else {
 			if ((tp->tte_data & TD_WIRED) != 0) {
 				tp->tte_data &= ~TD_WIRED;
 				pm->pm_stats.wired_count--;
 			}
 		}
 
 		/*
 		 * Save the old bits and clear the ones we're interested in.
 		 */
 		data = tp->tte_data;
 		tp->tte_data &= ~(TD_EXEC | TD_SW | TD_W);
 
 		/*
 		 * If we're turning off write permissions, sense modify status.
 		 */
 		if ((prot & VM_PROT_WRITE) != 0) {
 			tp->tte_data |= TD_SW;
 			if (wired)
 				tp->tte_data |= TD_W;
 			if ((m->oflags & VPO_UNMANAGED) == 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 		} else if ((data & TD_W) != 0)
 			vm_page_dirty(m);
 
 		/*
 		 * If we're turning on execute permissions, flush the icache.
 		 */
 		if ((prot & VM_PROT_EXECUTE) != 0) {
 			if ((data & TD_EXEC) == 0)
 				icache_page_inval(pa);
 			tp->tte_data |= TD_EXEC;
 		}
 
 		/*
 		 * Delete the old mapping.
 		 */
 		tlb_page_demap(pm, TTE_GET_VA(tp));
 	} else {
 		/*
 		 * If there is an existing mapping, but its for a different
 		 * physical address, delete the old mapping.
 		 */
 		if (tp != NULL) {
 			CTR0(KTR_PMAP, "pmap_enter_locked: replace");
 			PMAP_STATS_INC(pmap_nenter_replace);
 			pmap_remove_tte(pm, NULL, tp, va);
 			tlb_page_demap(pm, va);
 		} else {
 			CTR0(KTR_PMAP, "pmap_enter_locked: new");
 			PMAP_STATS_INC(pmap_nenter_new);
 		}
 
 		/*
 		 * Now set up the data and install the new mapping.
 		 */
 		data = TD_V | TD_8K | TD_PA(pa);
 		if (pm == kernel_pmap)
 			data |= TD_P;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			data |= TD_SW;
 			if ((m->oflags & VPO_UNMANAGED) == 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 		}
 		if (prot & VM_PROT_EXECUTE) {
 			data |= TD_EXEC;
 			icache_page_inval(pa);
 		}
 
 		/*
 		 * If its wired update stats.  We also don't need reference or
 		 * modify tracking for wired mappings, so set the bits now.
 		 */
 		if (wired) {
 			pm->pm_stats.wired_count++;
 			data |= TD_REF | TD_WIRED;
 			if ((prot & VM_PROT_WRITE) != 0)
 				data |= TD_W;
 		}
 
 		tsb_tte_enter(pm, m, va, TS_8K, data);
 	}
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pm, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_page_t m;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	m = m_start;
 	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		pmap_enter_locked(pm, start + ptoa(diff), m, prot &
 		    (VM_PROT_READ | VM_PROT_EXECUTE), 0, 0);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_enter_quick(pmap_t pm, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 
 	rw_wlock(&tte_list_global_lock);
 	PMAP_LOCK(pm);
 	pmap_enter_locked(pm, va, m, prot & (VM_PROT_READ | VM_PROT_EXECUTE),
 	    0, 0);
 	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(pm);
 }
 
 void
 pmap_object_init_pt(pmap_t pm, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 static int
 pmap_unwire_tte(pmap_t pm, pmap_t pm2, struct tte *tp, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pm, MA_OWNED);
 	if ((tp->tte_data & TD_WIRED) == 0)
 		panic("pmap_unwire_tte: tp %p is missing TD_WIRED", tp);
 	atomic_clear_long(&tp->tte_data, TD_WIRED);
 	pm->pm_stats.wired_count--;
 	return (1);
 }
 
 /*
  * Clear the wired attribute from the mappings for the specified range of
  * addresses in the given pmap.  Every valid mapping within that range must
  * have the wired attribute set.  In contrast, invalid mappings cannot have
  * the wired attribute set, so they are ignored.
  *
  * The wired attribute of the translation table entry is not a hardware
  * feature, so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pm, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va;
 	struct tte *tp;
 
 	PMAP_LOCK(pm);
 	if (eva - sva > PMAP_TSB_THRESH)
 		tsb_foreach(pm, NULL, sva, eva, pmap_unwire_tte);
 	else {
 		for (va = sva; va < eva; va += PAGE_SIZE)
 			if ((tp = tsb_tte_lookup(pm, va)) != NULL)
 				pmap_unwire_tte(pm, NULL, tp, va);
 	}
 	PMAP_UNLOCK(pm);
 }
 
 static int
 pmap_copy_tte(pmap_t src_pmap, pmap_t dst_pmap, struct tte *tp,
     vm_offset_t va)
 {
 	vm_page_t m;
 	u_long data;
 
 	if ((tp->tte_data & TD_FAKE) != 0)
 		return (1);
 	if (tsb_tte_lookup(dst_pmap, va) == NULL) {
 		data = tp->tte_data &
 		    ~(TD_PV | TD_REF | TD_SW | TD_CV | TD_W);
 		m = PHYS_TO_VM_PAGE(TTE_GET_PA(tp));
 		tsb_tte_enter(dst_pmap, m, va, TS_8K, data);
 	}
 	return (1);
 }
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 	struct tte *tp;
 	vm_offset_t va;
 
 	if (dst_addr != src_addr)
 		return;
 	rw_wlock(&tte_list_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	if (len > PMAP_TSB_THRESH) {
 		tsb_foreach(src_pmap, dst_pmap, src_addr, src_addr + len,
 		    pmap_copy_tte);
 		tlb_context_demap(dst_pmap);
 	} else {
 		for (va = src_addr; va < src_addr + len; va += PAGE_SIZE)
 			if ((tp = tsb_tte_lookup(src_pmap, va)) != NULL)
 				pmap_copy_tte(src_pmap, dst_pmap, tp, va);
 		tlb_range_demap(dst_pmap, src_addr, src_addr + len - 1);
 	}
 	rw_wunlock(&tte_list_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 void
 pmap_zero_page(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_area: fake page"));
 	KASSERT(off + size <= PAGE_SIZE, ("pmap_zero_page_area: bad off/size"));
 	PMAP_STATS_INC(pmap_nzero_page_area);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_area_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		bzero((void *)(va + off), size);
 	} else if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_area_nc);
 		aszero(ASI_PHYS_USE_EC, pa + off, size);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_area_oc);
 		PMAP_LOCK(kernel_pmap);
 		va = pmap_temp_map_1 + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		bzero((void *)(va + off), size);
 		tlb_page_demap(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 void
 pmap_zero_page_idle(vm_page_t m)
 {
 	struct tte *tp;
 	vm_offset_t va;
 	vm_paddr_t pa;
 
 	KASSERT((m->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_zero_page_idle: fake page"));
 	PMAP_STATS_INC(pmap_nzero_page_idle);
 	pa = VM_PAGE_TO_PHYS(m);
 	if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa)) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_c);
 		va = TLB_PHYS_TO_DIRECT(pa);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 	} else if (m->md.color == -1) {
 		PMAP_STATS_INC(pmap_nzero_page_idle_nc);
 		aszero(ASI_PHYS_USE_EC, pa, PAGE_SIZE);
 	} else {
 		PMAP_STATS_INC(pmap_nzero_page_idle_oc);
 		va = pmap_idle_map + (m->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(va);
 		tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(va, TS_8K);
 		cpu_block_zero((void *)va, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, va);
 	}
 }
 
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t vdst;
 	vm_offset_t vsrc;
 	vm_paddr_t pdst;
 	vm_paddr_t psrc;
 	struct tte *tp;
 
 	KASSERT((mdst->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake dst page"));
 	KASSERT((msrc->flags & PG_FICTITIOUS) == 0,
 	    ("pmap_copy_page: fake src page"));
 	PMAP_STATS_INC(pmap_ncopy_page);
 	pdst = VM_PAGE_TO_PHYS(mdst);
 	psrc = VM_PAGE_TO_PHYS(msrc);
 	if (dcache_color_ignore != 0 ||
 	    (msrc->md.color == DCACHE_COLOR(psrc) &&
 	    mdst->md.color == DCACHE_COLOR(pdst))) {
 		PMAP_STATS_INC(pmap_ncopy_page_c);
 		vdst = TLB_PHYS_TO_DIRECT(pdst);
 		vsrc = TLB_PHYS_TO_DIRECT(psrc);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 	} else if (msrc->md.color == -1 && mdst->md.color == -1) {
 		PMAP_STATS_INC(pmap_ncopy_page_nc);
 		ascopy(ASI_PHYS_USE_EC, psrc, pdst, PAGE_SIZE);
 	} else if (msrc->md.color == -1) {
 		if (mdst->md.color == DCACHE_COLOR(pdst)) {
 			PMAP_STATS_INC(pmap_ncopy_page_dc);
 			vdst = TLB_PHYS_TO_DIRECT(pdst);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_doc);
 			PMAP_LOCK(kernel_pmap);
 			vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vdst);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vdst, TS_8K);
 			ascopyfrom(ASI_PHYS_USE_EC, psrc, (void *)vdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vdst);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else if (mdst->md.color == -1) {
 		if (msrc->md.color == DCACHE_COLOR(psrc)) {
 			PMAP_STATS_INC(pmap_ncopy_page_sc);
 			vsrc = TLB_PHYS_TO_DIRECT(psrc);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 		} else {
 			PMAP_STATS_INC(pmap_ncopy_page_soc);
 			PMAP_LOCK(kernel_pmap);
 			vsrc = pmap_temp_map_1 + (msrc->md.color * PAGE_SIZE);
 			tp = tsb_kvtotte(vsrc);
 			tp->tte_data =
 			    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 			tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 			ascopyto((void *)vsrc, ASI_PHYS_USE_EC, pdst,
 			    PAGE_SIZE);
 			tlb_page_demap(kernel_pmap, vsrc);
 			PMAP_UNLOCK(kernel_pmap);
 		}
 	} else {
 		PMAP_STATS_INC(pmap_ncopy_page_oc);
 		PMAP_LOCK(kernel_pmap);
 		vdst = pmap_temp_map_1 + (mdst->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vdst);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(pdst) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vdst, TS_8K);
 		vsrc = pmap_temp_map_2 + (msrc->md.color * PAGE_SIZE);
 		tp = tsb_kvtotte(vsrc);
 		tp->tte_data =
 		    TD_V | TD_8K | TD_PA(psrc) | TD_CP | TD_CV | TD_W;
 		tp->tte_vpn = TV_VPN(vsrc, TS_8K);
 		cpu_block_copy((void *)vsrc, (void *)vdst, PAGE_SIZE);
 		tlb_page_demap(kernel_pmap, vdst);
 		tlb_page_demap(kernel_pmap, vsrc);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t pa;
 	vm_offset_t qaddr;
 	struct tte *tp;
 
 	pa = VM_PAGE_TO_PHYS(m);
 	if (dcache_color_ignore != 0 || m->md.color == DCACHE_COLOR(pa))
 		return (TLB_PHYS_TO_DIRECT(pa));
 
 	critical_enter();
 	qaddr = PCPU_GET(qmap_addr);
 	qaddr += (PAGE_SIZE * ((DCACHE_COLORS + DCACHE_COLOR(pa) -
 	    DCACHE_COLOR(qaddr)) % DCACHE_COLORS));
 	tp = tsb_kvtotte(qaddr);
 
 	KASSERT(tp->tte_data == 0, ("pmap_quick_enter_page: PTE busy"));
 	
 	tp->tte_data = TD_V | TD_8K | TD_PA(pa) | TD_CP | TD_CV | TD_W;
 	tp->tte_vpn = TV_VPN(qaddr, TS_8K);
 
 	return (qaddr);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 	vm_offset_t qaddr;
 	struct tte *tp;
 
 	if (addr >= VM_MIN_DIRECT_ADDRESS)
 		return;
 
 	tp = tsb_kvtotte(addr);
 	qaddr = PCPU_GET(qmap_addr);
 	
 	KASSERT((addr >= qaddr) && (addr < (qaddr + (PAGE_SIZE * DCACHE_COLORS))),
 	    ("pmap_quick_remove_page: invalid address"));
 	KASSERT(tp->tte_data != 0, ("pmap_quick_remove_page: PTE not in use"));
 	
 	stxa(TLB_DEMAP_VA(addr) | TLB_DEMAP_NUCLEUS | TLB_DEMAP_PAGE, ASI_DMMU_DEMAP, 0);
 	stxa(TLB_DEMAP_VA(addr) | TLB_DEMAP_NUCLEUS | TLB_DEMAP_PAGE, ASI_IMMU_DEMAP, 0);
 	flush(KERNBASE);
 	TTE_ZERO(tp);
 	critical_exit();
 }
 
 int unmapped_buf_allowed;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 
 	panic("pmap_copy_pages: not implemented");
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pm, vm_page_t m)
 {
 	struct tte *tp;
 	int loops;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	loops = 0;
 	rv = FALSE;
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if (TTE_GET_PMAP(tp) == pm) {
 			rv = TRUE;
 			break;
 		}
 		if (++loops >= 16)
 			break;
 	}
 	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
 /*
  * Return the number of managed mappings to the given physical page
  * that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct tte *tp;
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link)
 		if ((tp->tte_data & (TD_PV | TD_WIRED)) == (TD_PV | TD_WIRED))
 			count++;
 	rw_wunlock(&tte_list_global_lock);
 	return (count);
 }
 
 /*
  * Remove all pages from specified address space, this aids process exit
  * speeds.  This is much faster than pmap_remove in the case of running down
  * an entire address space.  Only works for the current pmap.
  */
 void
 pmap_remove_pages(pmap_t pm)
 {
 
 }
 
 /*
  * Returns TRUE if the given page has a managed mapping.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct tte *tp;
 	boolean_t rv;
 
 	rv = FALSE;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (rv);
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link)
 		if ((tp->tte_data & TD_PV) != 0) {
 			rv = TRUE;
 			break;
 		}
 	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
 #define	PMAP_TS_REFERENCED_MAX	5
 
 /*
  * Return a count of reference bits for a page, clearing those bits.
  * It is not necessary for every reference bit to be cleared, but it
  * is necessary that 0 only be returned when there are truly no
  * reference bits set.
  *
  * XXX: The exact number of bits to check and clear is a matter that
  * should be tested and standardized at some point in the future for
  * optimal aging of shared pages.
  *
  * As an optimization, update the page's dirty field if a modified bit is
  * found while counting reference bits.  This opportunistic update can be
  * performed at low cost and can eliminate the need for some future calls
  * to pmap_is_modified().  However, since this function stops after
  * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  * dirty pages.  Those dirty pages will only be detected by a future call
  * to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct tte *tpf;
 	struct tte *tpn;
 	struct tte *tp;
 	u_long data;
 	int count;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	count = 0;
 	rw_wlock(&tte_list_global_lock);
 	if ((tp = TAILQ_FIRST(&m->md.tte_list)) != NULL) {
 		tpf = tp;
 		do {
 			tpn = TAILQ_NEXT(tp, tte_link);
 			TAILQ_REMOVE(&m->md.tte_list, tp, tte_link);
 			TAILQ_INSERT_TAIL(&m->md.tte_list, tp, tte_link);
 			if ((tp->tte_data & TD_PV) == 0)
 				continue;
 			data = atomic_clear_long(&tp->tte_data, TD_REF);
 			if ((data & TD_W) != 0)
 				vm_page_dirty(m);
 			if ((data & TD_REF) != 0 && ++count >=
 			    PMAP_TS_REFERENCED_MAX)
 				break;
 		} while ((tp = tpn) != NULL && tp != tpf);
 	}
 	rw_wunlock(&tte_list_global_lock);
 	return (count);
 }
 
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 	struct tte *tp;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 	rv = FALSE;
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no TTEs can have TD_W set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (rv);
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if ((tp->tte_data & TD_W) != 0) {
 			rv = TRUE;
 			break;
 		}
 	}
 	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	boolean_t rv;
 
 	PMAP_LOCK(pmap);
 	rv = tsb_tte_lookup(pmap, addr) == NULL;
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Return whether or not the specified physical page was referenced
  * in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 	struct tte *tp;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		if ((tp->tte_data & TD_REF) != 0) {
 			rv = TRUE;
 			break;
 		}
 	}
 	rw_wunlock(&tte_list_global_lock);
 	return (rv);
 }
 
 /*
  * This function is advisory.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 }
 
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no TTEs can have TD_W set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_W);
 		if ((data & TD_W) != 0)
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 	}
 	rw_wunlock(&tte_list_global_lock);
 }
 
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct tte *tp;
 	u_long data;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&tte_list_global_lock);
 	TAILQ_FOREACH(tp, &m->md.tte_list, tte_link) {
 		if ((tp->tte_data & TD_PV) == 0)
 			continue;
 		data = atomic_clear_long(&tp->tte_data, TD_SW | TD_W);
 		if ((data & TD_W) != 0) {
 			vm_page_dirty(m);
 			tlb_page_demap(TTE_GET_PMAP(tp), TTE_GET_VA(tp));
 		}
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&tte_list_global_lock);
 }
 
 int
 pmap_mincore(pmap_t pm, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 
 	/* TODO; */
 	return (0);
 }
 
 /*
  * Activate a user pmap.  The pmap must be activated before its address space
  * can be accessed in any way.
  */
 void
 pmap_activate(struct thread *td)
 {
 	struct vmspace *vm;
 	struct pmap *pm;
 	int context;
 
 	critical_enter();
 	vm = td->td_proc->p_vmspace;
 	pm = vmspace_pmap(vm);
 
 	context = PCPU_GET(tlb_ctx);
 	if (context == PCPU_GET(tlb_ctx_max)) {
 		tlb_flush_user();
 		context = PCPU_GET(tlb_ctx_min);
 	}
 	PCPU_SET(tlb_ctx, context + 1);
 
 	pm->pm_context[curcpu] = context;
 #ifdef SMP
 	CPU_SET_ATOMIC(PCPU_GET(cpuid), &pm->pm_active);
 	atomic_store_acq_ptr((uintptr_t *)PCPU_PTR(pmap), (uintptr_t)pm);
 #else
 	CPU_SET(PCPU_GET(cpuid), &pm->pm_active);
 	PCPU_SET(pmap, pm);
 #endif
 
 	stxa(AA_DMMU_TSB, ASI_DMMU, pm->pm_tsb);
 	stxa(AA_IMMU_TSB, ASI_IMMU, pm->pm_tsb);
 	stxa(AA_DMMU_PCXR, ASI_DMMU, (ldxa(AA_DMMU_PCXR, ASI_DMMU) &
 	    TLB_CXR_PGSZ_MASK) | context);
 	flush(KERNBASE);
 	critical_exit();
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 
 }
 
 /*
  * Increase the starting virtual address of the given mapping if a
  * different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 
 }
Index: stable/11/sys/vm/vm_glue.c
===================================================================
--- stable/11/sys/vm/vm_glue.c	(revision 323661)
+++ stable/11/sys/vm/vm_glue.c	(revision 323662)
@@ -1,1103 +1,1097 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_glue.c	8.6 (Berkeley) 1/5/94
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 #include "opt_kstack_pages.h"
 #include "opt_kstack_max_pages.h"
 #include "opt_kstack_usage_prof.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 #include <sys/shm.h>
 #include <sys/vmmeter.h>
 #include <sys/vmem.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/_kstack_cache.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/unistd.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 #include <machine/cpu.h>
 
 #ifndef NO_SWAPPING
 static int swapout(struct proc *);
 static void swapclear(struct proc *);
 static void vm_thread_swapin(struct thread *td);
 static void vm_thread_swapout(struct thread *td);
 #endif
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  In most cases
  * just checking the vm_map_entry is sufficient within the kernel's address
  * space.
  */
 int
 kernacc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_offset_t saddr, eaddr;
 	vm_prot_t prot;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to kernacc (%x)\n", rw));
 
 	if ((vm_offset_t)addr + len > kernel_map->max_offset ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr)
 		return (FALSE);
 
 	prot = rw;
 	saddr = trunc_page((vm_offset_t)addr);
 	eaddr = round_page((vm_offset_t)addr + len);
 	vm_map_lock_read(kernel_map);
 	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
 	vm_map_unlock_read(kernel_map);
 	return (rv == TRUE);
 }
 
 /*
  * MPSAFE
  *
  * WARNING!  This code calls vm_map_check_protection() which only checks
  * the associated vm_map_entry range.  It does not determine whether the
  * contents of the memory is actually readable or writable.  vmapbuf(),
  * vm_fault_quick(), or copyin()/copout()/su*()/fu*() functions should be
  * used in conjunction with this call.
  */
 int
 useracc(addr, len, rw)
 	void *addr;
 	int len, rw;
 {
 	boolean_t rv;
 	vm_prot_t prot;
 	vm_map_t map;
 
 	KASSERT((rw & ~VM_PROT_ALL) == 0,
 	    ("illegal ``rw'' argument to useracc (%x)\n", rw));
 	prot = rw;
 	map = &curproc->p_vmspace->vm_map;
 	if ((vm_offset_t)addr + len > vm_map_max(map) ||
 	    (vm_offset_t)addr + len < (vm_offset_t)addr) {
 		return (FALSE);
 	}
 	vm_map_lock_read(map);
 	rv = vm_map_check_protection(map, trunc_page((vm_offset_t)addr),
 	    round_page((vm_offset_t)addr + len), prot);
 	vm_map_unlock_read(map);
 	return (rv == TRUE);
 }
 
 int
 vslock(void *addr, size_t len)
 {
 	vm_offset_t end, last, start;
 	vm_size_t npages;
 	int error;
 
 	last = (vm_offset_t)addr + len;
 	start = trunc_page((vm_offset_t)addr);
 	end = round_page(last);
 	if (last < (vm_offset_t)addr || end < (vm_offset_t)addr)
 		return (EINVAL);
 	npages = atop(end - start);
 	if (npages > vm_page_max_wired)
 		return (ENOMEM);
 #if 0
 	/*
 	 * XXX - not yet
 	 *
 	 * The limit for transient usage of wired pages should be
 	 * larger than for "permanent" wired pages (mlock()).
 	 *
 	 * Also, the sysctl code, which is the only present user
 	 * of vslock(), does a hard loop on EAGAIN.
 	 */
 	if (npages + vm_cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 #endif
 	error = vm_map_wire(&curproc->p_vmspace->vm_map, start, end,
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 	/*
 	 * Return EFAULT on error to match copy{in,out}() behaviour
 	 * rather than returning ENOMEM like mlock() would.
 	 */
 	return (error == KERN_SUCCESS ? 0 : EFAULT);
 }
 
 void
 vsunlock(void *addr, size_t len)
 {
 
 	/* Rely on the parameter sanity checks performed by vslock(). */
 	(void)vm_map_unwire(&curproc->p_vmspace->vm_map,
 	    trunc_page((vm_offset_t)addr), round_page((vm_offset_t)addr + len),
 	    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 }
 
 /*
  * Pin the page contained within the given object at the given offset.  If the
  * page is not resident, allocate and load it using the given object's pager.
  * Return the pinned page if successful; otherwise, return NULL.
  */
 static vm_page_t
 vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 	vm_pindex_t pindex;
 	int rv;
 
 	VM_OBJECT_WLOCK(object);
 	pindex = OFF_TO_IDX(offset);
 	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(m);
 		rv = vm_pager_get_pages(object, &m, 1, NULL, NULL);
 		if (rv != VM_PAGER_OK) {
 			vm_page_lock(m);
 			vm_page_free(m);
 			vm_page_unlock(m);
 			m = NULL;
 			goto out;
 		}
 		vm_page_xunbusy(m);
 	}
 	vm_page_lock(m);
 	vm_page_hold(m);
 	vm_page_activate(m);
 	vm_page_unlock(m);
 out:
 	VM_OBJECT_WUNLOCK(object);
 	return (m);
 }
 
 /*
  * Return a CPU private mapping to the page at the given offset within the
  * given object.  The page is pinned before it is mapped.
  */
 struct sf_buf *
 vm_imgact_map_page(vm_object_t object, vm_ooffset_t offset)
 {
 	vm_page_t m;
 
 	m = vm_imgact_hold_page(object, offset);
 	if (m == NULL)
 		return (NULL);
 	sched_pin();
 	return (sf_buf_alloc(m, SFB_CPUPRIVATE));
 }
 
 /*
  * Destroy the given CPU private mapping and unpin the page that it mapped.
  */
 void
 vm_imgact_unmap_page(struct sf_buf *sf)
 {
 	vm_page_t m;
 
 	m = sf_buf_page(sf);
 	sf_buf_free(sf);
 	sched_unpin();
 	vm_page_lock(m);
 	vm_page_unhold(m);
 	vm_page_unlock(m);
 }
 
 void
 vm_sync_icache(vm_map_t map, vm_offset_t va, vm_offset_t sz)
 {
 
 	pmap_sync_icache(map->pmap, va, sz);
 }
 
 struct kstack_cache_entry *kstack_cache;
 static int kstack_cache_size = 128;
 static int kstacks;
 static struct mtx kstack_cache_mtx;
 MTX_SYSINIT(kstack_cache, &kstack_cache_mtx, "kstkch", MTX_DEF);
 
 SYSCTL_INT(_vm, OID_AUTO, kstack_cache_size, CTLFLAG_RW, &kstack_cache_size, 0,
     "");
 SYSCTL_INT(_vm, OID_AUTO, kstacks, CTLFLAG_RD, &kstacks, 0,
     "");
 
 #ifndef KSTACK_MAX_PAGES
 #define KSTACK_MAX_PAGES 32
 #endif
 
 /*
  * Create the kernel stack (including pcb for i386) for a new thread.
  * This routine directly affects the fork perf for a process and
  * create performance for a thread.
  */
 int
 vm_thread_new(struct thread *td, int pages)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
-	vm_page_t m, ma[KSTACK_MAX_PAGES];
+	vm_page_t ma[KSTACK_MAX_PAGES];
 	struct kstack_cache_entry *ks_ce;
 	int i;
 
 	/* Bounds check */
 	if (pages <= 1)
 		pages = kstack_pages;
 	else if (pages > KSTACK_MAX_PAGES)
 		pages = KSTACK_MAX_PAGES;
 
 	if (pages == kstack_pages) {
 		mtx_lock(&kstack_cache_mtx);
 		if (kstack_cache != NULL) {
 			ks_ce = kstack_cache;
 			kstack_cache = ks_ce->next_ks_entry;
 			mtx_unlock(&kstack_cache_mtx);
 
 			td->td_kstack_obj = ks_ce->ksobj;
 			td->td_kstack = (vm_offset_t)ks_ce;
 			td->td_kstack_pages = kstack_pages;
 			return (1);
 		}
 		mtx_unlock(&kstack_cache_mtx);
 	}
 
 	/*
 	 * Allocate an object for the kstack.
 	 */
 	ksobj = vm_object_allocate(OBJT_DEFAULT, pages);
 	
 	/*
 	 * Get a kernel virtual address for this thread's kstack.
 	 */
 #if defined(__mips__)
 	/*
 	 * We need to align the kstack's mapped address to fit within
 	 * a single TLB entry.
 	 */
 	if (vmem_xalloc(kernel_arena, (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE,
 	    PAGE_SIZE * 2, 0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX,
 	    M_BESTFIT | M_NOWAIT, &ks)) {
 		ks = 0;
 	}
 #else
 	ks = kva_alloc((pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 #endif
 	if (ks == 0) {
 		printf("vm_thread_new: kstack allocation failed\n");
 		vm_object_deallocate(ksobj);
 		return (0);
 	}
 
 	atomic_add_int(&kstacks, 1);
 	if (KSTACK_GUARD_PAGES != 0) {
 		pmap_qremove(ks, KSTACK_GUARD_PAGES);
 		ks += KSTACK_GUARD_PAGES * PAGE_SIZE;
 	}
 	td->td_kstack_obj = ksobj;
 	td->td_kstack = ks;
 	/*
 	 * Knowing the number of pages allocated is useful when you
 	 * want to deallocate them.
 	 */
 	td->td_kstack_pages = pages;
 	/* 
 	 * For the length of the stack, link in a real page of ram for each
 	 * page of stack.
 	 */
 	VM_OBJECT_WLOCK(ksobj);
-	for (i = 0; i < pages; i++) {
-		/*
-		 * Get a kernel stack page.
-		 */
-		m = vm_page_grab(ksobj, i, VM_ALLOC_NOBUSY |
-		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
-		ma[i] = m;
-		m->valid = VM_PAGE_BITS_ALL;
-	}
+	vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY |
+	    VM_ALLOC_WIRED, ma, pages);
+	for (i = 0; i < pages; i++)
+		ma[i]->valid = VM_PAGE_BITS_ALL;
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(ks, ma, pages);
 	return (1);
 }
 
 static void
 vm_thread_stack_dispose(vm_object_t ksobj, vm_offset_t ks, int pages)
 {
 	vm_page_t m;
 	int i;
 
 	atomic_add_int(&kstacks, -1);
 	pmap_qremove(ks, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_dispose: kstack already missing?");
 		vm_page_lock(m);
 		vm_page_unwire(m, PQ_NONE);
 		vm_page_free(m);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	vm_object_deallocate(ksobj);
 	kva_free(ks - (KSTACK_GUARD_PAGES * PAGE_SIZE),
 	    (pages + KSTACK_GUARD_PAGES) * PAGE_SIZE);
 }
 
 /*
  * Dispose of a thread's kernel stack.
  */
 void
 vm_thread_dispose(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_offset_t ks;
 	struct kstack_cache_entry *ks_ce;
 	int pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	ks = td->td_kstack;
 	td->td_kstack = 0;
 	td->td_kstack_pages = 0;
 	if (pages == kstack_pages && kstacks <= kstack_cache_size) {
 		ks_ce = (struct kstack_cache_entry *)ks;
 		ks_ce->ksobj = ksobj;
 		mtx_lock(&kstack_cache_mtx);
 		ks_ce->next_ks_entry = kstack_cache;
 		kstack_cache = ks_ce;
 		mtx_unlock(&kstack_cache_mtx);
 		return;
 	}
 	vm_thread_stack_dispose(ksobj, ks, pages);
 }
 
 static void
 vm_thread_stack_lowmem(void *nulll)
 {
 	struct kstack_cache_entry *ks_ce, *ks_ce1;
 
 	mtx_lock(&kstack_cache_mtx);
 	ks_ce = kstack_cache;
 	kstack_cache = NULL;
 	mtx_unlock(&kstack_cache_mtx);
 
 	while (ks_ce != NULL) {
 		ks_ce1 = ks_ce;
 		ks_ce = ks_ce->next_ks_entry;
 
 		vm_thread_stack_dispose(ks_ce1->ksobj, (vm_offset_t)ks_ce1,
 		    kstack_pages);
 	}
 }
 
 static void
 kstack_cache_init(void *nulll)
 {
 
 	EVENTHANDLER_REGISTER(vm_lowmem, vm_thread_stack_lowmem, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 
 SYSINIT(vm_kstacks, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY, kstack_cache_init, NULL);
 
 #ifdef KSTACK_USAGE_PROF
 /*
  * Track maximum stack used by a thread in kernel.
  */
 static int max_kstack_used;
 
 SYSCTL_INT(_debug, OID_AUTO, max_kstack_used, CTLFLAG_RD,
     &max_kstack_used, 0,
     "Maxiumum stack depth used by a thread in kernel");
 
 void
 intr_prof_stack_use(struct thread *td, struct trapframe *frame)
 {
 	vm_offset_t stack_top;
 	vm_offset_t current;
 	int used, prev_used;
 
 	/*
 	 * Testing for interrupted kernel mode isn't strictly
 	 * needed. It optimizes the execution, since interrupts from
 	 * usermode will have only the trap frame on the stack.
 	 */
 	if (TRAPF_USERMODE(frame))
 		return;
 
 	stack_top = td->td_kstack + td->td_kstack_pages * PAGE_SIZE;
 	current = (vm_offset_t)(uintptr_t)&stack_top;
 
 	/*
 	 * Try to detect if interrupt is using kernel thread stack.
 	 * Hardware could use a dedicated stack for interrupt handling.
 	 */
 	if (stack_top <= current || current < td->td_kstack)
 		return;
 
 	used = stack_top - current;
 	for (;;) {
 		prev_used = max_kstack_used;
 		if (prev_used >= used)
 			break;
 		if (atomic_cmpset_int(&max_kstack_used, prev_used, used))
 			break;
 	}
 }
 #endif /* KSTACK_USAGE_PROF */
 
 #ifndef NO_SWAPPING
 /*
  * Allow a thread's kernel stack to be paged out.
  */
 static void
 vm_thread_swapout(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t m;
 	int i, pages;
 
 	cpu_thread_swapout(td);
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	pmap_qremove(td->td_kstack, pages);
 	VM_OBJECT_WLOCK(ksobj);
 	for (i = 0; i < pages; i++) {
 		m = vm_page_lookup(ksobj, i);
 		if (m == NULL)
 			panic("vm_thread_swapout: kstack already missing?");
 		vm_page_dirty(m);
 		vm_page_lock(m);
 		vm_page_unwire(m, PQ_INACTIVE);
 		vm_page_unlock(m);
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 }
 
 /*
  * Bring the kernel stack for a specified thread back in.
  */
 static void
 vm_thread_swapin(struct thread *td)
 {
 	vm_object_t ksobj;
 	vm_page_t ma[KSTACK_MAX_PAGES];
 	int pages;
 
 	pages = td->td_kstack_pages;
 	ksobj = td->td_kstack_obj;
 	VM_OBJECT_WLOCK(ksobj);
-	for (int i = 0; i < pages; i++)
-		ma[i] = vm_page_grab(ksobj, i, VM_ALLOC_NORMAL |
-		    VM_ALLOC_WIRED);
+	vm_page_grab_pages(ksobj, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED, ma,
+	    pages);
 	for (int i = 0; i < pages;) {
 		int j, a, count, rv;
 
 		vm_page_assert_xbusied(ma[i]);
 		if (ma[i]->valid == VM_PAGE_BITS_ALL) {
 			vm_page_xunbusy(ma[i]);
 			i++;
 			continue;
 		}
 		vm_object_pip_add(ksobj, 1);
 		for (j = i + 1; j < pages; j++)
 			if (ma[j]->valid == VM_PAGE_BITS_ALL)
 				break;
 		rv = vm_pager_has_page(ksobj, ma[i]->pindex, NULL, &a);
 		KASSERT(rv == 1, ("%s: missing page %p", __func__, ma[i]));
 		count = min(a + 1, j - i);
 		rv = vm_pager_get_pages(ksobj, ma + i, count, NULL, NULL);
 		KASSERT(rv == VM_PAGER_OK, ("%s: cannot get kstack for proc %d",
 		    __func__, td->td_proc->p_pid));
 		vm_object_pip_wakeup(ksobj);
 		for (j = i; j < i + count; j++)
 			vm_page_xunbusy(ma[j]);
 		i += count;
 	}
 	VM_OBJECT_WUNLOCK(ksobj);
 	pmap_qenter(td->td_kstack, ma, pages);
 	cpu_thread_swapin(td);
 }
 #endif /* !NO_SWAPPING */
 
 /*
  * Implement fork's actions on an address space.
  * Here we arrange for the address space to be copied or referenced,
  * allocate a user struct (pcb and kernel stack), then call the
  * machine-dependent layer to fill those in and make the new process
  * ready to run.  The new process is set up so that it returns directly
  * to user mode to avoid stack copying and relocation problems.
  */
 int
 vm_forkproc(td, p2, td2, vm2, flags)
 	struct thread *td;
 	struct proc *p2;
 	struct thread *td2;
 	struct vmspace *vm2;
 	int flags;
 {
 	struct proc *p1 = td->td_proc;
 	int error;
 
 	if ((flags & RFPROC) == 0) {
 		/*
 		 * Divorce the memory, if it is shared, essentially
 		 * this changes shared memory amongst threads, into
 		 * COW locally.
 		 */
 		if ((flags & RFMEM) == 0) {
 			if (p1->p_vmspace->vm_refcnt > 1) {
 				error = vmspace_unshare(p1);
 				if (error)
 					return (error);
 			}
 		}
 		cpu_fork(td, p2, td2, flags);
 		return (0);
 	}
 
 	if (flags & RFMEM) {
 		p2->p_vmspace = p1->p_vmspace;
 		atomic_add_int(&p1->p_vmspace->vm_refcnt, 1);
 	}
 
 	while (vm_page_count_severe()) {
 		VM_WAIT;
 	}
 
 	if ((flags & RFMEM) == 0) {
 		p2->p_vmspace = vm2;
 		if (p1->p_vmspace->vm_shm)
 			shmfork(p1, p2);
 	}
 
 	/*
 	 * cpu_fork will copy and update the pcb, set up the kernel stack,
 	 * and make the child ready to run.
 	 */
 	cpu_fork(td, p2, td2, flags);
 	return (0);
 }
 
 /*
  * Called after process has been wait(2)'ed upon and is being reaped.
  * The idea is to reclaim resources that we could not reclaim while
  * the process was still executing.
  */
 void
 vm_waitproc(p)
 	struct proc *p;
 {
 
 	vmspace_exitfree(p);		/* and clean-out the vmspace */
 }
 
 void
 faultin(p)
 	struct proc *p;
 {
 #ifdef NO_SWAPPING
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_INMEM) == 0)
 		panic("faultin: proc swapped out with NO_SWAPPING!");
 #else /* !NO_SWAPPING */
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	/*
 	 * If another process is swapping in this process,
 	 * just wait until it finishes.
 	 */
 	if (p->p_flag & P_SWAPPINGIN) {
 		while (p->p_flag & P_SWAPPINGIN)
 			msleep(&p->p_flag, &p->p_mtx, PVM, "faultin", 0);
 		return;
 	}
 	if ((p->p_flag & P_INMEM) == 0) {
 		/*
 		 * Don't let another thread swap process p out while we are
 		 * busy swapping it in.
 		 */
 		++p->p_lock;
 		p->p_flag |= P_SWAPPINGIN;
 		PROC_UNLOCK(p);
 
 		/*
 		 * We hold no lock here because the list of threads
 		 * can not change while all threads in the process are
 		 * swapped out.
 		 */
 		FOREACH_THREAD_IN_PROC(p, td)
 			vm_thread_swapin(td);
 		PROC_LOCK(p);
 		swapclear(p);
 		p->p_swtick = ticks;
 
 		wakeup(&p->p_flag);
 
 		/* Allow other threads to swap p out now. */
 		--p->p_lock;
 	}
 #endif /* NO_SWAPPING */
 }
 
 /*
  * This swapin algorithm attempts to swap-in processes only if there
  * is enough space for them.  Of course, if a process waits for a long
  * time, it will be swapped in anyway.
  */
 void
 swapper(void)
 {
 	struct proc *p;
 	struct thread *td;
 	struct proc *pp;
 	int slptime;
 	int swtime;
 	int ppri;
 	int pri;
 
 loop:
 	if (vm_page_count_min()) {
 		VM_WAIT;
 		goto loop;
 	}
 
 	pp = NULL;
 	ppri = INT_MIN;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW ||
 		    p->p_flag & (P_SWAPPINGOUT | P_SWAPPINGIN | P_INMEM)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		swtime = (ticks - p->p_swtick) / hz;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			/*
 			 * An otherwise runnable thread of a process
 			 * swapped out has only the TDI_SWAPPED bit set.
 			 * 
 			 */
 			thread_lock(td);
 			if (td->td_inhibitors == TDI_SWAPPED) {
 				slptime = (ticks - td->td_slptick) / hz;
 				pri = swtime + slptime;
 				if ((td->td_flags & TDF_SWAPINREQ) == 0)
 					pri -= p->p_nice * 8;
 				/*
 				 * if this thread is higher priority
 				 * and there is enough space, then select
 				 * this process instead of the previous
 				 * selection.
 				 */
 				if (pri > ppri) {
 					pp = p;
 					ppri = pri;
 				}
 			}
 			thread_unlock(td);
 		}
 		PROC_UNLOCK(p);
 	}
 	sx_sunlock(&allproc_lock);
 
 	/*
 	 * Nothing to do, back to sleep.
 	 */
 	if ((p = pp) == NULL) {
 		tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2);
 		goto loop;
 	}
 	PROC_LOCK(p);
 
 	/*
 	 * Another process may be bringing or may have already
 	 * brought this process in while we traverse all threads.
 	 * Or, this process may even be being swapped out again.
 	 */
 	if (p->p_flag & (P_INMEM | P_SWAPPINGOUT | P_SWAPPINGIN)) {
 		PROC_UNLOCK(p);
 		goto loop;
 	}
 
 	/*
 	 * We would like to bring someone in. (only if there is space).
 	 * [What checks the space? ]
 	 */
 	faultin(p);
 	PROC_UNLOCK(p);
 	goto loop;
 }
 
 void
 kick_proc0(void)
 {
 
 	wakeup(&proc0);
 }
 
 #ifndef NO_SWAPPING
 
 /*
  * Swap_idle_threshold1 is the guaranteed swapped in time for a process
  */
 static int swap_idle_threshold1 = 2;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold1, CTLFLAG_RW,
     &swap_idle_threshold1, 0, "Guaranteed swapped in time for a process");
 
 /*
  * Swap_idle_threshold2 is the time that a process can be idle before
  * it will be swapped out, if idle swapping is enabled.
  */
 static int swap_idle_threshold2 = 10;
 SYSCTL_INT(_vm, OID_AUTO, swap_idle_threshold2, CTLFLAG_RW,
     &swap_idle_threshold2, 0, "Time before a process will be swapped out");
 
 /*
  * First, if any processes have been sleeping or stopped for at least
  * "swap_idle_threshold1" seconds, they are swapped out.  If, however,
  * no such processes exist, then the longest-sleeping or stopped
  * process is swapped out.  Finally, and only as a last resort, if
  * there are no sleeping or stopped processes, the longest-resident
  * process is swapped out.
  */
 void
 swapout_procs(action)
 int action;
 {
 	struct proc *p;
 	struct thread *td;
 	int didswap = 0;
 
 retry:
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		struct vmspace *vm;
 		int minslptime = 100000;
 		int slptime;
 
 		PROC_LOCK(p);
 		/*
 		 * Watch out for a process in
 		 * creation.  It may have no
 		 * address space or lock yet.
 		 */
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * An aio daemon switches its
 		 * address space while running.
 		 * Perform a quick check whether
 		 * a process has P_SYSTEM.
 		 * Filter out exiting processes.
 		 */
 		if ((p->p_flag & (P_SYSTEM | P_WEXIT)) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD_LITE(p);
 		PROC_UNLOCK(p);
 		sx_sunlock(&allproc_lock);
 
 		/*
 		 * Do not swapout a process that
 		 * is waiting for VM data
 		 * structures as there is a possible
 		 * deadlock.  Test this first as
 		 * this may block.
 		 *
 		 * Lock the map until swapout
 		 * finishes, or a thread of this
 		 * process may attempt to alter
 		 * the map.
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL)
 			goto nextproc2;
 		if (!vm_map_trylock(&vm->vm_map))
 			goto nextproc1;
 
 		PROC_LOCK(p);
 		if (p->p_lock != 1 || (p->p_flag & (P_STOPPED_SINGLE |
 		    P_TRACED | P_SYSTEM)) != 0)
 			goto nextproc;
 
 		/*
 		 * only aiod changes vmspace, however it will be
 		 * skipped because of the if statement above checking 
 		 * for P_SYSTEM
 		 */
 		if ((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) != P_INMEM)
 			goto nextproc;
 
 		switch (p->p_state) {
 		default:
 			/* Don't swap out processes in any sort
 			 * of 'special' state. */
 			break;
 
 		case PRS_NORMAL:
 			/*
 			 * do not swapout a realtime process
 			 * Check all the thread groups..
 			 */
 			FOREACH_THREAD_IN_PROC(p, td) {
 				thread_lock(td);
 				if (PRI_IS_REALTIME(td->td_pri_class)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				slptime = (ticks - td->td_slptick) / hz;
 				/*
 				 * Guarantee swap_idle_threshold1
 				 * time in memory.
 				 */
 				if (slptime < swap_idle_threshold1) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				/*
 				 * Do not swapout a process if it is
 				 * waiting on a critical event of some
 				 * kind or there is a thread whose
 				 * pageable memory may be accessed.
 				 *
 				 * This could be refined to support
 				 * swapping out a thread.
 				 */
 				if (!thread_safetoswapout(td)) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 				/*
 				 * If the system is under memory stress,
 				 * or if we are swapping
 				 * idle processes >= swap_idle_threshold2,
 				 * then swap the process out.
 				 */
 				if (((action & VM_SWAP_NORMAL) == 0) &&
 				    (((action & VM_SWAP_IDLE) == 0) ||
 				    (slptime < swap_idle_threshold2))) {
 					thread_unlock(td);
 					goto nextproc;
 				}
 
 				if (minslptime > slptime)
 					minslptime = slptime;
 				thread_unlock(td);
 			}
 
 			/*
 			 * If the pageout daemon didn't free enough pages,
 			 * or if this process is idle and the system is
 			 * configured to swap proactively, swap it out.
 			 */
 			if ((action & VM_SWAP_NORMAL) ||
 				((action & VM_SWAP_IDLE) &&
 				 (minslptime > swap_idle_threshold2))) {
 				_PRELE(p);
 				if (swapout(p) == 0)
 					didswap++;
 				PROC_UNLOCK(p);
 				vm_map_unlock(&vm->vm_map);
 				vmspace_free(vm);
 				goto retry;
 			}
 		}
 nextproc:
 		PROC_UNLOCK(p);
 		vm_map_unlock(&vm->vm_map);
 nextproc1:
 		vmspace_free(vm);
 nextproc2:
 		sx_slock(&allproc_lock);
 		PRELE(p);
 	}
 	sx_sunlock(&allproc_lock);
 	/*
 	 * If we swapped something out, and another process needed memory,
 	 * then wakeup the sched process.
 	 */
 	if (didswap)
 		wakeup(&proc0);
 }
 
 static void
 swapclear(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		td->td_flags |= TDF_INMEM;
 		td->td_flags &= ~TDF_SWAPINREQ;
 		TD_CLR_SWAPPED(td);
 		if (TD_CAN_RUN(td))
 			if (setrunnable(td)) {
 #ifdef INVARIANTS
 				/*
 				 * XXX: We just cleared TDI_SWAPPED
 				 * above and set TDF_INMEM, so this
 				 * should never happen.
 				 */
 				panic("not waking up swapper");
 #endif
 			}
 		thread_unlock(td);
 	}
 	p->p_flag &= ~(P_SWAPPINGIN|P_SWAPPINGOUT);
 	p->p_flag |= P_INMEM;
 }
 
 static int
 swapout(p)
 	struct proc *p;
 {
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #if defined(SWAP_DEBUG)
 	printf("swapping out %d\n", p->p_pid);
 #endif
 
 	/*
 	 * The states of this process and its threads may have changed
 	 * by now.  Assuming that there is only one pageout daemon thread,
 	 * this process should still be in memory.
 	 */
 	KASSERT((p->p_flag & (P_INMEM|P_SWAPPINGOUT|P_SWAPPINGIN)) == P_INMEM,
 		("swapout: lost a swapout race?"));
 
 	/*
 	 * remember the process resident count
 	 */
 	p->p_vmspace->vm_swrss = vmspace_resident_count(p->p_vmspace);
 	/*
 	 * Check and mark all threads before we proceed.
 	 */
 	p->p_flag &= ~P_INMEM;
 	p->p_flag |= P_SWAPPINGOUT;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		thread_lock(td);
 		if (!thread_safetoswapout(td)) {
 			thread_unlock(td);
 			swapclear(p);
 			return (EBUSY);
 		}
 		td->td_flags &= ~TDF_INMEM;
 		TD_SET_SWAPPED(td);
 		thread_unlock(td);
 	}
 	td = FIRST_THREAD_IN_PROC(p);
 	++td->td_ru.ru_nswap;
 	PROC_UNLOCK(p);
 
 	/*
 	 * This list is stable because all threads are now prevented from
 	 * running.  The list is only modified in the context of a running
 	 * thread in this process.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		vm_thread_swapout(td);
 
 	PROC_LOCK(p);
 	p->p_flag &= ~P_SWAPPINGOUT;
 	p->p_swtick = ticks;
 	return (0);
 }
 #endif /* !NO_SWAPPING */
Index: stable/11/sys/vm/vm_page.c
===================================================================
--- stable/11/sys/vm/vm_page.c	(revision 323661)
+++ stable/11/sys/vm/vm_page.c	(revision 323662)
@@ -1,3616 +1,3710 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.c	7.4 (Berkeley) 5/7/91
  */
 
 /*-
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *			GENERAL RULES ON VM_PAGE MANIPULATION
  *
  *	- A page queue lock is required when adding or removing a page from a
  *	  page queue regardless of other locks or the busy state of a page.
  *
  *		* In general, no thread besides the page daemon can acquire or
  *		  hold more than one page queue lock at a time.
  *
  *		* The page daemon can acquire and hold any pair of page queue
  *		  locks in any order.
  *
  *	- The object lock is required when inserting or removing
  *	  pages from an object (vm_page_insert() or vm_page_remove()).
  *
  */
 
 /*
  *	Resident memory management module.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 #include <vm/uma_int.h>
 
 #include <machine/md_var.h>
 
 /*
  *	Associated with page of user-allocatable memory is a
  *	page structure.
  */
 
 struct vm_domain vm_dom[MAXMEMDOM];
 struct mtx_padalign vm_page_queue_free_mtx;
 
 struct mtx_padalign pa_lock[PA_LOCK_COUNT];
 
 vm_page_t vm_page_array;
 long vm_page_array_size;
 long first_page;
 int vm_page_zero_count;
 
 static int boot_pages = UMA_BOOT_PAGES;
 SYSCTL_INT(_vm, OID_AUTO, boot_pages, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &boot_pages, 0,
     "number of pages allocated for bootstrapping the VM system");
 
 static int pa_tryrelock_restart;
 SYSCTL_INT(_vm, OID_AUTO, tryrelock_restart, CTLFLAG_RD,
     &pa_tryrelock_restart, 0, "Number of tryrelock restarts");
 
 static TAILQ_HEAD(, vm_page) blacklist_head;
 static int sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, page_blacklist, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_vm_page_blacklist, "A", "Blacklist pages");
 
 /* Is the page daemon waiting for free pages? */
 static int vm_pageout_pages_needed;
 
 static uma_zone_t fakepg_zone;
 
 static void vm_page_alloc_check(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(uint8_t queue, vm_page_t m);
 static void vm_page_free_wakeup(void);
 static void vm_page_init_fakepg(void *dummy);
 static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
 static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
     vm_page_t mpred);
 static int vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
     vm_paddr_t high);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
 static void
 vm_page_init_fakepg(void *dummy)
 {
 
 	fakepg_zone = uma_zcreate("fakepg", sizeof(struct vm_page), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_VM);
 }
 
 /* Make sure that u_long is at least 64 bits when PAGE_SIZE is 32K. */
 #if PAGE_SIZE == 32768
 #ifdef CTASSERT
 CTASSERT(sizeof(u_long) >= 8);
 #endif
 #endif
 
 /*
  * Try to acquire a physical address lock while a pmap is locked.  If we
  * fail to trylock we unlock and lock the pmap directly and cache the
  * locked pa in *locked.  The caller should then restart their loop in case
  * the virtual to physical mapping has changed.
  */
 int
 vm_page_pa_tryrelock(pmap_t pmap, vm_paddr_t pa, vm_paddr_t *locked)
 {
 	vm_paddr_t lockpa;
 
 	lockpa = *locked;
 	*locked = pa;
 	if (lockpa) {
 		PA_LOCK_ASSERT(lockpa, MA_OWNED);
 		if (PA_LOCKPTR(pa) == PA_LOCKPTR(lockpa))
 			return (0);
 		PA_UNLOCK(lockpa);
 	}
 	if (PA_TRYLOCK(pa))
 		return (0);
 	PMAP_UNLOCK(pmap);
 	atomic_add_int(&pa_tryrelock_restart, 1);
 	PA_LOCK(pa);
 	PMAP_LOCK(pmap);
 	return (EAGAIN);
 }
 
 /*
  *	vm_set_page_size:
  *
  *	Sets the page size, perhaps based upon the memory
  *	size.  Must be called before any use of page-size
  *	dependent functions.
  */
 void
 vm_set_page_size(void)
 {
 	if (vm_cnt.v_page_size == 0)
 		vm_cnt.v_page_size = PAGE_SIZE;
 	if (((vm_cnt.v_page_size - 1) & vm_cnt.v_page_size) != 0)
 		panic("vm_set_page_size: page size not a power of two");
 }
 
 /*
  *	vm_page_blacklist_next:
  *
  *	Find the next entry in the provided string of blacklist
  *	addresses.  Entries are separated by space, comma, or newline.
  *	If an invalid integer is encountered then the rest of the
  *	string is skipped.  Updates the list pointer to the next
  *	character, or NULL if the string is exhausted or invalid.
  */
 static vm_paddr_t
 vm_page_blacklist_next(char **list, char *end)
 {
 	vm_paddr_t bad;
 	char *cp, *pos;
 
 	if (list == NULL || *list == NULL)
 		return (0);
 	if (**list =='\0') {
 		*list = NULL;
 		return (0);
 	}
 
 	/*
 	 * If there's no end pointer then the buffer is coming from
 	 * the kenv and we know it's null-terminated.
 	 */
 	if (end == NULL)
 		end = *list + strlen(*list);
 
 	/* Ensure that strtoq() won't walk off the end */
 	if (*end != '\0') {
 		if (*end == '\n' || *end == ' ' || *end  == ',')
 			*end = '\0';
 		else {
 			printf("Blacklist not terminated, skipping\n");
 			*list = NULL;
 			return (0);
 		}
 	}
 
 	for (pos = *list; *pos != '\0'; pos = cp) {
 		bad = strtoq(pos, &cp, 0);
 		if (*cp == '\0' || *cp == ' ' || *cp == ',' || *cp == '\n') {
 			if (bad == 0) {
 				if (++cp < end)
 					continue;
 				else
 					break;
 			}
 		} else
 			break;
 		if (*cp == '\0' || ++cp >= end)
 			*list = NULL;
 		else
 			*list = cp;
 		return (trunc_page(bad));
 	}
 	printf("Garbage in RAM blacklist, skipping\n");
 	*list = NULL;
 	return (0);
 }
 
 /*
  *	vm_page_blacklist_check:
  *
  *	Iterate through the provided string of blacklist addresses, pulling
  *	each entry out of the physical allocator free list and putting it
  *	onto a list for reporting via the vm.page_blacklist sysctl.
  */
 static void
 vm_page_blacklist_check(char *list, char *end)
 {
 	vm_paddr_t pa;
 	vm_page_t m;
 	char *next;
 	int ret;
 
 	next = list;
 	while (next != NULL) {
 		if ((pa = vm_page_blacklist_next(&next, end)) == 0)
 			continue;
 		m = vm_phys_paddr_to_vm_page(pa);
 		if (m == NULL)
 			continue;
 		mtx_lock(&vm_page_queue_free_mtx);
 		ret = vm_phys_unfree_page(m);
 		mtx_unlock(&vm_page_queue_free_mtx);
 		if (ret == TRUE) {
 			TAILQ_INSERT_TAIL(&blacklist_head, m, listq);
 			if (bootverbose)
 				printf("Skipping page with pa 0x%jx\n",
 				    (uintmax_t)pa);
 		}
 	}
 }
 
 /*
  *	vm_page_blacklist_load:
  *
  *	Search for a special module named "ram_blacklist".  It'll be a
  *	plain text file provided by the user via the loader directive
  *	of the same name.
  */
 static void
 vm_page_blacklist_load(char **list, char **end)
 {
 	void *mod;
 	u_char *ptr;
 	u_int len;
 
 	mod = NULL;
 	ptr = NULL;
 
 	mod = preload_search_by_type("ram_blacklist");
 	if (mod != NULL) {
 		ptr = preload_fetch_addr(mod);
 		len = preload_fetch_size(mod);
         }
 	*list = ptr;
 	if (ptr != NULL)
 		*end = ptr + len;
 	else
 		*end = NULL;
 	return;
 }
 
 static int
 sysctl_vm_page_blacklist(SYSCTL_HANDLER_ARGS)
 {
 	vm_page_t m;
 	struct sbuf sbuf;
 	int error, first;
 
 	first = 1;
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	TAILQ_FOREACH(m, &blacklist_head, listq) {
 		sbuf_printf(&sbuf, "%s%#jx", first ? "" : ",",
 		    (uintmax_t)m->phys_addr);
 		first = 0;
 	}
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 static void
 vm_page_domain_init(struct vm_domain *vmd)
 {
 	struct vm_pagequeue *pq;
 	int i;
 
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_name) =
 	    "vm inactive pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_INACTIVE].pq_vcnt) =
 	    &vm_cnt.v_inactive_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_name) =
 	    "vm active pagequeue";
 	*__DECONST(u_int **, &vmd->vmd_pagequeues[PQ_ACTIVE].pq_vcnt) =
 	    &vm_cnt.v_active_count;
 	*__DECONST(char **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_name) =
 	    "vm laundry pagequeue";
 	*__DECONST(int **, &vmd->vmd_pagequeues[PQ_LAUNDRY].pq_vcnt) =
 	    &vm_cnt.v_laundry_count;
 	vmd->vmd_page_count = 0;
 	vmd->vmd_free_count = 0;
 	vmd->vmd_segs = 0;
 	vmd->vmd_oom = FALSE;
 	for (i = 0; i < PQ_COUNT; i++) {
 		pq = &vmd->vmd_pagequeues[i];
 		TAILQ_INIT(&pq->pq_pl);
 		mtx_init(&pq->pq_mutex, pq->pq_name, "vm pagequeue",
 		    MTX_DEF | MTX_DUPOK);
 	}
 }
 
 /*
  *	vm_page_startup:
  *
  *	Initializes the resident memory module.  Allocates physical memory for
  *	bootstrapping UMA and some data structures that are used to manage
  *	physical pages.  Initializes these structures, and populates the free
  *	page queues.
  */
 vm_offset_t
 vm_page_startup(vm_offset_t vaddr)
 {
 	vm_offset_t mapped;
 	vm_paddr_t high_avail, low_avail, page_range, size;
 	vm_paddr_t new_end;
 	int i;
 	vm_paddr_t pa;
 	vm_paddr_t last_pa;
 	char *list, *listend;
 	vm_paddr_t end;
 	vm_paddr_t biggestsize;
 	int biggestone;
 	int pages_per_zone;
 
 	biggestsize = 0;
 	biggestone = 0;
 	vaddr = round_page(vaddr);
 
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		phys_avail[i] = round_page(phys_avail[i]);
 		phys_avail[i + 1] = trunc_page(phys_avail[i + 1]);
 	}
 	for (i = 0; phys_avail[i + 1]; i += 2) {
 		size = phys_avail[i + 1] - phys_avail[i];
 		if (size > biggestsize) {
 			biggestone = i;
 			biggestsize = size;
 		}
 	}
 
 	end = phys_avail[biggestone+1];
 
 	/*
 	 * Initialize the page and queue locks.
 	 */
 	mtx_init(&vm_page_queue_free_mtx, "vm page free queue", NULL, MTX_DEF);
 	for (i = 0; i < PA_LOCK_COUNT; i++)
 		mtx_init(&pa_lock[i], "vm page", NULL, MTX_DEF);
 	for (i = 0; i < vm_ndomains; i++)
 		vm_page_domain_init(&vm_dom[i]);
 
 	/*
 	 * Almost all of the pages needed for bootstrapping UMA are used
 	 * for zone structures, so if the number of CPUs results in those
 	 * structures taking more than one page each, we set aside more pages
 	 * in proportion to the zone structure size.
 	 */
 	pages_per_zone = howmany(sizeof(struct uma_zone) +
 	    sizeof(struct uma_cache) * (mp_maxid + 1), UMA_SLAB_SIZE);
 	if (pages_per_zone > 1) {
 		/* Reserve more pages so that we don't run out. */
 		boot_pages = UMA_BOOT_PAGES_ZONES * pages_per_zone;
 	}
 
 	/*
 	 * Allocate memory for use when boot strapping the kernel memory
 	 * allocator.
 	 *
 	 * CTFLAG_RDTUN doesn't work during the early boot process, so we must
 	 * manually fetch the value.
 	 */
 	TUNABLE_INT_FETCH("vm.boot_pages", &boot_pages);
 	new_end = end - (boot_pages * UMA_SLAB_SIZE);
 	new_end = trunc_page(new_end);
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)mapped, end - new_end);
 	uma_startup((void *)mapped, boot_pages);
 
 #if defined(__aarch64__) || defined(__amd64__) || defined(__arm__) || \
     defined(__i386__) || defined(__mips__)
 	/*
 	 * Allocate a bitmap to indicate that a random physical page
 	 * needs to be included in a minidump.
 	 *
 	 * The amd64 port needs this to indicate which direct map pages
 	 * need to be dumped, via calls to dump_add_page()/dump_drop_page().
 	 *
 	 * However, i386 still needs this workspace internally within the
 	 * minidump code.  In theory, they are not needed on i386, but are
 	 * included should the sf_buf code decide to use them.
 	 */
 	last_pa = 0;
 	for (i = 0; dump_avail[i + 1] != 0; i += 2)
 		if (dump_avail[i + 1] > last_pa)
 			last_pa = dump_avail[i + 1];
 	page_range = last_pa / PAGE_SIZE;
 	vm_page_dump_size = round_page(roundup2(page_range, NBBY) / NBBY);
 	new_end -= vm_page_dump_size;
 	vm_page_dump = (void *)(uintptr_t)pmap_map(&vaddr, new_end,
 	    new_end + vm_page_dump_size, VM_PROT_READ | VM_PROT_WRITE);
 	bzero((void *)vm_page_dump, vm_page_dump_size);
 #endif
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include the UMA bootstrap pages and vm_page_dump in a crash dump.
 	 * When pmap_map() uses the direct map, they are not automatically 
 	 * included.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif
 	phys_avail[biggestone + 1] = new_end;
 #ifdef __amd64__
 	/*
 	 * Request that the physical pages underlying the message buffer be
 	 * included in a crash dump.  Since the message buffer is accessed
 	 * through the direct map, they are not automatically included.
 	 */
 	pa = DMAP_TO_PHYS((vm_offset_t)msgbufp->msg_ptr);
 	last_pa = pa + round_page(msgbufsize);
 	while (pa < last_pa) {
 		dump_add_page(pa);
 		pa += PAGE_SIZE;
 	}
 #endif
 	/*
 	 * Compute the number of pages of memory that will be available for
 	 * use, taking into account the overhead of a page structure per page.
 	 * In other words, solve
 	 *	"available physical memory" - round_page(page_range *
 	 *	    sizeof(struct vm_page)) = page_range * PAGE_SIZE 
 	 * for page_range.  
 	 */
 	low_avail = phys_avail[0];
 	high_avail = phys_avail[1];
 	for (i = 0; i < vm_phys_nsegs; i++) {
 		if (vm_phys_segs[i].start < low_avail)
 			low_avail = vm_phys_segs[i].start;
 		if (vm_phys_segs[i].end > high_avail)
 			high_avail = vm_phys_segs[i].end;
 	}
 	/* Skip the first chunk.  It is already accounted for. */
 	for (i = 2; phys_avail[i + 1] != 0; i += 2) {
 		if (phys_avail[i] < low_avail)
 			low_avail = phys_avail[i];
 		if (phys_avail[i + 1] > high_avail)
 			high_avail = phys_avail[i + 1];
 	}
 	first_page = low_avail / PAGE_SIZE;
 #ifdef VM_PHYSSEG_SPARSE
 	size = 0;
 	for (i = 0; i < vm_phys_nsegs; i++)
 		size += vm_phys_segs[i].end - vm_phys_segs[i].start;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		size += phys_avail[i + 1] - phys_avail[i];
 #elif defined(VM_PHYSSEG_DENSE)
 	size = high_avail - low_avail;
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 
 #ifdef VM_PHYSSEG_DENSE
 	/*
 	 * In the VM_PHYSSEG_DENSE case, the number of pages can account for
 	 * the overhead of a page structure per page only if vm_page_array is
 	 * allocated from the last physical memory chunk.  Otherwise, we must
 	 * allocate page structures representing the physical memory
 	 * underlying vm_page_array, even though they will not be used.
 	 */
 	if (new_end != high_avail)
 		page_range = size / PAGE_SIZE;
 	else
 #endif
 	{
 		page_range = size / (PAGE_SIZE + sizeof(struct vm_page));
 
 		/*
 		 * If the partial bytes remaining are large enough for
 		 * a page (PAGE_SIZE) without a corresponding
 		 * 'struct vm_page', then new_end will contain an
 		 * extra page after subtracting the length of the VM
 		 * page array.  Compensate by subtracting an extra
 		 * page from new_end.
 		 */
 		if (size % (PAGE_SIZE + sizeof(struct vm_page)) >= PAGE_SIZE) {
 			if (new_end == high_avail)
 				high_avail -= PAGE_SIZE;
 			new_end -= PAGE_SIZE;
 		}
 	}
 	end = new_end;
 
 	/*
 	 * Reserve an unmapped guard page to trap access to vm_page_array[-1].
 	 * However, because this page is allocated from KVM, out-of-bounds
 	 * accesses using the direct map will not be trapped.
 	 */
 	vaddr += PAGE_SIZE;
 
 	/*
 	 * Allocate physical memory for the page structures, and map it.
 	 */
 	new_end = trunc_page(end - page_range * sizeof(struct vm_page));
 	mapped = pmap_map(&vaddr, new_end, end,
 	    VM_PROT_READ | VM_PROT_WRITE);
 	vm_page_array = (vm_page_t) mapped;
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Allocate physical memory for the reservation management system's
 	 * data structures, and map it.
 	 */
 	if (high_avail == end)
 		high_avail = new_end;
 	new_end = vm_reserv_startup(&vaddr, new_end, high_avail);
 #endif
 #if defined(__aarch64__) || defined(__amd64__) || defined(__mips__)
 	/*
 	 * Include vm_page_array and vm_reserv_array in a crash dump.
 	 */
 	for (pa = new_end; pa < end; pa += PAGE_SIZE)
 		dump_add_page(pa);
 #endif
 	phys_avail[biggestone + 1] = new_end;
 
 	/*
 	 * Add physical memory segments corresponding to the available
 	 * physical pages.
 	 */
 	for (i = 0; phys_avail[i + 1] != 0; i += 2)
 		vm_phys_add_seg(phys_avail[i], phys_avail[i + 1]);
 
 	/*
 	 * Clear all of the page structures
 	 */
 	bzero((caddr_t) vm_page_array, page_range * sizeof(struct vm_page));
 	for (i = 0; i < page_range; i++)
 		vm_page_array[i].order = VM_NFREEORDER;
 	vm_page_array_size = page_range;
 
 	/*
 	 * Initialize the physical memory allocator.
 	 */
 	vm_phys_init();
 
 	/*
 	 * Add every available physical page that is not blacklisted to
 	 * the free lists.
 	 */
 	vm_cnt.v_page_count = 0;
 	vm_cnt.v_free_count = 0;
 	for (i = 0; phys_avail[i + 1] != 0; i += 2) {
 		pa = phys_avail[i];
 		last_pa = phys_avail[i + 1];
 		while (pa < last_pa) {
 			vm_phys_add_page(pa);
 			pa += PAGE_SIZE;
 		}
 	}
 
 	TAILQ_INIT(&blacklist_head);
 	vm_page_blacklist_load(&list, &listend);
 	vm_page_blacklist_check(list, listend);
 
 	list = kern_getenv("vm.blacklist");
 	vm_page_blacklist_check(list, NULL);
 
 	freeenv(list);
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * Initialize the reservation management system.
 	 */
 	vm_reserv_init();
 #endif
 	return (vaddr);
 }
 
 void
 vm_page_reference(vm_page_t m)
 {
 
 	vm_page_aflag_set(m, PGA_REFERENCED);
 }
 
 /*
  *	vm_page_busy_downgrade:
  *
  *	Downgrade an exclusive busy page into a single shared busy page.
  */
 void
 vm_page_busy_downgrade(vm_page_t m)
 {
 	u_int x;
 	bool locked;
 
 	vm_page_assert_xbusied(m);
 	locked = mtx_owned(vm_page_lockptr(m));
 
 	for (;;) {
 		x = m->busy_lock;
 		x &= VPB_BIT_WAITERS;
 		if (x != 0 && !locked)
 			vm_page_lock(m);
 		if (atomic_cmpset_rel_int(&m->busy_lock,
 		    VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1)))
 			break;
 		if (x != 0 && !locked)
 			vm_page_unlock(m);
 	}
 	if (x != 0) {
 		wakeup(m);
 		if (!locked)
 			vm_page_unlock(m);
 	}
 }
 
 /*
  *	vm_page_sbusied:
  *
  *	Return a positive value if the page is shared busied, 0 otherwise.
  */
 int
 vm_page_sbusied(vm_page_t m)
 {
 	u_int x;
 
 	x = m->busy_lock;
 	return ((x & VPB_BIT_SHARED) != 0 && x != VPB_UNBUSIED);
 }
 
 /*
  *	vm_page_sunbusy:
  *
  *	Shared unbusy a page.
  */
 void
 vm_page_sunbusy(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	vm_page_assert_sbusied(m);
 
 	for (;;) {
 		x = m->busy_lock;
 		if (VPB_SHARERS(x) > 1) {
 			if (atomic_cmpset_int(&m->busy_lock, x,
 			    x - VPB_ONE_SHARER))
 				break;
 			continue;
 		}
 		if ((x & VPB_BIT_WAITERS) == 0) {
 			KASSERT(x == VPB_SHARERS_WORD(1),
 			    ("vm_page_sunbusy: invalid lock state"));
 			if (atomic_cmpset_int(&m->busy_lock,
 			    VPB_SHARERS_WORD(1), VPB_UNBUSIED))
 				break;
 			continue;
 		}
 		KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS),
 		    ("vm_page_sunbusy: invalid lock state for waiters"));
 
 		vm_page_lock(m);
 		if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) {
 			vm_page_unlock(m);
 			continue;
 		}
 		wakeup(m);
 		vm_page_unlock(m);
 		break;
 	}
 }
 
 /*
  *	vm_page_busy_sleep:
  *
  *	Sleep and release the page lock, using the page pointer as wchan.
  *	This is used to implement the hard-path of busying mechanism.
  *
  *	The given page must be locked.
  *
  *	If nonshared is true, sleep only if the page is xbusy.
  */
 void
 vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared)
 {
 	u_int x;
 
 	vm_page_assert_locked(m);
 
 	x = m->busy_lock;
 	if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) ||
 	    ((x & VPB_BIT_WAITERS) == 0 &&
 	    !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) {
 		vm_page_unlock(m);
 		return;
 	}
 	msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0);
 }
 
 /*
  *	vm_page_trysbusy:
  *
  *	Try to shared busy a page.
  *	If the operation succeeds 1 is returned otherwise 0.
  *	The operation never sleeps.
  */
 int
 vm_page_trysbusy(vm_page_t m)
 {
 	u_int x;
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_SHARED) == 0)
 			return (0);
 		if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER))
 			return (1);
 	}
 }
 
 static void
 vm_page_xunbusy_locked(vm_page_t m)
 {
 
 	vm_page_assert_xbusied(m);
 	vm_page_assert_locked(m);
 
 	atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED);
 	/* There is a waiter, do wakeup() instead of vm_page_flash(). */
 	wakeup(m);
 }
 
 void
 vm_page_xunbusy_maybelocked(vm_page_t m)
 {
 	bool lockacq;
 
 	vm_page_assert_xbusied(m);
 
 	/*
 	 * Fast path for unbusy.  If it succeeds, we know that there
 	 * are no waiters, so we do not need a wakeup.
 	 */
 	if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER,
 	    VPB_UNBUSIED))
 		return;
 
 	lockacq = !mtx_owned(vm_page_lockptr(m));
 	if (lockacq)
 		vm_page_lock(m);
 	vm_page_xunbusy_locked(m);
 	if (lockacq)
 		vm_page_unlock(m);
 }
 
 /*
  *	vm_page_xunbusy_hard:
  *
  *	Called after the first try the exclusive unbusy of a page failed.
  *	It is assumed that the waiters bit is on.
  */
 void
 vm_page_xunbusy_hard(vm_page_t m)
 {
 
 	vm_page_assert_xbusied(m);
 
 	vm_page_lock(m);
 	vm_page_xunbusy_locked(m);
 	vm_page_unlock(m);
 }
 
 /*
  *	vm_page_flash:
  *
  *	Wakeup anyone waiting for the page.
  *	The ownership bits do not change.
  *
  *	The given page must be locked.
  */
 void
 vm_page_flash(vm_page_t m)
 {
 	u_int x;
 
 	vm_page_lock_assert(m, MA_OWNED);
 
 	for (;;) {
 		x = m->busy_lock;
 		if ((x & VPB_BIT_WAITERS) == 0)
 			return;
 		if (atomic_cmpset_int(&m->busy_lock, x,
 		    x & (~VPB_BIT_WAITERS)))
 			break;
 	}
 	wakeup(m);
 }
 
 /*
  * Avoid releasing and reacquiring the same page lock.
  */
 void
 vm_page_change_lock(vm_page_t m, struct mtx **mtx)
 {
 	struct mtx *mtx1;
 
 	mtx1 = vm_page_lockptr(m);
 	if (*mtx == mtx1)
 		return;
 	if (*mtx != NULL)
 		mtx_unlock(*mtx);
 	*mtx = mtx1;
 	mtx_lock(mtx1);
 }
 
 /*
  * Keep page from being freed by the page daemon
  * much of the same effect as wiring, except much lower
  * overhead and should be used only for *very* temporary
  * holding ("wiring").
  */
 void
 vm_page_hold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
         mem->hold_count++;
 }
 
 void
 vm_page_unhold(vm_page_t mem)
 {
 
 	vm_page_lock_assert(mem, MA_OWNED);
 	KASSERT(mem->hold_count >= 1, ("vm_page_unhold: hold count < 0!!!"));
 	--mem->hold_count;
 	if (mem->hold_count == 0 && (mem->flags & PG_UNHOLDFREE) != 0)
 		vm_page_free_toq(mem);
 }
 
 /*
  *	vm_page_unhold_pages:
  *
  *	Unhold each of the pages that is referenced by the given array.
  */
 void
 vm_page_unhold_pages(vm_page_t *ma, int count)
 {
 	struct mtx *mtx;
 
 	mtx = NULL;
 	for (; count != 0; count--) {
 		vm_page_change_lock(*ma, &mtx);
 		vm_page_unhold(*ma);
 		ma++;
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 }
 
 vm_page_t
 PHYS_TO_VM_PAGE(vm_paddr_t pa)
 {
 	vm_page_t m;
 
 #ifdef VM_PHYSSEG_SPARSE
 	m = vm_phys_paddr_to_vm_page(pa);
 	if (m == NULL)
 		m = vm_phys_fictitious_to_vm_page(pa);
 	return (m);
 #elif defined(VM_PHYSSEG_DENSE)
 	long pi;
 
 	pi = atop(pa);
 	if (pi >= first_page && (pi - first_page) < vm_page_array_size) {
 		m = &vm_page_array[pi - first_page];
 		return (m);
 	}
 	return (vm_phys_fictitious_to_vm_page(pa));
 #else
 #error "Either VM_PHYSSEG_DENSE or VM_PHYSSEG_SPARSE must be defined."
 #endif
 }
 
 /*
  *	vm_page_getfake:
  *
  *	Create a fictitious page with the specified physical address and
  *	memory attribute.  The memory attribute is the only the machine-
  *	dependent aspect of a fictitious page that must be initialized.
  */
 vm_page_t
 vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr)
 {
 	vm_page_t m;
 
 	m = uma_zalloc(fakepg_zone, M_WAITOK | M_ZERO);
 	vm_page_initfake(m, paddr, memattr);
 	return (m);
 }
 
 void
 vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		/*
 		 * The page's memattr might have changed since the
 		 * previous initialization.  Update the pmap to the
 		 * new memattr.
 		 */
 		goto memattr;
 	}
 	m->phys_addr = paddr;
 	m->queue = PQ_NONE;
 	/* Fictitious pages don't use "segind". */
 	m->flags = PG_FICTITIOUS;
 	/* Fictitious pages don't use "order" or "pool". */
 	m->oflags = VPO_UNMANAGED;
 	m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	m->wire_count = 1;
 	pmap_page_init(m);
 memattr:
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_putfake:
  *
  *	Release a fictitious page.
  */
 void
 vm_page_putfake(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0, ("managed %p", m));
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_putfake: bad page %p", m));
 	uma_zfree(fakepg_zone, m);
 }
 
 /*
  *	vm_page_updatefake:
  *
  *	Update the given fictitious page to the specified physical address and
  *	memory attribute.
  */
 void
 vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
 {
 
 	KASSERT((m->flags & PG_FICTITIOUS) != 0,
 	    ("vm_page_updatefake: bad page %p", m));
 	m->phys_addr = paddr;
 	pmap_page_set_memattr(m, memattr);
 }
 
 /*
  *	vm_page_free:
  *
  *	Free a page.
  */
 void
 vm_page_free(vm_page_t m)
 {
 
 	m->flags &= ~PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  *	vm_page_free_zero:
  *
  *	Free a page to the zerod-pages queue
  */
 void
 vm_page_free_zero(vm_page_t m)
 {
 
 	m->flags |= PG_ZERO;
 	vm_page_free_toq(m);
 }
 
 /*
  * Unbusy and handle the page queueing for a page from a getpages request that
  * was optionally read ahead or behind.
  */
 void
 vm_page_readahead_finish(vm_page_t m)
 {
 
 	/* We shouldn't put invalid pages on queues. */
 	KASSERT(m->valid != 0, ("%s: %p is invalid", __func__, m));
 
 	/*
 	 * Since the page is not the actually needed one, whether it should
 	 * be activated or deactivated is not obvious.  Empirical results
 	 * have shown that deactivating the page is usually the best choice,
 	 * unless the page is wanted by another thread.
 	 */
 	vm_page_lock(m);
 	if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
 		vm_page_activate(m);
 	else
 		vm_page_deactivate(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 }
 
 /*
  *	vm_page_sleep_if_busy:
  *
  *	Sleep and release the page queues lock if the page is busied.
  *	Returns TRUE if the thread slept.
  *
  *	The given page must be unlocked and object containing it must
  *	be locked.
  */
 int
 vm_page_sleep_if_busy(vm_page_t m, const char *msg)
 {
 	vm_object_t obj;
 
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 
 	if (vm_page_busied(m)) {
 		/*
 		 * The page-specific object must be cached because page
 		 * identity can change during the sleep, causing the
 		 * re-lock of a different object.
 		 * It is assumed that a reference to the object is already
 		 * held by the callers.
 		 */
 		obj = m->object;
 		vm_page_lock(m);
 		VM_OBJECT_WUNLOCK(obj);
 		vm_page_busy_sleep(m, msg, false);
 		VM_OBJECT_WLOCK(obj);
 		return (TRUE);
 	}
 	return (FALSE);
 }
 
 /*
  *	vm_page_dirty_KBI:		[ internal use only ]
  *
  *	Set all bits in the page's dirty field.
  *
  *	The object containing the specified page must be locked if the
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
  *
  *	This function should only be called by vm_page_dirty().
  */
 void
 vm_page_dirty_KBI(vm_page_t m)
 {
 
 	/* Refer to this operation by its public name. */
 	KASSERT(m->valid == VM_PAGE_BITS_ALL,
 	    ("vm_page_dirty: page is invalid!"));
 	m->dirty = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_insert:		[ internal use only ]
  *
  *	Inserts the given mem entry into the object and object list.
  *
  *	The object must be locked.
  */
 int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mpred;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	mpred = vm_radix_lookup_le(&object->rtree, pindex);
 	return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
  *	vm_page_insert_after:
  *
  *	Inserts the page "m" into the specified object at offset "pindex".
  *
  *	The page "mpred" must immediately precede the offset "pindex" within
  *	the specified object.
  *
  *	The object must be locked.
  */
 static int
 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred)
 {
 	vm_page_t msucc;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(m->object == NULL,
 	    ("vm_page_insert_after: page already inserted"));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 		msucc = TAILQ_NEXT(mpred, listq);
 	} else
 		msucc = TAILQ_FIRST(&object->memq);
 	if (msucc != NULL)
 		KASSERT(msucc->pindex > pindex,
 		    ("vm_page_insert_after: msucc doesn't succeed pindex"));
 
 	/*
 	 * Record the object/offset pair in this page
 	 */
 	m->object = object;
 	m->pindex = pindex;
 
 	/*
 	 * Now link into the object's ordered list of backed pages.
 	 */
 	if (vm_radix_insert(&object->rtree, m)) {
 		m->object = NULL;
 		m->pindex = 0;
 		return (1);
 	}
 	vm_page_insert_radixdone(m, object, mpred);
 	return (0);
 }
 
 /*
  *	vm_page_insert_radixdone:
  *
  *	Complete page "m" insertion into the specified object after the
  *	radix trie hooking.
  *
  *	The page "mpred" must precede the offset "m->pindex" within the
  *	specified object.
  *
  *	The object must be locked.
  */
 static void
 vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object != NULL && m->object == object,
 	    ("vm_page_insert_radixdone: page %p has inconsistent object", m));
 	if (mpred != NULL) {
 		KASSERT(mpred->object == object,
 		    ("vm_page_insert_after: object doesn't contain mpred"));
 		KASSERT(mpred->pindex < m->pindex,
 		    ("vm_page_insert_after: mpred doesn't precede pindex"));
 	}
 
 	if (mpred != NULL)
 		TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
 	else
 		TAILQ_INSERT_HEAD(&object->memq, m, listq);
 
 	/*
 	 * Show that the object has one more resident page.
 	 */
 	object->resident_page_count++;
 
 	/*
 	 * Hold the vnode until the last page is released.
 	 */
 	if (object->resident_page_count == 1 && object->type == OBJT_VNODE)
 		vhold(object->handle);
 
 	/*
 	 * Since we are inserting a new and possibly dirty page,
 	 * update the object's OBJ_MIGHTBEDIRTY flag.
 	 */
 	if (pmap_page_is_write_mapped(m))
 		vm_object_set_writeable_dirty(object);
 }
 
 /*
  *	vm_page_remove:
  *
  *	Removes the specified page from its containing object, but does not
  *	invalidate any backing storage.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_remove(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mrem;
 
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_assert_locked(m);
 	if ((object = m->object) == NULL)
 		return;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (vm_page_xbusied(m))
 		vm_page_xunbusy_maybelocked(m);
 	mrem = vm_radix_remove(&object->rtree, m->pindex);
 	KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m));
 
 	/*
 	 * Now remove from the object's list of backed pages.
 	 */
 	TAILQ_REMOVE(&object->memq, m, listq);
 
 	/*
 	 * And show that the object has one fewer resident page.
 	 */
 	object->resident_page_count--;
 
 	/*
 	 * The vnode may now be recycled.
 	 */
 	if (object->resident_page_count == 0 && object->type == OBJT_VNODE)
 		vdrop(object->handle);
 
 	m->object = NULL;
 }
 
 /*
  *	vm_page_lookup:
  *
  *	Returns the page associated with the object/offset
  *	pair specified; if none is found, NULL is returned.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_lookup(vm_object_t object, vm_pindex_t pindex)
 {
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	return (vm_radix_lookup(&object->rtree, pindex));
 }
 
 /*
  *	vm_page_find_least:
  *
  *	Returns the page associated with the object with least pindex
  *	greater than or equal to the parameter pindex, or NULL.
  *
  *	The object must be locked.
  */
 vm_page_t
 vm_page_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if ((m = TAILQ_FIRST(&object->memq)) != NULL && m->pindex < pindex)
 		m = vm_radix_lookup_ge(&object->rtree, pindex);
 	return (m);
 }
 
 /*
  * Returns the given page's successor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_next(vm_page_t m)
 {
 	vm_page_t next;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((next = TAILQ_NEXT(m, listq)) != NULL) {
 		MPASS(next->object == m->object);
 		if (next->pindex != m->pindex + 1)
 			next = NULL;
 	}
 	return (next);
 }
 
 /*
  * Returns the given page's predecessor (by pindex) within the object if it is
  * resident; if none is found, NULL is returned.
  *
  * The object must be locked.
  */
 vm_page_t
 vm_page_prev(vm_page_t m)
 {
 	vm_page_t prev;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	if ((prev = TAILQ_PREV(m, pglist, listq)) != NULL) {
 		MPASS(prev->object == m->object);
 		if (prev->pindex != m->pindex - 1)
 			prev = NULL;
 	}
 	return (prev);
 }
 
 /*
  * Uses the page mnew as a replacement for an existing page at index
  * pindex which must be already present in the object.
  *
  * The existing page must not be on a paging queue.
  */
 vm_page_t
 vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
 {
 	vm_page_t mold;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(mnew->object == NULL,
 	    ("vm_page_replace: page already in object"));
 
 	/*
 	 * This function mostly follows vm_page_insert() and
 	 * vm_page_remove() without the radix, object count and vnode
 	 * dance.  Double check such functions for more comments.
 	 */
 
 	mnew->object = object;
 	mnew->pindex = pindex;
 	mold = vm_radix_replace(&object->rtree, mnew);
 	KASSERT(mold->queue == PQ_NONE,
 	    ("vm_page_replace: mold is on a paging queue"));
 
 	/* Keep the resident page list in sorted order. */
 	TAILQ_INSERT_AFTER(&object->memq, mold, mnew, listq);
 	TAILQ_REMOVE(&object->memq, mold, listq);
 
 	mold->object = NULL;
 	vm_page_xunbusy_maybelocked(mold);
 
 	/*
 	 * The object's resident_page_count does not change because we have
 	 * swapped one page for another, but OBJ_MIGHTBEDIRTY.
 	 */
 	if (pmap_page_is_write_mapped(mnew))
 		vm_object_set_writeable_dirty(object);
 	return (mold);
 }
 
 /*
  *	vm_page_rename:
  *
  *	Move the given memory entry from its
  *	current object to the specified target object/offset.
  *
  *	Note: swap associated with the page must be invalidated by the move.  We
  *	      have to do this for several reasons:  (1) we aren't freeing the
  *	      page, (2) we are dirtying the page, (3) the VM system is probably
  *	      moving the page from object A to B, and will then later move
  *	      the backing store from A to B and we can't have a conflict.
  *
  *	Note: we *always* dirty the page.  It is necessary both for the
  *	      fact that we moved it, and because we may be invalidating
  *	      swap.
  *
  *	The objects must be locked.
  */
 int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
 	vm_page_t mpred;
 	vm_pindex_t opidx;
 
 	VM_OBJECT_ASSERT_WLOCKED(new_object);
 
 	mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
 	KASSERT(mpred == NULL || mpred->pindex != new_pindex,
 	    ("vm_page_rename: pindex already renamed"));
 
 	/*
 	 * Create a custom version of vm_page_insert() which does not depend
 	 * by m_prev and can cheat on the implementation aspects of the
 	 * function.
 	 */
 	opidx = m->pindex;
 	m->pindex = new_pindex;
 	if (vm_radix_insert(&new_object->rtree, m)) {
 		m->pindex = opidx;
 		return (1);
 	}
 
 	/*
 	 * The operation cannot fail anymore.  The removal must happen before
 	 * the listq iterator is tainted.
 	 */
 	m->pindex = opidx;
 	vm_page_lock(m);
 	vm_page_remove(m);
 
 	/* Return back to the new pindex to complete vm_page_insert(). */
 	m->pindex = new_pindex;
 	m->object = new_object;
 	vm_page_unlock(m);
 	vm_page_insert_radixdone(m, new_object, mpred);
 	vm_page_dirty(m);
 	return (0);
 }
 
 /*
  *	vm_page_alloc:
  *
  *	Allocate and return a page that is associated with the specified
  *	object and offset pair.  By default, this page is exclusive busied.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int req)
 {
 	vm_page_t m, mpred;
 	int flags, req_class;
 
 	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc: inconsistent object(%p)/req(%x)", object, req));
 	if (object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 
 	if (__predict_false((req & VM_ALLOC_IFCACHED) != 0))
 		return (NULL);
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		   ("vm_page_alloc: pindex already allocated"));
 	}
 
 	/*
 	 * Allocate a page if the number of free pages exceeds the minimum
 	 * for the request class.
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count > 0)) {
 		/*
 		 * Can we allocate the page from a reservation?
 		 */
 #if VM_NRESERVLEVEL > 0
 		if (object == NULL || (object->flags & (OBJ_COLORED |
 		    OBJ_FICTITIOUS)) != OBJ_COLORED || (m =
 		    vm_reserv_alloc_page(object, pindex, mpred)) == NULL)
 #endif
 		{
 			/*
 			 * If not, allocate it from the free page queues.
 			 */
 			m = vm_phys_alloc_pages(object != NULL ?
 			    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT, 0);
 #if VM_NRESERVLEVEL > 0
 			if (m == NULL && vm_reserv_reclaim_inactive()) {
 				m = vm_phys_alloc_pages(object != NULL ?
 				    VM_FREEPOOL_DEFAULT : VM_FREEPOOL_DIRECT,
 				    0);
 			}
 #endif
 		}
 	} else {
 		/*
 		 * Not allocatable, give up.
 		 */
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 
 	/*
 	 *  At this point we had better have found a good page.
 	 */
 	KASSERT(m != NULL, ("vm_page_alloc: missing page"));
 	vm_phys_freecnt_adj(m, -1);
 	if ((m->flags & PG_ZERO) != 0)
 		vm_page_zero_count--;
 	mtx_unlock(&vm_page_queue_free_mtx);
 	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	flags &= m->flags;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	m->flags = flags;
 	m->aflags = 0;
 	m->oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	m->busy_lock = VPB_UNBUSIED;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		m->busy_lock = VPB_SINGLE_EXCLUSIVER;
 	if ((req & VM_ALLOC_SBUSY) != 0)
 		m->busy_lock = VPB_SHARERS_WORD(1);
 	if (req & VM_ALLOC_WIRED) {
 		/*
 		 * The page lock is not required for wiring a page until that
 		 * page is inserted into the object.
 		 */
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	m->act_count = 0;
 
 	if (object != NULL) {
 		if (vm_page_insert_after(m, object, pindex, mpred)) {
 			pagedaemon_wakeup();
 			if (req & VM_ALLOC_WIRED) {
 				atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 				m->wire_count = 0;
 			}
 			KASSERT(m->object == NULL, ("page %p has object", m));
 			m->oflags = VPO_UNMANAGED;
 			m->busy_lock = VPB_UNBUSIED;
 			/* Don't change PG_ZERO. */
 			vm_page_free_toq(m);
 			return (NULL);
 		}
 
 		/* Ignore device objects; the pager sets "memattr" for them. */
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    (object->flags & OBJ_FICTITIOUS) == 0)
 			pmap_page_set_memattr(m, object->memattr);
 	} else
 		m->pindex = pindex;
 
 	/*
 	 * Don't wakeup too often - wakeup the pageout daemon when
 	 * we would be nearly out of memory.
 	 */
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 
 	return (m);
 }
 
 /*
  *	vm_page_alloc_contig:
  *
  *	Allocate a contiguous set of physical pages of the given size "npages"
  *	from the free lists.  All of the physical pages must be at or above
  *	the given physical address "low" and below the given physical address
  *	"high".  The given value "alignment" determines the alignment of the
  *	first physical page in the set.  If the given value "boundary" is
  *	non-zero, then the set of physical pages cannot cross any physical
  *	address boundary that is a multiple of that value.  Both "alignment"
  *	and "boundary" must be a power of two.
  *
  *	If the specified memory attribute, "memattr", is VM_MEMATTR_DEFAULT,
  *	then the memory attribute setting for the physical pages is configured
  *	to the object's memory attribute setting.  Otherwise, the memory
  *	attribute setting for the physical pages is configured to "memattr",
  *	overriding the object's memory attribute setting.  However, if the
  *	object's memory attribute setting is not VM_MEMATTR_DEFAULT, then the
  *	memory attribute setting for the physical pages cannot be configured
  *	to VM_MEMATTR_DEFAULT.
  *
  *	The specified object may not contain fictitious pages.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_NOBUSY		do not exclusive busy the page
  *	VM_ALLOC_NODUMP		do not include the page in a kernel core dump
  *	VM_ALLOC_NOOBJ		page is not associated with an object and
  *				should not be exclusive busy
  *	VM_ALLOC_SBUSY		shared busy the allocated page
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
 	vm_page_t m, m_ret, mpred;
 	u_int busy_lock, flags, oflags;
 	int req_class;
 
 	mpred = NULL;	/* XXX: pacify gcc */
 	KASSERT((object != NULL) == ((req & VM_ALLOC_NOOBJ) == 0) &&
 	    (object != NULL || (req & VM_ALLOC_SBUSY) == 0) &&
 	    ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) !=
 	    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)),
 	    ("vm_page_alloc_contig: inconsistent object(%p)/req(%x)", object,
 	    req));
 	if (object != NULL) {
 		VM_OBJECT_ASSERT_WLOCKED(object);
 		KASSERT((object->flags & OBJ_FICTITIOUS) == 0,
 		    ("vm_page_alloc_contig: object %p has fictitious pages",
 		    object));
 	}
 	KASSERT(npages > 0, ("vm_page_alloc_contig: npages is zero"));
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	if (object != NULL) {
 		mpred = vm_radix_lookup_le(&object->rtree, pindex);
 		KASSERT(mpred == NULL || mpred->pindex != pindex,
 		    ("vm_page_alloc_contig: pindex already allocated"));
 	}
 
 	/*
 	 * Can we allocate the pages without the number of free pages falling
 	 * below the lower bound for the allocation class?
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count >= npages + vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count >= npages + vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count >= npages)) {
 		/*
 		 * Can we allocate the pages from a reservation?
 		 */
 #if VM_NRESERVLEVEL > 0
 retry:
 		if (object == NULL || (object->flags & OBJ_COLORED) == 0 ||
 		    (m_ret = vm_reserv_alloc_contig(object, pindex, npages,
 		    low, high, alignment, boundary, mpred)) == NULL)
 #endif
 			/*
 			 * If not, allocate them from the free page queues.
 			 */
 			m_ret = vm_phys_alloc_contig(npages, low, high,
 			    alignment, boundary);
 	} else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit, npages);
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m_ret != NULL) {
 		vm_phys_freecnt_adj(m_ret, -npages);
 		for (m = m_ret; m < &m_ret[npages]; m++)
 			if ((m->flags & PG_ZERO) != 0)
 				vm_page_zero_count--;
 	} else {
 #if VM_NRESERVLEVEL > 0
 		if (vm_reserv_reclaim_contig(npages, low, high, alignment,
 		    boundary))
 			goto retry;
 #endif
 	}
 	mtx_unlock(&vm_page_queue_free_mtx);
 	if (m_ret == NULL)
 		return (NULL);
 	for (m = m_ret; m < &m_ret[npages]; m++)
 		vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the pages.  Only the PG_ZERO flag is inherited.
 	 */
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	if ((req & VM_ALLOC_NODUMP) != 0)
 		flags |= PG_NODUMP;
 	oflags = object == NULL || (object->flags & OBJ_UNMANAGED) != 0 ?
 	    VPO_UNMANAGED : 0;
 	busy_lock = VPB_UNBUSIED;
 	if ((req & (VM_ALLOC_NOBUSY | VM_ALLOC_NOOBJ | VM_ALLOC_SBUSY)) == 0)
 		busy_lock = VPB_SINGLE_EXCLUSIVER;
 	if ((req & VM_ALLOC_SBUSY) != 0)
 		busy_lock = VPB_SHARERS_WORD(1);
 	if ((req & VM_ALLOC_WIRED) != 0)
 		atomic_add_int(&vm_cnt.v_wire_count, npages);
 	if (object != NULL) {
 		if (object->memattr != VM_MEMATTR_DEFAULT &&
 		    memattr == VM_MEMATTR_DEFAULT)
 			memattr = object->memattr;
 	}
 	for (m = m_ret; m < &m_ret[npages]; m++) {
 		m->aflags = 0;
 		m->flags = (m->flags | PG_NODUMP) & flags;
 		m->busy_lock = busy_lock;
 		if ((req & VM_ALLOC_WIRED) != 0)
 			m->wire_count = 1;
 		m->act_count = 0;
 		m->oflags = oflags;
 		if (object != NULL) {
 			if (vm_page_insert_after(m, object, pindex, mpred)) {
 				pagedaemon_wakeup();
 				if ((req & VM_ALLOC_WIRED) != 0)
 					atomic_subtract_int(
 					    &vm_cnt.v_wire_count, npages);
 				KASSERT(m->object == NULL,
 				    ("page %p has object", m));
 				mpred = m;
 				for (m = m_ret; m < &m_ret[npages]; m++) {
 					if (m <= mpred &&
 					    (req & VM_ALLOC_WIRED) != 0)
 						m->wire_count = 0;
 					m->oflags = VPO_UNMANAGED;
 					m->busy_lock = VPB_UNBUSIED;
 					/* Don't change PG_ZERO. */
 					vm_page_free_toq(m);
 				}
 				return (NULL);
 			}
 			mpred = m;
 		} else
 			m->pindex = pindex;
 		if (memattr != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, memattr);
 		pindex++;
 	}
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m_ret);
 }
 
 /*
  * Check a page that has been freshly dequeued from a freelist.
  */
 static void
 vm_page_alloc_check(vm_page_t m)
 {
 
 	KASSERT(m->object == NULL, ("page %p has object", m));
 	KASSERT(m->queue == PQ_NONE,
 	    ("page %p has unexpected queue %d", m, m->queue));
 	KASSERT(m->wire_count == 0, ("page %p is wired", m));
 	KASSERT(m->hold_count == 0, ("page %p is held", m));
 	KASSERT(!vm_page_busied(m), ("page %p is busy", m));
 	KASSERT(m->dirty == 0, ("page %p is dirty", m));
 	KASSERT(pmap_page_get_memattr(m) == VM_MEMATTR_DEFAULT,
 	    ("page %p has unexpected memattr %d",
 	    m, pmap_page_get_memattr(m)));
 	KASSERT(m->valid == 0, ("free page %p is valid", m));
 }
 
 /*
  * 	vm_page_alloc_freelist:
  *
  *	Allocate a physical page from the specified free page list.
  *
  *	The caller must always specify an allocation class.
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	optional allocation flags:
  *	VM_ALLOC_COUNT(number)	the number of additional pages that the caller
  *				intends to allocate
  *	VM_ALLOC_WIRED		wire the allocated page
  *	VM_ALLOC_ZERO		prefer a zeroed page
  *
  *	This routine may not sleep.
  */
 vm_page_t
 vm_page_alloc_freelist(int flind, int req)
 {
 	vm_page_t m;
 	u_int flags;
 	int req_class;
 
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Do not allocate reserved pages unless the req has asked for it.
 	 */
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (vm_cnt.v_free_count > vm_cnt.v_free_reserved ||
 	    (req_class == VM_ALLOC_SYSTEM &&
 	    vm_cnt.v_free_count > vm_cnt.v_interrupt_free_min) ||
 	    (req_class == VM_ALLOC_INTERRUPT &&
 	    vm_cnt.v_free_count > 0))
 		m = vm_phys_alloc_freelist_pages(flind, VM_FREEPOOL_DIRECT, 0);
 	else {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		atomic_add_int(&vm_pageout_deficit,
 		    max((u_int)req >> VM_ALLOC_COUNT_SHIFT, 1));
 		pagedaemon_wakeup();
 		return (NULL);
 	}
 	if (m == NULL) {
 		mtx_unlock(&vm_page_queue_free_mtx);
 		return (NULL);
 	}
 	vm_phys_freecnt_adj(m, -1);
 	if ((m->flags & PG_ZERO) != 0)
 		vm_page_zero_count--;
 	mtx_unlock(&vm_page_queue_free_mtx);
 	vm_page_alloc_check(m);
 
 	/*
 	 * Initialize the page.  Only the PG_ZERO flag is inherited.
 	 */
 	m->aflags = 0;
 	flags = 0;
 	if ((req & VM_ALLOC_ZERO) != 0)
 		flags = PG_ZERO;
 	m->flags &= flags;
 	if ((req & VM_ALLOC_WIRED) != 0) {
 		/*
 		 * The page lock is not required for wiring a page that does
 		 * not belong to an object.
 		 */
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 		m->wire_count = 1;
 	}
 	/* Unmanaged pages don't use "act_count". */
 	m->oflags = VPO_UNMANAGED;
 	if (vm_paging_needed())
 		pagedaemon_wakeup();
 	return (m);
 }
 
 #define	VPSC_ANY	0	/* No restrictions. */
 #define	VPSC_NORESERV	1	/* Skip reservations; implies VPSC_NOSUPER. */
 #define	VPSC_NOSUPER	2	/* Skip superpages. */
 
 /*
  *	vm_page_scan_contig:
  *
  *	Scan vm_page_array[] between the specified entries "m_start" and
  *	"m_end" for a run of contiguous physical pages that satisfy the
  *	specified conditions, and return the lowest page in the run.  The
  *	specified "alignment" determines the alignment of the lowest physical
  *	page in the run.  If the specified "boundary" is non-zero, then the
  *	run of physical pages cannot span a physical address that is a
  *	multiple of "boundary".
  *
  *	"m_end" is never dereferenced, so it need not point to a vm_page
  *	structure within vm_page_array[].
  *
  *	"npages" must be greater than zero.  "m_start" and "m_end" must not
  *	span a hole (or discontiguity) in the physical address space.  Both
  *	"alignment" and "boundary" must be a power of two.
  */
 vm_page_t
 vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end,
     u_long alignment, vm_paddr_t boundary, int options)
 {
 	struct mtx *m_mtx;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_run;
 #if VM_NRESERVLEVEL > 0
 	int level;
 #endif
 	int m_inc, order, run_ext, run_len;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	m_run = NULL;
 	run_len = 0;
 	m_mtx = NULL;
 	for (m = m_start; m < m_end && run_len < npages; m += m_inc) {
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 
 		/*
 		 * If the current page would be the start of a run, check its
 		 * physical address against the end, alignment, and boundary
 		 * conditions.  If it doesn't satisfy these conditions, either
 		 * terminate the scan or advance to the next page that
 		 * satisfies the failed condition.
 		 */
 		if (run_len == 0) {
 			KASSERT(m_run == NULL, ("m_run != NULL"));
 			if (m + npages > m_end)
 				break;
 			pa = VM_PAGE_TO_PHYS(m);
 			if ((pa & (alignment - 1)) != 0) {
 				m_inc = atop(roundup2(pa, alignment) - pa);
 				continue;
 			}
 			if (rounddown2(pa ^ (pa + ptoa(npages) - 1),
 			    boundary) != 0) {
 				m_inc = atop(roundup2(pa, boundary) - pa);
 				continue;
 			}
 		} else
 			KASSERT(m_run != NULL, ("m_run == NULL"));
 
 		vm_page_change_lock(m, &m_mtx);
 		m_inc = 1;
 retry:
 		if (m->wire_count != 0 || m->hold_count != 0)
 			run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 		else if ((level = vm_reserv_level(m)) >= 0 &&
 		    (options & VPSC_NORESERV) != 0) {
 			run_ext = 0;
 			/* Advance to the end of the reservation. */
 			pa = VM_PAGE_TO_PHYS(m);
 			m_inc = atop(roundup2(pa + 1, vm_reserv_size(level)) -
 			    pa);
 		}
 #endif
 		else if ((object = m->object) != NULL) {
 			/*
 			 * The page is considered eligible for relocation if
 			 * and only if it could be laundered or reclaimed by
 			 * the page daemon.
 			 */
 			if (!VM_OBJECT_TRYRLOCK(object)) {
 				mtx_unlock(m_mtx);
 				VM_OBJECT_RLOCK(object);
 				mtx_lock(m_mtx);
 				if (m->object != object) {
 					/*
 					 * The page may have been freed.
 					 */
 					VM_OBJECT_RUNLOCK(object);
 					goto retry;
 				} else if (m->wire_count != 0 ||
 				    m->hold_count != 0) {
 					run_ext = 0;
 					goto unlock;
 				}
 			}
 			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 			    ("page %p is PG_UNHOLDFREE", m));
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (object->type != OBJT_DEFAULT &&
 			    object->type != OBJT_SWAP &&
 			    object->type != OBJT_VNODE) {
 				run_ext = 0;
 #if VM_NRESERVLEVEL > 0
 			} else if ((options & VPSC_NOSUPER) != 0 &&
 			    (level = vm_reserv_level_iffullpop(m)) >= 0) {
 				run_ext = 0;
 				/* Advance to the end of the superpage. */
 				pa = VM_PAGE_TO_PHYS(m);
 				m_inc = atop(roundup2(pa + 1,
 				    vm_reserv_size(level)) - pa);
 #endif
 			} else if (object->memattr == VM_MEMATTR_DEFAULT &&
 			    m->queue != PQ_NONE && !vm_page_busied(m)) {
 				/*
 				 * The page is allocated but eligible for
 				 * relocation.  Extend the current run by one
 				 * page.
 				 */
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT((m->oflags & (VPO_SWAPINPROG |
 				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: VPO_NOSYNC. */
 				run_ext = 1;
 			} else
 				run_ext = 0;
 unlock:
 			VM_OBJECT_RUNLOCK(object);
 #if VM_NRESERVLEVEL > 0
 		} else if (level >= 0) {
 			/*
 			 * The page is reserved but not yet allocated.  In
 			 * other words, it is still free.  Extend the current
 			 * run by one page.
 			 */
 			run_ext = 1;
 #endif
 		} else if ((order = m->order) < VM_NFREEORDER) {
 			/*
 			 * The page is enqueued in the physical memory
 			 * allocator's free page queues.  Moreover, it is the
 			 * first page in a power-of-two-sized run of
 			 * contiguous free pages.  Add these pages to the end
 			 * of the current run, and jump ahead.
 			 */
 			run_ext = 1 << order;
 			m_inc = 1 << order;
 		} else {
 			/*
 			 * Skip the page for one of the following reasons: (1)
 			 * It is enqueued in the physical memory allocator's
 			 * free page queues.  However, it is not the first
 			 * page in a run of contiguous free pages.  (This case
 			 * rarely occurs because the scan is performed in
 			 * ascending order.) (2) It is not reserved, and it is
 			 * transitioning from free to allocated.  (Conversely,
 			 * the transition from allocated to free for managed
 			 * pages is blocked by the page lock.) (3) It is
 			 * allocated but not contained by an object and not
 			 * wired, e.g., allocated by Xen's balloon driver.
 			 */
 			run_ext = 0;
 		}
 
 		/*
 		 * Extend or reset the current run of pages.
 		 */
 		if (run_ext > 0) {
 			if (run_len == 0)
 				m_run = m;
 			run_len += run_ext;
 		} else {
 			if (run_len > 0) {
 				m_run = NULL;
 				run_len = 0;
 			}
 		}
 	}
 	if (m_mtx != NULL)
 		mtx_unlock(m_mtx);
 	if (run_len >= npages)
 		return (m_run);
 	return (NULL);
 }
 
 /*
  *	vm_page_reclaim_run:
  *
  *	Try to relocate each of the allocated virtual pages within the
  *	specified run of physical pages to a new physical address.  Free the
  *	physical pages underlying the relocated virtual pages.  A virtual page
  *	is relocatable if and only if it could be laundered or reclaimed by
  *	the page daemon.  Whenever possible, a virtual page is relocated to a
  *	physical address above "high".
  *
  *	Returns 0 if every physical page within the run was already free or
  *	just freed by a successful relocation.  Otherwise, returns a non-zero
  *	value indicating why the last attempt to relocate a virtual page was
  *	unsuccessful.
  *
  *	"req_class" must be an allocation class.
  */
 static int
 vm_page_reclaim_run(int req_class, u_long npages, vm_page_t m_run,
     vm_paddr_t high)
 {
 	struct mtx *m_mtx;
 	struct spglist free;
 	vm_object_t object;
 	vm_paddr_t pa;
 	vm_page_t m, m_end, m_new;
 	int error, order, req;
 
 	KASSERT((req_class & VM_ALLOC_CLASS_MASK) == req_class,
 	    ("req_class is not an allocation class"));
 	SLIST_INIT(&free);
 	error = 0;
 	m = m_run;
 	m_end = m_run + npages;
 	m_mtx = NULL;
 	for (; error == 0 && m < m_end; m++) {
 		KASSERT((m->flags & (PG_FICTITIOUS | PG_MARKER)) == 0,
 		    ("page %p is PG_FICTITIOUS or PG_MARKER", m));
 
 		/*
 		 * Avoid releasing and reacquiring the same page lock.
 		 */
 		vm_page_change_lock(m, &m_mtx);
 retry:
 		if (m->wire_count != 0 || m->hold_count != 0)
 			error = EBUSY;
 		else if ((object = m->object) != NULL) {
 			/*
 			 * The page is relocated if and only if it could be
 			 * laundered or reclaimed by the page daemon.
 			 */
 			if (!VM_OBJECT_TRYWLOCK(object)) {
 				mtx_unlock(m_mtx);
 				VM_OBJECT_WLOCK(object);
 				mtx_lock(m_mtx);
 				if (m->object != object) {
 					/*
 					 * The page may have been freed.
 					 */
 					VM_OBJECT_WUNLOCK(object);
 					goto retry;
 				} else if (m->wire_count != 0 ||
 				    m->hold_count != 0) {
 					error = EBUSY;
 					goto unlock;
 				}
 			}
 			KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 			    ("page %p is PG_UNHOLDFREE", m));
 			/* Don't care: PG_NODUMP, PG_ZERO. */
 			if (object->type != OBJT_DEFAULT &&
 			    object->type != OBJT_SWAP &&
 			    object->type != OBJT_VNODE)
 				error = EINVAL;
 			else if (object->memattr != VM_MEMATTR_DEFAULT)
 				error = EINVAL;
 			else if (m->queue != PQ_NONE && !vm_page_busied(m)) {
 				KASSERT(pmap_page_get_memattr(m) ==
 				    VM_MEMATTR_DEFAULT,
 				    ("page %p has an unexpected memattr", m));
 				KASSERT((m->oflags & (VPO_SWAPINPROG |
 				    VPO_SWAPSLEEP | VPO_UNMANAGED)) == 0,
 				    ("page %p has unexpected oflags", m));
 				/* Don't care: VPO_NOSYNC. */
 				if (m->valid != 0) {
 					/*
 					 * First, try to allocate a new page
 					 * that is above "high".  Failing
 					 * that, try to allocate a new page
 					 * that is below "m_run".  Allocate
 					 * the new page between the end of
 					 * "m_run" and "high" only as a last
 					 * resort.
 					 */
 					req = req_class | VM_ALLOC_NOOBJ;
 					if ((m->flags & PG_NODUMP) != 0)
 						req |= VM_ALLOC_NODUMP;
 					if (trunc_page(high) !=
 					    ~(vm_paddr_t)PAGE_MASK) {
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    round_page(high),
 						    ~(vm_paddr_t)0,
 						    PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					} else
 						m_new = NULL;
 					if (m_new == NULL) {
 						pa = VM_PAGE_TO_PHYS(m_run);
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    0, pa - 1, PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						pa += ptoa(npages);
 						m_new = vm_page_alloc_contig(
 						    NULL, 0, req, 1,
 						    pa, high, PAGE_SIZE, 0,
 						    VM_MEMATTR_DEFAULT);
 					}
 					if (m_new == NULL) {
 						error = ENOMEM;
 						goto unlock;
 					}
 					KASSERT(m_new->wire_count == 0,
 					    ("page %p is wired", m));
 
 					/*
 					 * Replace "m" with the new page.  For
 					 * vm_page_replace(), "m" must be busy
 					 * and dequeued.  Finally, change "m"
 					 * as if vm_page_free() was called.
 					 */
 					if (object->ref_count != 0)
 						pmap_remove_all(m);
 					m_new->aflags = m->aflags;
 					KASSERT(m_new->oflags == VPO_UNMANAGED,
 					    ("page %p is managed", m));
 					m_new->oflags = m->oflags & VPO_NOSYNC;
 					pmap_copy_page(m, m_new);
 					m_new->valid = m->valid;
 					m_new->dirty = m->dirty;
 					m->flags &= ~PG_ZERO;
 					vm_page_xbusy(m);
 					vm_page_remque(m);
 					vm_page_replace_checked(m_new, object,
 					    m->pindex, m);
 					m->valid = 0;
 					vm_page_undirty(m);
 
 					/*
 					 * The new page must be deactivated
 					 * before the object is unlocked.
 					 */
 					vm_page_change_lock(m_new, &m_mtx);
 					vm_page_deactivate(m_new);
 				} else {
 					m->flags &= ~PG_ZERO;
 					vm_page_remque(m);
 					vm_page_remove(m);
 					KASSERT(m->dirty == 0,
 					    ("page %p is dirty", m));
 				}
 				SLIST_INSERT_HEAD(&free, m, plinks.s.ss);
 			} else
 				error = EBUSY;
 unlock:
 			VM_OBJECT_WUNLOCK(object);
 		} else {
 			mtx_lock(&vm_page_queue_free_mtx);
 			order = m->order;
 			if (order < VM_NFREEORDER) {
 				/*
 				 * The page is enqueued in the physical memory
 				 * allocator's free page queues.  Moreover, it
 				 * is the first page in a power-of-two-sized
 				 * run of contiguous free pages.  Jump ahead
 				 * to the last page within that run, and
 				 * continue from there.
 				 */
 				m += (1 << order) - 1;
 			}
 #if VM_NRESERVLEVEL > 0
 			else if (vm_reserv_is_page_free(m))
 				order = 0;
 #endif
 			mtx_unlock(&vm_page_queue_free_mtx);
 			if (order == VM_NFREEORDER)
 				error = EINVAL;
 		}
 	}
 	if (m_mtx != NULL)
 		mtx_unlock(m_mtx);
 	if ((m = SLIST_FIRST(&free)) != NULL) {
 		mtx_lock(&vm_page_queue_free_mtx);
 		do {
 			SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 			vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 			if (!vm_reserv_free_page(m))
 #else
 			if (true)
 #endif
 				vm_phys_free_pages(m, 0);
 		} while ((m = SLIST_FIRST(&free)) != NULL);
 		vm_page_zero_idle_wakeup();
 		vm_page_free_wakeup();
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 	return (error);
 }
 
 #define	NRUNS	16
 
 CTASSERT(powerof2(NRUNS));
 
 #define	RUN_INDEX(count)	((count) & (NRUNS - 1))
 
 #define	MIN_RECLAIM	8
 
 /*
  *	vm_page_reclaim_contig:
  *
  *	Reclaim allocated, contiguous physical memory satisfying the specified
  *	conditions by relocating the virtual pages using that physical memory.
  *	Returns true if reclamation is successful and false otherwise.  Since
  *	relocation requires the allocation of physical pages, reclamation may
  *	fail due to a shortage of free pages.  When reclamation fails, callers
  *	are expected to perform VM_WAIT before retrying a failed allocation
  *	operation, e.g., vm_page_alloc_contig().
  *
  *	The caller must always specify an allocation class through "req".
  *
  *	allocation classes:
  *	VM_ALLOC_NORMAL		normal process request
  *	VM_ALLOC_SYSTEM		system *really* needs a page
  *	VM_ALLOC_INTERRUPT	interrupt time request
  *
  *	The optional allocation flags are ignored.
  *
  *	"npages" must be greater than zero.  Both "alignment" and "boundary"
  *	must be a power of two.
  */
 bool
 vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low, vm_paddr_t high,
     u_long alignment, vm_paddr_t boundary)
 {
 	vm_paddr_t curr_low;
 	vm_page_t m_run, m_runs[NRUNS];
 	u_long count, reclaimed;
 	int error, i, options, req_class;
 
 	KASSERT(npages > 0, ("npages is 0"));
 	KASSERT(powerof2(alignment), ("alignment is not a power of 2"));
 	KASSERT(powerof2(boundary), ("boundary is not a power of 2"));
 	req_class = req & VM_ALLOC_CLASS_MASK;
 
 	/*
 	 * The page daemon is allowed to dig deeper into the free page list.
 	 */
 	if (curproc == pageproc && req_class != VM_ALLOC_INTERRUPT)
 		req_class = VM_ALLOC_SYSTEM;
 
 	/*
 	 * Return if the number of free pages cannot satisfy the requested
 	 * allocation.
 	 */
 	count = vm_cnt.v_free_count;
 	if (count < npages + vm_cnt.v_free_reserved || (count < npages +
 	    vm_cnt.v_interrupt_free_min && req_class == VM_ALLOC_SYSTEM) ||
 	    (count < npages && req_class == VM_ALLOC_INTERRUPT))
 		return (false);
 
 	/*
 	 * Scan up to three times, relaxing the restrictions ("options") on
 	 * the reclamation of reservations and superpages each time.
 	 */
 	for (options = VPSC_NORESERV;;) {
 		/*
 		 * Find the highest runs that satisfy the given constraints
 		 * and restrictions, and record them in "m_runs".
 		 */
 		curr_low = low;
 		count = 0;
 		for (;;) {
 			m_run = vm_phys_scan_contig(npages, curr_low, high,
 			    alignment, boundary, options);
 			if (m_run == NULL)
 				break;
 			curr_low = VM_PAGE_TO_PHYS(m_run) + ptoa(npages);
 			m_runs[RUN_INDEX(count)] = m_run;
 			count++;
 		}
 
 		/*
 		 * Reclaim the highest runs in LIFO (descending) order until
 		 * the number of reclaimed pages, "reclaimed", is at least
 		 * MIN_RECLAIM.  Reset "reclaimed" each time because each
 		 * reclamation is idempotent, and runs will (likely) recur
 		 * from one scan to the next as restrictions are relaxed.
 		 */
 		reclaimed = 0;
 		for (i = 0; count > 0 && i < NRUNS; i++) {
 			count--;
 			m_run = m_runs[RUN_INDEX(count)];
 			error = vm_page_reclaim_run(req_class, npages, m_run,
 			    high);
 			if (error == 0) {
 				reclaimed += npages;
 				if (reclaimed >= MIN_RECLAIM)
 					return (true);
 			}
 		}
 
 		/*
 		 * Either relax the restrictions on the next scan or return if
 		 * the last scan had no restrictions.
 		 */
 		if (options == VPSC_NORESERV)
 			options = VPSC_NOSUPER;
 		else if (options == VPSC_NOSUPER)
 			options = VPSC_ANY;
 		else if (options == VPSC_ANY)
 			return (reclaimed != 0);
 	}
 }
 
 /*
  *	vm_wait:	(also see VM_WAIT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called in various places before memory allocations.
  */
 void
 vm_wait(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (curproc == pageproc) {
 		vm_pageout_pages_needed = 1;
 		msleep(&vm_pageout_pages_needed, &vm_page_queue_free_mtx,
 		    PDROP | PSWP, "VMWait", 0);
 	} else {
 		if (__predict_false(pageproc == NULL))
 			panic("vm_wait in early boot");
 		if (!vm_pageout_wanted) {
 			vm_pageout_wanted = true;
 			wakeup(&vm_pageout_wanted);
 		}
 		vm_pages_needed = true;
 		msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PVM,
 		    "vmwait", 0);
 	}
 }
 
 /*
  *	vm_waitpfault:	(also see VM_WAITPFAULT macro)
  *
  *	Sleep until free pages are available for allocation.
  *	- Called only in vm_fault so that processes page faulting
  *	  can be easily tracked.
  *	- Sleeps at a lower priority than vm_wait() so that vm_wait()ing
  *	  processes will be able to grab memory first.  Do not change
  *	  this balance without careful testing first.
  */
 void
 vm_waitpfault(void)
 {
 
 	mtx_lock(&vm_page_queue_free_mtx);
 	if (!vm_pageout_wanted) {
 		vm_pageout_wanted = true;
 		wakeup(&vm_pageout_wanted);
 	}
 	vm_pages_needed = true;
 	msleep(&vm_cnt.v_free_count, &vm_page_queue_free_mtx, PDROP | PUSER,
 	    "pfault", 0);
 }
 
 struct vm_pagequeue *
 vm_page_pagequeue(vm_page_t m)
 {
 
 	if (vm_page_in_laundry(m))
 		return (&vm_dom[0].vmd_pagequeues[m->queue]);
 	else
 		return (&vm_phys_domain(m)->vmd_pagequeues[m->queue]);
 }
 
 /*
  *	vm_page_dequeue:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_dequeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_assert_locked(m);
 	KASSERT(m->queue < PQ_COUNT, ("vm_page_dequeue: page %p is not queued",
 	    m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_dequeue_locked:
  *
  *	Remove the given page from its current page queue.
  *
  *	The page and page queue must be locked.
  */
 void
 vm_page_dequeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	m->queue = PQ_NONE;
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_dec(pq);
 }
 
 /*
  *	vm_page_enqueue:
  *
  *	Add the given page to the specified page queue.
  *
  *	The page must be locked.
  */
 static void
 vm_page_enqueue(uint8_t queue, vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(queue < PQ_COUNT,
 	    ("vm_page_enqueue: invalid queue %u request for page %p",
 	    queue, m));
 	if (queue == PQ_LAUNDRY)
 		pq = &vm_dom[0].vmd_pagequeues[queue];
 	else
 		pq = &vm_phys_domain(m)->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	m->queue = queue;
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_cnt_inc(pq);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page must be locked.
  */
 void
 vm_page_requeue(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 	vm_pagequeue_unlock(pq);
 }
 
 /*
  *	vm_page_requeue_locked:
  *
  *	Move the given page to the tail of its current page queue.
  *
  *	The page queue must be locked.
  */
 void
 vm_page_requeue_locked(vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 
 	KASSERT(m->queue != PQ_NONE,
 	    ("vm_page_requeue_locked: page %p is not queued", m));
 	pq = vm_page_pagequeue(m);
 	vm_pagequeue_assert_locked(pq);
 	TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 	TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 }
 
 /*
  *	vm_page_activate:
  *
  *	Put the specified page on the active list (if appropriate).
  *	Ensure that act_count is at least ACT_INIT but do not otherwise
  *	mess with it.
  *
  *	The page must be locked.
  */
 void
 vm_page_activate(vm_page_t m)
 {
 	int queue;
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((queue = m->queue) != PQ_ACTIVE) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (m->act_count < ACT_INIT)
 				m->act_count = ACT_INIT;
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_ACTIVE, m);
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("vm_page_activate: wired page %p is queued", m));
 	} else {
 		if (m->act_count < ACT_INIT)
 			m->act_count = ACT_INIT;
 	}
 }
 
 /*
  *	vm_page_free_wakeup:
  *
  *	Helper routine for vm_page_free_toq().  This routine is called
  *	when a page is added to the free queues.
  *
  *	The page queues must be locked.
  */
 static inline void
 vm_page_free_wakeup(void)
 {
 
 	mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
 	/*
 	 * if pageout daemon needs pages, then tell it that there are
 	 * some free.
 	 */
 	if (vm_pageout_pages_needed &&
 	    vm_cnt.v_free_count >= vm_cnt.v_pageout_free_min) {
 		wakeup(&vm_pageout_pages_needed);
 		vm_pageout_pages_needed = 0;
 	}
 	/*
 	 * wakeup processes that are waiting on memory if we hit a
 	 * high water mark. And wakeup scheduler process if we have
 	 * lots of memory. this process will swapin processes.
 	 */
 	if (vm_pages_needed && !vm_page_count_min()) {
 		vm_pages_needed = false;
 		wakeup(&vm_cnt.v_free_count);
 	}
 }
 
 /*
  *	vm_page_free_toq:
  *
  *	Returns the given page to the free list,
  *	disassociating it with any VM object.
  *
  *	The object must be locked.  The page must be locked if it is managed.
  */
 void
 vm_page_free_toq(vm_page_t m)
 {
 
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		vm_page_lock_assert(m, MA_OWNED);
 		KASSERT(!pmap_page_is_mapped(m),
 		    ("vm_page_free_toq: freeing mapped page %p", m));
 	} else
 		KASSERT(m->queue == PQ_NONE,
 		    ("vm_page_free_toq: unmanaged page %p is queued", m));
 	PCPU_INC(cnt.v_tfree);
 
 	if (vm_page_sbusied(m))
 		panic("vm_page_free: freeing busy page %p", m);
 
 	/*
 	 * Unqueue, then remove page.  Note that we cannot destroy
 	 * the page here because we do not want to call the pager's
 	 * callback routine until after we've put the page on the
 	 * appropriate free queue.
 	 */
 	vm_page_remque(m);
 	vm_page_remove(m);
 
 	/*
 	 * If fictitious remove object association and
 	 * return, otherwise delay object association removal.
 	 */
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		return;
 	}
 
 	m->valid = 0;
 	vm_page_undirty(m);
 
 	if (m->wire_count != 0)
 		panic("vm_page_free: freeing wired page %p", m);
 	if (m->hold_count != 0) {
 		m->flags &= ~PG_ZERO;
 		KASSERT((m->flags & PG_UNHOLDFREE) == 0,
 		    ("vm_page_free: freeing PG_UNHOLDFREE page %p", m));
 		m->flags |= PG_UNHOLDFREE;
 	} else {
 		/*
 		 * Restore the default memory attribute to the page.
 		 */
 		if (pmap_page_get_memattr(m) != VM_MEMATTR_DEFAULT)
 			pmap_page_set_memattr(m, VM_MEMATTR_DEFAULT);
 
 		/*
 		 * Insert the page into the physical memory allocator's free
 		 * page queues.
 		 */
 		mtx_lock(&vm_page_queue_free_mtx);
 		vm_phys_freecnt_adj(m, 1);
 #if VM_NRESERVLEVEL > 0
 		if (!vm_reserv_free_page(m))
 #else
 		if (TRUE)
 #endif
 			vm_phys_free_pages(m, 0);
 		if ((m->flags & PG_ZERO) != 0)
 			++vm_page_zero_count;
 		else
 			vm_page_zero_idle_wakeup();
 		vm_page_free_wakeup();
 		mtx_unlock(&vm_page_queue_free_mtx);
 	}
 }
 
 /*
  *	vm_page_wire:
  *
  *	Mark this page as wired down by yet
  *	another map, removing it from paging queues
  *	as necessary.
  *
  *	If the page is fictitious, then its wire count must remain one.
  *
  *	The page must be locked.
  */
 void
 vm_page_wire(vm_page_t m)
 {
 
 	/*
 	 * Only bump the wire statistics if the page is not already wired,
 	 * and only unqueue the page if it is on some queue (if it is unmanaged
 	 * it is already off the queues).
 	 */
 	vm_page_lock_assert(m, MA_OWNED);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 		    ("vm_page_wire: fictitious page %p's wire count isn't one",
 		    m));
 		return;
 	}
 	if (m->wire_count == 0) {
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0 ||
 		    m->queue == PQ_NONE,
 		    ("vm_page_wire: unmanaged page %p is queued", m));
 		vm_page_remque(m);
 		atomic_add_int(&vm_cnt.v_wire_count, 1);
 	}
 	m->wire_count++;
 	KASSERT(m->wire_count != 0, ("vm_page_wire: wire_count overflow m=%p", m));
 }
 
 /*
  * vm_page_unwire:
  *
  * Release one wiring of the specified page, potentially allowing it to be
  * paged out.  Returns TRUE if the number of wirings transitions to zero and
  * FALSE otherwise.
  *
  * Only managed pages belonging to an object can be paged out.  If the number
  * of wirings transitions to zero and the page is eligible for page out, then
  * the page is added to the specified paging queue (unless PQ_NONE is
  * specified).
  *
  * If a page is fictitious, then its wire count must always be one.
  *
  * A managed page must be locked.
  */
 boolean_t
 vm_page_unwire(vm_page_t m, uint8_t queue)
 {
 
 	KASSERT(queue < PQ_COUNT || queue == PQ_NONE,
 	    ("vm_page_unwire: invalid queue %u request for page %p",
 	    queue, m));
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		vm_page_assert_locked(m);
 	if ((m->flags & PG_FICTITIOUS) != 0) {
 		KASSERT(m->wire_count == 1,
 	    ("vm_page_unwire: fictitious page %p's wire count isn't one", m));
 		return (FALSE);
 	}
 	if (m->wire_count > 0) {
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			atomic_subtract_int(&vm_cnt.v_wire_count, 1);
 			if ((m->oflags & VPO_UNMANAGED) == 0 &&
 			    m->object != NULL && queue != PQ_NONE)
 				vm_page_enqueue(queue, m);
 			return (TRUE);
 		} else
 			return (FALSE);
 	} else
 		panic("vm_page_unwire: page %p's wire count is zero", m);
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * Normally, "noreuse" is FALSE, resulting in LRU ordering of the inactive
  * queue.  However, setting "noreuse" to TRUE will accelerate the specified
  * page's reclamation, but it will not unmap the page from any address space.
  * This is implemented by inserting the page near the head of the inactive
  * queue, using a marker page to guide FIFO insertion ordering.
  *
  * The page must be locked.
  */
 static inline void
 _vm_page_deactivate(vm_page_t m, boolean_t noreuse)
 {
 	struct vm_pagequeue *pq;
 	int queue;
 
 	vm_page_assert_locked(m);
 
 	/*
 	 * Ignore if the page is already inactive, unless it is unlikely to be
 	 * reactivated.
 	 */
 	if ((queue = m->queue) == PQ_INACTIVE && !noreuse)
 		return;
 	if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 		pq = &vm_phys_domain(m)->vmd_pagequeues[PQ_INACTIVE];
 		/* Avoid multiple acquisitions of the inactive queue lock. */
 		if (queue == PQ_INACTIVE) {
 			vm_pagequeue_lock(pq);
 			vm_page_dequeue_locked(m);
 		} else {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_pagequeue_lock(pq);
 		}
 		m->queue = PQ_INACTIVE;
 		if (noreuse)
 			TAILQ_INSERT_BEFORE(&vm_phys_domain(m)->vmd_inacthead,
 			    m, plinks.q);
 		else
 			TAILQ_INSERT_TAIL(&pq->pq_pl, m, plinks.q);
 		vm_pagequeue_cnt_inc(pq);
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Move the specified page to the inactive queue.
  *
  * The page must be locked.
  */
 void
 vm_page_deactivate(vm_page_t m)
 {
 
 	_vm_page_deactivate(m, FALSE);
 }
 
 /*
  * Move the specified page to the inactive queue with the expectation
  * that it is unlikely to be reused.
  *
  * The page must be locked.
  */
 void
 vm_page_deactivate_noreuse(vm_page_t m)
 {
 
 	_vm_page_deactivate(m, TRUE);
 }
 
 /*
  * vm_page_launder
  *
  * 	Put a page in the laundry.
  */
 void
 vm_page_launder(vm_page_t m)
 {
 	int queue;
 
 	vm_page_assert_locked(m);
 	if ((queue = m->queue) != PQ_LAUNDRY) {
 		if (m->wire_count == 0 && (m->oflags & VPO_UNMANAGED) == 0) {
 			if (queue != PQ_NONE)
 				vm_page_dequeue(m);
 			vm_page_enqueue(PQ_LAUNDRY, m);
 		} else
 			KASSERT(queue == PQ_NONE,
 			    ("wired page %p is queued", m));
 	}
 }
 
 /*
  * vm_page_try_to_free()
  *
  *	Attempt to free the page.  If we cannot free it, we do nothing.
  *	1 is returned on success, 0 on failure.
  */
 int
 vm_page_try_to_free(vm_page_t m)
 {
 
 	vm_page_lock_assert(m, MA_OWNED);
 	if (m->object != NULL)
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty || m->hold_count || m->wire_count ||
 	    (m->oflags & VPO_UNMANAGED) != 0 || vm_page_busied(m))
 		return (0);
 	pmap_remove_all(m);
 	if (m->dirty)
 		return (0);
 	vm_page_free(m);
 	return (1);
 }
 
 /*
  * vm_page_advise
  *
  * 	Apply the specified advice to the given page.
  *
  *	The object and page must be locked.
  */
 void
 vm_page_advise(vm_page_t m, int advice)
 {
 
 	vm_page_assert_locked(m);
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (advice == MADV_FREE)
 		/*
 		 * Mark the page clean.  This will allow the page to be freed
 		 * without first paging it out.  MADV_FREE pages are often
 		 * quickly reused by malloc(3), so we do not do anything that
 		 * would result in a page fault on a later access.
 		 */
 		vm_page_undirty(m);
 	else if (advice != MADV_DONTNEED) {
 		if (advice == MADV_WILLNEED)
 			vm_page_activate(m);
 		return;
 	}
 
 	/*
 	 * Clear any references to the page.  Otherwise, the page daemon will
 	 * immediately reactivate the page.
 	 */
 	vm_page_aflag_clear(m, PGA_REFERENCED);
 
 	if (advice != MADV_FREE && m->dirty == 0 && pmap_is_modified(m))
 		vm_page_dirty(m);
 
 	/*
 	 * Place clean pages near the head of the inactive queue rather than
 	 * the tail, thus defeating the queue's LRU operation and ensuring that
 	 * the page will be reused quickly.  Dirty pages not already in the
 	 * laundry are moved there.
 	 */
 	if (m->dirty == 0)
 		vm_page_deactivate_noreuse(m);
 	else
 		vm_page_launder(m);
 }
 
 /*
  * Grab a page, waiting until we are waken up due to the page
  * changing state.  We keep on waiting, if the page continues
  * to be in the object.  If the page doesn't exist, first allocate it
  * and then conditionally zero it.
  *
  * This routine may sleep.
  *
  * The object must be locked on entry.  The lock will, however, be released
  * and reacquired if the routine sleeps.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
 {
 	vm_page_t m;
 	int sleep;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
 	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
 	    ("vm_page_grab: VM_ALLOC_SBUSY/VM_ALLOC_IGN_SBUSY mismatch"));
 retrylookup:
 	if ((m = vm_page_lookup(object, pindex)) != NULL) {
 		sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
 		    vm_page_xbusied(m) : vm_page_busied(m);
 		if (sleep) {
 			if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 				return (NULL);
 			/*
 			 * Reference the page before unlocking and
 			 * sleeping so that the page daemon is less
 			 * likely to reclaim it.
 			 */
 			vm_page_aflag_set(m, PGA_REFERENCED);
 			vm_page_lock(m);
 			VM_OBJECT_WUNLOCK(object);
 			vm_page_busy_sleep(m, "pgrbwt", (allocflags &
 			    VM_ALLOC_IGN_SBUSY) != 0);
 			VM_OBJECT_WLOCK(object);
 			goto retrylookup;
 		} else {
 			if ((allocflags & VM_ALLOC_WIRED) != 0) {
 				vm_page_lock(m);
 				vm_page_wire(m);
 				vm_page_unlock(m);
 			}
 			if ((allocflags &
 			    (VM_ALLOC_NOBUSY | VM_ALLOC_SBUSY)) == 0)
 				vm_page_xbusy(m);
 			if ((allocflags & VM_ALLOC_SBUSY) != 0)
 				vm_page_sbusy(m);
 			return (m);
 		}
 	}
 	m = vm_page_alloc(object, pindex, allocflags);
 	if (m == NULL) {
 		if ((allocflags & VM_ALLOC_NOWAIT) != 0)
 			return (NULL);
 		VM_OBJECT_WUNLOCK(object);
 		VM_WAIT;
 		VM_OBJECT_WLOCK(object);
 		goto retrylookup;
 	}
 	if (allocflags & VM_ALLOC_ZERO && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 /*
+ * Return the specified range of pages from the given object.  For each
+ * page offset within the range, if a page already exists within the object
+ * at that offset and it is busy, then wait for it to change state.  If,
+ * instead, the page doesn't exist, then allocate it.
+ *
+ * The caller must always specify an allocation class.
+ *
+ * allocation classes:
+ *	VM_ALLOC_NORMAL		normal process request
+ *	VM_ALLOC_SYSTEM		system *really* needs the pages
+ *
+ * The caller must always specify that the pages are to be busied and/or
+ * wired.
+ *
+ * optional allocation flags:
+ *	VM_ALLOC_IGN_SBUSY	do not sleep on soft busy pages
+ *	VM_ALLOC_NOBUSY		do not exclusive busy the page
+ *	VM_ALLOC_SBUSY		set page to sbusy state
+ *	VM_ALLOC_WIRED		wire the pages
+ *	VM_ALLOC_ZERO		zero and validate any invalid pages
+ *
+ * This routine may sleep.
+ */
+void
+vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count)
+{
+	vm_page_t m;
+	int i;
+	bool sleep;
+
+	VM_OBJECT_ASSERT_WLOCKED(object);
+	KASSERT(((u_int)allocflags >> VM_ALLOC_COUNT_SHIFT) == 0,
+	    ("vm_page_grap_pages: VM_ALLOC_COUNT() is not allowed"));
+	KASSERT((allocflags & VM_ALLOC_NOBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_WIRED) != 0,
+	    ("vm_page_grab_pages: the pages must be busied or wired"));
+	KASSERT((allocflags & VM_ALLOC_SBUSY) == 0 ||
+	    (allocflags & VM_ALLOC_IGN_SBUSY) != 0,
+	    ("vm_page_grab_pages: VM_ALLOC_SBUSY/IGN_SBUSY mismatch"));
+	if (count == 0)
+		return;
+	i = 0;
+retrylookup:
+	m = vm_page_lookup(object, pindex + i);
+	for (; i < count; i++) {
+		if (m != NULL) {
+			sleep = (allocflags & VM_ALLOC_IGN_SBUSY) != 0 ?
+			    vm_page_xbusied(m) : vm_page_busied(m);
+			if (sleep) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_aflag_set(m, PGA_REFERENCED);
+				vm_page_lock(m);
+				VM_OBJECT_WUNLOCK(object);
+				vm_page_busy_sleep(m, "grbmaw", (allocflags &
+				    VM_ALLOC_IGN_SBUSY) != 0);
+				VM_OBJECT_WLOCK(object);
+				goto retrylookup;
+			}
+			if ((allocflags & VM_ALLOC_WIRED) != 0) {
+				vm_page_lock(m);
+				vm_page_wire(m);
+				vm_page_unlock(m);
+			}
+			if ((allocflags & (VM_ALLOC_NOBUSY |
+			    VM_ALLOC_SBUSY)) == 0)
+				vm_page_xbusy(m);
+			if ((allocflags & VM_ALLOC_SBUSY) != 0)
+				vm_page_sbusy(m);
+		} else {
+			m = vm_page_alloc(object, pindex + i, (allocflags &
+			    ~VM_ALLOC_IGN_SBUSY) | VM_ALLOC_COUNT(count - i));
+			if (m == NULL) {
+				VM_OBJECT_WUNLOCK(object);
+				VM_WAIT;
+				VM_OBJECT_WLOCK(object);
+				goto retrylookup;
+			}
+		}
+		if (m->valid == 0 && (allocflags & VM_ALLOC_ZERO) != 0) {
+			if ((m->flags & PG_ZERO) == 0)
+				pmap_zero_page(m);
+			m->valid = VM_PAGE_BITS_ALL;
+		}
+		ma[i] = m;
+		m = vm_page_next(m);
+	}
+}
+
+/*
  * Mapping function for valid or dirty bits in a page.
  *
  * Inputs are required to range within a page.
  */
 vm_page_bits_t
 vm_page_bits(int base, int size)
 {
 	int first_bit;
 	int last_bit;
 
 	KASSERT(
 	    base + size <= PAGE_SIZE,
 	    ("vm_page_bits: illegal base/size %d/%d", base, size)
 	);
 
 	if (size == 0)		/* handle degenerate case */
 		return (0);
 
 	first_bit = base >> DEV_BSHIFT;
 	last_bit = (base + size - 1) >> DEV_BSHIFT;
 
 	return (((vm_page_bits_t)2 << last_bit) -
 	    ((vm_page_bits_t)1 << first_bit));
 }
 
 /*
  *	vm_page_set_valid_range:
  *
  *	Sets portions of a page valid.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zeroed.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_valid_range(vm_page_t m, int base, int size)
 {
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & (1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & (1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Assert that no previously invalid block that is now being validated
 	 * is already dirty.
 	 */
 	KASSERT((~m->valid & vm_page_bits(base, size) & m->dirty) == 0,
 	    ("vm_page_set_valid_range: page %p is dirty", m));
 
 	/*
 	 * Set valid bits inclusive of any overlap.
 	 */
 	m->valid |= vm_page_bits(base, size);
 }
 
 /*
  * Clear the given bits from the specified page's dirty field.
  */
 static __inline void
 vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits)
 {
 	uintptr_t addr;
 #if PAGE_SIZE < 16384
 	int shift;
 #endif
 
 	/*
 	 * If the object is locked and the page is neither exclusive busy nor
 	 * write mapped, then the page's dirty field cannot possibly be
 	 * set by a concurrent pmap operation.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && !pmap_page_is_write_mapped(m))
 		m->dirty &= ~pagebits;
 	else {
 		/*
 		 * The pmap layer can call vm_page_dirty() without
 		 * holding a distinguished lock.  The combination of
 		 * the object's lock and an atomic operation suffice
 		 * to guarantee consistency of the page dirty field.
 		 *
 		 * For PAGE_SIZE == 32768 case, compiler already
 		 * properly aligns the dirty field, so no forcible
 		 * alignment is needed. Only require existence of
 		 * atomic_clear_64 when page size is 32768.
 		 */
 		addr = (uintptr_t)&m->dirty;
 #if PAGE_SIZE == 32768
 		atomic_clear_64((uint64_t *)addr, pagebits);
 #elif PAGE_SIZE == 16384
 		atomic_clear_32((uint32_t *)addr, pagebits);
 #else		/* PAGE_SIZE <= 8192 */
 		/*
 		 * Use a trick to perform a 32-bit atomic on the
 		 * containing aligned word, to not depend on the existence
 		 * of atomic_clear_{8, 16}.
 		 */
 		shift = addr & (sizeof(uint32_t) - 1);
 #if BYTE_ORDER == BIG_ENDIAN
 		shift = (sizeof(uint32_t) - sizeof(m->dirty) - shift) * NBBY;
 #else
 		shift *= NBBY;
 #endif
 		addr &= ~(sizeof(uint32_t) - 1);
 		atomic_clear_32((uint32_t *)addr, pagebits << shift);
 #endif		/* PAGE_SIZE */
 	}
 }
 
 /*
  *	vm_page_set_validclean:
  *
  *	Sets portions of a page valid and clean.  The arguments are expected
  *	to be DEV_BSIZE aligned but if they aren't the bitmap is inclusive
  *	of any partial chunks touched by the range.  The invalid portion of
  *	such chunks will be zero'd.
  *
  *	(base + size) must be less then or equal to PAGE_SIZE.
  */
 void
 vm_page_set_validclean(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t oldvalid, pagebits;
 	int endoff, frag;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (size == 0)	/* handle degenerate case */
 		return;
 
 	/*
 	 * If the base is not DEV_BSIZE aligned and the valid
 	 * bit is clear, we have to zero out a portion of the
 	 * first block.
 	 */
 	if ((frag = rounddown2(base, DEV_BSIZE)) != base &&
 	    (m->valid & ((vm_page_bits_t)1 << (base >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, frag, base - frag);
 
 	/*
 	 * If the ending offset is not DEV_BSIZE aligned and the
 	 * valid bit is clear, we have to zero out a portion of
 	 * the last block.
 	 */
 	endoff = base + size;
 	if ((frag = rounddown2(endoff, DEV_BSIZE)) != endoff &&
 	    (m->valid & ((vm_page_bits_t)1 << (endoff >> DEV_BSHIFT))) == 0)
 		pmap_zero_page_area(m, endoff,
 		    DEV_BSIZE - (endoff & (DEV_BSIZE - 1)));
 
 	/*
 	 * Set valid, clear dirty bits.  If validating the entire
 	 * page we can safely clear the pmap modify bit.  We also
 	 * use this opportunity to clear the VPO_NOSYNC flag.  If a process
 	 * takes a write fault on a MAP_NOSYNC memory area the flag will
 	 * be set again.
 	 *
 	 * We set valid bits inclusive of any overlap, but we can only
 	 * clear dirty bits for DEV_BSIZE chunks that are fully within
 	 * the range.
 	 */
 	oldvalid = m->valid;
 	pagebits = vm_page_bits(base, size);
 	m->valid |= pagebits;
 #if 0	/* NOT YET */
 	if ((frag = base & (DEV_BSIZE - 1)) != 0) {
 		frag = DEV_BSIZE - frag;
 		base += frag;
 		size -= frag;
 		if (size < 0)
 			size = 0;
 	}
 	pagebits = vm_page_bits(base, size & (DEV_BSIZE - 1));
 #endif
 	if (base == 0 && size == PAGE_SIZE) {
 		/*
 		 * The page can only be modified within the pmap if it is
 		 * mapped, and it can only be mapped if it was previously
 		 * fully valid.
 		 */
 		if (oldvalid == VM_PAGE_BITS_ALL)
 			/*
 			 * Perform the pmap_clear_modify() first.  Otherwise,
 			 * a concurrent pmap operation, such as
 			 * pmap_protect(), could clear a modification in the
 			 * pmap and set the dirty field on the page before
 			 * pmap_clear_modify() had begun and after the dirty
 			 * field was cleared here.
 			 */
 			pmap_clear_modify(m);
 		m->dirty = 0;
 		m->oflags &= ~VPO_NOSYNC;
 	} else if (oldvalid != VM_PAGE_BITS_ALL)
 		m->dirty &= ~pagebits;
 	else
 		vm_page_clear_dirty_mask(m, pagebits);
 }
 
 void
 vm_page_clear_dirty(vm_page_t m, int base, int size)
 {
 
 	vm_page_clear_dirty_mask(m, vm_page_bits(base, size));
 }
 
 /*
  *	vm_page_set_invalid:
  *
  *	Invalidates DEV_BSIZE'd chunks within a page.  Both the
  *	valid and dirty bits for the effected areas are cleared.
  */
 void
 vm_page_set_invalid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 	vm_object_t object;
 
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type == OBJT_VNODE && base == 0 && IDX_TO_OFF(m->pindex) +
 	    size >= object->un_pager.vnp.vnp_size)
 		bits = VM_PAGE_BITS_ALL;
 	else
 		bits = vm_page_bits(base, size);
 	if (object->ref_count != 0 && m->valid == VM_PAGE_BITS_ALL &&
 	    bits != 0)
 		pmap_remove_all(m);
 	KASSERT((bits == 0 && m->valid == VM_PAGE_BITS_ALL) ||
 	    !pmap_page_is_mapped(m),
 	    ("vm_page_set_invalid: page %p is mapped", m));
 	m->valid &= ~bits;
 	m->dirty &= ~bits;
 }
 
 /*
  * vm_page_zero_invalid()
  *
  *	The kernel assumes that the invalid portions of a page contain
  *	garbage, but such pages can be mapped into memory by user code.
  *	When this occurs, we must zero out the non-valid portions of the
  *	page so user code sees what it expects.
  *
  *	Pages are most often semi-valid when the end of a file is mapped
  *	into memory and the file's size is not page aligned.
  */
 void
 vm_page_zero_invalid(vm_page_t m, boolean_t setvalid)
 {
 	int b;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	/*
 	 * Scan the valid bits looking for invalid sections that
 	 * must be zeroed.  Invalid sub-DEV_BSIZE'd areas ( where the
 	 * valid bit may be set ) have already been zeroed by
 	 * vm_page_set_validclean().
 	 */
 	for (b = i = 0; i <= PAGE_SIZE / DEV_BSIZE; ++i) {
 		if (i == (PAGE_SIZE / DEV_BSIZE) ||
 		    (m->valid & ((vm_page_bits_t)1 << i))) {
 			if (i > b) {
 				pmap_zero_page_area(m,
 				    b << DEV_BSHIFT, (i - b) << DEV_BSHIFT);
 			}
 			b = i + 1;
 		}
 	}
 
 	/*
 	 * setvalid is TRUE when we can safely set the zero'd areas
 	 * as being valid.  We can do this if there are no cache consistancy
 	 * issues.  e.g. it is ok to do with UFS, but not ok to do with NFS.
 	 */
 	if (setvalid)
 		m->valid = VM_PAGE_BITS_ALL;
 }
 
 /*
  *	vm_page_is_valid:
  *
  *	Is (partial) page valid?  Note that the case where size == 0
  *	will return FALSE in the degenerate case where the page is
  *	entirely invalid, and TRUE otherwise.
  */
 int
 vm_page_is_valid(vm_page_t m, int base, int size)
 {
 	vm_page_bits_t bits;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	bits = vm_page_bits(base, size);
 	return (m->valid != 0 && (m->valid & bits) == bits);
 }
 
 /*
  *	vm_page_ps_is_valid:
  *
  *	Returns TRUE if the entire (super)page is valid and FALSE otherwise.
  */
 boolean_t
 vm_page_ps_is_valid(vm_page_t m)
 {
 	int i, npages;
 
 	VM_OBJECT_ASSERT_LOCKED(m->object);
 	npages = atop(pagesizes[m->psind]);
 
 	/*
 	 * The physically contiguous pages that make up a superpage, i.e., a
 	 * page with a page size index ("psind") greater than zero, will
 	 * occupy adjacent entries in vm_page_array[].
 	 */
 	for (i = 0; i < npages; i++) {
 		if (m[i].valid != VM_PAGE_BITS_ALL)
 			return (FALSE);
 	}
 	return (TRUE);
 }
 
 /*
  * Set the page's dirty bits if the page is modified.
  */
 void
 vm_page_test_dirty(vm_page_t m)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (m->dirty != VM_PAGE_BITS_ALL && pmap_is_modified(m))
 		vm_page_dirty(m);
 }
 
 void
 vm_page_lock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_lock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 void
 vm_page_unlock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	mtx_unlock_flags_(vm_page_lockptr(m), 0, file, line);
 }
 
 int
 vm_page_trylock_KBI(vm_page_t m, const char *file, int line)
 {
 
 	return (mtx_trylock_flags_(vm_page_lockptr(m), 0, file, line));
 }
 
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void
 vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line)
 {
 
 	vm_page_lock_assert_KBI(m, MA_OWNED, file, line);
 }
 
 void
 vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line)
 {
 
 	mtx_assert_(vm_page_lockptr(m), a, file, line);
 }
 #endif
 
 #ifdef INVARIANTS
 void
 vm_page_object_lock_assert(vm_page_t m)
 {
 
 	/*
 	 * Certain of the page's fields may only be modified by the
 	 * holder of the containing object's lock or the exclusive busy.
 	 * holder.  Unfortunately, the holder of the write busy is
 	 * not recorded, and thus cannot be checked here.
 	 */
 	if (m->object != NULL && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_WLOCKED(m->object);
 }
 
 void
 vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits)
 {
 
 	if ((bits & PGA_WRITEABLE) == 0)
 		return;
 
 	/*
 	 * The PGA_WRITEABLE flag can only be set if the page is
 	 * managed, is exclusively busied or the object is locked.
 	 * Currently, this flag is only set by pmap_enter().
 	 */
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("PGA_WRITEABLE on unmanaged page"));
 	if (!vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 }
 #endif
 
 #include "opt_ddb.h"
 #ifdef DDB
 #include <sys/kernel.h>
 
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(page, vm_page_print_page_info)
 {
 
 	db_printf("vm_cnt.v_free_count: %d\n", vm_cnt.v_free_count);
 	db_printf("vm_cnt.v_inactive_count: %d\n", vm_cnt.v_inactive_count);
 	db_printf("vm_cnt.v_active_count: %d\n", vm_cnt.v_active_count);
 	db_printf("vm_cnt.v_laundry_count: %d\n", vm_cnt.v_laundry_count);
 	db_printf("vm_cnt.v_wire_count: %d\n", vm_cnt.v_wire_count);
 	db_printf("vm_cnt.v_free_reserved: %d\n", vm_cnt.v_free_reserved);
 	db_printf("vm_cnt.v_free_min: %d\n", vm_cnt.v_free_min);
 	db_printf("vm_cnt.v_free_target: %d\n", vm_cnt.v_free_target);
 	db_printf("vm_cnt.v_inactive_target: %d\n", vm_cnt.v_inactive_target);
 }
 
 DB_SHOW_COMMAND(pageq, vm_page_print_pageq_info)
 {
 	int dom;
 
 	db_printf("pq_free %d\n", vm_cnt.v_free_count);
 	for (dom = 0; dom < vm_ndomains; dom++) {
 		db_printf(
 	    "dom %d page_cnt %d free %d pq_act %d pq_inact %d pq_laund %d\n",
 		    dom,
 		    vm_dom[dom].vmd_page_count,
 		    vm_dom[dom].vmd_free_count,
 		    vm_dom[dom].vmd_pagequeues[PQ_ACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_INACTIVE].pq_cnt,
 		    vm_dom[dom].vmd_pagequeues[PQ_LAUNDRY].pq_cnt);
 	}
 }
 
 DB_SHOW_COMMAND(pginfo, vm_page_print_pginfo)
 {
 	vm_page_t m;
 	boolean_t phys;
 
 	if (!have_addr) {
 		db_printf("show pginfo addr\n");
 		return;
 	}
 
 	phys = strchr(modif, 'p') != NULL;
 	if (phys)
 		m = PHYS_TO_VM_PAGE(addr);
 	else
 		m = (vm_page_t)addr;
 	db_printf(
     "page %p obj %p pidx 0x%jx phys 0x%jx q %d hold %d wire %d\n"
     "  af 0x%x of 0x%x f 0x%x act %d busy %x valid 0x%x dirty 0x%x\n",
 	    m, m->object, (uintmax_t)m->pindex, (uintmax_t)m->phys_addr,
 	    m->queue, m->hold_count, m->wire_count, m->aflags, m->oflags,
 	    m->flags, m->act_count, m->busy_lock, m->valid, m->dirty);
 }
 #endif /* DDB */
Index: stable/11/sys/vm/vm_page.h
===================================================================
--- stable/11/sys/vm/vm_page.h	(revision 323661)
+++ stable/11/sys/vm/vm_page.h	(revision 323662)
@@ -1,718 +1,724 @@
 /*-
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_page.h	8.2 (Berkeley) 12/13/93
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  *
  * $FreeBSD$
  */
 
 /*
  *	Resident memory system definitions.
  */
 
 #ifndef	_VM_PAGE_
 #define	_VM_PAGE_
 
 #include <vm/pmap.h>
 
 /*
  *	Management of resident (logical) pages.
  *
  *	A small structure is kept for each resident
  *	page, indexed by page number.  Each structure
  *	is an element of several collections:
  *
  *		A radix tree used to quickly
  *		perform object/offset lookups
  *
  *		A list of all pages for a given object,
  *		so they can be quickly deactivated at
  *		time of deallocation.
  *
  *		An ordered list of pages due for pageout.
  *
  *	In addition, the structure contains the object
  *	and offset to which this page belongs (for pageout),
  *	and sundry status bits.
  *
  *	In general, operations on this structure's mutable fields are
  *	synchronized using either one of or a combination of the lock on the
  *	object that the page belongs to (O), the pool lock for the page (P),
  *	or the lock for either the free or paging queue (Q).  If a field is
  *	annotated below with two of these locks, then holding either lock is
  *	sufficient for read access, but both locks are required for write
  *	access.
  *
  *	In contrast, the synchronization of accesses to the page's
  *	dirty field is machine dependent (M).  In the
  *	machine-independent layer, the lock on the object that the
  *	page belongs to must be held in order to operate on the field.
  *	However, the pmap layer is permitted to set all bits within
  *	the field without holding that lock.  If the underlying
  *	architecture does not support atomic read-modify-write
  *	operations on the field's type, then the machine-independent
  *	layer uses a 32-bit atomic on the aligned 32-bit word that
  *	contains the dirty field.  In the machine-independent layer,
  *	the implementation of read-modify-write operations on the
  *	field is encapsulated in vm_page_clear_dirty_mask().
  */
 
 #if PAGE_SIZE == 4096
 #define VM_PAGE_BITS_ALL 0xffu
 typedef uint8_t vm_page_bits_t;
 #elif PAGE_SIZE == 8192
 #define VM_PAGE_BITS_ALL 0xffffu
 typedef uint16_t vm_page_bits_t;
 #elif PAGE_SIZE == 16384
 #define VM_PAGE_BITS_ALL 0xffffffffu
 typedef uint32_t vm_page_bits_t;
 #elif PAGE_SIZE == 32768
 #define VM_PAGE_BITS_ALL 0xfffffffffffffffflu
 typedef uint64_t vm_page_bits_t;
 #endif
 
 struct vm_page {
 	union {
 		TAILQ_ENTRY(vm_page) q; /* page queue or free list (Q) */
 		struct {
 			SLIST_ENTRY(vm_page) ss; /* private slists */
 			void *pv;
 		} s;
 		struct {
 			u_long p;
 			u_long v;
 		} memguard;
 	} plinks;
 	TAILQ_ENTRY(vm_page) listq;	/* pages in same object (O) */
 	vm_object_t object;		/* which object am I in (O,P) */
 	vm_pindex_t pindex;		/* offset into object (O,P) */
 	vm_paddr_t phys_addr;		/* physical address of page */
 	struct md_page md;		/* machine dependent stuff */
 	u_int wire_count;		/* wired down maps refs (P) */
 	volatile u_int busy_lock;	/* busy owners lock */
 	uint16_t hold_count;		/* page hold count (P) */
 	uint16_t flags;			/* page PG_* flags (P) */
 	uint8_t aflags;			/* access is atomic */
 	uint8_t oflags;			/* page VPO_* flags (O) */
 	uint8_t	queue;			/* page queue index (P,Q) */
 	int8_t psind;			/* pagesizes[] index (O) */
 	int8_t segind;
 	uint8_t	order;			/* index of the buddy queue */
 	uint8_t pool;
 	u_char	act_count;		/* page usage count (P) */
 	/* NOTE that these must support one bit per DEV_BSIZE in a page */
 	/* so, on normal X86 kernels, they must be at least 8 bits wide */
 	vm_page_bits_t valid;		/* map of valid DEV_BSIZE chunks (O) */
 	vm_page_bits_t dirty;		/* map of dirty DEV_BSIZE chunks (M) */
 };
 
 /*
  * Page flags stored in oflags:
  *
  * Access to these page flags is synchronized by the lock on the object
  * containing the page (O).
  *
  * Note: VPO_UNMANAGED (used by OBJT_DEVICE, OBJT_PHYS and OBJT_SG)
  * 	 indicates that the page is not under PV management but
  * 	 otherwise should be treated as a normal page.  Pages not
  * 	 under PV management cannot be paged out via the
  * 	 object/vm_page_t because there is no knowledge of their pte
  * 	 mappings, and such pages are also not on any PQ queue.
  *
  */
 #define	VPO_UNUSED01	0x01		/* --available-- */
 #define	VPO_SWAPSLEEP	0x02		/* waiting for swap to finish */
 #define	VPO_UNMANAGED	0x04		/* no PV management for page */
 #define	VPO_SWAPINPROG	0x08		/* swap I/O in progress on page */
 #define	VPO_NOSYNC	0x10		/* do not collect for syncer */
 
 /*
  * Busy page implementation details.
  * The algorithm is taken mostly by rwlock(9) and sx(9) locks implementation,
  * even if the support for owner identity is removed because of size
  * constraints.  Checks on lock recursion are then not possible, while the
  * lock assertions effectiveness is someway reduced.
  */
 #define	VPB_BIT_SHARED		0x01
 #define	VPB_BIT_EXCLUSIVE	0x02
 #define	VPB_BIT_WAITERS		0x04
 #define	VPB_BIT_FLAGMASK						\
 	(VPB_BIT_SHARED | VPB_BIT_EXCLUSIVE | VPB_BIT_WAITERS)
 
 #define	VPB_SHARERS_SHIFT	3
 #define	VPB_SHARERS(x)							\
 	(((x) & ~VPB_BIT_FLAGMASK) >> VPB_SHARERS_SHIFT)
 #define	VPB_SHARERS_WORD(x)	((x) << VPB_SHARERS_SHIFT | VPB_BIT_SHARED)
 #define	VPB_ONE_SHARER		(1 << VPB_SHARERS_SHIFT)
 
 #define	VPB_SINGLE_EXCLUSIVER	VPB_BIT_EXCLUSIVE
 
 #define	VPB_UNBUSIED		VPB_SHARERS_WORD(0)
 
 #define	PQ_NONE		255
 #define	PQ_INACTIVE	0
 #define	PQ_ACTIVE	1
 #define	PQ_LAUNDRY	2
 #define	PQ_COUNT	3
 
 TAILQ_HEAD(pglist, vm_page);
 SLIST_HEAD(spglist, vm_page);
 
 struct vm_pagequeue {
 	struct mtx	pq_mutex;
 	struct pglist	pq_pl;
 	int		pq_cnt;
 	u_int		* const pq_vcnt;
 	const char	* const pq_name;
 } __aligned(CACHE_LINE_SIZE);
 
 
 struct vm_domain {
 	struct vm_pagequeue vmd_pagequeues[PQ_COUNT];
 	u_int vmd_page_count;
 	u_int vmd_free_count;
 	long vmd_segs;	/* bitmask of the segments */
 	boolean_t vmd_oom;
 	int vmd_oom_seq;
 	int vmd_last_active_scan;
 	struct vm_page vmd_laundry_marker;
 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
 	struct vm_page vmd_inacthead; /* marker for LRU-defeating insertions */
 };
 
 extern struct vm_domain vm_dom[MAXMEMDOM];
 
 #define	vm_pagequeue_assert_locked(pq)	mtx_assert(&(pq)->pq_mutex, MA_OWNED)
 #define	vm_pagequeue_lock(pq)		mtx_lock(&(pq)->pq_mutex)
 #define	vm_pagequeue_lockptr(pq)	(&(pq)->pq_mutex)
 #define	vm_pagequeue_unlock(pq)		mtx_unlock(&(pq)->pq_mutex)
 
 #ifdef _KERNEL
 static __inline void
 vm_pagequeue_cnt_add(struct vm_pagequeue *pq, int addend)
 {
 
 #ifdef notyet
 	vm_pagequeue_assert_locked(pq);
 #endif
 	pq->pq_cnt += addend;
 	atomic_add_int(pq->pq_vcnt, addend);
 }
 #define	vm_pagequeue_cnt_inc(pq)	vm_pagequeue_cnt_add((pq), 1)
 #define	vm_pagequeue_cnt_dec(pq)	vm_pagequeue_cnt_add((pq), -1)
 #endif	/* _KERNEL */
 
 extern struct mtx_padalign vm_page_queue_free_mtx;
 extern struct mtx_padalign pa_lock[];
 
 #if defined(__arm__)
 #define	PDRSHIFT	PDR_SHIFT
 #elif !defined(PDRSHIFT)
 #define PDRSHIFT	21
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	PA_LOCKPTR(pa)	((struct mtx *)(&pa_lock[pa_index(pa) % PA_LOCK_COUNT]))
 #define	PA_LOCKOBJPTR(pa)	((struct lock_object *)PA_LOCKPTR((pa)))
 #define	PA_LOCK(pa)	mtx_lock(PA_LOCKPTR(pa))
 #define	PA_TRYLOCK(pa)	mtx_trylock(PA_LOCKPTR(pa))
 #define	PA_UNLOCK(pa)	mtx_unlock(PA_LOCKPTR(pa))
 #define	PA_UNLOCK_COND(pa) 			\
 	do {		   			\
 		if ((pa) != 0) {		\
 			PA_UNLOCK((pa));	\
 			(pa) = 0;		\
 		}				\
 	} while (0)
 
 #define	PA_LOCK_ASSERT(pa, a)	mtx_assert(PA_LOCKPTR(pa), (a))
 
 #ifdef KLD_MODULE
 #define	vm_page_lock(m)		vm_page_lock_KBI((m), LOCK_FILE, LOCK_LINE)
 #define	vm_page_unlock(m)	vm_page_unlock_KBI((m), LOCK_FILE, LOCK_LINE)
 #define	vm_page_trylock(m)	vm_page_trylock_KBI((m), LOCK_FILE, LOCK_LINE)
 #else	/* !KLD_MODULE */
 #define	vm_page_lockptr(m)	(PA_LOCKPTR(VM_PAGE_TO_PHYS((m))))
 #define	vm_page_lock(m)		mtx_lock(vm_page_lockptr((m)))
 #define	vm_page_unlock(m)	mtx_unlock(vm_page_lockptr((m)))
 #define	vm_page_trylock(m)	mtx_trylock(vm_page_lockptr((m)))
 #endif
 #if defined(INVARIANTS)
 #define	vm_page_assert_locked(m)		\
     vm_page_assert_locked_KBI((m), __FILE__, __LINE__)
 #define	vm_page_lock_assert(m, a)		\
     vm_page_lock_assert_KBI((m), (a), __FILE__, __LINE__)
 #else
 #define	vm_page_assert_locked(m)
 #define	vm_page_lock_assert(m, a)
 #endif
 
 /*
  * The vm_page's aflags are updated using atomic operations.  To set or clear
  * these flags, the functions vm_page_aflag_set() and vm_page_aflag_clear()
  * must be used.  Neither these flags nor these functions are part of the KBI.
  *
  * PGA_REFERENCED may be cleared only if the page is locked.  It is set by
  * both the MI and MD VM layers.  However, kernel loadable modules should not
  * directly set this flag.  They should call vm_page_reference() instead.
  *
  * PGA_WRITEABLE is set exclusively on managed pages by pmap_enter().
  * When it does so, the object must be locked, or the page must be
  * exclusive busied.  The MI VM layer must never access this flag
  * directly.  Instead, it should call pmap_page_is_write_mapped().
  *
  * PGA_EXECUTABLE may be set by pmap routines, and indicates that a page has
  * at least one executable mapping.  It is not consumed by the MI VM layer.
  */
 #define	PGA_WRITEABLE	0x01		/* page may be mapped writeable */
 #define	PGA_REFERENCED	0x02		/* page has been referenced */
 #define	PGA_EXECUTABLE	0x04		/* page may be mapped executable */
 
 /*
  * Page flags.  If changed at any other time than page allocation or
  * freeing, the modification must be protected by the vm_page lock.
  */
 #define	PG_FICTITIOUS	0x0004		/* physical page doesn't exist */
 #define	PG_ZERO		0x0008		/* page is zeroed */
 #define	PG_MARKER	0x0010		/* special queue marker page */
 #define	PG_NODUMP	0x0080		/* don't include this page in a dump */
 #define	PG_UNHOLDFREE	0x0100		/* delayed free of a held page */
 
 /*
  * Misc constants.
  */
 #define ACT_DECLINE		1
 #define ACT_ADVANCE		3
 #define ACT_INIT		5
 #define ACT_MAX			64
 
 #ifdef _KERNEL
 
 #include <sys/systm.h>
 
 #include <machine/atomic.h>
 
 /*
  * Each pageable resident page falls into one of four lists:
  *
  *	free
  *		Available for allocation now.
  *
  *	inactive
  *		Low activity, candidates for reclamation.
  *		This list is approximately LRU ordered.
  *
  *	laundry
  *		This is the list of pages that should be
  *		paged out next.
  *
  *	active
  *		Pages that are "active", i.e., they have been
  *		recently referenced.
  *
  */
 
 extern int vm_page_zero_count;
 
 extern vm_page_t vm_page_array;		/* First resident page in table */
 extern long vm_page_array_size;		/* number of vm_page_t's */
 extern long first_page;			/* first physical page number */
 
 #define VM_PAGE_TO_PHYS(entry)	((entry)->phys_addr)
 
 /*
  * PHYS_TO_VM_PAGE() returns the vm_page_t object that represents a memory
  * page to which the given physical address belongs. The correct vm_page_t
  * object is returned for addresses that are not page-aligned.
  */
 vm_page_t PHYS_TO_VM_PAGE(vm_paddr_t pa);
 
 /*
  * Page allocation parameters for vm_page for the functions
  * vm_page_alloc(), vm_page_grab(), vm_page_alloc_contig() and
  * vm_page_alloc_freelist().  Some functions support only a subset
  * of the flags, and ignore others, see the flags legend.
  *
+ * The meaning of VM_ALLOC_ZERO differs slightly between the vm_page_alloc*()
+ * and the vm_page_grab*() functions.  See these functions for details.
+ *
  * Bits 0 - 1 define class.
  * Bits 2 - 15 dedicated for flags.
  * Legend:
  * (a) - vm_page_alloc() supports the flag.
  * (c) - vm_page_alloc_contig() supports the flag.
  * (f) - vm_page_alloc_freelist() supports the flag.
  * (g) - vm_page_grab() supports the flag.
+ * (p) - vm_page_grab_pages() supports the flag.
  * Bits above 15 define the count of additional pages that the caller
  * intends to allocate.
  */
 #define VM_ALLOC_NORMAL		0
 #define VM_ALLOC_INTERRUPT	1
 #define VM_ALLOC_SYSTEM		2
 #define	VM_ALLOC_CLASS_MASK	3
-#define	VM_ALLOC_WIRED		0x0020	/* (acfg) Allocate non pageable page */
-#define	VM_ALLOC_ZERO		0x0040	/* (acfg) Try to obtain a zeroed page */
+#define	VM_ALLOC_WIRED		0x0020	/* (acfgp) Allocate a wired page */
+#define	VM_ALLOC_ZERO		0x0040	/* (acfgp) Allocate a prezeroed page */
 #define	VM_ALLOC_NOOBJ		0x0100	/* (acg) No associated object */
-#define	VM_ALLOC_NOBUSY		0x0200	/* (acg) Do not busy the page */
+#define	VM_ALLOC_NOBUSY		0x0200	/* (acgp) Do not excl busy the page */
 #define	VM_ALLOC_IFCACHED	0x0400
 #define	VM_ALLOC_IFNOTCACHED	0x0800
-#define	VM_ALLOC_IGN_SBUSY	0x1000	/* (g) Ignore shared busy flag */
+#define	VM_ALLOC_IGN_SBUSY	0x1000	/* (gp) Ignore shared busy flag */
 #define	VM_ALLOC_NODUMP		0x2000	/* (ag) don't include in dump */
-#define	VM_ALLOC_SBUSY		0x4000	/* (acg) Shared busy the page */
+#define	VM_ALLOC_SBUSY		0x4000	/* (acgp) Shared busy the page */
 #define	VM_ALLOC_NOWAIT		0x8000	/* (g) Do not sleep, return NULL */
 #define	VM_ALLOC_COUNT_SHIFT	16
 #define	VM_ALLOC_COUNT(count)	((count) << VM_ALLOC_COUNT_SHIFT)
 
 #ifdef M_NOWAIT
 static inline int
 malloc2vm_flags(int malloc_flags)
 {
 	int pflags;
 
 	KASSERT((malloc_flags & M_USE_RESERVE) == 0 ||
 	    (malloc_flags & M_NOWAIT) != 0,
 	    ("M_USE_RESERVE requires M_NOWAIT"));
 	pflags = (malloc_flags & M_USE_RESERVE) != 0 ? VM_ALLOC_INTERRUPT :
 	    VM_ALLOC_SYSTEM;
 	if ((malloc_flags & M_ZERO) != 0)
 		pflags |= VM_ALLOC_ZERO;
 	if ((malloc_flags & M_NODUMP) != 0)
 		pflags |= VM_ALLOC_NODUMP;
 	return (pflags);
 }
 #endif
 
 void vm_page_busy_downgrade(vm_page_t m);
 void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared);
 void vm_page_flash(vm_page_t m);
 void vm_page_hold(vm_page_t mem);
 void vm_page_unhold(vm_page_t mem);
 void vm_page_free(vm_page_t m);
 void vm_page_free_zero(vm_page_t m);
 
 void vm_page_activate (vm_page_t);
 void vm_page_advise(vm_page_t m, int advice);
 vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int);
 vm_page_t vm_page_alloc_contig(vm_object_t object, vm_pindex_t pindex, int req,
     u_long npages, vm_paddr_t low, vm_paddr_t high, u_long alignment,
     vm_paddr_t boundary, vm_memattr_t memattr);
 vm_page_t vm_page_alloc_freelist(int, int);
 void vm_page_change_lock(vm_page_t m, struct mtx **mtx);
 vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int);
+void vm_page_grab_pages(vm_object_t object, vm_pindex_t pindex, int allocflags,
+    vm_page_t *ma, int count);
 int vm_page_try_to_free (vm_page_t);
 void vm_page_deactivate (vm_page_t);
 void vm_page_deactivate_noreuse(vm_page_t);
 void vm_page_dequeue(vm_page_t m);
 void vm_page_dequeue_locked(vm_page_t m);
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
 void vm_page_launder(vm_page_t m);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
 int vm_page_pa_tryrelock(pmap_t, vm_paddr_t, vm_paddr_t *);
 struct vm_pagequeue *vm_page_pagequeue(vm_page_t m);
 vm_page_t vm_page_prev(vm_page_t m);
 boolean_t vm_page_ps_is_valid(vm_page_t m);
 void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
 bool vm_page_reclaim_contig(int req, u_long npages, vm_paddr_t low,
     vm_paddr_t high, u_long alignment, vm_paddr_t boundary);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
 int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
 vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
     vm_pindex_t pindex);
 void vm_page_requeue(vm_page_t m);
 void vm_page_requeue_locked(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);
 vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start,
     vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options);
 void vm_page_set_valid_range(vm_page_t m, int base, int size);
 int vm_page_sleep_if_busy(vm_page_t m, const char *msg);
 vm_offset_t vm_page_startup(vm_offset_t vaddr);
 void vm_page_sunbusy(vm_page_t m);
 int vm_page_trysbusy(vm_page_t m);
 void vm_page_unhold_pages(vm_page_t *ma, int count);
 boolean_t vm_page_unwire(vm_page_t m, uint8_t queue);
 void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_wire (vm_page_t);
 void vm_page_xunbusy_hard(vm_page_t m);
 void vm_page_xunbusy_maybelocked(vm_page_t m);
 void vm_page_set_validclean (vm_page_t, int, int);
 void vm_page_clear_dirty (vm_page_t, int, int);
 void vm_page_set_invalid (vm_page_t, int, int);
 int vm_page_is_valid (vm_page_t, int, int);
 void vm_page_test_dirty (vm_page_t);
 vm_page_bits_t vm_page_bits(int base, int size);
 void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid);
 void vm_page_free_toq(vm_page_t m);
 void vm_page_zero_idle_wakeup(void);
 
 void vm_page_dirty_KBI(vm_page_t m);
 void vm_page_lock_KBI(vm_page_t m, const char *file, int line);
 void vm_page_unlock_KBI(vm_page_t m, const char *file, int line);
 int vm_page_trylock_KBI(vm_page_t m, const char *file, int line);
 #if defined(INVARIANTS) || defined(INVARIANT_SUPPORT)
 void vm_page_assert_locked_KBI(vm_page_t m, const char *file, int line);
 void vm_page_lock_assert_KBI(vm_page_t m, int a, const char *file, int line);
 #endif
 
 #define	vm_page_assert_sbusied(m)					\
 	KASSERT(vm_page_sbusied(m),					\
 	    ("vm_page_assert_sbusied: page %p not shared busy @ %s:%d", \
 	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_unbusied(m)					\
 	KASSERT(!vm_page_busied(m),					\
 	    ("vm_page_assert_unbusied: page %p busy @ %s:%d",		\
 	    (m), __FILE__, __LINE__))
 
 #define	vm_page_assert_xbusied(m)					\
 	KASSERT(vm_page_xbusied(m),					\
 	    ("vm_page_assert_xbusied: page %p not exclusive busy @ %s:%d", \
 	    (m), __FILE__, __LINE__))
 
 #define	vm_page_busied(m)						\
 	((m)->busy_lock != VPB_UNBUSIED)
 
 #define	vm_page_sbusy(m) do {						\
 	if (!vm_page_trysbusy(m))					\
 		panic("%s: page %p failed shared busying", __func__,	\
 		    (m));						\
 } while (0)
 
 #define	vm_page_tryxbusy(m)						\
 	(atomic_cmpset_acq_int(&(m)->busy_lock, VPB_UNBUSIED,		\
 	    VPB_SINGLE_EXCLUSIVER))
 
 #define	vm_page_xbusied(m)						\
 	(((m)->busy_lock & VPB_SINGLE_EXCLUSIVER) != 0)
 
 #define	vm_page_xbusy(m) do {						\
 	if (!vm_page_tryxbusy(m))					\
 		panic("%s: page %p failed exclusive busying", __func__,	\
 		    (m));						\
 } while (0)
 
 /* Note: page m's lock must not be owned by the caller. */
 #define	vm_page_xunbusy(m) do {						\
 	if (!atomic_cmpset_rel_int(&(m)->busy_lock,			\
 	    VPB_SINGLE_EXCLUSIVER, VPB_UNBUSIED))			\
 		vm_page_xunbusy_hard(m);				\
 } while (0)
 
 #ifdef INVARIANTS
 void vm_page_object_lock_assert(vm_page_t m);
 #define	VM_PAGE_OBJECT_LOCK_ASSERT(m)	vm_page_object_lock_assert(m)
 void vm_page_assert_pga_writeable(vm_page_t m, uint8_t bits);
 #define	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits)				\
 	vm_page_assert_pga_writeable(m, bits)
 #else
 #define	VM_PAGE_OBJECT_LOCK_ASSERT(m)	(void)0
 #define	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits)	(void)0
 #endif
 
 /*
  * We want to use atomic updates for the aflags field, which is 8 bits wide.
  * However, not all architectures support atomic operations on 8-bit
  * destinations.  In order that we can easily use a 32-bit operation, we
  * require that the aflags field be 32-bit aligned.
  */
 CTASSERT(offsetof(struct vm_page, aflags) % sizeof(uint32_t) == 0);
 
 /*
  *	Clear the given bits in the specified page.
  */
 static inline void
 vm_page_aflag_clear(vm_page_t m, uint8_t bits)
 {
 	uint32_t *addr, val;
 
 	/*
 	 * The PGA_REFERENCED flag can only be cleared if the page is locked.
 	 */
 	if ((bits & PGA_REFERENCED) != 0)
 		vm_page_assert_locked(m);
 
 	/*
 	 * Access the whole 32-bit word containing the aflags field with an
 	 * atomic update.  Parallel non-atomic updates to the other fields
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
 	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
 	    ("vm_page_aflag_clear: aflags is misaligned"));
 	val = bits;
 #if BYTE_ORDER == BIG_ENDIAN
 	val <<= 24;
 #endif
 	atomic_clear_32(addr, val);
 }
 
 /*
  *	Set the given bits in the specified page.
  */
 static inline void
 vm_page_aflag_set(vm_page_t m, uint8_t bits)
 {
 	uint32_t *addr, val;
 
 	VM_PAGE_ASSERT_PGA_WRITEABLE(m, bits);
 
 	/*
 	 * Access the whole 32-bit word containing the aflags field with an
 	 * atomic update.  Parallel non-atomic updates to the other fields
 	 * within this word are handled properly by the atomic update.
 	 */
 	addr = (void *)&m->aflags;
 	KASSERT(((uintptr_t)addr & (sizeof(uint32_t) - 1)) == 0,
 	    ("vm_page_aflag_set: aflags is misaligned"));
 	val = bits;
 #if BYTE_ORDER == BIG_ENDIAN
 	val <<= 24;
 #endif
 	atomic_set_32(addr, val);
 } 
 
 /*
  *	vm_page_dirty:
  *
  *	Set all bits in the page's dirty field.
  *
  *	The object containing the specified page must be locked if the
  *	call is made from the machine-independent layer.
  *
  *	See vm_page_clear_dirty_mask().
  */
 static __inline void
 vm_page_dirty(vm_page_t m)
 {
 
 	/* Use vm_page_dirty_KBI() under INVARIANTS to save memory. */
 #if defined(KLD_MODULE) || defined(INVARIANTS)
 	vm_page_dirty_KBI(m);
 #else
 	m->dirty = VM_PAGE_BITS_ALL;
 #endif
 }
 
 /*
  *	vm_page_remque:
  *
  *	If the given page is in a page queue, then remove it from that page
  *	queue.
  *
  *	The page must be locked.
  */
 static inline void
 vm_page_remque(vm_page_t m)
 {
 
 	if (m->queue != PQ_NONE)
 		vm_page_dequeue(m);
 }
 
 /*
  *	vm_page_undirty:
  *
  *	Set page to not be dirty.  Note: does not clear pmap modify bits
  */
 static __inline void
 vm_page_undirty(vm_page_t m)
 {
 
 	VM_PAGE_OBJECT_LOCK_ASSERT(m);
 	m->dirty = 0;
 }
 
 static inline void
 vm_page_replace_checked(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mold)
 {
 	vm_page_t mret;
 
 	mret = vm_page_replace(mnew, object, pindex);
 	KASSERT(mret == mold,
 	    ("invalid page replacement, mold=%p, mret=%p", mold, mret));
 
 	/* Unused if !INVARIANTS. */
 	(void)mold;
 	(void)mret;
 }
 
 static inline bool
 vm_page_active(vm_page_t m)
 {
 
 	return (m->queue == PQ_ACTIVE);
 }
 
 static inline bool
 vm_page_inactive(vm_page_t m)
 {
 
 	return (m->queue == PQ_INACTIVE);
 }
 
 static inline bool
 vm_page_in_laundry(vm_page_t m)
 {
 
 	return (m->queue == PQ_LAUNDRY);
 }
 
 #endif				/* _KERNEL */
 #endif				/* !_VM_PAGE_ */
Index: stable/11
===================================================================
--- stable/11	(revision 323661)
+++ stable/11	(revision 323662)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r322296