Index: head/sys/vm/swap_pager.c
===================================================================
--- head/sys/vm/swap_pager.c	(revision 349790)
+++ head/sys/vm/swap_pager.c	(revision 349791)
@@ -1,3004 +1,3004 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/blist.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
  * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define	MAX_PAGEOUT_CLUSTER	32
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 #define	SWAP_META_PAGES		PCTRIE_COUNT
 
 /*
  * A swblk structure maps each page index within a
  * SWAP_META_PAGES-aligned and sized range to the address of an
  * on-disk swap block (or SWAPBLK_NONE). The collection of these
  * mappings for an entire vm object is implemented as a pc-trie.
  */
 struct swblk {
 	vm_pindex_t	p;
 	daddr_t		d[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 
 static u_long swap_reserved;
 static u_long swap_total;
 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_reserved, 0, sysctl_page_shift, "A", 
     "Amount of swap storage needed to back all allocated anonymous memory.");
 SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_total, 0, sysctl_page_shift, "A", 
     "Total amount of available swap storage.");
 
 static int overcommit = 0;
 SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
     "Actual size of swap metadata zone");
 static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
     "Maximum amount of swap supported");
 
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
 #define	SWAP_RESERVE_ALLOW_NONWIRED	(1 << 2)
 
 static int
 sysctl_page_shift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t newval;
 	u_long value = *(u_long *)arg1;
 
 	newval = ((uint64_t)value) << PAGE_SHIFT;
 	return (sysctl_handle_64(oidp, &newval, 0, req));
 }
 
 int
 swap_reserve(vm_ooffset_t incr)
 {
 
 	return (swap_reserve_by_cred(incr, curthread->td_ucred));
 }
 
 int
 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
 {
 	u_long r, s, prev, pincr;
 	int res, error;
 	static int curfail;
 	static struct timeval lastfail;
 	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)incr));
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(curproc);
 		error = racct_add(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 		if (error != 0)
 			return (0);
 	}
 #endif
 
 	pincr = atop(incr);
 	res = 0;
 	prev = atomic_fetchadd_long(&swap_reserved, pincr);
 	r = prev + pincr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
 		s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
 		    vm_wire_count();
 	} else
 		s = 0;
 	s += swap_total;
 	if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
 	    (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
 		res = 1;
 	} else {
 		prev = atomic_fetchadd_long(&swap_reserved, -pincr);
 		if (prev < pincr)
 			panic("swap_reserved < incr on overcommit fail");
 	}
 	if (res) {
 		prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
 		    prev + pincr > lim_cur(curthread, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) {
 			res = 0;
 			prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr);
 			if (prev < pincr)
 				panic("uip->ui_vmsize < incr on overcommit fail");
 		}
 	}
 	if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
 		    uip->ui_uid, curproc->p_pid, incr);
 	}
 
 #ifdef RACCT
 	if (racct_enable && !res) {
 		PROC_LOCK(curproc);
 		racct_sub(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 
 	return (res);
 }
 
 void
 swap_reserve_force(vm_ooffset_t incr)
 {
 	struct uidinfo *uip;
 	u_long pincr;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)incr));
 
 	PROC_LOCK(curproc);
 #ifdef RACCT
 	if (racct_enable)
 		racct_add_force(curproc, RACCT_SWAP, incr);
 #endif
 	pincr = atop(incr);
 	atomic_add_long(&swap_reserved, pincr);
 	uip = curproc->p_ucred->cr_ruidinfo;
 	atomic_add_long(&uip->ui_vmsize, pincr);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release(vm_ooffset_t decr)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(curproc);
 	cred = curproc->p_ucred;
 	swap_release_by_cred(decr, cred);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
 	u_long prev, pdecr;
  	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)decr));
 
 	pdecr = atop(decr);
 	prev = atomic_fetchadd_long(&swap_reserved, -pdecr);
 	if (prev < pdecr)
 		panic("swap_reserved < decr");
 
 	prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr);
 	if (prev < pdecr)
 		printf("negative vmsize for uid = %d\n", uip->ui_uid);
 #ifdef RACCT
 	if (racct_enable)
 		racct_sub_cred(cred, RACCT_SWAP, decr);
 #endif
 }
 
 #define SWM_POP		0x01	/* pop out			*/
 
 static int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static struct mtx swbuf_mtx;	/* to sync nsw_wcount_async */
 static int nsw_wcount_async;	/* limit async write buffers */
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
     "Maximum running async swap ops");
 static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
     "Swap Fragmentation Info");
 
 static struct sx sw_alloc_sx;
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t swwbuf_zone;
 static uma_zone_t swrbuf_zone;
 static uma_zone_t swblk_zone;
 static uma_zone_t swpctrie_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
     int *);
 static int	swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, pgo_getpages_iodone_t, void *);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async)		*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
     "Maximum size of a swap block in pages");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static bool	swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
 static int	swapongeom(struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, daddr_t npages);
 static daddr_t	swp_pager_getswapspace(int *npages, int limit);
 
 /*
  * Metadata functions
  */
 static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 static void
 swp_pager_init_freerange(daddr_t *start, daddr_t *num)
 {
 
 	*start = SWAPBLK_NONE;
 	*num = 0;
 }
 
 static void
 swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr)
 {
 
 	if (*start + *num == addr) {
 		(*num)++;
 	} else {
 		swp_pager_freeswapspace(*start, *num);
 		*start = addr;
 		*num = 1;
 	}
 }
 
 static void *
 swblk_trie_alloc(struct pctrie *ptree)
 {
 
 	return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
 	    M_USE_RESERVE : 0)));
 }
 
 static void
 swblk_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(swpctrie_zone, node);
 }
 
 PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 	sx_init(&sw_alloc_sx, "swspsx");
 	sx_init(&swdev_syscall_lock, "swsysc");
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	unsigned long n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
-	 * array (MAXPHYS/PAGE_SIZE) and our locally defined
+	 * array MAXPHYS / PAGE_SIZE and our locally defined
 	 * MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
-	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);
+	nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);
 
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF);
 
 	swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4);
 	swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2);
 
 	/*
 	 * Initialize our zone, taking the user's requested size or
 	 * estimating the number we need based on the number of pages
 	 * in the system.
 	 */
 	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
 	    vm_cnt.v_page_count / 2;
 	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
 	    pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
 	if (swpctrie_zone == NULL)
 		panic("failed to create swap pctrie zone.");
 	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
 	    NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM);
 	if (swblk_zone == NULL)
 		panic("failed to create swap blk zone.");
 	n2 = n;
 	do {
 		if (uma_zone_reserve_kva(swblk_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 
 	/*
 	 * Often uma_zone_reserve_kva() cannot reserve exactly the
 	 * requested size.  Account for the difference when
 	 * calculating swap_maxpages.
 	 */
 	n = uma_zone_get_max(swblk_zone);
 
 	if (n < n2)
 		printf("Swap blk zone entries changed from %lu to %lu.\n",
 		    n2, n);
 	swap_maxpages = n * SWAP_META_PAGES;
 	swzone = n * sizeof(struct swblk);
 	if (!uma_zone_reserve_kva(swpctrie_zone, n))
 		printf("Cannot reserve swap pctrie zone, "
 		    "reduce kern.maxswzone.\n");
 }
 
 static vm_object_t
 swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
     vm_ooffset_t offset)
 {
 	vm_object_t object;
 
 	if (cred != NULL) {
 		if (!swap_reserve_by_cred(size, cred))
 			return (NULL);
 		crhold(cred);
 	}
 
 	/*
 	 * The un_pager.swp.swp_blks trie is initialized by
 	 * vm_object_allocate() to ensure the correct order of
 	 * visibility to other threads.
 	 */
 	object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
 	    PAGE_MASK + size));
 
 	object->handle = handle;
 	if (cred != NULL) {
 		object->cred = cred;
 		object->charge = size;
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.
  *
  *	This routine must ensure that no live duplicate is created for
  *	the named object request, which is protected against by
  *	holding the sw_alloc_sx lock in case handle != NULL.
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 
 	if (handle != NULL) {
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
 			object = swap_pager_alloc_init(handle, cred, size,
 			    offset);
 			if (object != NULL) {
 				TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
 				    object, pager_object_list);
 			}
 		}
 		sx_xunlock(&sw_alloc_sx);
 	} else {
 		object = swap_pager_alloc_init(handle, cred, size, offset);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	The object must be locked.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(object);
 	}
 
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for up to the requested number of pages, and at
  *	least a minimum number of pages.  The starting swap block number
  *	(a page index) is returned or SWAPBLK_NONE if the allocation
  *	failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	This routine may not sleep.
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int *io_npages, int limit)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int mpages, npages;
 
 	blk = SWAPBLK_NONE;
 	mpages = *io_npages;
 	npages = imin(BLIST_MAX_ALLOC, mpages);
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	while (!TAILQ_EMPTY(&swtailq)) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if ((sp->sw_flags & SW_CLOSING) == 0)
 			blk = blist_alloc(sp->sw_blist, &npages, mpages);
 		if (blk != SWAPBLK_NONE)
 			break;
 		sp = TAILQ_NEXT(sp, sw_list);
 		if (swdevhd == sp) {
 			if (npages <= limit)
 				break;
 			mpages = npages - 1;
 			npages >>= 1;
 		}
 	}
 	if (blk != SWAPBLK_NONE) {
 		*io_npages = npages;
 		blk += sp->sw_first;
 		sp->sw_used += npages;
 		swap_pager_avail -= npages;
 		swp_sizecheck();
 		swdevhd = TAILQ_NEXT(sp, sw_list);
 	} else {
 		if (swap_pager_full != 2) {
 			printf("swp_pager_getswapspace(%d): failed\n",
 			    *io_npages);
 			swap_pager_full = 2;
 			swap_pager_almost_full = 1;
 		}
 		swdevhd = NULL;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static bool
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(bp->b_blkno, sp)) {
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
 				bp->b_data = unmapped_buf;
 				bp->b_offset = 0;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
 			}
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, daddr_t npages)
 {
 	struct swdevt *sp;
 
 	if (npages == 0)
 		return;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(blk, sp)) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SYSCTL_SWAP_FRAGMENTATION() -	produce raw swap space stats
  */
 static int
 sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct swdevt *sp;
 	const char *devname;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
 		blist_stats(sp->sw_blist, &sbuf);
 	}
 	mtx_unlock(&sw_dev_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed
  *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
  *
  *	The object must be locked.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The
  *	swap blocks are not zeroed.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	daddr_t addr, blk, n_free, s_free;
 	int i, j, n;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += n) {
 		n = size - i;
 		blk = swp_pager_getswapspace(&n, 1);
 		if (blk == SWAPBLK_NONE) {
 			swp_pager_meta_free(object, start, i);
 			VM_OBJECT_WUNLOCK(object);
 			return (-1);
 		}
 		for (j = 0; j < n; ++j) {
 			addr = swp_pager_meta_build(object,
 			    start + i + j, blk + j);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&s_free, &n_free,
 				    addr);
 		}
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source.
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked.
  *	Both object locks may temporarily be released.
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 	daddr_t dstaddr, n_free, s_free, srcaddr;
 
 	VM_OBJECT_ASSERT_WLOCKED(srcobject);
 	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	/*
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
 	if (destroysource && srcobject->handle != NULL) {
 		vm_object_pip_add(srcobject, 1);
 		VM_OBJECT_WUNLOCK(srcobject);
 		vm_object_pip_add(dstobject, 1);
 		VM_OBJECT_WUNLOCK(dstobject);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(dstobject);
 		vm_object_pip_wakeup(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
 		vm_object_pip_wakeup(srcobject);
 	}
 
 	/*
 	 * Transfer source to destination.
 	 */
 	swp_pager_init_freerange(&s_free, &n_free);
 	for (i = 0; i < dstobject->size; ++i) {
 		srcaddr = swp_pager_meta_ctl(srcobject, i + offset, SWM_POP);
 		if (srcaddr == SWAPBLK_NONE)
 			continue;
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 		if (dstaddr != SWAPBLK_NONE) {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the source block.
 			 */
 			swp_pager_update_freerange(&s_free, &n_free, srcaddr);
 			continue;
 		}
 
 		/*
 		 * Destination has no swapblk and is not resident,
 		 * copy source.
 		 *
 		 * swp_pager_meta_build() can sleep.
 		 */
 		vm_object_pip_add(srcobject, 1);
 		VM_OBJECT_WUNLOCK(srcobject);
 		vm_object_pip_add(dstobject, 1);
 		dstaddr = swp_pager_meta_build(dstobject, i, srcaddr);
 		KASSERT(dstaddr == SWAPBLK_NONE,
 		    ("Unexpected destination swapblk"));
 		vm_object_pip_wakeup(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
 		vm_object_pip_wakeup(srcobject);
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidentally
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	daddr_t blk, blk0;
 	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = i - 1;
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = i - 1;
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not sleep.
  *
  *	The object containing the page must be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 	daddr_t srcaddr;
 
 	srcaddr = swp_pager_meta_ctl(m->object, m->pindex, SWM_POP);
 	if (srcaddr != SWAPBLK_NONE)
 		swp_pager_freeswapspace(srcaddr, 1);
 }
 
 /*
  * swap_pager_getpages() - bring pages in from swap
  *
  *	Attempt to page in the pages in array "ma" of length "count".  The
  *	caller may optionally specify that additional pages preceding and
  *	succeeding the specified range be paged in.  The number of such pages
  *	is returned in the "rbehind" and "rahead" parameters, and they will
  *	be in the inactive queue upon return.
  *
  *	The pages in "ma" must be busied and will remain busied upon return.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	struct buf *bp;
 	vm_page_t bm, mpred, msucc, p;
 	vm_pindex_t pindex;
 	daddr_t blk;
 	int i, maxahead, maxbehind, reqcount;
 
 	reqcount = count;
 
 	/*
 	 * Determine the final number of read-behind pages and
 	 * allocate them BEFORE releasing the object lock.  Otherwise,
 	 * there can be a problematic race with vm_object_split().
 	 * Specifically, vm_object_split() might first transfer pages
 	 * that precede ma[0] in the current object to a new object,
 	 * and then this function incorrectly recreates those pages as
 	 * read-behind pages in the current object.
 	 */
 	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Clip the readahead and readbehind ranges to exclude resident pages.
 	 */
 	if (rahead != NULL) {
 		KASSERT(reqcount - 1 <= maxahead,
 		    ("page count %d extends beyond swap block", reqcount));
 		*rahead = imin(*rahead, maxahead - (reqcount - 1));
 		pindex = ma[reqcount - 1]->pindex;
 		msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
 		if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
 			*rahead = msucc->pindex - pindex - 1;
 	}
 	if (rbehind != NULL) {
 		*rbehind = imin(*rbehind, maxbehind);
 		pindex = ma[0]->pindex;
 		mpred = TAILQ_PREV(ma[0], pglist, listq);
 		if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
 			*rbehind = pindex - mpred->pindex - 1;
 	}
 
 	bm = ma[0];
 	for (i = 0; i < count; i++)
 		ma[i]->oflags |= VPO_SWAPINPROG;
 
 	/*
 	 * Allocate readahead and readbehind pages.
 	 */
 	if (rbehind != NULL) {
 		for (i = 1; i <= *rbehind; i++) {
 			p = vm_page_alloc(object, ma[0]->pindex - i,
 			    VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			p->oflags |= VPO_SWAPINPROG;
 			bm = p;
 		}
 		*rbehind = i - 1;
 	}
 	if (rahead != NULL) {
 		for (i = 0; i < *rahead; i++) {
 			p = vm_page_alloc(object,
 			    ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			p->oflags |= VPO_SWAPINPROG;
 		}
 		*rahead = i;
 	}
 	if (rbehind != NULL)
 		count += *rbehind;
 	if (rahead != NULL)
 		count += *rahead;
 
 	vm_object_pip_add(object, count);
 
 	pindex = bm->pindex;
 	blk = swp_pager_meta_ctl(object, pindex, 0);
 	KASSERT(blk != SWAPBLK_NONE,
 	    ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
 
 	VM_OBJECT_WUNLOCK(object);
 	bp = uma_zalloc(swrbuf_zone, M_WAITOK);
 	/* Pages cannot leave the object while busy. */
 	for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
 		MPASS(p->pindex == bm->pindex + i);
 		bp->b_pages[i] = p;
 	}
 
 	bp->b_flags |= B_PAGING;
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk;
 	bp->b_bcount = PAGE_SIZE * count;
 	bp->b_bufsize = PAGE_SIZE * count;
 	bp->b_npages = count;
 	bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
 	bp->b_pgafter = rahead != NULL ? *rahead : 0;
 
 	VM_CNT_INC(v_swapin);
 	VM_CNT_ADD(v_swappgsin, count);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our ma[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * Wait for the pages we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the metadata for each page in the request.
 	 */
 	VM_OBJECT_WLOCK(object);
 	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
 		ma[0]->oflags |= VPO_SWAPSLEEP;
 		VM_CNT_INC(v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
 		    "swread", hz * 20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * If we had an unrecoverable read error pages will not be valid.
 	 */
 	for (i = 0; i < reqcount; i++)
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			return (VM_PAGER_ERROR);
 
 	return (VM_PAGER_OK);
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 /*
  * 	swap_pager_getpages_async():
  *
  *	Right now this is emulation of asynchronous operation on top of
  *	swap_pager_getpages().
  */
 static int
 swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
     int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
 {
 	int r, error;
 
 	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
 	VM_OBJECT_WUNLOCK(object);
 	switch (r) {
 	case VM_PAGER_OK:
 		error = 0;
 		break;
 	case VM_PAGER_ERROR:
 		error = EIO;
 		break;
 	case VM_PAGER_FAIL:
 		error = EINVAL;
 		break;
 	default:
 		panic("unhandled swap_pager_getpages() error %d", r);
 	}
 	(iodone)(arg, ma, count, error);
 	VM_OBJECT_WLOCK(object);
 
 	return (r);
 }
 
 /*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new
  *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
  *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 static void
 swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
 	int i, n;
 	boolean_t sync;
 	daddr_t addr, n_free, s_free;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	if (count && ma[0]->object != object) {
 		panic("swap_pager_putpages: object mismatch %p/%p",
 		    object,
 		    ma[0]->object
 		);
 	}
 
 	/*
 	 * Step 1
 	 *
 	 * Turn object into OBJT_SWAP
 	 * check for bogus sysops
 	 * force sync if not pageout process
 	 */
 	if (object->type != OBJT_SWAP) {
 		addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		KASSERT(addr == SWAPBLK_NONE,
 		    ("unexpected object swap block"));
 	}
 	VM_OBJECT_WUNLOCK(object);
 
 	n = 0;
 	if (curproc != pageproc)
 		sync = TRUE;
 	else
 		sync = (flags & VM_PAGER_PUT_SYNC) != 0;
 
 	/*
 	 * Step 2
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
 		int j;
 		struct buf *bp;
 		daddr_t blk;
 
 		/* Maximum I/O size is limited by maximum swap block size. */
 		n = min(count - i, nsw_cluster_max);
 
 		/* Get a block of swap of size up to size n. */
 		blk = swp_pager_getswapspace(&n, 4);
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
 				rtvals[i+j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
 		 * All I/O parameters have been satisfied, build the I/O
 		 * request and assign the swap space.
 		 */
 		if (sync != TRUE) {
 			mtx_lock(&swbuf_mtx);
 			while (nsw_wcount_async == 0)
 				msleep(&nsw_wcount_async, &swbuf_mtx, PVM,
 				    "swbufa", 0);
 			nsw_wcount_async--;
 			mtx_unlock(&swbuf_mtx);
 		}
 		bp = uma_zalloc(swwbuf_zone, M_WAITOK);
 		if (sync != TRUE)
 			bp->b_flags = B_ASYNC;
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
 			vm_page_t mreq = ma[i+j];
 
 			addr = swp_pager_meta_build(mreq->object, mreq->pindex,
 			    blk + j);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&s_free, &n_free,
 				    addr);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		VM_CNT_INC(v_swapout);
 		VM_CNT_ADD(v_swappgsout, bp->b_npages);
 
 		/*
 		 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
 		 * can call the async completion routine at the end of a
 		 * synchronous I/O operation.  Otherwise, our caller would
 		 * perform duplicate unbusy and wakeup operations on the page
 		 * and object, respectively.
 		 */
 		for (j = 0; j < n; j++)
 			rtvals[i + j] = VM_PAGER_PEND;
 
 		/*
 		 * asynchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		if (sync == FALSE) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
 		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete.
 		 */
 		bwait(bp, PVM, "swwrt");
 
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
 	VM_OBJECT_WLOCK(object);
 	swp_pager_freeswapspace(s_free, n_free);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * Report error - unless we ran out of memory, in which case
 	 * we've already logged it in swapgeom_strategy().
 	 */
 	if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	if (buf_mapped(bp))
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 	else
 		bp->b_data = bp->b_kvabase;
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_WLOCK(object);
 	}
 
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 		if (m->oflags & VPO_SWAPSLEEP) {
 			m->oflags &= ~VPO_SWAPSLEEP;
 			wakeup(&object->paging_in_progress);
 		}
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				MPASS(m->dirty == VM_PAGE_BITS_ALL);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
 				vm_page_sunbusy(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
 			m->valid = VM_PAGE_BITS_ALL;
 			if (i < bp->b_pgbefore ||
 			    i >= bp->b_npages - bp->b_pgafter)
 				vm_page_readahead_finish(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 * A page is only written to swap after a period of
 			 * inactivity.  Therefore, we do not expect it to be
 			 * reused.
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
 			vm_page_lock(m);
 			vm_page_deactivate_noreuse(m);
 			vm_page_unlock(m);
 			vm_page_sunbusy(m);
 		}
 	}
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_WUNLOCK(object);
 	}
 
 	/*
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		mtx_lock(&swbuf_mtx);
 		if (++nsw_wcount_async == 1)
 			wakeup(&nsw_wcount_async);
 		mtx_unlock(&swbuf_mtx);
 	}
 	uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp);
 }
 
 int
 swap_pager_nswapdev(void)
 {
 
 	return (nswapdev);
 }
 
 static void
 swp_pager_force_dirty(vm_page_t m)
 {
 
 	vm_page_dirty(m);
 #ifdef INVARIANTS
 	vm_page_lock(m);
 	if (!vm_page_wired(m) && m->queue == PQ_NONE)
 		panic("page %p is neither wired nor queued", m);
 	vm_page_unlock(m);
 #endif
 	vm_page_xunbusy(m);
 	swap_pager_unswapped(m);
 }
 
 static void
 swp_pager_force_launder(vm_page_t m)
 {
 
 	vm_page_dirty(m);
 	vm_page_lock(m);
 	vm_page_launder(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	swap_pager_unswapped(m);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force swap blocks to be paged in
  *
  *	This routine dissociates pages starting at the given index within an
  *	object from their backing store, paging them in if they do not reside
  *	in memory.  Pages that are paged in are marked dirty and placed in the
  *	laundry queue.  Pages are marked dirty because they no longer have
  *	backing store.  They are placed in the laundry queue because they have
  *	not been accessed recently.  Otherwise, they would already reside in
  *	memory.
  */
 static void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex, int npages)
 {
 	vm_page_t ma[npages];
 	int i, j;
 
 	KASSERT(npages > 0, ("%s: No pages", __func__));
 	KASSERT(npages <= MAXPHYS / PAGE_SIZE,
 	    ("%s: Too many pages: %d", __func__, npages));
 	vm_object_pip_add(object, npages);
 	vm_page_grab_pages(object, pindex, VM_ALLOC_NORMAL, ma, npages);
 	for (i = j = 0;; i++) {
 		/* Count nonresident pages, to page-in all at once. */
 		if (i < npages && ma[i]->valid != VM_PAGE_BITS_ALL)
 			continue;
 		if (j < i) {
 			/* Page-in nonresident pages. Mark for laundering. */
 			if (swap_pager_getpages(object, &ma[j], i - j, NULL,
 			    NULL) != VM_PAGER_OK)
 				panic("%s: read from swap failed", __func__);
 			do {
 				swp_pager_force_launder(ma[j]);
 			} while (++j < i);
 		}
 		if (i == npages)
 			break;
 		/* Mark dirty a resident page. */
 		swp_pager_force_dirty(ma[j++]);
 	}
 	vm_object_pip_wakeupn(object, npages);
 }
 
 /*
  *	swap_pager_swapoff_object:
  *
  *	Page in all of the pages that have been paged out for an object
  *	to a swap device.
  */
 static void
 swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object)
 {
 	struct swblk *sb;
 	vm_pindex_t pi, s_pindex;
 	daddr_t blk, n_blks, s_blk;
 	int i;
 
 	n_blks = 0;
 	for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
 	    &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			blk = sb->d[i];
 			if (!swp_pager_isondev(blk, sp))
 				blk = SWAPBLK_NONE;
 
 			/*
 			 * If there are no blocks/pages accumulated, start a new
 			 * accumulation here.
 			 */
 			if (n_blks == 0) {
 				if (blk != SWAPBLK_NONE) {
 					s_blk = blk;
 					s_pindex = sb->p + i;
 					n_blks = 1;
 				}
 				continue;
 			}
 
 			/*
 			 * If the accumulation can be extended without breaking
 			 * the sequence of consecutive blocks and pages that
 			 * swp_pager_force_pagein() depends on, do so.
 			 */
 			if (n_blks < MAXPHYS / PAGE_SIZE &&
 			    s_blk + n_blks == blk &&
 			    s_pindex + n_blks == sb->p + i) {
 				++n_blks;
 				continue;
 			}
 
 			/*
 			 * The sequence of consecutive blocks and pages cannot
 			 * be extended, so page them all in here.  Then,
 			 * because doing so involves releasing and reacquiring
 			 * a lock that protects the swap block pctrie, do not
 			 * rely on the current swap block.  Break this loop and
 			 * re-fetch the same pindex from the pctrie again.
 			 */
 			swp_pager_force_pagein(object, s_pindex, n_blks);
 			n_blks = 0;
 			break;
 		}
 		if (i == SWAP_META_PAGES)
 			pi = sb->p + SWAP_META_PAGES;
 	}
 	if (n_blks > 0)
 		swp_pager_force_pagein(object, s_pindex, n_blks);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	vm_object_t object;
 	int retries;
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->type != OBJT_SWAP)
 			continue;
 		mtx_unlock(&vm_object_list_mtx);
 		/* Depends on type-stability. */
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Dead objects are eventually terminated on their own.
 		 */
 		if ((object->flags & OBJ_DEAD) != 0)
 			goto next_obj;
 
 		/*
 		 * Sync with fences placed after pctrie
 		 * initialization.  We must not access pctrie below
 		 * unless we checked that our object is swap and not
 		 * dead.
 		 */
 		atomic_thread_fence_acq();
 		if (object->type != OBJT_SWAP)
 			goto next_obj;
 
 		swap_pager_swapoff_object(sp, object);
 next_obj:
 		VM_OBJECT_WUNLOCK(object);
 		mtx_lock(&vm_object_list_mtx);
 	}
 	mtx_unlock(&vm_object_list_mtx);
 
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 	EVENTHANDLER_INVOKE(swapoff, sp);
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free?
  */
 static bool
 swp_pager_swblk_empty(struct swblk *sb, int start, int limit)
 {
 	int i;
 
 	MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES);
 	for (i = start; i < limit; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (false);
 	}
 	return (true);
 }
    
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is returned.
  */
 static daddr_t
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
 	struct swblk *sb, *sb1;
 	vm_pindex_t modpi, rdpi;
 	daddr_t prev_swapblk;
 	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		pctrie_init(&object->un_pager.swp.swp_blks);
 
 		/*
 		 * Ensure that swap_pager_swapoff()'s iteration over
 		 * object_list does not see a garbage pctrie.
 		 */
 		atomic_thread_fence_rel();
 
 		object->type = OBJT_SWAP;
 		KASSERT(object->handle == NULL, ("default pager with handle"));
 	}
 
 	rdpi = rounddown(pindex, SWAP_META_PAGES);
 	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
 	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
 			return (SWAPBLK_NONE);
 		for (;;) {
 			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
 			    pageproc ? M_USE_RESERVE : 0));
 			if (sb != NULL) {
 				sb->p = rdpi;
 				for (i = 0; i < SWAP_META_PAGES; i++)
 					sb->d[i] = SWAPBLK_NONE;
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    1, 0))
 					printf("swblk zone ok\n");
 				break;
 			}
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swblk_zone)) {
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    0, 1))
 					printf("swap blk zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxb", 10);
 			} else
 				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
 			sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 			    rdpi);
 			if (sb != NULL)
 				/*
 				 * Somebody swapped out a nearby page,
 				 * allocating swblk at the rdpi index,
 				 * while we dropped the object lock.
 				 */
 				goto allocated;
 		}
 		for (;;) {
 			error = SWAP_PCTRIE_INSERT(
 			    &object->un_pager.swp.swp_blks, sb);
 			if (error == 0) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    1, 0))
 					printf("swpctrie zone ok\n");
 				break;
 			}
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swpctrie_zone)) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    0, 1))
 					printf("swap pctrie zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxp", 10);
 			} else
 				uma_zwait(swpctrie_zone);
 			VM_OBJECT_WLOCK(object);
 			sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 			    rdpi);
 			if (sb1 != NULL) {
 				uma_zfree(swblk_zone, sb);
 				sb = sb1;
 				goto allocated;
 			}
 		}
 	}
 allocated:
 	MPASS(sb->p == rdpi);
 
 	modpi = pindex % SWAP_META_PAGES;
 	/* Return prior contents of metadata. */
 	prev_swapblk = sb->d[modpi];
 	/* Enter block into metadata. */
 	sb->d[modpi] = swapblk;
 
 	/*
 	 * Free the swblk if we end up with the empty page run.
 	 */
 	if (swapblk == SWAPBLK_NONE &&
 	    swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rdpi);
 		uma_zfree(swblk_zone, sb);
 	}
 	return (prev_swapblk);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
 {
 	struct swblk *sb;
 	daddr_t n_free, s_free;
 	vm_pindex_t last;
 	int i, limit, start;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP || count == 0)
 		return;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	last = pindex + count;
 	for (;;) {
 		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 		    rounddown(pindex, SWAP_META_PAGES));
 		if (sb == NULL || sb->p >= last)
 			break;
 		start = pindex > sb->p ? pindex - sb->p : 0;
 		limit = last - sb->p < SWAP_META_PAGES ? last - sb->p :
 		    SWAP_META_PAGES;
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&s_free, &n_free, sb->d[i]);
 			sb->d[i] = SWAPBLK_NONE;
 		}
 		pindex = sb->p + SWAP_META_PAGES;
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
 			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
 			    sb->p);
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	struct swblk *sb;
 	daddr_t n_free, s_free;
 	vm_pindex_t pindex;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
 	    &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
 		pindex = sb->p + SWAP_META_PAGES;
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&s_free, &n_free, sb->d[i]);
 		}
 		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
 		uma_zfree(swblk_zone, sb);
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap meta data.
  *
  *	This routine is capable of looking up, or removing swapblk
  *	assignments in the swap meta data.  It returns the swapblk being
  *	looked-up, popped, or SWAPBLK_NONE if the block was invalid.
  *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  *
  *	SWM_POP		remove from meta data but do not free it
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblk *sb;
 	daddr_t r1;
 
 	if ((flags & SWM_POP) != 0)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 	else
 		VM_OBJECT_ASSERT_LOCKED(object);
 
 	/*
 	 * The meta data only exists if the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 	    rounddown(pindex, SWAP_META_PAGES));
 	if (sb == NULL)
 		return (SWAPBLK_NONE);
 	r1 = sb->d[pindex % SWAP_META_PAGES];
 	if (r1 == SWAPBLK_NONE)
 		return (SWAPBLK_NONE);
 	if ((flags & SWM_POP) != 0) {
 		sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
 		if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
 			    rounddown(pindex, SWAP_META_PAGES));
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	return (r1);
 }
 
 /*
  * Returns the least page index which is greater than or equal to the
  * parameter pindex and for which there is a swap block allocated.
  * Returns object's size if the object's type is not swap or if there
  * are no allocated swap blocks for the object after the requested
  * pindex.
  */
 vm_pindex_t
 swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	struct swblk *sb;
 	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return (object->size);
 
 	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 	    rounddown(pindex, SWAP_META_PAGES));
 	if (sb == NULL)
 		return (object->size);
 	if (sb->p < pindex) {
 		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
 				return (sb->p + i);
 		}
 		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 		    roundup(pindex, SWAP_META_PAGES));
 		if (sb == NULL)
 			return (object->size);
 	}
 	for (i = 0; i < SWAP_META_PAGES; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (sb->p + i);
 	}
 
 	/*
 	 * We get here if a swblk is present in the trie but it
 	 * doesn't map any blocks.
 	 */
 	MPASS(0);
 	return (object->size);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swblk_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
  * message.
  */
 static void
 swapon_check_swzone(void)
 {
 	unsigned long maxpages, npages;
 
 	npages = swap_total;
 	/* absolute maximum we can handle assuming 100% efficiency */
 	maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
 		printf("warning: total configured swap (%lu pages) "
 		    "exceeds maximum recommended amount (%lu pages).\n",
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
 	}
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks,
     sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf(
     "WARNING: reducing swap size to maximum of %luMB per unit\n",
 		    mblocks / 1024 / 1024 * PAGE_SIZE);
 		nblks = mblocks;
 	}
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 	sp->sw_flags = flags;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks - 2;
 	swap_total += nblks;
 	swapon_check_swzone();
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 	EVENTHANDLER_INVOKE(swapon, sp);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
 	u_long nblks;
 #ifdef MAC
 	int error;
 #endif
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
 	if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
 		return (ENOMEM);
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
 	swap_total -= nblks;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_id = NULL;
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
 
 	sx_xlock(&swdev_syscall_lock);
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	sx_xunlock(&swdev_syscall_lock);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 int
 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
 {
 	struct swdevt *sp;
 	const char *tmp_devname;
 	int error, n;
 
 	n = 0;
 	error = ENOENT;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n != name) {
 			n++;
 			continue;
 		}
 		xs->xsw_version = XSWDEV_VERSION;
 		xs->xsw_dev = sp->sw_dev;
 		xs->xsw_flags = sp->sw_flags;
 		xs->xsw_nblks = sp->sw_nblks;
 		xs->xsw_used = sp->sw_used;
 		if (devname != NULL) {
 			if (vn_isdisk(sp->sw_vp, NULL))
 				tmp_devname = devtoname(sp->sw_vp->v_rdev);
 			else
 				tmp_devname = "[file]";
 			strncpy(devname, tmp_devname, len);
 		}
 		error = 0;
 		break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 #define XSWDEV_VERSION_11	1
 struct xswdev11 {
 	u_int	xsw_version;
 	uint32_t xsw_dev;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 struct xswdev32 {
 	u_int	xsw_version;
 	u_int	xsw_dev1, xsw_dev2;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	struct xswdev xs;
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	struct xswdev32 xs32;
 #endif
 #if defined(COMPAT_FREEBSD11)
 	struct xswdev11 xs11;
 #endif
 	int error;
 
 	if (arg2 != 1)			/* name length */
 		return (EINVAL);
 	error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
 	if (error != 0)
 		return (error);
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	if (req->oldlen == sizeof(xs32)) {
 		xs32.xsw_version = XSWDEV_VERSION;
 		xs32.xsw_dev1 = xs.xsw_dev;
 		xs32.xsw_dev2 = xs.xsw_dev >> 32;
 		xs32.xsw_flags = xs.xsw_flags;
 		xs32.xsw_nblks = xs.xsw_nblks;
 		xs32.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs32, sizeof(xs32));
 		return (error);
 	}
 #endif
 #if defined(COMPAT_FREEBSD11)
 	if (req->oldlen == sizeof(xs11)) {
 		xs11.xsw_version = XSWDEV_VERSION_11;
 		xs11.xsw_dev = xs.xsw_dev; /* truncation */
 		xs11.xsw_flags = xs.xsw_flags;
 		xs11.xsw_nblks = xs.xsw_nblks;
 		xs11.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs11, sizeof(xs11));
 		return (error);
 	}
 #endif
 	error = SYSCTL_OUT(req, &xs, sizeof(xs));
 	return (error);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
     sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * Count the approximate swap usage in pages for a vmspace.  The
  * shadowed or not yet copied on write swap blocks are not accounted.
  * The map must be locked.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	struct swblk *sb;
 	vm_pindex_t e, pi;
 	long count;
 	int i;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		object = cur->object.vm_object;
 		if (object == NULL || object->type != OBJT_SWAP)
 			continue;
 		VM_OBJECT_RLOCK(object);
 		if (object->type != OBJT_SWAP)
 			goto unlock;
 		pi = OFF_TO_IDX(cur->offset);
 		e = pi + OFF_TO_IDX(cur->end - cur->start);
 		for (;; pi = sb->p + SWAP_META_PAGES) {
 			sb = SWAP_PCTRIE_LOOKUP_GE(
 			    &object->un_pager.swp.swp_blks, pi);
 			if (sb == NULL || sb->p >= e)
 				break;
 			for (i = 0; i < SWAP_META_PAGES; i++) {
 				if (sb->p + i < e &&
 				    sb->d[i] != SWAPBLK_NONE)
 					count++;
 			}
 		}
 unlock:
 		VM_OBJECT_RUNLOCK(object);
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 /*
  * Add a reference to the g_consumer for an inflight transaction.
  */
 static void
 swapgeom_acquire(struct g_consumer *cp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index++;
 }
 
 /*
  * Remove a reference from the g_consumer.  Post a close event if all
  * references go away, since the function might be called from the
  * biodone context.
  */
 static void
 swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index--;
 	if (cp->index == 0) {
 		if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
 			sp->sw_id = NULL;
 	}
 }
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct swdevt *sp;
 	struct buf *bp;
 	struct g_consumer *cp;
 
 	bp = bp2->bio_caller2;
 	cp = bp2->bio_from;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bp->b_caller1 = NULL;
 	bufdone(bp);
 	sp = bp2->bio_caller1;
 	mtx_lock(&sw_dev_mtx);
 	swapgeom_release(cp, sp);
 	mtx_unlock(&sw_dev_mtx);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	swapgeom_acquire(cp);
 	mtx_unlock(&sw_dev_mtx);
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
 		mtx_lock(&sw_dev_mtx);
 		swapgeom_release(cp, sp);
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		printf("swap_pager: cannot allocate bio\n");
 		bufdone(bp);
 		return;
 	}
 
 	bp->b_caller1 = bio;
 	bio->bio_caller1 = sp;
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	if (!buf_mapped(bp)) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bio->bio_ma_n = bp->b_npages;
 		bio->bio_flags |= BIO_UNMAPPED;
 	} else {
 		bio->bio_data = bp->b_data;
 		bio->bio_ma = NULL;
 	}
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 	int destroy;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == cp) {
 			sp->sw_flags |= SW_CLOSING;
 			break;
 		}
 	}
 	/*
 	 * Drop reference we were created with. Do directly since we're in a
 	 * special context where we don't have to queue the call to
 	 * swapgeom_close_ev().
 	 */
 	cp->index--;
 	destroy = ((sp != NULL) && (cp->index == 0));
 	if (destroy)
 		sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	if (destroy)
 		swapgeom_close_ev(cp, 0);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * swapgeom_close() may be called from the biodone context,
 	 * where we cannot perform topology changes.  Delegate the
 	 * work to the events thread.
 	 */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 static int
 swapongeom_locked(struct cdev *dev, struct vnode *vp)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	pp = g_dev_getprovider(dev);
 	if (pp == NULL)
 		return (ENODEV);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
 	cp->index = 1;	/* Number of active I/Os, plus one for being active. */
 	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	/*
 	 * XXX: Every time you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		return (error);
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(dev),
 	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
 	return (0);
 }
 
 static int
 swapongeom(struct vnode *vp)
 {
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) {
 		error = ENOENT;
 	} else {
 		g_topology_lock();
 		error = swapongeom_locked(vp->v_rdev, vp);
 		g_topology_unlock();
 	}
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV, 0);
 	return (0);
 }
 
 static int
 sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
 {
 	int error, new, n;
 
 	new = nsw_wcount_async_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (new > nswbuf / 2 || new < 1)
 		return (EINVAL);
 
 	mtx_lock(&swbuf_mtx);
 	while (nsw_wcount_async_max != new) {
 		/*
 		 * Adjust difference.  If the current async count is too low,
 		 * we will need to sqeeze our update slowly in.  Sleep with a
 		 * higher priority than getpbuf() to finish faster.
 		 */
 		n = new - nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		} else {
 			nsw_wcount_async_max -= nsw_wcount_async;
 			nsw_wcount_async = 0;
 			msleep(&nsw_wcount_async, &swbuf_mtx, PSWP,
 			    "swpsysctl", 0);
 		}
 	}
 	mtx_unlock(&swbuf_mtx);
 
 	return (0);
 }
Index: head/sys/vm/vm_pageout.c
===================================================================
--- head/sys/vm/vm_pageout.c	(revision 349790)
+++ head/sys/vm/vm_pageout.c	(revision 349791)
@@ -1,2112 +1,2112 @@
 /*-
  * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU)
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005 Yahoo! Technologies Norway AS
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vm_pageout.c	7.4 (Berkeley) 5/7/91
  *
  *
  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Authors: Avadis Tevanian, Jr., Michael Wayne Young
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  *	The proverbial page-out daemon.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/ktr.h>
 #include <sys/mount.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_pagequeue.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 /*
  * System initialization
  */
 
 /* the kernel process "vm_pageout"*/
 static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t m, int *numpagedout);
 static int vm_pageout_cluster(vm_page_t m);
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage);
 
 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
     NULL);
 
 struct proc *pageproc;
 
 static struct kproc_desc page_kp = {
 	"pagedaemon",
 	vm_pageout,
 	&pageproc
 };
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
 SDT_PROVIDER_DEFINE(vm);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
 
 /* Pagedaemon activity rates, in subdivisions of one second. */
 #define	VM_LAUNDER_RATE		10
 #define	VM_INACT_SCAN_RATE	10
 
 static int vm_pageout_oom_seq = 12;
 
 static int vm_pageout_update_period;
 static int disable_swap_pageouts;
 static int lowmem_period = 10;
 static int swapdev_enabled;
 
 static int vm_panic_on_oom = 0;
 
 SYSCTL_INT(_vm, OID_AUTO, panic_on_oom,
 	CTLFLAG_RWTUN, &vm_panic_on_oom, 0,
 	"panic on out of memory instead of killing the largest process");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_update_period,
 	CTLFLAG_RWTUN, &vm_pageout_update_period, 0,
 	"Maximum active LRU update period");
   
 SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0,
 	"Low memory callback period");
 
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
 SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
 	CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0,
 	"back-to-back calls to oom detector to start OOM");
 
 static int act_scan_laundry_weight = 3;
 SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN,
     &act_scan_laundry_weight, 0,
     "weight given to clean vs. dirty pages in active queue scans");
 
 static u_int vm_background_launder_rate = 4096;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN,
     &vm_background_launder_rate, 0,
     "background laundering rate, in kilobytes per second");
 
 static u_int vm_background_launder_max = 20 * 1024;
 SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN,
     &vm_background_launder_max, 0, "background laundering cap, in kilobytes");
 
 int vm_pageout_page_count = 32;
 
 u_long vm_page_max_user_wired;
 SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW,
     &vm_page_max_user_wired, 0,
     "system-wide limit to user-wired page count");
 
 static u_int isqrt(u_int num);
 static int vm_pageout_launder(struct vm_domain *vmd, int launder,
     bool in_shortfall);
 static void vm_pageout_laundry_worker(void *arg);
 
 struct scan_state {
 	struct vm_batchqueue bq;
 	struct vm_pagequeue *pq;
 	vm_page_t	marker;
 	int		maxscan;
 	int		scanned;
 };
 
 static void
 vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq,
     vm_page_t marker, vm_page_t after, int maxscan)
 {
 
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((marker->aflags & PGA_ENQUEUED) == 0,
 	    ("marker %p already enqueued", marker));
 
 	if (after == NULL)
 		TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q);
 	else
 		TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q);
 	vm_page_aflag_set(marker, PGA_ENQUEUED);
 
 	vm_batchqueue_init(&ss->bq);
 	ss->pq = pq;
 	ss->marker = marker;
 	ss->maxscan = maxscan;
 	ss->scanned = 0;
 	vm_pagequeue_unlock(pq);
 }
 
 static void
 vm_pageout_end_scan(struct scan_state *ss)
 {
 	struct vm_pagequeue *pq;
 
 	pq = ss->pq;
 	vm_pagequeue_assert_locked(pq);
 	KASSERT((ss->marker->aflags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q);
 	vm_page_aflag_clear(ss->marker, PGA_ENQUEUED);
 	pq->pq_pdpages += ss->scanned;
 }
 
 /*
  * Add a small number of queued pages to a batch queue for later processing
  * without the corresponding queue lock held.  The caller must have enqueued a
  * marker page at the desired start point for the scan.  Pages will be
  * physically dequeued if the caller so requests.  Otherwise, the returned
  * batch may contain marker pages, and it is up to the caller to handle them.
  *
  * When processing the batch queue, vm_page_queue() must be used to
  * determine whether the page has been logically dequeued by another thread.
  * Once this check is performed, the page lock guarantees that the page will
  * not be disassociated from the queue.
  */
 static __always_inline void
 vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue)
 {
 	struct vm_pagequeue *pq;
 	vm_page_t m, marker, n;
 
 	marker = ss->marker;
 	pq = ss->pq;
 
 	KASSERT((marker->aflags & PGA_ENQUEUED) != 0,
 	    ("marker %p not enqueued", ss->marker));
 
 	vm_pagequeue_lock(pq);
 	for (m = TAILQ_NEXT(marker, plinks.q); m != NULL &&
 	    ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE;
 	    m = n, ss->scanned++) {
 		n = TAILQ_NEXT(m, plinks.q);
 		if ((m->flags & PG_MARKER) == 0) {
 			KASSERT((m->aflags & PGA_ENQUEUED) != 0,
 			    ("page %p not enqueued", m));
 			KASSERT((m->flags & PG_FICTITIOUS) == 0,
 			    ("Fictitious page %p cannot be in page queue", m));
 			KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("Unmanaged page %p cannot be in page queue", m));
 		} else if (dequeue)
 			continue;
 
 		(void)vm_batchqueue_insert(&ss->bq, m);
 		if (dequeue) {
 			TAILQ_REMOVE(&pq->pq_pl, m, plinks.q);
 			vm_page_aflag_clear(m, PGA_ENQUEUED);
 		}
 	}
 	TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q);
 	if (__predict_true(m != NULL))
 		TAILQ_INSERT_BEFORE(m, marker, plinks.q);
 	else
 		TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q);
 	if (dequeue)
 		vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt);
 	vm_pagequeue_unlock(pq);
 }
 
 /* Return the next page to be scanned, or NULL if the scan is complete. */
 static __always_inline vm_page_t
 vm_pageout_next(struct scan_state *ss, const bool dequeue)
 {
 
 	if (ss->bq.bq_cnt == 0)
 		vm_pageout_collect_batch(ss, dequeue);
 	return (vm_batchqueue_pop(&ss->bq));
 }
 
 /*
  * Scan for pages at adjacent offsets within the given page's object that are
  * eligible for laundering, form a cluster of these pages and the given page,
  * and launder that cluster.
  */
 static int
 vm_pageout_cluster(vm_page_t m)
 {
 	vm_object_t object;
 	vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps;
 	vm_pindex_t pindex;
 	int ib, is, page_base, pageout_count;
 
 	vm_page_assert_locked(m);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	pindex = m->pindex;
 
 	vm_page_assert_unbusied(m);
 	KASSERT(!vm_page_held(m), ("page %p is held", m));
 
 	pmap_remove_write(m);
 	vm_page_unlock(m);
 
 	mc[vm_pageout_page_count] = pb = ps = m;
 	pageout_count = 1;
 	page_base = vm_pageout_page_count;
 	ib = 1;
 	is = 1;
 
 	/*
 	 * We can cluster only if the page is not clean, busy, or held, and
 	 * the page is in the laundry queue.
 	 *
 	 * During heavy mmap/modification loads the pageout
 	 * daemon can really fragment the underlying file
 	 * due to flushing pages out of order and not trying to
 	 * align the clusters (which leaves sporadic out-of-order
 	 * holes).  To solve this problem we do the reverse scan
 	 * first and attempt to align our cluster, then do a 
 	 * forward scan if room remains.
 	 */
 more:
 	while (ib != 0 && pageout_count < vm_pageout_page_count) {
 		if (ib > pindex) {
 			ib = 0;
 			break;
 		}
 		if ((p = vm_page_prev(pb)) == NULL || vm_page_busied(p)) {
 			ib = 0;
 			break;
 		}
 		vm_page_test_dirty(p);
 		if (p->dirty == 0) {
 			ib = 0;
 			break;
 		}
 		vm_page_lock(p);
 		if (vm_page_held(p) || !vm_page_in_laundry(p)) {
 			vm_page_unlock(p);
 			ib = 0;
 			break;
 		}
 		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[--page_base] = pb = p;
 		++pageout_count;
 		++ib;
 
 		/*
 		 * We are at an alignment boundary.  Stop here, and switch
 		 * directions.  Do not clear ib.
 		 */
 		if ((pindex - (ib - 1)) % vm_pageout_page_count == 0)
 			break;
 	}
 	while (pageout_count < vm_pageout_page_count && 
 	    pindex + is < object->size) {
 		if ((p = vm_page_next(ps)) == NULL || vm_page_busied(p))
 			break;
 		vm_page_test_dirty(p);
 		if (p->dirty == 0)
 			break;
 		vm_page_lock(p);
 		if (vm_page_held(p) || !vm_page_in_laundry(p)) {
 			vm_page_unlock(p);
 			break;
 		}
 		pmap_remove_write(p);
 		vm_page_unlock(p);
 		mc[page_base + pageout_count] = ps = p;
 		++pageout_count;
 		++is;
 	}
 
 	/*
 	 * If we exhausted our forward scan, continue with the reverse scan
 	 * when possible, even past an alignment boundary.  This catches
 	 * boundary conditions.
 	 */
 	if (ib != 0 && pageout_count < vm_pageout_page_count)
 		goto more;
 
 	return (vm_pageout_flush(&mc[page_base], pageout_count,
 	    VM_PAGER_PUT_NOREUSE, 0, NULL, NULL));
 }
 
 /*
  * vm_pageout_flush() - launder the given pages
  *
  *	The given pages are laundered.  Note that we setup for the start of
  *	I/O ( i.e. busy the page ), mark it read-only, and bump the object
  *	reference count all in here rather then in the parent.  If we want
  *	the parent to do more sophisticated things we may have to change
  *	the ordering.
  *
  *	Returned runlen is the count of pages between mreq and first
  *	page after mreq with status VM_PAGER_AGAIN.
  *	*eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL
  *	for any page in runlen set.
  */
 int
 vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen,
     boolean_t *eio)
 {
 	vm_object_t object = mc[0]->object;
 	int pageout_status[count];
 	int numpagedout = 0;
 	int i, runlen;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Initiate I/O.  Mark the pages busy and verify that they're valid
 	 * and read-only.
 	 *
 	 * We do not have to fixup the clean/dirty bits here... we can
 	 * allow the pager to do it after the I/O completes.
 	 *
 	 * NOTE! mc[i]->dirty may be partial or fragmented due to an
 	 * edge case with file fragments.
 	 */
 	for (i = 0; i < count; i++) {
 		KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL,
 		    ("vm_pageout_flush: partially invalid page %p index %d/%d",
 			mc[i], i, count));
 		KASSERT((mc[i]->aflags & PGA_WRITEABLE) == 0,
 		    ("vm_pageout_flush: writeable page %p", mc[i]));
 		vm_page_sbusy(mc[i]);
 	}
 	vm_object_pip_add(object, count);
 
 	vm_pager_put_pages(object, mc, count, flags, pageout_status);
 
 	runlen = count - mreq;
 	if (eio != NULL)
 		*eio = FALSE;
 	for (i = 0; i < count; i++) {
 		vm_page_t mt = mc[i];
 
 		KASSERT(pageout_status[i] == VM_PAGER_PEND ||
 		    !pmap_page_is_write_mapped(mt),
 		    ("vm_pageout_flush: page %p is not write protected", mt));
 		switch (pageout_status[i]) {
 		case VM_PAGER_OK:
 			vm_page_lock(mt);
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			vm_page_unlock(mt);
 			/* FALLTHROUGH */
 		case VM_PAGER_PEND:
 			numpagedout++;
 			break;
 		case VM_PAGER_BAD:
 			/*
 			 * The page is outside the object's range.  We pretend
 			 * that the page out worked and clean the page, so the
 			 * changes will be lost if the page is reclaimed by
 			 * the page daemon.
 			 */
 			vm_page_undirty(mt);
 			vm_page_lock(mt);
 			if (vm_page_in_laundry(mt))
 				vm_page_deactivate_noreuse(mt);
 			vm_page_unlock(mt);
 			break;
 		case VM_PAGER_ERROR:
 		case VM_PAGER_FAIL:
 			/*
 			 * If the page couldn't be paged out to swap because the
 			 * pager wasn't able to find space, place the page in
 			 * the PQ_UNSWAPPABLE holding queue.  This is an
 			 * optimization that prevents the page daemon from
 			 * wasting CPU cycles on pages that cannot be reclaimed
 			 * becase no swap device is configured.
 			 *
 			 * Otherwise, reactivate the page so that it doesn't
 			 * clog the laundry and inactive queues.  (We will try
 			 * paging it out again later.)
 			 */
 			vm_page_lock(mt);
 			if (object->type == OBJT_SWAP &&
 			    pageout_status[i] == VM_PAGER_FAIL) {
 				vm_page_unswappable(mt);
 				numpagedout++;
 			} else
 				vm_page_activate(mt);
 			vm_page_unlock(mt);
 			if (eio != NULL && i >= mreq && i - mreq < runlen)
 				*eio = TRUE;
 			break;
 		case VM_PAGER_AGAIN:
 			if (i >= mreq && i - mreq < runlen)
 				runlen = i - mreq;
 			break;
 		}
 
 		/*
 		 * If the operation is still going, leave the page busy to
 		 * block all other accesses. Also, leave the paging in
 		 * progress indicator set so that we don't attempt an object
 		 * collapse.
 		 */
 		if (pageout_status[i] != VM_PAGER_PEND) {
 			vm_object_pip_wakeup(object);
 			vm_page_sunbusy(mt);
 		}
 	}
 	if (prunlen != NULL)
 		*prunlen = runlen;
 	return (numpagedout);
 }
 
 static void
 vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	atomic_store_rel_int(&swapdev_enabled, 1);
 }
 
 static void
 vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused)
 {
 
 	if (swap_pager_nswapdev() == 1)
 		atomic_store_rel_int(&swapdev_enabled, 0);
 }
 
 /*
  * Attempt to acquire all of the necessary locks to launder a page and
  * then call through the clustering layer to PUTPAGES.  Wait a short
  * time for a vnode lock.
  *
  * Requires the page and object lock on entry, releases both before return.
  * Returns 0 on success and an errno otherwise.
  */
 static int
 vm_pageout_clean(vm_page_t m, int *numpagedout)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_object_t object;
 	vm_pindex_t pindex;
 	int error, lockmode;
 
 	vm_page_assert_locked(m);
 	object = m->object;
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 	vp = NULL;
 	mp = NULL;
 
 	/*
 	 * The object is already known NOT to be dead.   It
 	 * is possible for the vget() to block the whole
 	 * pageout daemon, but the new low-memory handling
 	 * code should prevent it.
 	 *
 	 * We can't wait forever for the vnode lock, we might
 	 * deadlock due to a vn_read() getting stuck in
 	 * vm_wait while holding this vnode.  We skip the 
 	 * vnode if we can't get it in a reasonable amount
 	 * of time.
 	 */
 	if (object->type == OBJT_VNODE) {
 		vm_page_unlock(m);
 		vp = object->handle;
 		if (vp->v_type == VREG &&
 		    vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 			mp = NULL;
 			error = EDEADLK;
 			goto unlock_all;
 		}
 		KASSERT(mp != NULL,
 		    ("vp %p with NULL v_mount", vp));
 		vm_object_reference_locked(object);
 		pindex = m->pindex;
 		VM_OBJECT_WUNLOCK(object);
 		lockmode = MNT_SHARED_WRITES(vp->v_mount) ?
 		    LK_SHARED : LK_EXCLUSIVE;
 		if (vget(vp, lockmode | LK_TIMELOCK, curthread)) {
 			vp = NULL;
 			error = EDEADLK;
 			goto unlock_mp;
 		}
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Ensure that the object and vnode were not disassociated
 		 * while locks were dropped.
 		 */
 		if (vp->v_object != object) {
 			error = ENOENT;
 			goto unlock_all;
 		}
 		vm_page_lock(m);
 
 		/*
 		 * While the object and page were unlocked, the page
 		 * may have been:
 		 * (1) moved to a different queue,
 		 * (2) reallocated to a different object,
 		 * (3) reallocated to a different offset, or
 		 * (4) cleaned.
 		 */
 		if (!vm_page_in_laundry(m) || m->object != object ||
 		    m->pindex != pindex || m->dirty == 0) {
 			vm_page_unlock(m);
 			error = ENXIO;
 			goto unlock_all;
 		}
 
 		/*
 		 * The page may have been busied or referenced while the object
 		 * and page locks were released.
 		 */
 		if (vm_page_busied(m) || vm_page_held(m)) {
 			vm_page_unlock(m);
 			error = EBUSY;
 			goto unlock_all;
 		}
 	}
 
 	/*
 	 * If a page is dirty, then it is either being washed
 	 * (but not yet cleaned) or it is still in the
 	 * laundry.  If it is still in the laundry, then we
 	 * start the cleaning operation. 
 	 */
 	if ((*numpagedout = vm_pageout_cluster(m)) == 0)
 		error = EIO;
 
 unlock_all:
 	VM_OBJECT_WUNLOCK(object);
 
 unlock_mp:
 	vm_page_lock_assert(m, MA_NOTOWNED);
 	if (mp != NULL) {
 		if (vp != NULL)
 			vput(vp);
 		vm_object_deallocate(object);
 		vn_finished_write(mp);
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to launder the specified number of pages.
  *
  * Returns the number of pages successfully laundered.
  */
 static int
 vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall)
 {
 	struct scan_state ss;
 	struct vm_pagequeue *pq;
 	struct mtx *mtx;
 	vm_object_t object;
 	vm_page_t m, marker;
 	int act_delta, error, numpagedout, queue, starting_target;
 	int vnodes_skipped;
 	bool pageout_ok;
 
 	mtx = NULL;
 	object = NULL;
 	starting_target = launder;
 	vnodes_skipped = 0;
 
 	/*
 	 * Scan the laundry queues for pages eligible to be laundered.  We stop
 	 * once the target number of dirty pages have been laundered, or once
 	 * we've reached the end of the queue.  A single iteration of this loop
 	 * may cause more than one page to be laundered because of clustering.
 	 *
 	 * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no
 	 * swap devices are configured.
 	 */
 	if (atomic_load_acq_int(&swapdev_enabled))
 		queue = PQ_UNSWAPPABLE;
 	else
 		queue = PQ_LAUNDRY;
 
 scan:
 	marker = &vmd->vmd_markers[queue];
 	pq = &vmd->vmd_pagequeues[queue];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 	while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		vm_page_change_lock(m, &mtx);
 
 recheck:
 		/*
 		 * The page may have been disassociated from the queue
 		 * while locks were dropped.
 		 */
 		if (vm_page_queue(m) != queue)
 			continue;
 
 		/*
 		 * A requeue was requested, so this page gets a second
 		 * chance.
 		 */
 		if ((m->aflags & PGA_REQUEUE) != 0) {
 			vm_page_requeue(m);
 			continue;
 		}
 
 		/*
 		 * Held pages are essentially stuck in the queue.
 		 *
 		 * Wired pages may not be freed.  Complete their removal
 		 * from the queue now to avoid needless revisits during
 		 * future scans.
 		 */
 		if (m->hold_count != 0)
 			continue;
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 
 		if (object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = m->object;
 			if (!VM_OBJECT_TRYWLOCK(object)) {
 				mtx_unlock(mtx);
 				/* Depends on type-stability. */
 				VM_OBJECT_WLOCK(object);
 				mtx_lock(mtx);
 				goto recheck;
 			}
 		}
 
 		if (vm_page_busied(m))
 			continue;
 
 		/*
 		 * Invalid pages can be easily freed.  They cannot be
 		 * mapped; vm_page_free() asserts this.
 		 */
 		if (m->valid == 0)
 			goto free_page;
 
 		/*
 		 * If the page has been referenced and the object is not dead,
 		 * reactivate or requeue the page depending on whether the
 		 * object is mapped.
 		 *
 		 * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 		 * that a reference from a concurrently destroyed mapping is
 		 * observed here and now.
 		 */
 		if (object->ref_count != 0)
 			act_delta = pmap_ts_referenced(m);
 		else {
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("page %p is mapped", m));
 			act_delta = 0;
 		}
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta++;
 		}
 		if (act_delta != 0) {
 			if (object->ref_count != 0) {
 				VM_CNT_INC(v_reactivated);
 				vm_page_activate(m);
 
 				/*
 				 * Increase the activation count if the page
 				 * was referenced while in the laundry queue.
 				 * This makes it less likely that the page will
 				 * be returned prematurely to the inactive
 				 * queue.
  				 */
 				m->act_count += act_delta + ACT_ADVANCE;
 
 				/*
 				 * If this was a background laundering, count
 				 * activated pages towards our target.  The
 				 * purpose of background laundering is to ensure
 				 * that pages are eventually cycled through the
 				 * laundry queue, and an activation is a valid
 				 * way out.
 				 */
 				if (!in_shortfall)
 					launder--;
 				continue;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				vm_page_requeue(m);
 				continue;
 			}
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0)
 				pmap_remove_all(m);
 		}
 
 		/*
 		 * Clean pages are freed, and dirty pages are paged out unless
 		 * they belong to a dead object.  Requeueing dirty pages from
 		 * dead objects is pointless, as they are being paged out and
 		 * freed by the thread that destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			vm_page_free(m);
 			VM_CNT_INC(v_dfree);
 		} else if ((object->flags & OBJ_DEAD) == 0) {
 			if (object->type != OBJT_SWAP &&
 			    object->type != OBJT_DEFAULT)
 				pageout_ok = true;
 			else if (disable_swap_pageouts)
 				pageout_ok = false;
 			else
 				pageout_ok = true;
 			if (!pageout_ok) {
 				vm_page_requeue(m);
 				continue;
 			}
 
 			/*
 			 * Form a cluster with adjacent, dirty pages from the
 			 * same object, and page out that entire cluster.
 			 *
 			 * The adjacent, dirty pages must also be in the
 			 * laundry.  However, their mappings are not checked
 			 * for new references.  Consequently, a recently
 			 * referenced page may be paged out.  However, that
 			 * page will not be prematurely reclaimed.  After page
 			 * out, the page will be placed in the inactive queue,
 			 * where any new references will be detected and the
 			 * page reactivated.
 			 */
 			error = vm_pageout_clean(m, &numpagedout);
 			if (error == 0) {
 				launder -= numpagedout;
 				ss.scanned += numpagedout;
 			} else if (error == EDEADLK) {
 				pageout_lock_miss++;
 				vnodes_skipped++;
 			}
 			mtx = NULL;
 			object = NULL;
 		}
 	}
 	if (mtx != NULL) {
 		mtx_unlock(mtx);
 		mtx = NULL;
 	}
 	if (object != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		object = NULL;
 	}
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	if (launder > 0 && queue == PQ_UNSWAPPABLE) {
 		queue = PQ_LAUNDRY;
 		goto scan;
 	}
 
 	/*
 	 * Wakeup the sync daemon if we skipped a vnode in a writeable object
 	 * and we didn't launder enough pages.
 	 */
 	if (vnodes_skipped > 0 && launder > 0)
 		(void)speedup_syncer();
 
 	return (starting_target - launder);
 }
 
 /*
  * Compute the integer square root.
  */
 static u_int
 isqrt(u_int num)
 {
 	u_int bit, root, tmp;
 
 	bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0;
 	root = 0;
 	while (bit != 0) {
 		tmp = root + bit;
 		root >>= 1;
 		if (num >= tmp) {
 			num -= tmp;
 			root += bit;
 		}
 		bit >>= 2;
 	}
 	return (root);
 }
 
 /*
  * Perform the work of the laundry thread: periodically wake up and determine
  * whether any pages need to be laundered.  If so, determine the number of pages
  * that need to be laundered, and launder them.
  */
 static void
 vm_pageout_laundry_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	struct vm_pagequeue *pq;
 	uint64_t nclean, ndirty, nfreed;
 	int domain, last_target, launder, shortfall, shortfall_cycle, target;
 	bool in_shortfall;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 
 	shortfall = 0;
 	in_shortfall = false;
 	shortfall_cycle = 0;
 	last_target = target = 0;
 	nfreed = 0;
 
 	/*
 	 * Calls to these handlers are serialized by the swap syscall lock.
 	 */
 	(void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd,
 	    EVENTHANDLER_PRI_ANY);
 	(void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd,
 	    EVENTHANDLER_PRI_ANY);
 
 	/*
 	 * The pageout laundry worker is never done, so loop forever.
 	 */
 	for (;;) {
 		KASSERT(target >= 0, ("negative target %d", target));
 		KASSERT(shortfall_cycle >= 0,
 		    ("negative cycle %d", shortfall_cycle));
 		launder = 0;
 
 		/*
 		 * First determine whether we need to launder pages to meet a
 		 * shortage of free pages.
 		 */
 		if (shortfall > 0) {
 			in_shortfall = true;
 			shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE;
 			target = shortfall;
 		} else if (!in_shortfall)
 			goto trybackground;
 		else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) {
 			/*
 			 * We recently entered shortfall and began laundering
 			 * pages.  If we have completed that laundering run
 			 * (and we are no longer in shortfall) or we have met
 			 * our laundry target through other activity, then we
 			 * can stop laundering pages.
 			 */
 			in_shortfall = false;
 			target = 0;
 			goto trybackground;
 		}
 		launder = target / shortfall_cycle--;
 		goto dolaundry;
 
 		/*
 		 * There's no immediate need to launder any pages; see if we
 		 * meet the conditions to perform background laundering:
 		 *
 		 * 1. The ratio of dirty to clean inactive pages exceeds the
 		 *    background laundering threshold, or
 		 * 2. we haven't yet reached the target of the current
 		 *    background laundering run.
 		 *
 		 * The background laundering threshold is not a constant.
 		 * Instead, it is a slowly growing function of the number of
 		 * clean pages freed by the page daemon since the last
 		 * background laundering.  Thus, as the ratio of dirty to
 		 * clean inactive pages grows, the amount of memory pressure
 		 * required to trigger laundering decreases.  We ensure
 		 * that the threshold is non-zero after an inactive queue
 		 * scan, even if that scan failed to free a single clean page.
 		 */
 trybackground:
 		nclean = vmd->vmd_free_count +
 		    vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt;
 		ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt;
 		if (target == 0 && ndirty * isqrt(howmany(nfreed + 1,
 		    vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) {
 			target = vmd->vmd_background_launder_target;
 		}
 
 		/*
 		 * We have a non-zero background laundering target.  If we've
 		 * laundered up to our maximum without observing a page daemon
 		 * request, just stop.  This is a safety belt that ensures we
 		 * don't launder an excessive amount if memory pressure is low
 		 * and the ratio of dirty to clean pages is large.  Otherwise,
 		 * proceed at the background laundering rate.
 		 */
 		if (target > 0) {
 			if (nfreed > 0) {
 				nfreed = 0;
 				last_target = target;
 			} else if (last_target - target >=
 			    vm_background_launder_max * PAGE_SIZE / 1024) {
 				target = 0;
 			}
 			launder = vm_background_launder_rate * PAGE_SIZE / 1024;
 			launder /= VM_LAUNDER_RATE;
 			if (launder > target)
 				launder = target;
 		}
 
 dolaundry:
 		if (launder > 0) {
 			/*
 			 * Because of I/O clustering, the number of laundered
 			 * pages could exceed "target" by the maximum size of
 			 * a cluster minus one. 
 			 */
 			target -= min(vm_pageout_launder(vmd, launder,
 			    in_shortfall), target);
 			pause("laundp", hz / VM_LAUNDER_RATE);
 		}
 
 		/*
 		 * If we're not currently laundering pages and the page daemon
 		 * hasn't posted a new request, sleep until the page daemon
 		 * kicks us.
 		 */
 		vm_pagequeue_lock(pq);
 		if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE)
 			(void)mtx_sleep(&vmd->vmd_laundry_request,
 			    vm_pagequeue_lockptr(pq), PVM, "launds", 0);
 
 		/*
 		 * If the pagedaemon has indicated that it's in shortfall, start
 		 * a shortfall laundering unless we're already in the middle of
 		 * one.  This may preempt a background laundering.
 		 */
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL &&
 		    (!in_shortfall || shortfall_cycle == 0)) {
 			shortfall = vm_laundry_target(vmd) +
 			    vmd->vmd_pageout_deficit;
 			target = 0;
 		} else
 			shortfall = 0;
 
 		if (target == 0)
 			vmd->vmd_laundry_request = VM_LAUNDRY_IDLE;
 		nfreed += vmd->vmd_clean_pages_freed;
 		vmd->vmd_clean_pages_freed = 0;
 		vm_pagequeue_unlock(pq);
 	}
 }
 
 /*
  * Compute the number of pages we want to try to move from the
  * active queue to either the inactive or laundry queue.
  *
  * When scanning active pages during a shortage, we make clean pages
  * count more heavily towards the page shortage than dirty pages.
  * This is because dirty pages must be laundered before they can be
  * reused and thus have less utility when attempting to quickly
  * alleviate a free page shortage.  However, this weighting also
  * causes the scan to deactivate dirty pages more aggressively,
  * improving the effectiveness of clustering.
  */
 static int
 vm_pageout_active_target(struct vm_domain *vmd)
 {
 	int shortage;
 
 	shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) -
 	    (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt +
 	    vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight);
 	shortage *= act_scan_laundry_weight;
 	return (shortage);
 }
 
 /*
  * Scan the active queue.  If there is no shortage of inactive pages, scan a
  * small portion of the queue in order to maintain quasi-LRU.
  */
 static void
 vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage)
 {
 	struct scan_state ss;
 	struct mtx *mtx;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	long min_scan;
 	int act_delta, max_scan, scan_tick;
 
 	marker = &vmd->vmd_markers[PQ_ACTIVE];
 	pq = &vmd->vmd_pagequeues[PQ_ACTIVE];
 	vm_pagequeue_lock(pq);
 
 	/*
 	 * If we're just idle polling attempt to visit every
 	 * active page within 'update_period' seconds.
 	 */
 	scan_tick = ticks;
 	if (vm_pageout_update_period != 0) {
 		min_scan = pq->pq_cnt;
 		min_scan *= scan_tick - vmd->vmd_last_active_scan;
 		min_scan /= hz * vm_pageout_update_period;
 	} else
 		min_scan = 0;
 	if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0))
 		vmd->vmd_last_active_scan = scan_tick;
 
 	/*
 	 * Scan the active queue for pages that can be deactivated.  Update
 	 * the per-page activity counter and use it to identify deactivation
 	 * candidates.  Held pages may be deactivated.
 	 *
 	 * To avoid requeuing each page that remains in the active queue, we
 	 * implement the CLOCK algorithm.  To keep the implementation of the
 	 * enqueue operation consistent for all page queues, we use two hands,
 	 * represented by marker pages. Scans begin at the first hand, which
 	 * precedes the second hand in the queue.  When the two hands meet,
 	 * they are moved back to the head and tail of the queue, respectively,
 	 * and scanning resumes.
 	 */
 	max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan;
 	mtx = NULL;
 act_scan:
 	vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan);
 	while ((m = vm_pageout_next(&ss, false)) != NULL) {
 		if (__predict_false(m == &vmd->vmd_clock[1])) {
 			vm_pagequeue_lock(pq);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 			TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q);
 			TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0],
 			    plinks.q);
 			TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1],
 			    plinks.q);
 			max_scan -= ss.scanned;
 			vm_pageout_end_scan(&ss);
 			goto act_scan;
 		}
 		if (__predict_false((m->flags & PG_MARKER) != 0))
 			continue;
 
 		vm_page_change_lock(m, &mtx);
 
 		/*
 		 * The page may have been disassociated from the queue
 		 * while locks were dropped.
 		 */
 		if (vm_page_queue(m) != PQ_ACTIVE)
 			continue;
 
 		/*
 		 * Wired pages are dequeued lazily.
 		 */
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 
 		/*
 		 * Check to see "how much" the page has been used.
 		 *
 		 * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 		 * that a reference from a concurrently destroyed mapping is
 		 * observed here and now.
 		 *
 		 * Perform an unsynchronized object ref count check.  While
 		 * the page lock ensures that the page is not reallocated to
 		 * another object, in particular, one with unmanaged mappings
 		 * that cannot support pmap_ts_referenced(), two races are,
 		 * nonetheless, possible:
 		 * 1) The count was transitioning to zero, but we saw a non-
 		 *    zero value.  pmap_ts_referenced() will return zero
 		 *    because the page is not mapped.
 		 * 2) The count was transitioning to one, but we saw zero.
 		 *    This race delays the detection of a new reference.  At
 		 *    worst, we will deactivate and reactivate the page.
 		 */
 		if (m->object->ref_count != 0)
 			act_delta = pmap_ts_referenced(m);
 		else
 			act_delta = 0;
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta++;
 		}
 
 		/*
 		 * Advance or decay the act_count based on recent usage.
 		 */
 		if (act_delta != 0) {
 			m->act_count += ACT_ADVANCE + act_delta;
 			if (m->act_count > ACT_MAX)
 				m->act_count = ACT_MAX;
 		} else
 			m->act_count -= min(m->act_count, ACT_DECLINE);
 
 		if (m->act_count == 0) {
 			/*
 			 * When not short for inactive pages, let dirty pages go
 			 * through the inactive queue before moving to the
 			 * laundry queues.  This gives them some extra time to
 			 * be reactivated, potentially avoiding an expensive
 			 * pageout.  However, during a page shortage, the
 			 * inactive queue is necessarily small, and so dirty
 			 * pages would only spend a trivial amount of time in
 			 * the inactive queue.  Therefore, we might as well
 			 * place them directly in the laundry queue to reduce
 			 * queuing overhead.
 			 */
 			if (page_shortage <= 0)
 				vm_page_deactivate(m);
 			else {
 				/*
 				 * Calling vm_page_test_dirty() here would
 				 * require acquisition of the object's write
 				 * lock.  However, during a page shortage,
 				 * directing dirty pages into the laundry
 				 * queue is only an optimization and not a
 				 * requirement.  Therefore, we simply rely on
 				 * the opportunistic updates to the page's
 				 * dirty field by the pmap.
 				 */
 				if (m->dirty == 0) {
 					vm_page_deactivate(m);
 					page_shortage -=
 					    act_scan_laundry_weight;
 				} else {
 					vm_page_launder(m);
 					page_shortage--;
 				}
 			}
 		}
 	}
 	if (mtx != NULL) {
 		mtx_unlock(mtx);
 		mtx = NULL;
 	}
 	vm_pagequeue_lock(pq);
 	TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q);
 	TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 }
 
 static int
 vm_pageout_reinsert_inactive_page(struct scan_state *ss, vm_page_t m)
 {
 	struct vm_domain *vmd;
 
 	if (m->queue != PQ_INACTIVE || (m->aflags & PGA_ENQUEUED) != 0)
 		return (0);
 	vm_page_aflag_set(m, PGA_ENQUEUED);
 	if ((m->aflags & PGA_REQUEUE_HEAD) != 0) {
 		vmd = vm_pagequeue_domain(m);
 		TAILQ_INSERT_BEFORE(&vmd->vmd_inacthead, m, plinks.q);
 		vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 	} else if ((m->aflags & PGA_REQUEUE) != 0) {
 		TAILQ_INSERT_TAIL(&ss->pq->pq_pl, m, plinks.q);
 		vm_page_aflag_clear(m, PGA_REQUEUE | PGA_REQUEUE_HEAD);
 	} else
 		TAILQ_INSERT_BEFORE(ss->marker, m, plinks.q);
 	return (1);
 }
 
 /*
  * Re-add stuck pages to the inactive queue.  We will examine them again
  * during the next scan.  If the queue state of a page has changed since
  * it was physically removed from the page queue in
  * vm_pageout_collect_batch(), don't do anything with that page.
  */
 static void
 vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq,
     vm_page_t m)
 {
 	struct vm_pagequeue *pq;
 	int delta;
 
 	delta = 0;
 	pq = ss->pq;
 
 	if (m != NULL) {
 		if (vm_batchqueue_insert(bq, m))
 			return;
 		vm_pagequeue_lock(pq);
 		delta += vm_pageout_reinsert_inactive_page(ss, m);
 	} else
 		vm_pagequeue_lock(pq);
 	while ((m = vm_batchqueue_pop(bq)) != NULL)
 		delta += vm_pageout_reinsert_inactive_page(ss, m);
 	vm_pagequeue_cnt_add(pq, delta);
 	vm_pagequeue_unlock(pq);
 	vm_batchqueue_init(bq);
 }
 
 /*
  * Attempt to reclaim the requested number of pages from the inactive queue.
  * Returns true if the shortage was addressed.
  */
 static int
 vm_pageout_scan_inactive(struct vm_domain *vmd, int shortage,
     int *addl_shortage)
 {
 	struct scan_state ss;
 	struct vm_batchqueue rq;
 	struct mtx *mtx;
 	vm_page_t m, marker;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
 	int act_delta, addl_page_shortage, deficit, page_shortage;
 	int starting_page_shortage;
 
 	/*
 	 * The addl_page_shortage is an estimate of the number of temporarily
 	 * stuck pages in the inactive queue.  In other words, the
 	 * number of pages from the inactive count that should be
 	 * discounted in setting the target for the active queue scan.
 	 */
 	addl_page_shortage = 0;
 
 	/*
 	 * vmd_pageout_deficit counts the number of pages requested in
 	 * allocations that failed because of a free page shortage.  We assume
 	 * that the allocations will be reattempted and thus include the deficit
 	 * in our scan target.
 	 */
 	deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit);
 	starting_page_shortage = page_shortage = shortage + deficit;
 
 	mtx = NULL;
 	object = NULL;
 	vm_batchqueue_init(&rq);
 
 	/*
 	 * Start scanning the inactive queue for pages that we can free.  The
 	 * scan will stop when we reach the target or we have scanned the
 	 * entire queue.  (Note that m->act_count is not used to make
 	 * decisions for the inactive queue, only for the active queue.)
 	 */
 	marker = &vmd->vmd_markers[PQ_INACTIVE];
 	pq = &vmd->vmd_pagequeues[PQ_INACTIVE];
 	vm_pagequeue_lock(pq);
 	vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt);
 	while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) {
 		KASSERT((m->flags & PG_MARKER) == 0,
 		    ("marker page %p was dequeued", m));
 
 		vm_page_change_lock(m, &mtx);
 
 recheck:
 		/*
 		 * The page may have been disassociated from the queue
 		 * while locks were dropped.
 		 */
 		if (vm_page_queue(m) != PQ_INACTIVE) {
 			addl_page_shortage++;
 			continue;
 		}
 
 		/*
 		 * The page was re-enqueued after the page queue lock was
 		 * dropped, or a requeue was requested.  This page gets a second
 		 * chance.
 		 */
 		if ((m->aflags & (PGA_ENQUEUED | PGA_REQUEUE |
 		    PGA_REQUEUE_HEAD)) != 0)
 			goto reinsert;
 
 		/*
 		 * Held pages are essentially stuck in the queue.  So,
 		 * they ought to be discounted from the inactive count.
 		 * See the description of addl_page_shortage above.
 		 *
 		 * Wired pages may not be freed.  Complete their removal
 		 * from the queue now to avoid needless revisits during
 		 * future scans.
 		 */
 		if (m->hold_count != 0) {
 			addl_page_shortage++;
 			goto reinsert;
 		}
 		if (vm_page_wired(m)) {
 			vm_page_dequeue_deferred(m);
 			continue;
 		}
 
 		if (object != m->object) {
 			if (object != NULL)
 				VM_OBJECT_WUNLOCK(object);
 			object = m->object;
 			if (!VM_OBJECT_TRYWLOCK(object)) {
 				mtx_unlock(mtx);
 				/* Depends on type-stability. */
 				VM_OBJECT_WLOCK(object);
 				mtx_lock(mtx);
 				goto recheck;
 			}
 		}
 
 		if (vm_page_busied(m)) {
 			/*
 			 * Don't mess with busy pages.  Leave them at
 			 * the front of the queue.  Most likely, they
 			 * are being paged out and will leave the
 			 * queue shortly after the scan finishes.  So,
 			 * they ought to be discounted from the
 			 * inactive count.
 			 */
 			addl_page_shortage++;
 			goto reinsert;
 		}
 
 		/*
 		 * Invalid pages can be easily freed. They cannot be
 		 * mapped, vm_page_free() asserts this.
 		 */
 		if (m->valid == 0)
 			goto free_page;
 
 		/*
 		 * If the page has been referenced and the object is not dead,
 		 * reactivate or requeue the page depending on whether the
 		 * object is mapped.
 		 *
 		 * Test PGA_REFERENCED after calling pmap_ts_referenced() so
 		 * that a reference from a concurrently destroyed mapping is
 		 * observed here and now.
 		 */
 		if (object->ref_count != 0)
 			act_delta = pmap_ts_referenced(m);
 		else {
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("page %p is mapped", m));
 			act_delta = 0;
 		}
 		if ((m->aflags & PGA_REFERENCED) != 0) {
 			vm_page_aflag_clear(m, PGA_REFERENCED);
 			act_delta++;
 		}
 		if (act_delta != 0) {
 			if (object->ref_count != 0) {
 				VM_CNT_INC(v_reactivated);
 				vm_page_activate(m);
 
 				/*
 				 * Increase the activation count if the page
 				 * was referenced while in the inactive queue.
 				 * This makes it less likely that the page will
 				 * be returned prematurely to the inactive
 				 * queue.
  				 */
 				m->act_count += act_delta + ACT_ADVANCE;
 				continue;
 			} else if ((object->flags & OBJ_DEAD) == 0) {
 				vm_page_aflag_set(m, PGA_REQUEUE);
 				goto reinsert;
 			}
 		}
 
 		/*
 		 * If the page appears to be clean at the machine-independent
 		 * layer, then remove all of its mappings from the pmap in
 		 * anticipation of freeing it.  If, however, any of the page's
 		 * mappings allow write access, then the page may still be
 		 * modified until the last of those mappings are removed.
 		 */
 		if (object->ref_count != 0) {
 			vm_page_test_dirty(m);
 			if (m->dirty == 0)
 				pmap_remove_all(m);
 		}
 
 		/*
 		 * Clean pages can be freed, but dirty pages must be sent back
 		 * to the laundry, unless they belong to a dead object.
 		 * Requeueing dirty pages from dead objects is pointless, as
 		 * they are being paged out and freed by the thread that
 		 * destroyed the object.
 		 */
 		if (m->dirty == 0) {
 free_page:
 			/*
 			 * Because we dequeued the page and have already
 			 * checked for concurrent dequeue and enqueue
 			 * requests, we can safely disassociate the page
 			 * from the inactive queue.
 			 */
 			KASSERT((m->aflags & PGA_QUEUE_STATE_MASK) == 0,
 			    ("page %p has queue state", m));
 			m->queue = PQ_NONE;
 			vm_page_free(m);
 			page_shortage--;
 		} else if ((object->flags & OBJ_DEAD) == 0)
 			vm_page_launder(m);
 		continue;
 reinsert:
 		vm_pageout_reinsert_inactive(&ss, &rq, m);
 	}
 	if (mtx != NULL)
 		mtx_unlock(mtx);
 	if (object != NULL)
 		VM_OBJECT_WUNLOCK(object);
 	vm_pageout_reinsert_inactive(&ss, &rq, NULL);
 	vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL);
 	vm_pagequeue_lock(pq);
 	vm_pageout_end_scan(&ss);
 	vm_pagequeue_unlock(pq);
 
 	VM_CNT_ADD(v_dfree, starting_page_shortage - page_shortage);
 
 	/*
 	 * Wake up the laundry thread so that it can perform any needed
 	 * laundering.  If we didn't meet our target, we're in shortfall and
 	 * need to launder more aggressively.  If PQ_LAUNDRY is empty and no
 	 * swap devices are configured, the laundry thread has no work to do, so
 	 * don't bother waking it up.
 	 *
 	 * The laundry thread uses the number of inactive queue scans elapsed
 	 * since the last laundering to determine whether to launder again, so
 	 * keep count.
 	 */
 	if (starting_page_shortage > 0) {
 		pq = &vmd->vmd_pagequeues[PQ_LAUNDRY];
 		vm_pagequeue_lock(pq);
 		if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE &&
 		    (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) {
 			if (page_shortage > 0) {
 				vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL;
 				VM_CNT_INC(v_pdshortfalls);
 			} else if (vmd->vmd_laundry_request !=
 			    VM_LAUNDRY_SHORTFALL)
 				vmd->vmd_laundry_request =
 				    VM_LAUNDRY_BACKGROUND;
 			wakeup(&vmd->vmd_laundry_request);
 		}
 		vmd->vmd_clean_pages_freed +=
 		    starting_page_shortage - page_shortage;
 		vm_pagequeue_unlock(pq);
 	}
 
 	/*
 	 * Wakeup the swapout daemon if we didn't free the targeted number of
 	 * pages.
 	 */
 	if (page_shortage > 0)
 		vm_swapout_run();
 
 	/*
 	 * If the inactive queue scan fails repeatedly to meet its
 	 * target, kill the largest process.
 	 */
 	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
 
 	/*
 	 * Reclaim pages by swapping out idle processes, if configured to do so.
 	 */
 	vm_swapout_run_idle();
 
 	/*
 	 * See the description of addl_page_shortage above.
 	 */
 	*addl_shortage = addl_page_shortage + deficit;
 
 	return (page_shortage <= 0);
 }
 
 static int vm_pageout_oom_vote;
 
 /*
  * The pagedaemon threads randlomly select one to perform the
  * OOM.  Trying to kill processes before all pagedaemons
  * failed to reach free target is premature.
  */
 static void
 vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
     int starting_page_shortage)
 {
 	int old_vote;
 
 	if (starting_page_shortage <= 0 || starting_page_shortage !=
 	    page_shortage)
 		vmd->vmd_oom_seq = 0;
 	else
 		vmd->vmd_oom_seq++;
 	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = FALSE;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
 		}
 		return;
 	}
 
 	/*
 	 * Do not follow the call sequence until OOM condition is
 	 * cleared.
 	 */
 	vmd->vmd_oom_seq = 0;
 
 	if (vmd->vmd_oom)
 		return;
 
 	vmd->vmd_oom = TRUE;
 	old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1);
 	if (old_vote != vm_ndomains - 1)
 		return;
 
 	/*
 	 * The current pagedaemon thread is the last in the quorum to
 	 * start OOM.  Initiate the selection and signaling of the
 	 * victim.
 	 */
 	vm_pageout_oom(VM_OOM_MEM);
 
 	/*
 	 * After one round of OOM terror, recall our vote.  On the
 	 * next pass, current pagedaemon would vote again if the low
 	 * memory condition is still there, due to vmd_oom being
 	 * false.
 	 */
 	vmd->vmd_oom = FALSE;
 	atomic_subtract_int(&vm_pageout_oom_vote, 1);
 }
 
 /*
  * The OOM killer is the page daemon's action of last resort when
  * memory allocation requests have been stalled for a prolonged period
  * of time because it cannot reclaim memory.  This function computes
  * the approximate number of physical pages that could be reclaimed if
  * the specified address space is destroyed.
  *
  * Private, anonymous memory owned by the address space is the
  * principal resource that we expect to recover after an OOM kill.
  * Since the physical pages mapped by the address space's COW entries
  * are typically shared pages, they are unlikely to be released and so
  * they are not counted.
  *
  * To get to the point where the page daemon runs the OOM killer, its
  * efforts to write-back vnode-backed pages may have stalled.  This
  * could be caused by a memory allocation deadlock in the write path
  * that might be resolved by an OOM kill.  Therefore, physical pages
  * belonging to vnode-backed objects are counted, because they might
  * be freed without being written out first if the address space holds
  * the last reference to an unlinked vnode.
  *
  * Similarly, physical pages belonging to OBJT_PHYS objects are
  * counted because the address space might hold the last reference to
  * the object.
  */
 static long
 vm_pageout_oom_pagecount(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj;
 	long res;
 
 	map = &vmspace->vm_map;
 	KASSERT(!map->system_map, ("system map"));
 	sx_assert(&map->lock, SA_LOCKED);
 	res = 0;
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		obj = entry->object.vm_object;
 		if (obj == NULL)
 			continue;
 		if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 &&
 		    obj->ref_count != 1)
 			continue;
 		switch (obj->type) {
 		case OBJT_DEFAULT:
 		case OBJT_SWAP:
 		case OBJT_PHYS:
 		case OBJT_VNODE:
 			res += obj->resident_page_count;
 			break;
 		}
 	}
 	return (res);
 }
 
 void
 vm_pageout_oom(int shortage)
 {
 	struct proc *p, *bigproc;
 	vm_offset_t size, bigsize;
 	struct thread *td;
 	struct vmspace *vm;
 	bool breakout;
 
 	/*
 	 * We keep the process bigproc locked once we find it to keep anyone
 	 * from messing with it; however, there is a possibility of
 	 * deadlock if process B is bigproc and one of its child processes
 	 * attempts to propagate a signal to B while we are waiting for A's
 	 * lock while walking this list.  To avoid this, we don't block on
 	 * the process lock but just skip a process if it is already locked.
 	 */
 	bigproc = NULL;
 	bigsize = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 
 		/*
 		 * If this is a system, protected or killed process, skip it.
 		 */
 		if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC |
 		    P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 ||
 		    p->p_pid == 1 || P_KILLED(p) ||
 		    (p->p_pid < 48 && swap_pager_avail != 0)) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * If the process is in a non-running type state,
 		 * don't touch it.  Check all the threads individually.
 		 */
 		breakout = false;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			thread_lock(td);
 			if (!TD_ON_RUNQ(td) &&
 			    !TD_IS_RUNNING(td) &&
 			    !TD_IS_SLEEPING(td) &&
 			    !TD_IS_SUSPENDED(td) &&
 			    !TD_IS_SWAPPED(td)) {
 				thread_unlock(td);
 				breakout = true;
 				break;
 			}
 			thread_unlock(td);
 		}
 		if (breakout) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		/*
 		 * get the process size
 		 */
 		vm = vmspace_acquire_ref(p);
 		if (vm == NULL) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		_PHOLD_LITE(p);
 		PROC_UNLOCK(p);
 		sx_sunlock(&allproc_lock);
 		if (!vm_map_trylock_read(&vm->vm_map)) {
 			vmspace_free(vm);
 			sx_slock(&allproc_lock);
 			PRELE(p);
 			continue;
 		}
 		size = vmspace_swap_count(vm);
 		if (shortage == VM_OOM_MEM)
 			size += vm_pageout_oom_pagecount(vm);
 		vm_map_unlock_read(&vm->vm_map);
 		vmspace_free(vm);
 		sx_slock(&allproc_lock);
 
 		/*
 		 * If this process is bigger than the biggest one,
 		 * remember it.
 		 */
 		if (size > bigsize) {
 			if (bigproc != NULL)
 				PRELE(bigproc);
 			bigproc = p;
 			bigsize = size;
 		} else {
 			PRELE(p);
 		}
 	}
 	sx_sunlock(&allproc_lock);
 	if (bigproc != NULL) {
 		if (vm_panic_on_oom != 0)
 			panic("out of swap space");
 		PROC_LOCK(bigproc);
 		killproc(bigproc, "out of swap space");
 		sched_nice(bigproc, PRIO_MIN);
 		_PRELE(bigproc);
 		PROC_UNLOCK(bigproc);
 	}
 }
 
 static bool
 vm_pageout_lowmem(void)
 {
 	static int lowmem_ticks = 0;
 	int last;
 
 	last = atomic_load_int(&lowmem_ticks);
 	while ((u_int)(ticks - last) / hz >= lowmem_period) {
 		if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0)
 			continue;
 
 		/*
 		 * Decrease registered cache sizes.
 		 */
 		SDT_PROBE0(vm, , , vm__lowmem_scan);
 		EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES);
 
 		/*
 		 * We do this explicitly after the caches have been
 		 * drained above.
 		 */
 		uma_reclaim();
 		return (true);
 	}
 	return (false);
 }
 
 static void
 vm_pageout_worker(void *arg)
 {
 	struct vm_domain *vmd;
 	u_int ofree;
 	int addl_shortage, domain, shortage;
 	bool target_met;
 
 	domain = (uintptr_t)arg;
 	vmd = VM_DOMAIN(domain);
 	shortage = 0;
 	target_met = true;
 
 	/*
 	 * XXXKIB It could be useful to bind pageout daemon threads to
 	 * the cores belonging to the domain, from which vm_page_array
 	 * is allocated.
 	 */
 
 	KASSERT(vmd->vmd_segs != 0, ("domain without segments"));
 	vmd->vmd_last_active_scan = ticks;
 
 	/*
 	 * The pageout daemon worker is never done, so loop forever.
 	 */
 	while (TRUE) {
 		vm_domain_pageout_lock(vmd);
 
 		/*
 		 * We need to clear wanted before we check the limits.  This
 		 * prevents races with wakers who will check wanted after they
 		 * reach the limit.
 		 */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 0);
 
 		/*
 		 * Might the page daemon need to run again?
 		 */
 		if (vm_paging_needed(vmd, vmd->vmd_free_count)) {
 			/*
 			 * Yes.  If the scan failed to produce enough free
 			 * pages, sleep uninterruptibly for some time in the
 			 * hope that the laundry thread will clean some pages.
 			 */
 			vm_domain_pageout_unlock(vmd);
 			if (!target_met)
 				pause("pwait", hz / VM_INACT_SCAN_RATE);
 		} else {
 			/*
 			 * No, sleep until the next wakeup or until pages
 			 * need to have their reference stats updated.
 			 */
 			if (mtx_sleep(&vmd->vmd_pageout_wanted,
 			    vm_domain_pageout_lockptr(vmd), PDROP | PVM,
 			    "psleep", hz / VM_INACT_SCAN_RATE) == 0)
 				VM_CNT_INC(v_pdwakeups);
 		}
 
 		/* Prevent spurious wakeups by ensuring that wanted is set. */
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 
 		/*
 		 * Use the controller to calculate how many pages to free in
 		 * this interval, and scan the inactive queue.  If the lowmem
 		 * handlers appear to have freed up some pages, subtract the
 		 * difference from the inactive queue scan target.
 		 */
 		shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count);
 		if (shortage > 0) {
 			ofree = vmd->vmd_free_count;
 			if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree)
 				shortage -= min(vmd->vmd_free_count - ofree,
 				    (u_int)shortage);
 			target_met = vm_pageout_scan_inactive(vmd, shortage,
 			    &addl_shortage);
 		} else
 			addl_shortage = 0;
 
 		/*
 		 * Scan the active queue.  A positive value for shortage
 		 * indicates that we must aggressively deactivate pages to avoid
 		 * a shortfall.
 		 */
 		shortage = vm_pageout_active_target(vmd) + addl_shortage;
 		vm_pageout_scan_active(vmd, shortage);
 	}
 }
 
 /*
  *	vm_pageout_init initialises basic pageout daemon settings.
  */
 static void
 vm_pageout_init_domain(int domain)
 {
 	struct vm_domain *vmd;
 	struct sysctl_oid *oid;
 
 	vmd = VM_DOMAIN(domain);
 	vmd->vmd_interrupt_free_min = 2;
 
 	/*
 	 * v_free_reserved needs to include enough for the largest
 	 * swap pager structures plus enough for any pv_entry structs
 	 * when paging. 
 	 */
 	if (vmd->vmd_page_count > 1024)
 		vmd->vmd_free_min = 4 + (vmd->vmd_page_count - 1024) / 200;
 	else
 		vmd->vmd_free_min = 4;
-	vmd->vmd_pageout_free_min = (2*MAXBSIZE)/PAGE_SIZE +
+	vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE +
 	    vmd->vmd_interrupt_free_min;
 	vmd->vmd_free_reserved = vm_pageout_page_count +
 	    vmd->vmd_pageout_free_min + (vmd->vmd_page_count / 768);
 	vmd->vmd_free_severe = vmd->vmd_free_min / 2;
 	vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved;
 	vmd->vmd_free_min += vmd->vmd_free_reserved;
 	vmd->vmd_free_severe += vmd->vmd_free_reserved;
 	vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2;
 	if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3)
 		vmd->vmd_inactive_target = vmd->vmd_free_count / 3;
 
 	/*
 	 * Set the default wakeup threshold to be 10% below the paging
 	 * target.  This keeps the steady state out of shortfall.
 	 */
 	vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9;
 
 	/*
 	 * Target amount of memory to move out of the laundry queue during a
 	 * background laundering.  This is proportional to the amount of system
 	 * memory.
 	 */
 	vmd->vmd_background_launder_target = (vmd->vmd_free_target -
 	    vmd->vmd_free_min) / 10;
 
 	/* Initialize the pageout daemon pid controller. */
 	pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE,
 	    vmd->vmd_free_target, PIDCTRL_BOUND,
 	    PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO,
 	    "pidctrl", CTLFLAG_RD, NULL, "");
 	pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid));
 }
 
 static void
 vm_pageout_init(void)
 {
 	u_int freecount;
 	int i;
 
 	/*
 	 * Initialize some paging parameters.
 	 */
 	if (vm_cnt.v_page_count < 2000)
 		vm_pageout_page_count = 8;
 
 	freecount = 0;
 	for (i = 0; i < vm_ndomains; i++) {
 		struct vm_domain *vmd;
 
 		vm_pageout_init_domain(i);
 		vmd = VM_DOMAIN(i);
 		vm_cnt.v_free_reserved += vmd->vmd_free_reserved;
 		vm_cnt.v_free_target += vmd->vmd_free_target;
 		vm_cnt.v_free_min += vmd->vmd_free_min;
 		vm_cnt.v_inactive_target += vmd->vmd_inactive_target;
 		vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min;
 		vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min;
 		vm_cnt.v_free_severe += vmd->vmd_free_severe;
 		freecount += vmd->vmd_free_count;
 	}
 
 	/*
 	 * Set interval in seconds for active scan.  We want to visit each
 	 * page at least once every ten minutes.  This is to prevent worst
 	 * case paging behaviors with stale active LRU.
 	 */
 	if (vm_pageout_update_period == 0)
 		vm_pageout_update_period = 600;
 
 	if (vm_page_max_user_wired == 0)
 		vm_page_max_user_wired = freecount / 3;
 }
 
 /*
  *     vm_pageout is the high level pageout daemon.
  */
 static void
 vm_pageout(void)
 {
 	struct proc *p;
 	struct thread *td;
 	int error, first, i;
 
 	p = curproc;
 	td = curthread;
 
 	swap_pager_swap_init();
 	for (first = -1, i = 0; i < vm_ndomains; i++) {
 		if (VM_DOMAIN_EMPTY(i)) {
 			if (bootverbose)
 				printf("domain %d empty; skipping pageout\n",
 				    i);
 			continue;
 		}
 		if (first == -1)
 			first = i;
 		else {
 			error = kthread_add(vm_pageout_worker,
 			    (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i);
 			if (error != 0)
 				panic("starting pageout for domain %d: %d\n",
 				    i, error);
 		}
 		error = kthread_add(vm_pageout_laundry_worker,
 		    (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i);
 		if (error != 0)
 			panic("starting laundry for domain %d: %d", i, error);
 	}
 	error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma");
 	if (error != 0)
 		panic("starting uma_reclaim helper, error %d\n", error);
 
 	snprintf(td->td_name, sizeof(td->td_name), "dom%d", first);
 	vm_pageout_worker((void *)(uintptr_t)first);
 }
 
 /*
  * Perform an advisory wakeup of the page daemon.
  */
 void
 pagedaemon_wakeup(int domain)
 {
 	struct vm_domain *vmd;
 
 	vmd = VM_DOMAIN(domain);
 	vm_domain_pageout_assert_unlocked(vmd);
 	if (curproc == pageproc)
 		return;
 
 	if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) {
 		vm_domain_pageout_lock(vmd);
 		atomic_store_int(&vmd->vmd_pageout_wanted, 1);
 		wakeup(&vmd->vmd_pageout_wanted);
 		vm_domain_pageout_unlock(vmd);
 	}
 }
Index: head/sys/vm/vnode_pager.c
===================================================================
--- head/sys/vm/vnode_pager.c	(revision 349790)
+++ head/sys/vm/vnode_pager.c	(revision 349791)
@@ -1,1589 +1,1589 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1991 The Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1993, 1994 John S. Dyson
  * Copyright (c) 1995, David Greenman
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from: @(#)vnode_pager.c	7.5 (Berkeley) 4/20/91
  */
 
 /*
  * Page to/from files (vnodes).
  */
 
 /*
  * TODO:
  *	Implement VOP_GETPAGES/PUTPAGES interface for filesystems. Will
  *	greatly re-simplify the vnode_pager.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/vmmeter.h>
 #include <sys/ktr.h>
 #include <sys/limits.h>
 #include <sys/conf.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/domainset.h>
 
 #include <machine/atomic.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_map.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 static int vnode_pager_addr(struct vnode *vp, vm_ooffset_t address,
     daddr_t *rtaddress, int *run);
 static int vnode_pager_input_smlfs(vm_object_t object, vm_page_t m);
 static int vnode_pager_input_old(vm_object_t object, vm_page_t m);
 static void vnode_pager_dealloc(vm_object_t);
 static int vnode_pager_getpages(vm_object_t, vm_page_t *, int, int *, int *);
 static int vnode_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, vop_getpages_iodone_t, void *);
 static void vnode_pager_putpages(vm_object_t, vm_page_t *, int, int, int *);
 static boolean_t vnode_pager_haspage(vm_object_t, vm_pindex_t, int *, int *);
 static vm_object_t vnode_pager_alloc(void *, vm_ooffset_t, vm_prot_t,
     vm_ooffset_t, struct ucred *cred);
 static int vnode_pager_generic_getpages_done(struct buf *);
 static void vnode_pager_generic_getpages_done_async(struct buf *);
 
 struct pagerops vnodepagerops = {
 	.pgo_alloc =	vnode_pager_alloc,
 	.pgo_dealloc =	vnode_pager_dealloc,
 	.pgo_getpages =	vnode_pager_getpages,
 	.pgo_getpages_async = vnode_pager_getpages_async,
 	.pgo_putpages =	vnode_pager_putpages,
 	.pgo_haspage =	vnode_pager_haspage,
 };
 
 static struct domainset *vnode_domainset = NULL;
 
 SYSCTL_PROC(_debug, OID_AUTO, vnode_domainset, CTLTYPE_STRING | CTLFLAG_RW,
     &vnode_domainset, 0, sysctl_handle_domainset, "A",
     "Default vnode NUMA policy");
 
 static int nvnpbufs;
 SYSCTL_INT(_vm, OID_AUTO, vnode_pbufs, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &nvnpbufs, 0, "number of physical buffers allocated for vnode pager");
 
 static uma_zone_t vnode_pbuf_zone;
 
 static void
 vnode_pager_init(void *dummy)
 {
 
 #ifdef __LP64__
 	nvnpbufs = nswbuf * 2;
 #else
 	nvnpbufs = nswbuf / 2;
 #endif
 	TUNABLE_INT_FETCH("vm.vnode_pbufs", &nvnpbufs);
 	vnode_pbuf_zone = pbuf_zsecond_create("vnpbuf", nvnpbufs);
 }
 SYSINIT(vnode_pager, SI_SUB_CPU, SI_ORDER_ANY, vnode_pager_init, NULL);
 
 /* Create the VM system backing object for this vnode */
 int
 vnode_create_vobject(struct vnode *vp, off_t isize, struct thread *td)
 {
 	vm_object_t object;
 	vm_ooffset_t size = isize;
 	struct vattr va;
 
 	if (!vn_isdisk(vp, NULL) && vn_canvmio(vp) == FALSE)
 		return (0);
 
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if (!(object->flags & OBJ_DEAD)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (0);
 		}
 		VOP_UNLOCK(vp, 0);
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vodead", 0);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 
 	if (size == 0) {
 		if (vn_isdisk(vp, NULL)) {
 			size = IDX_TO_OFF(INT_MAX);
 		} else {
 			if (VOP_GETATTR(vp, &va, td->td_ucred))
 				return (0);
 			size = va.va_size;
 		}
 	}
 
 	object = vnode_pager_alloc(vp, size, 0, 0, td->td_ucred);
 	/*
 	 * Dereference the reference we just created.  This assumes
 	 * that the object is associated with the vp.
 	 */
 	VM_OBJECT_WLOCK(object);
 	object->ref_count--;
 	VM_OBJECT_WUNLOCK(object);
 	vrele(vp);
 
 	KASSERT(vp->v_object != NULL, ("vnode_create_vobject: NULL object"));
 
 	return (0);
 }
 
 void
 vnode_destroy_vobject(struct vnode *vp)
 {
 	struct vm_object *obj;
 
 	obj = vp->v_object;
 	if (obj == NULL)
 		return;
 	ASSERT_VOP_ELOCKED(vp, "vnode_destroy_vobject");
 	VM_OBJECT_WLOCK(obj);
 	umtx_shm_object_terminated(obj);
 	if (obj->ref_count == 0) {
 		/*
 		 * don't double-terminate the object
 		 */
 		if ((obj->flags & OBJ_DEAD) == 0) {
 			vm_object_terminate(obj);
 		} else {
 			/*
 			 * Waiters were already handled during object
 			 * termination.  The exclusive vnode lock hopefully
 			 * prevented new waiters from referencing the dying
 			 * object.
 			 */
 			KASSERT((obj->flags & OBJ_DISCONNECTWNT) == 0,
 			    ("OBJ_DISCONNECTWNT set obj %p flags %x",
 			    obj, obj->flags));
 			vp->v_object = NULL;
 			VM_OBJECT_WUNLOCK(obj);
 		}
 	} else {
 		/*
 		 * Woe to the process that tries to page now :-).
 		 */
 		vm_pager_deallocate(obj);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	KASSERT(vp->v_object == NULL, ("vp %p obj %p", vp, vp->v_object));
 }
 
 
 /*
  * Allocate (or lookup) pager for a vnode.
  * Handle is a vnode pointer.
  *
  * MPSAFE
  */
 vm_object_t
 vnode_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 	struct vnode *vp;
 
 	/*
 	 * Pageout to vnode, no can do yet.
 	 */
 	if (handle == NULL)
 		return (NULL);
 
 	vp = (struct vnode *) handle;
 
 	/*
 	 * If the object is being terminated, wait for it to
 	 * go away.
 	 */
 retry:
 	while ((object = vp->v_object) != NULL) {
 		VM_OBJECT_WLOCK(object);
 		if ((object->flags & OBJ_DEAD) == 0)
 			break;
 		vm_object_set_flag(object, OBJ_DISCONNECTWNT);
 		VM_OBJECT_SLEEP(object, object, PDROP | PVM, "vadead", 0);
 	}
 
 	KASSERT(vp->v_usecount != 0, ("vnode_pager_alloc: no vnode reference"));
 
 	if (object == NULL) {
 		/*
 		 * Add an object of the appropriate size
 		 */
 		object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size)));
 
 		object->un_pager.vnp.vnp_size = size;
 		object->un_pager.vnp.writemappings = 0;
 		object->domain.dr_policy = vnode_domainset;
 
 		object->handle = handle;
 		VI_LOCK(vp);
 		if (vp->v_object != NULL) {
 			/*
 			 * Object has been created while we were sleeping
 			 */
 			VI_UNLOCK(vp);
 			VM_OBJECT_WLOCK(object);
 			KASSERT(object->ref_count == 1,
 			    ("leaked ref %p %d", object, object->ref_count));
 			object->type = OBJT_DEAD;
 			object->ref_count = 0;
 			VM_OBJECT_WUNLOCK(object);
 			vm_object_destroy(object);
 			goto retry;
 		}
 		vp->v_object = object;
 		VI_UNLOCK(vp);
 	} else {
 		object->ref_count++;
 #if VM_NRESERVLEVEL > 0
 		vm_object_color(object, 0);
 #endif
 		VM_OBJECT_WUNLOCK(object);
 	}
 	vrefact(vp);
 	return (object);
 }
 
 /*
  *	The object must be locked.
  */
 static void
 vnode_pager_dealloc(vm_object_t object)
 {
 	struct vnode *vp;
 	int refs;
 
 	vp = object->handle;
 	if (vp == NULL)
 		panic("vnode_pager_dealloc: pager already dealloced");
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	vm_object_pip_wait(object, "vnpdea");
 	refs = object->ref_count;
 
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 	if (object->flags & OBJ_DISCONNECTWNT) {
 		vm_object_clear_flag(object, OBJ_DISCONNECTWNT);
 		wakeup(object);
 	}
 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc");
 	if (object->un_pager.vnp.writemappings > 0) {
 		object->un_pager.vnp.writemappings = 0;
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	vp->v_object = NULL;
 	VI_LOCK(vp);
 
 	/*
 	 * vm_map_entry_set_vnode_text() cannot reach this vnode by
 	 * following object->handle.  Clear all text references now.
 	 * This also clears the transient references from
 	 * kern_execve(), which is fine because dead_vnodeops uses nop
 	 * for VOP_UNSET_TEXT().
 	 */
 	if (vp->v_writecount < 0)
 		vp->v_writecount = 0;
 	VI_UNLOCK(vp);
 	VM_OBJECT_WUNLOCK(object);
 	while (refs-- > 0)
 		vunref(vp);
 	VM_OBJECT_WLOCK(object);
 }
 
 static boolean_t
 vnode_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	struct vnode *vp = object->handle;
 	daddr_t bn;
 	int err;
 	daddr_t reqblock;
 	int poff;
 	int bsize;
 	int pagesperblock, blocksperpage;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	/*
 	 * If no vp or vp is doomed or marked transparent to VM, we do not
 	 * have the page.
 	 */
 	if (vp == NULL || vp->v_iflag & VI_DOOMED)
 		return FALSE;
 	/*
 	 * If the offset is beyond end of file we do
 	 * not have the page.
 	 */
 	if (IDX_TO_OFF(pindex) >= object->un_pager.vnp.vnp_size)
 		return FALSE;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 	blocksperpage = 0;
 	if (pagesperblock > 0) {
 		reqblock = pindex / pagesperblock;
 	} else {
 		blocksperpage = (PAGE_SIZE / bsize);
 		reqblock = pindex * blocksperpage;
 	}
 	VM_OBJECT_WUNLOCK(object);
 	err = VOP_BMAP(vp, reqblock, NULL, &bn, after, before);
 	VM_OBJECT_WLOCK(object);
 	if (err)
 		return TRUE;
 	if (bn == -1)
 		return FALSE;
 	if (pagesperblock > 0) {
 		poff = pindex - (reqblock * pagesperblock);
 		if (before) {
 			*before *= pagesperblock;
 			*before += poff;
 		}
 		if (after) {
 			/*
 			 * The BMAP vop can report a partial block in the
 			 * 'after', but must not report blocks after EOF.
 			 * Assert the latter, and truncate 'after' in case
 			 * of the former.
 			 */
 			KASSERT((reqblock + *after) * pagesperblock <
 			    roundup2(object->size, pagesperblock),
 			    ("%s: reqblock %jd after %d size %ju", __func__,
 			    (intmax_t )reqblock, *after,
 			    (uintmax_t )object->size));
 			*after *= pagesperblock;
 			*after += pagesperblock - (poff + 1);
 			if (pindex + *after >= object->size)
 				*after = object->size - 1 - pindex;
 		}
 	} else {
 		if (before) {
 			*before /= blocksperpage;
 		}
 
 		if (after) {
 			*after /= blocksperpage;
 		}
 	}
 	return TRUE;
 }
 
 /*
  * Lets the VM system know about a change in size for a file.
  * We adjust our own internal size and flush any cached pages in
  * the associated object that are affected by the size change.
  *
  * Note: this routine may be invoked as a result of a pager put
  * operation (possibly at object termination time), so we must be careful.
  */
 void
 vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t nobjsize;
 
 	if ((object = vp->v_object) == NULL)
 		return;
 /* 	ASSERT_VOP_ELOCKED(vp, "vnode_pager_setsize and not locked vnode"); */
 	VM_OBJECT_WLOCK(object);
 	if (object->type == OBJT_DEAD) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	KASSERT(object->type == OBJT_VNODE,
 	    ("not vnode-backed object %p", object));
 	if (nsize == object->un_pager.vnp.vnp_size) {
 		/*
 		 * Hasn't changed size
 		 */
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	nobjsize = OFF_TO_IDX(nsize + PAGE_MASK);
 	if (nsize < object->un_pager.vnp.vnp_size) {
 		/*
 		 * File has shrunk. Toss any cached pages beyond the new EOF.
 		 */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 		/*
 		 * this gets rid of garbage at the end of a page that is now
 		 * only partially backed by the vnode.
 		 *
 		 * XXX for some reason (I don't know yet), if we take a
 		 * completely invalid page and mark it partially valid
 		 * it can screw up NFS reads, so we don't allow the case.
 		 */
 		if ((nsize & PAGE_MASK) &&
 		    (m = vm_page_lookup(object, OFF_TO_IDX(nsize))) != NULL &&
 		    m->valid != 0) {
 			int base = (int)nsize & PAGE_MASK;
 			int size = PAGE_SIZE - base;
 
 			/*
 			 * Clear out partial-page garbage in case
 			 * the page has been mapped.
 			 */
 			pmap_zero_page_area(m, base, size);
 
 			/*
 			 * Update the valid bits to reflect the blocks that
 			 * have been zeroed.  Some of these valid bits may
 			 * have already been set.
 			 */
 			vm_page_set_valid_range(m, base, size);
 
 			/*
 			 * Round "base" to the next block boundary so that the
 			 * dirty bit for a partially zeroed block is not
 			 * cleared.
 			 */
 			base = roundup2(base, DEV_BSIZE);
 
 			/*
 			 * Clear out partial-page dirty bits.
 			 *
 			 * note that we do not clear out the valid
 			 * bits.  This would prevent bogus_page
 			 * replacement from working properly.
 			 */
 			vm_page_clear_dirty(m, base, PAGE_SIZE - base);
 		}
 	}
 	object->un_pager.vnp.vnp_size = nsize;
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 }
 
 /*
  * calculate the linear (byte) disk address of specified virtual
  * file address
  */
 static int
 vnode_pager_addr(struct vnode *vp, vm_ooffset_t address, daddr_t *rtaddress,
     int *run)
 {
 	int bsize;
 	int err;
 	daddr_t vblock;
 	daddr_t voffset;
 
 	if (address < 0)
 		return -1;
 
 	if (vp->v_iflag & VI_DOOMED)
 		return -1;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	vblock = address / bsize;
 	voffset = address % bsize;
 
 	err = VOP_BMAP(vp, vblock, NULL, rtaddress, run, NULL);
 	if (err == 0) {
 		if (*rtaddress != -1)
 			*rtaddress += voffset / DEV_BSIZE;
 		if (run) {
 			*run += 1;
-			*run *= bsize/PAGE_SIZE;
-			*run -= voffset/PAGE_SIZE;
+			*run *= bsize / PAGE_SIZE;
+			*run -= voffset / PAGE_SIZE;
 		}
 	}
 
 	return (err);
 }
 
 /*
  * small block filesystem vnode pager input
  */
 static int
 vnode_pager_input_smlfs(vm_object_t object, vm_page_t m)
 {
 	struct vnode *vp;
 	struct bufobj *bo;
 	struct buf *bp;
 	struct sf_buf *sf;
 	daddr_t fileaddr;
 	vm_offset_t bsize;
 	vm_page_bits_t bits;
 	int error, i;
 
 	error = 0;
 	vp = object->handle;
 	if (vp->v_iflag & VI_DOOMED)
 		return VM_PAGER_BAD;
 
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 
 	VOP_BMAP(vp, 0, &bo, 0, NULL, NULL);
 
 	sf = sf_buf_alloc(m, 0);
 
 	for (i = 0; i < PAGE_SIZE / bsize; i++) {
 		vm_ooffset_t address;
 
 		bits = vm_page_bits(i * bsize, bsize);
 		if (m->valid & bits)
 			continue;
 
 		address = IDX_TO_OFF(m->pindex) + i * bsize;
 		if (address >= object->un_pager.vnp.vnp_size) {
 			fileaddr = -1;
 		} else {
 			error = vnode_pager_addr(vp, address, &fileaddr, NULL);
 			if (error)
 				break;
 		}
 		if (fileaddr != -1) {
 			bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK);
 
 			/* build a minimal buffer header */
 			bp->b_iocmd = BIO_READ;
 			bp->b_iodone = bdone;
 			KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 			KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 			bp->b_rcred = crhold(curthread->td_ucred);
 			bp->b_wcred = crhold(curthread->td_ucred);
 			bp->b_data = (caddr_t)sf_buf_kva(sf) + i * bsize;
 			bp->b_blkno = fileaddr;
 			pbgetbo(bo, bp);
 			bp->b_vp = vp;
 			bp->b_bcount = bsize;
 			bp->b_bufsize = bsize;
 			bp->b_runningbufspace = bp->b_bufsize;
 			atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 
 			/* do the input */
 			bp->b_iooffset = dbtob(bp->b_blkno);
 			bstrategy(bp);
 
 			bwait(bp, PVM, "vnsrd");
 
 			if ((bp->b_ioflags & BIO_ERROR) != 0)
 				error = EIO;
 
 			/*
 			 * free the buffer header back to the swap buffer pool
 			 */
 			bp->b_vp = NULL;
 			pbrelbo(bp);
 			uma_zfree(vnode_pbuf_zone, bp);
 			if (error)
 				break;
 		} else
 			bzero((caddr_t)sf_buf_kva(sf) + i * bsize, bsize);
 		KASSERT((m->dirty & bits) == 0,
 		    ("vnode_pager_input_smlfs: page %p is dirty", m));
 		VM_OBJECT_WLOCK(object);
 		m->valid |= bits;
 		VM_OBJECT_WUNLOCK(object);
 	}
 	sf_buf_free(sf);
 	if (error) {
 		return VM_PAGER_ERROR;
 	}
 	return VM_PAGER_OK;
 }
 
 /*
  * old style vnode pager input routine
  */
 static int
 vnode_pager_input_old(vm_object_t object, vm_page_t m)
 {
 	struct uio auio;
 	struct iovec aiov;
 	int error;
 	int size;
 	struct sf_buf *sf;
 	struct vnode *vp;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	error = 0;
 
 	/*
 	 * Return failure if beyond current EOF
 	 */
 	if (IDX_TO_OFF(m->pindex) >= object->un_pager.vnp.vnp_size) {
 		return VM_PAGER_BAD;
 	} else {
 		size = PAGE_SIZE;
 		if (IDX_TO_OFF(m->pindex) + size > object->un_pager.vnp.vnp_size)
 			size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(m->pindex);
 		vp = object->handle;
 		VM_OBJECT_WUNLOCK(object);
 
 		/*
 		 * Allocate a kernel virtual address and initialize so that
 		 * we can use VOP_READ/WRITE routines.
 		 */
 		sf = sf_buf_alloc(m, 0);
 
 		aiov.iov_base = (caddr_t)sf_buf_kva(sf);
 		aiov.iov_len = size;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = IDX_TO_OFF(m->pindex);
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_rw = UIO_READ;
 		auio.uio_resid = size;
 		auio.uio_td = curthread;
 
 		error = VOP_READ(vp, &auio, 0, curthread->td_ucred);
 		if (!error) {
 			int count = size - auio.uio_resid;
 
 			if (count == 0)
 				error = EINVAL;
 			else if (count != PAGE_SIZE)
 				bzero((caddr_t)sf_buf_kva(sf) + count,
 				    PAGE_SIZE - count);
 		}
 		sf_buf_free(sf);
 
 		VM_OBJECT_WLOCK(object);
 	}
 	KASSERT(m->dirty == 0, ("vnode_pager_input_old: page %p is dirty", m));
 	if (!error)
 		m->valid = VM_PAGE_BITS_ALL;
 	return error ? VM_PAGER_ERROR : VM_PAGER_OK;
 }
 
 /*
  * generic vnode pager input routine
  */
 
 /*
  * Local media VFS's that do not implement their own VOP_GETPAGES
  * should have their VOP_GETPAGES call to vnode_pager_generic_getpages()
  * to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_GETPAGES.
  */
 static int
 vnode_pager_getpages(vm_object_t object, vm_page_t *m, int count, int *rbehind,
     int *rahead)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES(vp, m, count, rbehind, rahead);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return rtval;
 }
 
 static int
 vnode_pager_getpages_async(vm_object_t object, vm_page_t *m, int count,
     int *rbehind, int *rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	struct vnode *vp;
 	int rtval;
 
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_GETPAGES_ASYNC(vp, m, count, rbehind, rahead, iodone, arg);
 	KASSERT(rtval != EOPNOTSUPP,
 	    ("vnode_pager: FS getpages_async not implemented\n"));
 	VM_OBJECT_WLOCK(object);
 	return (rtval);
 }
 
 /*
  * The implementation of VOP_GETPAGES() and VOP_GETPAGES_ASYNC() for
  * local filesystems, where partially valid pages can only occur at
  * the end of file.
  */
 int
 vnode_pager_local_getpages(struct vop_getpages_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, NULL, NULL));
 }
 
 int
 vnode_pager_local_getpages_async(struct vop_getpages_async_args *ap)
 {
 
 	return (vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count,
 	    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg));
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_GETPAGES.
  */
 int
 vnode_pager_generic_getpages(struct vnode *vp, vm_page_t *m, int count,
     int *a_rbehind, int *a_rahead, vop_getpages_iodone_t iodone, void *arg)
 {
 	vm_object_t object;
 	struct bufobj *bo;
 	struct buf *bp;
 	off_t foff;
 #ifdef INVARIANTS
 	off_t blkno0;
 #endif
 	int bsize, pagesperblock;
 	int error, before, after, rbehind, rahead, poff, i;
 	int bytecount, secmask;
 
 	KASSERT(vp->v_type != VCHR && vp->v_type != VBLK,
 	    ("%s does not support devices", __func__));
 
 	if (vp->v_iflag & VI_DOOMED)
 		return (VM_PAGER_BAD);
 
 	object = vp->v_object;
 	foff = IDX_TO_OFF(m[0]->pindex);
 	bsize = vp->v_mount->mnt_stat.f_iosize;
 	pagesperblock = bsize / PAGE_SIZE;
 
 	KASSERT(foff < object->un_pager.vnp.vnp_size,
 	    ("%s: page %p offset beyond vp %p size", __func__, m[0], vp));
 	KASSERT(count <= nitems(bp->b_pages),
 	    ("%s: requested %d pages", __func__, count));
 
 	/*
 	 * The last page has valid blocks.  Invalid part can only
 	 * exist at the end of file, and the page is made fully valid
 	 * by zeroing in vm_pager_get_pages().
 	 */
 	if (m[count - 1]->valid != 0 && --count == 0) {
 		if (iodone != NULL)
 			iodone(arg, m, 1, 0);
 		return (VM_PAGER_OK);
 	}
 
 	bp = uma_zalloc(vnode_pbuf_zone, M_WAITOK);
 
 	/*
 	 * Get the underlying device blocks for the file with VOP_BMAP().
 	 * If the file system doesn't support VOP_BMAP, use old way of
 	 * getting pages via VOP_READ.
 	 */
 	error = VOP_BMAP(vp, foff / bsize, &bo, &bp->b_blkno, &after, &before);
 	if (error == EOPNOTSUPP) {
 		uma_zfree(vnode_pbuf_zone, bp);
 		VM_OBJECT_WLOCK(object);
 		for (i = 0; i < count; i++) {
 			VM_CNT_INC(v_vnodein);
 			VM_CNT_INC(v_vnodepgsin);
 			error = vnode_pager_input_old(object, m[i]);
 			if (error)
 				break;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		return (error);
 	} else if (error != 0) {
 		uma_zfree(vnode_pbuf_zone, bp);
 		return (VM_PAGER_ERROR);
 	}
 
 	/*
 	 * If the file system supports BMAP, but blocksize is smaller
 	 * than a page size, then use special small filesystem code.
 	 */
 	if (pagesperblock == 0) {
 		uma_zfree(vnode_pbuf_zone, bp);
 		for (i = 0; i < count; i++) {
 			VM_CNT_INC(v_vnodein);
 			VM_CNT_INC(v_vnodepgsin);
 			error = vnode_pager_input_smlfs(object, m[i]);
 			if (error)
 				break;
 		}
 		return (error);
 	}
 
 	/*
 	 * A sparse file can be encountered only for a single page request,
 	 * which may not be preceded by call to vm_pager_haspage().
 	 */
 	if (bp->b_blkno == -1) {
 		KASSERT(count == 1,
 		    ("%s: array[%d] request to a sparse file %p", __func__,
 		    count, vp));
 		uma_zfree(vnode_pbuf_zone, bp);
 		pmap_zero_page(m[0]);
 		KASSERT(m[0]->dirty == 0, ("%s: page %p is dirty",
 		    __func__, m[0]));
 		VM_OBJECT_WLOCK(object);
 		m[0]->valid = VM_PAGE_BITS_ALL;
 		VM_OBJECT_WUNLOCK(object);
 		return (VM_PAGER_OK);
 	}
 
 #ifdef INVARIANTS
 	blkno0 = bp->b_blkno;
 #endif
 	bp->b_blkno += (foff % bsize) / DEV_BSIZE;
 
 	/* Recalculate blocks available after/before to pages. */
 	poff = (foff % bsize) / PAGE_SIZE;
 	before *= pagesperblock;
 	before += poff;
 	after *= pagesperblock;
 	after += pagesperblock - (poff + 1);
 	if (m[0]->pindex + after >= object->size)
 		after = object->size - 1 - m[0]->pindex;
 	KASSERT(count <= after + 1, ("%s: %d pages asked, can do only %d",
 	    __func__, count, after + 1));
 	after -= count - 1;
 
 	/* Trim requested rbehind/rahead to possible values. */   
 	rbehind = a_rbehind ? *a_rbehind : 0;
 	rahead = a_rahead ? *a_rahead : 0;
 	rbehind = min(rbehind, before);
 	rbehind = min(rbehind, m[0]->pindex);
 	rahead = min(rahead, after);
 	rahead = min(rahead, object->size - m[count - 1]->pindex);
 	/*
 	 * Check that total amount of pages fit into buf.  Trim rbehind and
 	 * rahead evenly if not.
 	 */
 	if (rbehind + rahead + count > nitems(bp->b_pages)) {
 		int trim, sum;
 
 		trim = rbehind + rahead + count - nitems(bp->b_pages) + 1;
 		sum = rbehind + rahead;
 		if (rbehind == before) {
 			/* Roundup rbehind trim to block size. */
 			rbehind -= roundup(trim * rbehind / sum, pagesperblock);
 			if (rbehind < 0)
 				rbehind = 0;
 		} else
 			rbehind -= trim * rbehind / sum;
 		rahead -= trim * rahead / sum;
 	}
 	KASSERT(rbehind + rahead + count <= nitems(bp->b_pages),
 	    ("%s: behind %d ahead %d count %d", __func__,
 	    rbehind, rahead, count));
 
 	/*
 	 * Fill in the bp->b_pages[] array with requested and optional   
 	 * read behind or read ahead pages.  Read behind pages are looked
 	 * up in a backward direction, down to a first cached page.  Same
 	 * for read ahead pages, but there is no need to shift the array
 	 * in case of encountering a cached page.
 	 */
 	i = bp->b_npages = 0;
 	if (rbehind) {
 		vm_pindex_t startpindex, tpindex;
 		vm_page_t p;
 
 		VM_OBJECT_WLOCK(object);
 		startpindex = m[0]->pindex - rbehind;
 		if ((p = TAILQ_PREV(m[0], pglist, listq)) != NULL &&
 		    p->pindex >= startpindex)
 			startpindex = p->pindex + 1;
 
 		/* tpindex is unsigned; beware of numeric underflow. */
 		for (tpindex = m[0]->pindex - 1;
 		    tpindex >= startpindex && tpindex < m[0]->pindex;
 		    tpindex--, i++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 			if (p == NULL) {
 				/* Shift the array. */
 				for (int j = 0; j < i; j++)
 					bp->b_pages[j] = bp->b_pages[j + 
 					    tpindex + 1 - startpindex]; 
 				break;
 			}
 			bp->b_pages[tpindex - startpindex] = p;
 		}
 
 		bp->b_pgbefore = i;
 		bp->b_npages += i;
 		bp->b_blkno -= IDX_TO_OFF(i) / DEV_BSIZE;
 	} else
 		bp->b_pgbefore = 0;
 
 	/* Requested pages. */
 	for (int j = 0; j < count; j++, i++)
 		bp->b_pages[i] = m[j];
 	bp->b_npages += count;
 
 	if (rahead) {
 		vm_pindex_t endpindex, tpindex;
 		vm_page_t p;
 
 		if (!VM_OBJECT_WOWNED(object))
 			VM_OBJECT_WLOCK(object);
 		endpindex = m[count - 1]->pindex + rahead + 1;
 		if ((p = TAILQ_NEXT(m[count - 1], listq)) != NULL &&
 		    p->pindex < endpindex)
 			endpindex = p->pindex;
 		if (endpindex > object->size)
 			endpindex = object->size;
 
 		for (tpindex = m[count - 1]->pindex + 1;
 		    tpindex < endpindex; i++, tpindex++) {
 			p = vm_page_alloc(object, tpindex, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			bp->b_pages[i] = p;
 		}
 
 		bp->b_pgafter = i - bp->b_npages;
 		bp->b_npages = i;
 	} else
 		bp->b_pgafter = 0;
 
 	if (VM_OBJECT_WOWNED(object))
 		VM_OBJECT_WUNLOCK(object);
 
 	/* Report back actual behind/ahead read. */
 	if (a_rbehind)
 		*a_rbehind = bp->b_pgbefore;
 	if (a_rahead)
 		*a_rahead = bp->b_pgafter;
 
 #ifdef INVARIANTS
 	KASSERT(bp->b_npages <= nitems(bp->b_pages),
 	    ("%s: buf %p overflowed", __func__, bp));
 	for (int j = 1, prev = 0; j < bp->b_npages; j++) {
 		if (bp->b_pages[j] == bogus_page)
 			continue;
 		KASSERT(bp->b_pages[j]->pindex - bp->b_pages[prev]->pindex ==
 		    j - prev, ("%s: pages array not consecutive, bp %p",
 		     __func__, bp));
 		prev = j;
 	}
 #endif
 
 	/*
 	 * Recalculate first offset and bytecount with regards to read behind.
 	 * Truncate bytecount to vnode real size and round up physical size
 	 * for real devices.
 	 */
 	foff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	bytecount = bp->b_npages << PAGE_SHIFT;
 	if ((foff + bytecount) > object->un_pager.vnp.vnp_size)
 		bytecount = object->un_pager.vnp.vnp_size - foff;
 	secmask = bo->bo_bsize - 1;
 	KASSERT(secmask < PAGE_SIZE && secmask > 0,
 	    ("%s: sector size %d too large", __func__, secmask + 1));
 	bytecount = (bytecount + secmask) & ~secmask;
 
 	/*
 	 * And map the pages to be read into the kva, if the filesystem
 	 * requires mapped buffers.
 	 */
 	if ((vp->v_mount->mnt_kern_flag & MNTK_UNMAPPED_BUFS) != 0 &&
 	    unmapped_buf_allowed) {
 		bp->b_data = unmapped_buf;
 		bp->b_offset = 0;
 	} else {
 		bp->b_data = bp->b_kvabase;
 		pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
 	}
 
 	/* Build a minimal buffer header. */
 	bp->b_iocmd = BIO_READ;
 	KASSERT(bp->b_rcred == NOCRED, ("leaking read ucred"));
 	KASSERT(bp->b_wcred == NOCRED, ("leaking write ucred"));
 	bp->b_rcred = crhold(curthread->td_ucred);
 	bp->b_wcred = crhold(curthread->td_ucred);
 	pbgetbo(bo, bp);
 	bp->b_vp = vp;
 	bp->b_bcount = bp->b_bufsize = bp->b_runningbufspace = bytecount;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	KASSERT(IDX_TO_OFF(m[0]->pindex - bp->b_pages[0]->pindex) ==
 	    (blkno0 - bp->b_blkno) * DEV_BSIZE +
 	    IDX_TO_OFF(m[0]->pindex) % bsize,
 	    ("wrong offsets bsize %d m[0] %ju b_pages[0] %ju "
 	    "blkno0 %ju b_blkno %ju", bsize,
 	    (uintmax_t)m[0]->pindex, (uintmax_t)bp->b_pages[0]->pindex,
 	    (uintmax_t)blkno0, (uintmax_t)bp->b_blkno));
 
 	atomic_add_long(&runningbufspace, bp->b_runningbufspace);
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, bp->b_npages);
 
 	if (iodone != NULL) { /* async */
 		bp->b_pgiodone = iodone;
 		bp->b_caller1 = arg;
 		bp->b_iodone = vnode_pager_generic_getpages_done_async;
 		bp->b_flags |= B_ASYNC;
 		BUF_KERNPROC(bp);
 		bstrategy(bp);
 		return (VM_PAGER_OK);
 	} else {
 		bp->b_iodone = bdone;
 		bstrategy(bp);
 		bwait(bp, PVM, "vnread");
 		error = vnode_pager_generic_getpages_done(bp);
 		for (i = 0; i < bp->b_npages; i++)
 			bp->b_pages[i] = NULL;
 		bp->b_vp = NULL;
 		pbrelbo(bp);
 		uma_zfree(vnode_pbuf_zone, bp);
 		return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
 	}
 }
 
 static void
 vnode_pager_generic_getpages_done_async(struct buf *bp)
 {
 	int error;
 
 	error = vnode_pager_generic_getpages_done(bp);
 	/* Run the iodone upon the requested range. */
 	bp->b_pgiodone(bp->b_caller1, bp->b_pages + bp->b_pgbefore,
 	    bp->b_npages - bp->b_pgbefore - bp->b_pgafter, error);
 	for (int i = 0; i < bp->b_npages; i++)
 		bp->b_pages[i] = NULL;
 	bp->b_vp = NULL;
 	pbrelbo(bp);
 	uma_zfree(vnode_pbuf_zone, bp);
 }
 
 static int
 vnode_pager_generic_getpages_done(struct buf *bp)
 {
 	vm_object_t object;
 	off_t tfoff, nextoff;
 	int i, error;
 
 	error = (bp->b_ioflags & BIO_ERROR) != 0 ? EIO : 0;
 	object = bp->b_vp->v_object;
 
 	if (error == 0 && bp->b_bcount != bp->b_npages * PAGE_SIZE) {
 		if (!buf_mapped(bp)) {
 			bp->b_data = bp->b_kvabase;
 			pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages,
 			    bp->b_npages);
 		}
 		bzero(bp->b_data + bp->b_bcount,
 		    PAGE_SIZE * bp->b_npages - bp->b_bcount);
 	}
 	if (buf_mapped(bp)) {
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 		bp->b_data = unmapped_buf;
 	}
 
 	VM_OBJECT_WLOCK(object);
 	for (i = 0, tfoff = IDX_TO_OFF(bp->b_pages[0]->pindex);
 	    i < bp->b_npages; i++, tfoff = nextoff) {
 		vm_page_t mt;
 
 		nextoff = tfoff + PAGE_SIZE;
 		mt = bp->b_pages[i];
 
 		if (nextoff <= object->un_pager.vnp.vnp_size) {
 			/*
 			 * Read filled up entire page.
 			 */
 			mt->valid = VM_PAGE_BITS_ALL;
 			KASSERT(mt->dirty == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 			KASSERT(!pmap_page_is_mapped(mt),
 			    ("%s: page %p is mapped", __func__, mt));
 		} else {
 			/*
 			 * Read did not fill up entire page.
 			 *
 			 * Currently we do not set the entire page valid,
 			 * we just try to clear the piece that we couldn't
 			 * read.
 			 */
 			vm_page_set_valid_range(mt, 0,
 			    object->un_pager.vnp.vnp_size - tfoff);
 			KASSERT((mt->dirty & vm_page_bits(0,
 			    object->un_pager.vnp.vnp_size - tfoff)) == 0,
 			    ("%s: page %p is dirty", __func__, mt));
 		}
 
 		if (i < bp->b_pgbefore || i >= bp->b_npages - bp->b_pgafter)
 			vm_page_readahead_finish(mt);
 	}
 	VM_OBJECT_WUNLOCK(object);
 	if (error != 0)
 		printf("%s: I/O read error %d\n", __func__, error);
 
 	return (error);
 }
 
 /*
  * EOPNOTSUPP is no longer legal.  For local media VFS's that do not
  * implement their own VOP_PUTPAGES, their VOP_PUTPAGES should call to
  * vnode_pager_generic_putpages() to implement the previous behaviour.
  *
  * All other FS's should use the bypass to get to the local media
  * backing vp's VOP_PUTPAGES.
  */
 static void
 vnode_pager_putpages(vm_object_t object, vm_page_t *m, int count,
     int flags, int *rtvals)
 {
 	int rtval;
 	struct vnode *vp;
 	int bytes = count * PAGE_SIZE;
 
 	/*
 	 * Force synchronous operation if we are extremely low on memory
 	 * to prevent a low-memory deadlock.  VOP operations often need to
 	 * allocate more memory to initiate the I/O ( i.e. do a BMAP
 	 * operation ).  The swapper handles the case by limiting the amount
 	 * of asynchronous I/O, but that sort of solution doesn't scale well
 	 * for the vnode pager without a lot of work.
 	 *
 	 * Also, the backing vnode's iodone routine may not wake the pageout
 	 * daemon up.  This should be probably be addressed XXX.
 	 */
 
 	if (vm_page_count_min())
 		flags |= VM_PAGER_PUT_SYNC;
 
 	/*
 	 * Call device-specific putpages function
 	 */
 	vp = object->handle;
 	VM_OBJECT_WUNLOCK(object);
 	rtval = VOP_PUTPAGES(vp, m, bytes, flags, rtvals);
 	KASSERT(rtval != EOPNOTSUPP, 
 	    ("vnode_pager: stale FS putpages\n"));
 	VM_OBJECT_WLOCK(object);
 }
 
 static int
 vn_off2bidx(vm_ooffset_t offset)
 {
 
 	return ((offset & PAGE_MASK) / DEV_BSIZE);
 }
 
 static bool
 vn_dirty_blk(vm_page_t m, vm_ooffset_t offset)
 {
 
 	KASSERT(IDX_TO_OFF(m->pindex) <= offset &&
 	    offset < IDX_TO_OFF(m->pindex + 1),
 	    ("page %p pidx %ju offset %ju", m, (uintmax_t)m->pindex,
 	    (uintmax_t)offset));
 	return ((m->dirty & ((vm_page_bits_t)1 << vn_off2bidx(offset))) != 0);
 }
 
 /*
  * This is now called from local media FS's to operate against their
  * own vnodes if they fail to implement VOP_PUTPAGES.
  *
  * This is typically called indirectly via the pageout daemon and
  * clustering has already typically occurred, so in general we ask the
  * underlying filesystem to write the data out asynchronously rather
  * then delayed.
  */
 int
 vnode_pager_generic_putpages(struct vnode *vp, vm_page_t *ma, int bytecount,
     int flags, int *rtvals)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_ooffset_t maxblksz, next_offset, poffset, prev_offset;
 	struct uio auio;
 	struct iovec aiov;
 	off_t prev_resid, wrsz;
 	int count, error, i, maxsize, ncount, pgoff, ppscheck;
 	bool in_hole;
 	static struct timeval lastfail;
 	static int curfail;
 
 	object = vp->v_object;
 	count = bytecount / PAGE_SIZE;
 
 	for (i = 0; i < count; i++)
 		rtvals[i] = VM_PAGER_ERROR;
 
 	if ((int64_t)ma[0]->pindex < 0) {
 		printf("vnode_pager_generic_putpages: "
 		    "attempt to write meta-data 0x%jx(%lx)\n",
 		    (uintmax_t)ma[0]->pindex, (u_long)ma[0]->dirty);
 		rtvals[0] = VM_PAGER_BAD;
 		return (VM_PAGER_BAD);
 	}
 
 	maxsize = count * PAGE_SIZE;
 	ncount = count;
 
 	poffset = IDX_TO_OFF(ma[0]->pindex);
 
 	/*
 	 * If the page-aligned write is larger then the actual file we
 	 * have to invalidate pages occurring beyond the file EOF.  However,
 	 * there is an edge case where a file may not be page-aligned where
 	 * the last page is partially invalid.  In this case the filesystem
 	 * may not properly clear the dirty bits for the entire page (which
 	 * could be VM_PAGE_BITS_ALL due to the page having been mmap()d).
 	 * With the page locked we are free to fix-up the dirty bits here.
 	 *
 	 * We do not under any circumstances truncate the valid bits, as
 	 * this will screw up bogus page replacement.
 	 */
 	VM_OBJECT_RLOCK(object);
 	if (maxsize + poffset > object->un_pager.vnp.vnp_size) {
 		if (!VM_OBJECT_TRYUPGRADE(object)) {
 			VM_OBJECT_RUNLOCK(object);
 			VM_OBJECT_WLOCK(object);
 			if (maxsize + poffset <= object->un_pager.vnp.vnp_size)
 				goto downgrade;
 		}
 		if (object->un_pager.vnp.vnp_size > poffset) {
 			maxsize = object->un_pager.vnp.vnp_size - poffset;
 			ncount = btoc(maxsize);
 			if ((pgoff = (int)maxsize & PAGE_MASK) != 0) {
 				pgoff = roundup2(pgoff, DEV_BSIZE);
 
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 		("vnode_pager_generic_putpages: page %p is not read-only", m));
 				MPASS(m->dirty != 0);
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			maxsize = 0;
 			ncount = 0;
 		}
 		for (i = ncount; i < count; i++)
 			rtvals[i] = VM_PAGER_BAD;
 downgrade:
 		VM_OBJECT_LOCK_DOWNGRADE(object);
 	}
 
 	auio.uio_iov = &aiov;
 	auio.uio_segflg = UIO_NOCOPY;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = NULL;
 	maxblksz = roundup2(poffset + maxsize, DEV_BSIZE);
 
 	for (prev_offset = poffset; prev_offset < maxblksz;) {
 		/* Skip clean blocks. */
 		for (in_hole = true; in_hole && prev_offset < maxblksz;) {
 			m = ma[OFF_TO_IDX(prev_offset - poffset)];
 			for (i = vn_off2bidx(prev_offset);
 			    i < sizeof(vm_page_bits_t) * NBBY &&
 			    prev_offset < maxblksz; i++) {
 				if (vn_dirty_blk(m, prev_offset)) {
 					in_hole = false;
 					break;
 				}
 				prev_offset += DEV_BSIZE;
 			}
 		}
 		if (in_hole)
 			goto write_done;
 
 		/* Find longest run of dirty blocks. */
 		for (next_offset = prev_offset; next_offset < maxblksz;) {
 			m = ma[OFF_TO_IDX(next_offset - poffset)];
 			for (i = vn_off2bidx(next_offset);
 			    i < sizeof(vm_page_bits_t) * NBBY &&
 			    next_offset < maxblksz; i++) {
 				if (!vn_dirty_blk(m, next_offset))
 					goto start_write;
 				next_offset += DEV_BSIZE;
 			}
 		}
 start_write:
 		if (next_offset > poffset + maxsize)
 			next_offset = poffset + maxsize;
 
 		/*
 		 * Getting here requires finding a dirty block in the
 		 * 'skip clean blocks' loop.
 		 */
 		MPASS(prev_offset < next_offset);
 
 		VM_OBJECT_RUNLOCK(object);
 		aiov.iov_base = NULL;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = prev_offset;
 		prev_resid = auio.uio_resid = aiov.iov_len = next_offset -
 		    prev_offset;
 		error = VOP_WRITE(vp, &auio,
 		    vnode_pager_putpages_ioflags(flags), curthread->td_ucred);
 
 		wrsz = prev_resid - auio.uio_resid;
 		if (wrsz == 0) {
 			if (ppsratecheck(&lastfail, &curfail, 1) != 0) {
 				vn_printf(vp, "vnode_pager_putpages: "
 				    "zero-length write at %ju resid %zd\n",
 				    auio.uio_offset, auio.uio_resid);
 			}
 			VM_OBJECT_RLOCK(object);
 			break;
 		}
 
 		/* Adjust the starting offset for next iteration. */
 		prev_offset += wrsz;
 		MPASS(auio.uio_offset == prev_offset);
 
 		ppscheck = 0;
 		if (error != 0 && (ppscheck = ppsratecheck(&lastfail,
 		    &curfail, 1)) != 0)
 			vn_printf(vp, "vnode_pager_putpages: I/O error %d\n",
 			    error);
 		if (auio.uio_resid != 0 && (ppscheck != 0 ||
 		    ppsratecheck(&lastfail, &curfail, 1) != 0))
 			vn_printf(vp, "vnode_pager_putpages: residual I/O %zd "
 			    "at %ju\n", auio.uio_resid,
 			    (uintmax_t)ma[0]->pindex);
 		VM_OBJECT_RLOCK(object);
 		if (error != 0 || auio.uio_resid != 0)
 			break;
 	}
 write_done:
 	/* Mark completely processed pages. */
 	for (i = 0; i < OFF_TO_IDX(prev_offset - poffset); i++)
 		rtvals[i] = VM_PAGER_OK;
 	/* Mark partial EOF page. */
 	if (prev_offset == poffset + maxsize && (prev_offset & PAGE_MASK) != 0)
 		rtvals[i++] = VM_PAGER_OK;
 	/* Unwritten pages in range, free bonus if the page is clean. */
 	for (; i < ncount; i++)
 		rtvals[i] = ma[i]->dirty == 0 ? VM_PAGER_OK : VM_PAGER_ERROR;
 	VM_OBJECT_RUNLOCK(object);
 	VM_CNT_ADD(v_vnodepgsout, i);
 	VM_CNT_INC(v_vnodeout);
 	return (rtvals[0]);
 }
 
 int
 vnode_pager_putpages_ioflags(int pager_flags)
 {
 	int ioflags;
 
 	/*
 	 * Pageouts are already clustered, use IO_ASYNC to force a
 	 * bawrite() rather then a bdwrite() to prevent paging I/O
 	 * from saturating the buffer cache.  Dummy-up the sequential
 	 * heuristic to cause large ranges to cluster.  If neither
 	 * IO_SYNC or IO_ASYNC is set, the system decides how to
 	 * cluster.
 	 */
 	ioflags = IO_VMIO;
 	if ((pager_flags & (VM_PAGER_PUT_SYNC | VM_PAGER_PUT_INVAL)) != 0)
 		ioflags |= IO_SYNC;
 	else if ((pager_flags & VM_PAGER_CLUSTER_OK) == 0)
 		ioflags |= IO_ASYNC;
 	ioflags |= (pager_flags & VM_PAGER_PUT_INVAL) != 0 ? IO_INVAL: 0;
 	ioflags |= (pager_flags & VM_PAGER_PUT_NOREUSE) != 0 ? IO_NOREUSE : 0;
 	ioflags |= IO_SEQMAX << IO_SEQSHIFT;
 	return (ioflags);
 }
 
 /*
  * vnode_pager_undirty_pages().
  *
  * A helper to mark pages as clean after pageout that was possibly
  * done with a short write.  The lpos argument specifies the page run
  * length in bytes, and the written argument specifies how many bytes
  * were actually written.  eof is the offset past the last valid byte
  * in the vnode using the absolute file position of the first byte in
  * the run as the base from which it is computed.
  */
 void
 vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written, off_t eof,
     int lpos)
 {
 	vm_object_t obj;
 	int i, pos, pos_devb;
 
 	if (written == 0 && eof >= lpos)
 		return;
 	obj = ma[0]->object;
 	VM_OBJECT_WLOCK(obj);
 	for (i = 0, pos = 0; pos < written; i++, pos += PAGE_SIZE) {
 		if (pos < trunc_page(written)) {
 			rtvals[i] = VM_PAGER_OK;
 			vm_page_undirty(ma[i]);
 		} else {
 			/* Partially written page. */
 			rtvals[i] = VM_PAGER_AGAIN;
 			vm_page_clear_dirty(ma[i], 0, written & PAGE_MASK);
 		}
 	}
 	if (eof >= lpos) /* avoid truncation */
 		goto done;
 	for (pos = eof, i = OFF_TO_IDX(trunc_page(pos)); pos < lpos; i++) {
 		if (pos != trunc_page(pos)) {
 			/*
 			 * The page contains the last valid byte in
 			 * the vnode, mark the rest of the page as
 			 * clean, potentially making the whole page
 			 * clean.
 			 */
 			pos_devb = roundup2(pos & PAGE_MASK, DEV_BSIZE);
 			vm_page_clear_dirty(ma[i], pos_devb, PAGE_SIZE -
 			    pos_devb);
 
 			/*
 			 * If the page was cleaned, report the pageout
 			 * on it as successful.  msync() no longer
 			 * needs to write out the page, endlessly
 			 * creating write requests and dirty buffers.
 			 */
 			if (ma[i]->dirty == 0)
 				rtvals[i] = VM_PAGER_OK;
 
 			pos = round_page(pos);
 		} else {
 			/* vm_pageout_flush() clears dirty */
 			rtvals[i] = VM_PAGER_BAD;
 			pos += PAGE_SIZE;
 		}
 	}
 done:
 	VM_OBJECT_WUNLOCK(obj);
 }
 
 void
 vnode_pager_update_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	vm_ooffset_t old_wm;
 
 	VM_OBJECT_WLOCK(object);
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 	old_wm = object->un_pager.vnp.writemappings;
 	object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start;
 	vp = object->handle;
 	if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) {
 		ASSERT_VOP_LOCKED(vp, "v_writecount inc");
 		VOP_ADD_WRITECOUNT_CHECKED(vp, 1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
 		    __func__, vp, vp->v_writecount);
 	} else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) {
 		ASSERT_VOP_LOCKED(vp, "v_writecount dec");
 		VOP_ADD_WRITECOUNT_CHECKED(vp, -1);
 		CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
 		    __func__, vp, vp->v_writecount);
 	}
 	VM_OBJECT_WUNLOCK(object);
 }
 
 void
 vnode_pager_release_writecount(vm_object_t object, vm_offset_t start,
     vm_offset_t end)
 {
 	struct vnode *vp;
 	struct mount *mp;
 	vm_offset_t inc;
 
 	VM_OBJECT_WLOCK(object);
 
 	/*
 	 * First, recheck the object type to account for the race when
 	 * the vnode is reclaimed.
 	 */
 	if (object->type != OBJT_VNODE) {
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	/*
 	 * Optimize for the case when writemappings is not going to
 	 * zero.
 	 */
 	inc = end - start;
 	if (object->un_pager.vnp.writemappings != inc) {
 		object->un_pager.vnp.writemappings -= inc;
 		VM_OBJECT_WUNLOCK(object);
 		return;
 	}
 
 	vp = object->handle;
 	vhold(vp);
 	VM_OBJECT_WUNLOCK(object);
 	mp = NULL;
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
 	/*
 	 * Decrement the object's writemappings, by swapping the start
 	 * and end arguments for vnode_pager_update_writecount().  If
 	 * there was not a race with vnode reclaimation, then the
 	 * vnode's v_writecount is decremented.
 	 */
 	vnode_pager_update_writecount(object, end, start);
 	VOP_UNLOCK(vp, 0);
 	vdrop(vp);
 	if (mp != NULL)
 		vn_finished_write(mp);
 }